diff --git a/sycl/CMakeLists.txt b/sycl/CMakeLists.txt
index cb2220e9fbc12..a5920d2a9a718 100644
--- a/sycl/CMakeLists.txt
+++ b/sycl/CMakeLists.txt
@@ -249,7 +249,6 @@ install(FILES
 file(GLOB_RECURSE HEADERS_IN_SYCL_DIR CONFIGURE_DEPENDS "${sycl_inc_dir}/sycl/*")
 file(GLOB_RECURSE HEADERS_IN_CL_DIR CONFIGURE_DEPENDS "${sycl_inc_dir}/CL/*")
 file(GLOB_RECURSE HEADERS_IN_STD_DIR CONFIGURE_DEPENDS "${sycl_inc_dir}/std/*")
-file(GLOB_RECURSE HEADERS_IN_SYCLCOMPAT_DIR CONFIGURE_DEPENDS "${sycl_inc_dir}/syclcompat/*" "${sycl_inc_dir}/syclcompat.hpp")
 
 string(REPLACE "${sycl_inc_dir}" "${SYCL_INCLUDE_BUILD_DIR}"
   OUT_HEADERS_IN_SYCL_DIR "${HEADERS_IN_SYCL_DIR}")
@@ -293,8 +292,6 @@ add_custom_command(
   COMMAND ${CMAKE_COMMAND} -E copy_directory ${sycl_inc_dir}/sycl ${SYCL_INCLUDE_BUILD_DIR}/sycl
   COMMAND ${CMAKE_COMMAND} -E copy_directory ${sycl_inc_dir}/CL ${SYCL_INCLUDE_BUILD_DIR}/CL
   COMMAND ${CMAKE_COMMAND} -E copy_directory ${sycl_inc_dir}/std ${SYCL_INCLUDE_BUILD_DIR}/std
-  COMMAND ${CMAKE_COMMAND} -E copy_directory ${sycl_inc_dir}/syclcompat ${SYCL_INCLUDE_BUILD_DIR}/syclcompat
-  COMMAND ${CMAKE_COMMAND} -E copy ${sycl_inc_dir}/syclcompat.hpp ${SYCL_INCLUDE_BUILD_DIR}/syclcompat.hpp
   COMMAND ${CMAKE_COMMAND} -E copy ${UR_HEADERS_TO_COPY} ${SYCL_INCLUDE_BUILD_DIR}
   COMMENT "Copying SYCL headers ...")
 
@@ -302,8 +299,6 @@ add_custom_command(
 install(DIRECTORY "${sycl_inc_dir}/sycl" DESTINATION ${SYCL_INCLUDE_DIR} COMPONENT sycl-headers)
 install(DIRECTORY "${sycl_inc_dir}/CL" DESTINATION ${SYCL_INCLUDE_DIR}/ COMPONENT sycl-headers)
 install(DIRECTORY "${sycl_inc_dir}/std" DESTINATION ${SYCL_INCLUDE_DIR} COMPONENT sycl-headers)
-install(DIRECTORY "${sycl_inc_dir}/syclcompat" DESTINATION ${SYCL_INCLUDE_DIR} COMPONENT sycl-headers)
-install(FILES "${sycl_inc_dir}/syclcompat.hpp" DESTINATION ${SYCL_INCLUDE_DIR} COMPONENT sycl-headers)
 install(FILES "${UNIFIED_RUNTIME_INCLUDE_DIR}/ur_api.h" DESTINATION ${SYCL_INCLUDE_DIR}
   COMPONENT sycl-headers)
 install(FILES "${UNIFIED_RUNTIME_INCLUDE_DIR}/ur_api_funcs.def" DESTINATION ${SYCL_INCLUDE_DIR}
diff --git a/sycl/doc/index.rst b/sycl/doc/index.rst
index fe3e1078514a8..fa885e8cdb000 100644
--- a/sycl/doc/index.rst
+++ b/sycl/doc/index.rst
@@ -14,7 +14,6 @@ Using oneAPI DPC++ for Application Development
    PreprocessorMacros
    cuda/contents
    Extensions <https://github.com/intel/llvm/tree/sycl/sycl/doc/extensions>
-   syclcompat/README.md
    FAQ
    EnvironmentVariables
    MultiTileCardWithLevelZero
diff --git a/sycl/doc/syclcompat/README.md b/sycl/doc/syclcompat/README.md
deleted file mode 100644
index 9835d325a966d..0000000000000
--- a/sycl/doc/syclcompat/README.md
+++ /dev/null
@@ -1,3503 +0,0 @@
-**⚠️ DEPRECATION NOTICE ⚠️**
-
-**SYCLcompat is deprecated and will be removed in a future release. Users are encouraged to migrate to native SYCL APIs or alternative compatibility solutions. The `syclcompat` namespace has been marked with `[[deprecated]]` attribute.**
-
-# SYCLcompat
-
-SYCLcompat is a header-only library that intends to help developers familiar
-with other heterogeneous programming models (such as OpenMP, CUDA or HIP) to
-familiarize themselves with the SYCL programming API while porting their
-existing codes. Compatibility tools can also benefit from the reduced API size
-when converting legacy codebases.
-
-SYCLcompat provides:
-
-* A high-level API that provides closer semantics to other programming models,
-simplifying line by line conversions.
-* Alternative submission APIs that encapsulate SYCL-specific "queue" and
-"event" APIs for easier reference.
-* Ability to gradually introduce other SYCL concepts as the user familiarizes
-themselves with the core SYCL API.
-* Clear distinction between core SYCL API and the compatibility interface via
-separate namespaces.
-
-## Notice
-
-Copyright © 2023-2024 Codeplay Software Limited. All rights reserved.
-
-Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks of
-The Khronos Group Inc. OpenCL(TM) is a trademark of Apple Inc. used by
-permission by Khronos.
-
-## Support
-
-SYCLcompat depends on specific oneAPI DPC++ compiler extensions that may not be
-available to all the SYCL 2020 specification implementations.
-
-Specifically, this library depends on the following SYCL extensions:
-
-* [sycl_ext_oneapi_local_memory](
-    ../extensions/supported/sycl_ext_oneapi_local_memory.asciidoc)
-* [sycl_ext_oneapi_complex](
-    ../extensions/experimental/sycl_ext_oneapi_complex.asciidoc)
-* [sycl_ext_oneapi_free_function_queries](
-    ../extensions/supported/sycl_ext_oneapi_free_function_queries.asciidoc)
-* [sycl_ext_oneapi_assert](
-    ../extensions/supported/sycl_ext_oneapi_assert.asciidoc)
-* [sycl_ext_oneapi_enqueue_barrier](
-    ../extensions/supported/sycl_ext_oneapi_enqueue_barrier.asciidoc)
-* [sycl_ext_oneapi_usm_device_read_only](
-    ../extensions/supported/sycl_ext_oneapi_usm_device_read_only.asciidoc)
-* [sycl_ext_oneapi_properties](
-    ../extensions/experimental/sycl_ext_oneapi_properties.asciidoc)
-* [sycl_ext_oneapi_enqueue_functions](
-    ../extensions/experimental/sycl_ext_oneapi_enqueue_functions.asciidoc)
-* [sycl_ext_oneapi_kernel_properties](
-    ../extensions/experimental/sycl_ext_oneapi_kernel_properties.asciidoc)
-
-If available, the following extensions extend SYCLcompat functionality:
-
-* [sycl_ext_intel_device_info](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/supported/sycl_ext_intel_device_info.md) \[Optional\]
-* [sycl_ext_oneapi_bfloat16_math_functions](../extensions/experimental/sycl_ext_oneapi_bfloat16_math_functions.asciidoc) \[Optional\]
-* [sycl_ext_oneapi_max_work_group_query](
-  https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_max_work_group_query.md)
-  \[Optional\]
-
-### Hardware Requirements
-
-Some of the functionalities provided by SYCLcompat rely on Unified Shared Memory (`aspect::usm_device_allocations`), though most of the USM-like memory APIs (malloc*, memcpy*, memset*) support hardware with only buffer/accessor support. See section [Buffer Support](#buffer-support) below.
-
-## Usage
-
-All functionality is available under the `syclcompat::` namespace, imported
-through the main header, `syclcompat.hpp`. Note that `syclcompat.hpp` does not
-import the <sycl/sycl.hpp> header.
-
-``` cpp
-#include <syclcompat.hpp>
-```
-
-This document presents the public API under the [Features](#features) section,
-and provides a working [Sample code](#sample-code) using this library. Refer to
-those to learn to use the library.
-
-## Versioning
-
-SYCLcompat adopts [semantic versioning](https://semver.org/)
-(`major.minor.patch`) in a manner which aligns with oneAPI releases. Each oneAPI
-product release has an associated SYCLcompat release. Between oneAPI releases,
-there will be at most one `major` or `minor` bump. In other words, if a given
-oneAPI release has SYCLcompat version `1.0.0`, the next release will have either
-`1.1.0` or, if breaking changes have been made, `2.0.0`. This guarantee has
-implications for code merged to the `sycl` branch, described below.
-
-Between release cycles, ongoing updates to SYCLcompat (including possibly
-breaking changes) are merged into DPC++ via PRs to the
-[`sycl`](https://github.com/intel/llvm/tree/sycl) branch. If a PR introduces the
-*first* breaking changes since the last release, that PR must bump to the next
-`major` version. Otherwise, if the PR introduces *new functionality* and neither
-the `major` nor `minor` have been bumped since the last release, it must bump to
-the next `minor` release. If a PR introduces important bugfixes to existing
-functionality, `patch` should be bumped, and there are no limits to how many
-`patch` bumps can occur between release cycles.
-
-### Release Process
-
-Once all changes planned for a release have been merged, the release process is
-defined as:
-
-1. Check the `major.minor` version associated with the *previous* release.
-2. Confirm the version bump process outlined above has been followed.
-3. If no version bump has occurred since previous release, bump to next `minor`.
-4. oneAPI release is delivered.
-5. Tag the SYCLcompat release on DPC++ repo: `SYCLcompat-major.minor.0`.
-
-### Deprecation Process/Breaking Changes
-
-As outlined above, SYCLcompat may sometimes make API breaking changes, indicated
-with a `major` version bump. Advanced notice (at least one major oneAPI release)
-will be provided via a deprecation warning on the relevant APIs, indicating to
-the user which alternative API should be used instead.
-
-Note that SYCLcompat is currently in pre-release, and until version `1.0.0` we
-do not consider our API to be stable, and may change it with shorter notice.
-
-### Changelog
-
-Since SYCLcompat releases are aligned with oneAPI product releases, the changelog for SYCLcompat is incorporated into [SYCL's Release Notes](https://github.com/intel/llvm/blob/sycl/sycl/ReleaseNotes.md).
-
-### Experimental Namespace
-
-SYCLcompat provides some new experimental features in the `syclcompat::experimental` namespace. This serves as a testing ground for new features which are expected to migrate to `syclcompat::` in time, but the developers do not guarantee either API stability or continued existence of these features; they may be modified or removed without notice. When features are migrated from `syclcompat::experimental` to `syclcompat::`, this will be treated as a `minor` version bump.
-
-## Features
-
-### dim3
-
-SYCLcompat provides a `dim3` class akin to that of CUDA or HIP programming
-models. `dim3` encapsulates other languages iteration spaces that are
-represented with coordinate letters (x, y, z). In SYCL, the fastest-moving
-dimension is the one with the highest index, e.g. in a SYCL 2D range iteration
-space, there are two dimensions, 0 and 1, and 1 will be the one that "moves
-faster". For CUDA/HIP, the convention is reversed: `x` is the dimension which
-moves fastest. `syclcompat::dim3` follows this convention, so that
-`syclcompat::dim3(32, 4)` is equivalent to `sycl::range<2>(4, 32)`, and
-`syclcompat::dim3(32, 4, 2)` is equivalent to `sycl::range<3>(2, 4, 32)`.
-
-```cpp
-namespace syclcompat {
-
-class dim3 {
-public:
-  unsigned int x, y, z;
-  dim3(const sycl::range<3> &r);
-  dim3(const sycl::range<2> &r);
-  dim3(const sycl::range<1> &r);
-  constexpr dim3(unsigned int x = 1, unsigned int y = 1, unsigned int z = 1);
-
-  constexpr size_t size();
-
-  operator sycl::range<3>();
-  operator sycl::range<2>();
-  operator sycl::range<1>();
-};
-
-// Element-wise operators
-inline dim3 operator*(const dim3 &a, const dim3 &b);
-inline dim3 operator+(const dim3 &a, const dim3 &b);
-inline dim3 operator-(const dim3 &a, const dim3 &b);
-
-} // syclcompat
-```
-
-The compatibility headers for SYCL offer a number of convenience functions that
-help the mapping between xyz-based coordinates to SYCL iteration spaces in the
-different scopes available. In addition to the global range, the following
-helper functions are also provided:
-
-``` c++
-namespace syclcompat {
-
-namespace local_id {
-inline size_t x();
-inline size_t y();
-inline size_t z();
-} // namespace local_id
-
-namespace local_range {
-inline size_t x();
-inline size_t y();
-inline size_t z();
-} // namespace local_range
-
-namespace work_group_id {
-inline size_t x();
-inline size_t y();
-inline size_t z();
-} // namespace work_group_id
-
-namespace work_group_range {
-inline size_t x();
-inline size_t y();
-inline size_t z();
-} // namespace work_group_range
-
-namespace global_range {
-inline size_t x();
-inline size_t y();
-inline size_t z();
-} // namespace global_range
-
-namespace global_id {
-inline size_t x();
-inline size_t y();
-inline size_t z();
-} // namespace global_id
-
-} // syclcompat
-```
-
-These translate any kernel dimensions from one convention to the other. An
-example of an equivalent SYCL call for a 3D kernel using `compat` is
-`syclcompat::global_id::x() == get_global_id(2)`.
-
-### launch<function>
-
-SYCLcompat provides a kernel `launch` interface which accepts a function that
-executes on the device (a.k.a "kernel") instead of a lambda/functor. It can be
-called either by using a pair of "teams"/"blocks" and "threads", from
-OpenMP/CUDA terminology, or using a `sycl::nd_range`. The interface accepts a
-device _function_ with the use of an `auto F` template parameter, and a variadic
-`Args` for the function's arguments.
-
-Various overloads for `launch<function>` exist to permit the user to launch on a
-specific `queue`, or to describe the range as either `nd_range` or `dim3, dim3`.
-
-``` c++
-namespace syclcompat {
-
-template <auto F, typename... Args>
-sycl::event launch(const dim3 &grid, const dim3 &threads, Args... args);
-
-template <auto F, int Dim, typename... Args>
-sycl::event launch(const sycl::nd_range<Dim> &range, Args... args);
-
-template <auto F, int Dim, typename... Args>
-sycl::event launch(const sycl::nd_range<Dim> &range,
-                   sycl::queue q, Args... args);
-
-template <auto F, typename... Args>
-sycl::event launch(const dim3 &grid, const dim3 &threads,
-                   sycl::queue q, Args... args);
-
-} // syclcompat
-```
-
-For example, if the user had an existing function named `vectorAdd` to execute
-on a device such as follows:
-
-``` c++
-void vectorAdd(const float *A, const float *B, float *C, int n);
-```
-
-using SYCLcompat, the user can call it as follows:
-
-``` c++
-syclcompat::launch<vectorAdd>(blocksPerGrid, threadsPerBlock, d_A, d_B, d_C, n);
-```
-
-which would be equivalent to the following call using a `sycl::nd_range`:
-
-``` c++
-auto range = sycl::nd_range<3>{blocksPerGrid * threadsPerBlock,
-                               threadsPerBlock};
-syclcompat::launch<vectorAdd>(range, d_A, d_B, d_C, n);
-```
-
-Note that since `syclcompat::launch` accepts a device function, the kernel
-lambda is constructed by SYCLcompat internally. This means that, for
-example, `sycl::local_accessor`s cannot be declared. Instead, users wishing to
-use local memory should launch with a `launch_policy` object as described below.
-
-#### launch_policy
-
-In addition to the simple `syclcompat::launch` interface described above,
-SYCLcompat provides a more flexible (`experimental`) interface to `launch` a
-kernel with a given `launch_policy`. By constructing and passing a
-`launch_policy`, users can pass `sycl::ext::oneapi::experimental::properties`
-associated with the kernel or launch, as well as request **local memory** for
-the kernel.
-
-In order to disambiguate the variadic constructor of `launch_policy`, the
-following wrapper structs are defined. The `kernel_properties` and
-`launch_properties` wrappers can be constructed *either* with a variadc set of
-properties, or with an existing `sycl_exp::properties` object.
-
-```cpp
-namespace syclcompat::experimental {
-namespace sycl_exp = sycl::ext::oneapi::experimental;
-
-// Wrapper for kernel sycl_exp::properties
-template <typename Properties> struct kernel_properties {
-  using Props = Properties;
-  template <typename... Props>
-  kernel_properties(Props... properties);
-  template <typename... Props>
-  kernel_properties(sycl_exp::properties<Props...> properties)
-  Properties props;
-};
-
-// Wrapper for launch sycl_exp::properties
-template <typename Properties> struct launch_properties {
-  using Props = Properties;
-  template <typename... Props>
-  launch_properties(Props... properties);
-  template <typename... Props>
-  launch_properties(sycl_exp::properties<Props...> properties)
-  Properties props;
-};
-
-// Wrapper for local memory size
-struct local_mem_size {
-  local_mem_size(size_t size = 0);
-  size_t size;
-};
-
-} //namespace syclcompat::experimental
-```
-
-The constructors of `launch_policy` are variadic, accepting any form of range
-(`nd_range`, `range`, `dim3`, `dim3, dim3`), followed by zero or more of
-`local_memory_size`, `kernel_properties`, and `launch_properties`:
-
-``` c++
-namespace syclcompat::experimental {
-namespace sycl_exp = sycl::ext::oneapi::experimental;
-
-// launch_policy is constructed by the user & passed to `compat_exp::launch`
-template <typename Range, typename KProps, typename LProps, bool LocalMem>
-class launch_policy {
-public:
-  using KPropsT = KProps;
-  using LPropsT = LProps;
-  using RangeT = Range;
-  static constexpr bool HasLocalMem = LocalMem;
-
-  template <typename... Ts>
-  launch_policy(Range range, Ts... ts);
-
-  template <typename... Ts>
-  launch_policy(dim3 global_range, Ts... ts);
-
-  template <typename... Ts>
-  launch_policy(dim3 global_range, dim3 local_range, Ts... ts);
-
-  KProps get_kernel_properties();
-  LProps get_launch_properties();
-  size_t get_local_mem_size();
-  Range get_range();
-};
-} //namespace syclcompat::experimental
-```
-
-The `launch` overloads accepting a `launch_policy` are:
-
-```cpp
-namespace syclcompat::experimental {
-
-template <auto F, typename LaunchPolicy, typename... Args>
-sycl::event launch(LaunchPolicy launch_policy, sycl::queue q, Args... args);
-
-template <auto F, typename LaunchPolicy, typename... Args>
-sycl::event launch(LaunchPolicy launch_policy, Args... args);
-} //namespace syclcompat::experimental
-
-```
-
-For local memory, `launch<function>` injects a `char *` pointer to the beginning
-of a local accessor of the requested `local_mem_size` as the last argument of
-the kernel function. This `char *` can then be reinterpreted as the datatype
-required by the user within the kernel function.
-
-For example, the previous function named `vectorAdd` can be modified
-with the following signature, which adds a `char *` pointer to access local
-memory inside the kernel:
-
-``` c++
-void vectorAdd(const float *A, const float *B, float *C, int n,
-               char *local_mem);
-```
-
-Then, the new `vectorAdd` can be launched like this:
-
-``` c++
-using syclcompat::experimental;
-launch_policy policy{blocksPerGrid, threadsPerBlock,
-                      local_mem_size(nbytes)};
-launch<vectorAdd>(policy, d_A, d_B, d_C, n);
-```
-
-To request a different cache/local memory split on supported hardware:
-
-```c++
-using syclcompat::experimental;
-namespace sycl_intel_exp = sycl::ext::intel::experimental;
-
-sycl_intel_exp::cache_config cache_config{
-    sycl_intel_exp::large_slm};
-kernel_properties kernel_props{cache_config};
-launch_policy policy{blocksPerGrid, threadsPerBlock,
-                      local_mem_size(nbytes), kernel_props};
-
-launch<vectorAdd>(policy, d_A, d_B, d_C, n);
-```
-
-To request a certain cluster dimension on supported hardware:
-
-```c++
-using syclcompat::experimental;
-namespace sycl_exp = sycl::ext::oneapi::experimental;
-
-sycl_exp::cuda::cluster_size cluster_dims(cluster_range);
-launch_policy policy{blocksPerGrid, threadsPerBlock,
-                                  local_mem_size(nbytes), 
-                                  launch_properties{cluster_dims}};
-
-launch<vectorAdd>(policy, d_A, d_B, d_C, n);
-```
-
-### Utilities
-
-SYCLcompat introduces a set of utility functions designed to streamline the
-usage of the library and its `launch<function>` mechanism.
-
-The first utility function is `syclcompat::wg_barrier()`, which provides a
-concise work-group barrier. `syclcompat::wg_barrier()` uses the
-_SYCL_INTEL_free_function_queries_ extension to provide this functionality.
-
-The second utility function, `syclcompat::compute_nd_range`, ensures that the
-provided global size and work group sizes are appropriate for a given
-dimensionality, and that global size is rounded up to a multiple of the work
-group size in each dimension.
-
-```c++
-namespace syclcompat {
-
-inline void wg_barrier();
-
-template <int Dim>
-inline sycl::nd_range<Dim> compute_nd_range(sycl::range<Dim> global_size_in,
-                                            sycl::range<Dim> work_group_size);
-inline sycl::nd_range<1> compute_nd_range(int global_size_in, 
-                                          int work_group_size);
-
-} // syclcompat
-```
-
-### Queues
-
-The design for this library assumes _in-order_ queues
-(`sycl::property::queue::in_order()`).
-
-Many of the APIs accept an optional `queue` parameter, and this can be an
-out-of-order queue, either created manually or retrieved via a call to
-`syclcompat::create_queue()`, specifying `false` for the `in_order` parameter.
-
-```c++
-namespace syclcompat {
-
-inline sycl::queue create_queue(bool print_on_async_exceptions = false,
-                                bool in_order = true);
-
-} // syclcompat
-```
-
-However, SYCLcompat does not implement any mechanisms to deal with this case.
-The rationale for this is that a user wanting the full power of SYCL's
-dependency management shouldn't be using the this library. As such, support for
-out-of-order queues is very limited. The only way to safely use an out-of-order
-queue at present is to explicitly `q.wait()` or `e.wait()` where `e` is the
-`sycl::event` returned through a `syclcompat::async` API.
-
-To facilitate machine translation from other heterogeneous programming models to
-SYCL, SYCLcompat provides the following pointer aliases for `sycl::event` and
-`sycl::queue`, and the function `destroy_event` which destroys an `event_ptr`
-allocated on the heap.
-
-``` c++
-namespace syclcompat {
-
-using event_ptr = sycl::event *;
-
-using queue_ptr = sycl::queue *;
-
-static void destroy_event(event_ptr event);
-
-} // syclcompat
-```
-
-### Memory Operations
-
-This library provides interfaces to allocate memory to be accessed within kernel
-functions and on the host. The `syclcompat::malloc` function allocates device
-USM memory, the `syclcompat::malloc_host` function allocates host USM memory,
-and the `syclcompat::malloc_shared` function allocates shared USM memory.
-
-In each case we provide a template and non-templated interface for allocating
-memory, taking the number of elements or number of bytes respectively.
-
-The interface includes both synchronous and asynchronous `malloc`, `memcpy`,
-`memset`, `fill`, and `free` operations.
-
-There is a helper class `pointer_attributes` to query allocation type for memory
-pointers using SYCLcompat, through `sycl::usm::alloc` and
-`sycl::get_pointer_device`.
-
-``` c++
-namespace syclcompat {
-
-// Expects number of elements
-template <typename T>
-T *malloc(size_t count, sycl::queue q = get_default_queue());
-template <typename T>
-T *malloc_host(size_t count, sycl::queue q = get_default_queue());
-template <typename T>
-T *malloc_shared(size_t count, sycl::queue q = get_default_queue());
-
-// Expects size of the memory in bytes
-void *malloc(size_t num_bytes, sycl::queue q = get_default_queue());
-void *malloc_host(size_t num_bytes, sycl::queue q = get_default_queue());
-void *malloc_shared(size_t num_bytes, sycl::queue q = get_default_queue());
-
-// 2D, 3D memory allocation wrappers
-void *malloc(size_t &pitch, size_t x, size_t y,
-             sycl::queue q = get_default_queue())
-pitched_data malloc(sycl::range<3> size, sycl::queue q = get_default_queue());
-
-// Blocking memcpy
-void memcpy(void *to_ptr, const void *from_ptr, size_t size,
-            sycl::queue q = get_default_queue());
-void memcpy(T *to_ptr, const T *from_ptr, size_t count,
-            sycl::queue q = get_default_queue());
-void memcpy(void *to_ptr, size_t to_pitch, const void *from_ptr,
-            size_t from_pitch, size_t x, size_t y,
-            sycl::queue q = get_default_queue()); // 2D matrix
-void memcpy(pitched_data to, sycl::id<3> to_pos,
-            pitched_data from, sycl::id<3> from_pos,
-            sycl::range<3> size,
-            sycl::queue q = get_default_queue()); // 3D matrix
-
-// Non-blocking memcpy
-sycl::event memcpy_async(void *to_ptr, const void *from_ptr, size_t size,
-                         sycl::queue q = get_default_queue());
-template <typename T>
-sycl::event memcpy_async(T *to_ptr, T void *from_ptr, size_t count,
-                         sycl::queue q = get_default_queue());
-sycl::event memcpy_async(void *to_ptr, size_t to_pitch,
-                         const void *from_ptr, size_t from_pitch,
-                         size_t x, size_t y,
-                         sycl::queue q = get_default_queue()); // 2D matrix
-sycl::event memcpy_async(pitched_data to, sycl::id<3> to_pos,
-                         pitched_data from, sycl::id<3> from_pos,
-                         sycl::range<3> size,
-                         sycl::queue q = get_default_queue()); // 3D matrix
-
-// Fill
-template <class T>
-void fill(void *dev_ptr, const T &pattern, size_t count,
-          sycl::queue q = get_default_queue());
-template <typename T>
-sycl::event fill_async(void *dev_ptr, const T &pattern,
-                       size_t count, sycl::queue q = get_default_queue());
-
-// Memset
-void memset(void *dev_ptr, int value, size_t size,
-                   sycl::queue q = get_default_queue());
-void memset(void *ptr, size_t pitch, int val, size_t x, size_t y,
-            sycl::queue q = get_default_queue()); // 2D matrix
-void memset(pitched_data pitch, int val, sycl::range<3> size,
-                          sycl::queue q = get_default_queue()); // 3D matrix
-sycl::event memset_async(void *dev_ptr, int value, size_t size,
-                         sycl::queue q = get_default_queue());
-sycl::event memset_async(void *ptr, size_t pitch, int val,
-                         size_t x, size_t y,
-                         sycl::queue q = get_default_queue()); // 2D matrix
-sycl::event memset_async(pitched_data pitch, int val,
-                         sycl::range<3> size,
-                         sycl::queue q = get_default_queue()); // 3D matrix
-
-// Free
-void wait_and_free(void *ptr, sycl::queue q = get_default_queue());
-void free(void *ptr, sycl::queue q = get_default_queue());
-sycl::event enqueue_free(const std::vector<void *> &pointers,
-                         const std::vector<sycl::event> &events,
-                         sycl::queue q = get_default_queue());
-
-// Queries pointer allocation type
-class pointer_attributes {
-public:
-  void init(const void *ptr, sycl::queue q = get_default_queue());
-  sycl::usm::alloc get_memory_type();
-  const void *get_device_pointer();
-  const void *get_host_pointer();
-  bool is_memory_shared();
-  unsigned int get_device_id();
-};
-
-} // syclcompat
-```
-
-The `syclcompat::experimental` namespace contains currently unsupported `memcpy` overloads which take a `syclcompat::experimental::memcpy_parameter` argument. These are included for forwards compatibility and currently throw a `std::runtime_error`.
-
-```cpp
-namespace syclcompat {
-namespace experimental {
-// Forward declarations for types relating to unsupported memcpy_parameter API:
-
-#ifdef SYCL_EXT_ONEAPI_BINDLESS_IMAGES
-class image_mem_wrapper;
-#endif
-class image_matrix;
-
-/// Memory copy parameters for 2D/3D memory data.
-struct memcpy_parameter {
-  struct data_wrapper {
-    pitched_data pitched{};
-    sycl::id<3> pos{};
-#ifdef SYCL_EXT_ONEAPI_BINDLESS_IMAGES
-    experimental::image_mem_wrapper *image_bindless{nullptr};
-#endif
-    image_matrix *image{nullptr};
-  };
-  data_wrapper from{};
-  data_wrapper to{};
-  sycl::range<3> size{};
-};
-
-/// [UNSUPPORTED] Synchronously copies 2D/3D memory data specified by \p param .
-/// The function will return after the copy is completed.
-///
-/// \param param Memory copy parameters.
-/// \param q Queue to execute the copy task.
-/// \returns no return value.
-static inline void memcpy(const memcpy_parameter &param,
-                          sycl::queue q = get_default_queue());
-
-/// [UNSUPPORTED] Asynchronously copies 2D/3D memory data specified by \p param
-/// . The return of the function does NOT guarantee the copy is completed.
-///
-/// \param param Memory copy parameters.
-/// \param q Queue to execute the copy task.
-/// \returns no return value.
-static inline void memcpy_async(const memcpy_parameter &param,
-                                sycl::queue q = get_default_queue());
-
-} // namespace experimental
-} // namespace syclcompat
-```
-
-Finally, the class `pitched_data`, which manages memory allocation for 3D
-spaces, padded to avoid uncoalesced memory accesses.
-
-```c++
-namespace syclcompat {
-
-class pitched_data {
-public:
-  pitched_data();
-  pitched_data(void *data, size_t pitch, size_t x, size_t y);
-
-  void *get_data_ptr();
-  size_t get_pitch();
-  size_t get_x();
-  size_t get_y();
-
-  void set_data_ptr(void *data);
-  void set_pitch(size_t pitch);
-  void set_x(size_t x);
-  void set_y(size_t y);
-};
-
-} // syclcompat
-```
-
-There are various helper classes and aliases defined within SYCLcompat to
-encapsulate and define memory operations and objects. These classes and aliases
-are primarily designed to assist with machine translation from other
-heterogeneous programming models.
-
-The wrapper class `device_memory` provides a unified representation for device
-memory in various regions. The class provides methods to allocate memory for the
-object (`init()`) and access the underlying memory in various ways (`get_ptr()`,
-`get_access()`, `operator[]`). Aliases for global and USM shared specializations
-are provided.
-
-The `memory_traits` class is provided as a traits helper for `device_memory`.
-The `accessor` class template provides a 2D or 3D `sycl::accessor`-like wrapper
-around raw pointers.
-
-```c++
-namespace syclcompat {
-
-enum class memory_region {
-  global = 0, // device global memory
-  constant,   // device read-only memory
-  local,      // device local memory
-  usm_shared, // memory which can be accessed by host and device
-};
-
-using byte_t = uint8_t;
-
-template <memory_region Memory, class T = byte_t> class memory_traits {
-public:
-  static constexpr sycl::access::address_space asp =
-      (Memory == memory_region::local)
-          ? sycl::access::address_space::local_space
-          : sycl::access::address_space::global_space;
-  static constexpr sycl::target target =
-      (Memory == memory_region::local)
-          ? sycl::target::local
-          : sycl::target::device;
-  static constexpr sycl::access_mode mode =
-      (Memory == memory_region::constant)
-          ? sycl::access_mode::read
-          : sycl::access_mode::read_write;
-  static constexpr size_t type_size = sizeof(T);
-  using element_t =
-      typename std::conditional_t<Memory == constant, const T, T>;
-  using value_t = typename std::remove_cv_t<T>;
-  template <size_t Dimension = 1>
-  using accessor_t = typename std::conditional_t<
-      target == sycl::target::local,
-      sycl::local_accessor<T, Dimension>,
-      sycl::accessor<T, Dimension, mode>>;
-  using pointer_t = T *;
-};
-
-template <class T, memory_region Memory, size_t Dimension> class device_memory {
-public:
-  using accessor_t =
-      typename memory_traits<Memory, T>::template accessor_t<Dimension>;
-  using value_t = typename memory_traits<Memory, T>::value_t;
-  using syclcompat_accessor_t =
-      syclcompat::accessor<T, Memory, Dimension>;
-
-  device_memory();
-
-  device_memory(const sycl::range<Dimension> &in_range,
-                std::initializer_list<value_t> &&init_list);
-
-  template <size_t D = Dimension>
-  device_memory(
-      const typename std::enable_if<D == 2, sycl::range<2>>::type &in_range,
-      std::initializer_list<std::initializer_list<value_t>> &&init_list);
-
-  device_memory(const sycl::range<Dimension> &range_in);
-
-  // Variadic constructor taking 1, 2 or 3 integers to be interpreted as a
-  // sycl::range<Dim>.
-  template <class... Args>
-  device_memory(Args... Arguments);
-
-  ~device_memory();
-
-  // Allocate memory with default queue, and init memory if has initial value.
-  void init();
-  // Allocate memory with specified queue, and init memory if has initial
-  // value.
-  void init(sycl::queue q);
-
-  // The variable is assigned to a device pointer.
-  void assign(value_t *src, size_t size);
-
-  // Get memory pointer of the memory object, which is virtual pointer when
-  // usm is not used, and device pointer when usm is used.
-  value_t *get_ptr();
-  // Get memory pointer of the memory object, which is virtual pointer when
-  // usm is not used, and device pointer when usm is used.
-  value_t *get_ptr(sycl::queue q);
-
-  // Get the device memory object size in bytes.
-  size_t get_size();
-
-  template <size_t D = Dimension>
-  typename std::enable_if<D == 1, T>::type &operator[](size_t index);
-
-  // Get accessor with dimension info for the device memory object
-  // when usm is used and dimension is greater than 1.
-  template <size_t D = Dimension>
-  typename std::enable_if<D != 1, syclcompat_accessor_t>::type
-  get_access(sycl::handler &cgh);
-};
-
-
-template <class T, memory_region Memory>
-class device_memory<T, Memory, 0> : public device_memory<T, Memory, 1> {
-public:
-  using base = device_memory<T, Memory, 1>;
-  using value_t = typename base::value_t;
-  using accessor_t =
-      typename memory_traits<Memory, T>::template accessor_t<0>;
-  device_memory(const value_t &val);
-  device_memory();
-};
-
-template <class T, size_t Dimension>
-using global_memory = device_memory<T, memory_region::global, Dimension>;
-template <class T, size_t Dimension>
-using constant_memory = detail::device_memory<T, constant, Dimension>;
-template <class T, size_t Dimension>
-using shared_memory = device_memory<T, memory_region::usm_shared, Dimension>;
-
-
-template <class T, memory_region Memory, size_t Dimension> class accessor;
-
-template <class T, memory_region Memory> class accessor<T, Memory, 3> {
-public:
-  using memory_t = memory_traits<Memory, T>;
-  using element_t = typename memory_t::element_t;
-  using pointer_t = typename memory_t::pointer_t;
-  using accessor_t = typename memory_t::template accessor_t<3>;
-
-  accessor(pointer_t data, const sycl::range<3> &in_range);
-  template <memory_region M = Memory>
-  accessor(typename std::enable_if<M != memory_region::local,
-                                   const accessor_t>::type &acc);
-  accessor(const accessor_t &acc, const sycl::range<3> &in_range);
-
-  accessor<T, Memory, 2> operator[](size_t index) const;
-
-  pointer_t get_ptr() const;
-
-};
-
-template <class T, memory_region Memory> class accessor<T, Memory, 2> {
-public:
-  using memory_t = memory_traits<Memory, T>;
-  using element_t = typename memory_t::element_t;
-  using pointer_t = typename memory_t::pointer_t;
-  using accessor_t = typename memory_t::template accessor_t<2>;
-
-  accessor(pointer_t data, const sycl::range<2> &in_range);
-  template <memory_region M = Memory>
-  accessor(typename std::enable_if<M != memory_region::local,
-                                   const accessor_t>::type &acc);
-  accessor(const accessor_t &acc, const sycl::range<2> &in_range);
-
-  pointer_t operator[](size_t index);
-
-  pointer_t get_ptr() const;
-};
-
-} // syclcompat
-```
-
-#### Buffer Support
-
-Although SYCLcompat is primarily designed around the Unified Shared Memory
-model, there is (limited) support for the buffer/accessor model. This can be
-enabled by setting the compiler define `SYCLCOMPAT_USM_LEVEL_NONE`. This macro
-instructs SYCLcompat to effectively provide emulated USM pointers via a Memory
-Manager singleton.
-
-Note that in `SYCLCOMPAT_USM_LEVEL_NONE` mode, the pointers returned by e.g.
-`syclcompat::malloc`, and passed to `syclcompat::memcpy` can *only* interact
-with `syclcompat` APIs. It is legal to perform pointer arithmetic on these
-virtual pointers, but attempting to dereference them, passing them to `sycl`
-APIs, or passing them into kernels will result in an error.
-
-The SYCLcompat tests with the suffix `_usmnone.cpp` provide examples of how to
-use `SYCLCOMPAT_USM_LEVEL_NONE`.
-
-### ptr_to_int
-
-The following cuda backend specific function is introduced in order to
-translate from local memory pointers to `uint32_t` or `size_t` variables that
-contain a byte address to the local (local refers to`.shared` in nvptx) memory
-state space.
-
-``` c++
-namespace syclcompat {
-template <typename T>
-__syclcompat_inline__
-    std::enable_if_t<std::is_same_v<T, uint32_t> || std::is_same_v<T, size_t>,
-                     T>
-    ptr_to_int(void *ptr)
-} // namespace syclcompat
-```
-
-These variables can be used in inline PTX instructions that take address
-operands. Such inline PTX instructions are commonly used in optimized
-libraries. A simplified example usage of the above functions is as follows:
-
-``` c++
-  half *data = syclcompat::local_mem<half[NUM_ELEMENTS]>();
-  // ...
-  // ...
-  T addr =
-      syclcompat::ptr_to_int<T>(reinterpret_cast<char *>(data) + (id % 8) * 16);
-  uint32_t fragment;
-#if defined(__NVPTX__)
-  asm volatile("ldmatrix.sync.aligned.m8n8.x1.shared.b16 {%0}, [%1];\n"
-               : "=r"(fragment)
-               : "r"(addr));
-#endif
-```
-
-### Device Information
-
-`sycl::device` properties are encapsulated using the `device_info` helper class.
-The class is meant to be constructed and used through the extended device
-implemented in SYCLcompat.
-
-This is the synopsis of `device_info`:
-
-```c++
-class device_info {
-public:
-  const char *get_name();
-  char *get_name();
-  template <typename WorkItemSizesTy = sycl::range<3>,
-            std::enable_if_t<std::is_same_v<WorkItemSizesTy, sycl::id<3>> ||
-                                 std::is_same_v<WorkItemSizesTy, int *>,
-                             int> = 0>
-  auto get_max_work_item_sizes() const;
-
-  template <typename WorkItemSizesTy = sycl::range<3>,
-          std::enable_if_t<std::is_same_v<WorkItemSizesTy, sycl::id<3>> ||
-                                std::is_same_v<WorkItemSizesTy, int *>,
-                            int> = 0>
-  auto get_max_work_item_sizes() const;
-  bool get_host_unified_memory() const;
-  int get_major_version() const;
-  int get_minor_version() const;
-  int get_integrated() const;
-  int get_max_clock_frequency() const;
-  int get_max_compute_units() const;
-  int get_max_work_group_size() const;
-  int get_max_sub_group_size() const;
-  int get_max_work_items_per_compute_unit() const;
-  int get_max_register_size_per_work_group() const;
-  template <typename NDRangeSizeTy = size_t *,
-            std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t *> ||
-                                 std::is_same_v<NDRangeSizeTy, int *>,
-                             int> = 0>
-  auto get_max_nd_range_size() const;
-  template <typename NDRangeSizeTy = size_t *,
-            std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t *> ||
-                                 std::is_same_v<NDRangeSizeTy, int *>,
-                             int> = 0>
-  auto get_max_nd_range_size();
-  size_t get_global_mem_size() const;
-  size_t get_local_mem_size() const;
-
-  unsigned int get_memory_clock_rate() const;
-  unsigned int get_memory_bus_width() const;
-  uint32_t get_device_id() const;
-  std::array<unsigned char, 16> get_uuid() const;
-  unsigned int get_global_mem_cache_size() const;
-  int get_image1d_max() const;
-  auto get_image2d_max() const;
-  auto get_image2d_max();
-  auto get_image3d_max() const;
-  auto get_image3d_max();
-
-  void set_name(const char *name);
-  void set_max_work_item_sizes(const sycl::range<3> max_work_item_sizes);
-  [[deprecated]] void
-  set_max_work_item_sizes(const sycl::id<3> max_work_item_sizes);
-  void set_host_unified_memory(bool host_unified_memory);
-  void set_major_version(int major);
-  void set_minor_version(int minor);
-  void set_integrated(int integrated);
-  void set_max_clock_frequency(int frequency);
-  void set_max_compute_units(int max_compute_units);
-  void set_global_mem_size(size_t global_mem_size);
-  void set_local_mem_size(size_t local_mem_size);
-  void set_max_work_group_size(int max_work_group_size);
-  void set_max_sub_group_size(int max_sub_group_size);
-  void
-  set_max_work_items_per_compute_unit(int max_work_items_per_compute_unit);
-  void set_max_nd_range_size(int max_nd_range_size[]);
-  void set_max_nd_range_size(sycl::id<3> max_nd_range_size);
-  void set_memory_clock_rate(unsigned int memory_clock_rate);
-  void set_memory_bus_width(unsigned int memory_bus_width);
-  void 
-  set_max_register_size_per_work_group(int max_register_size_per_work_group);
-  void set_device_id(uint32_t device_id);
-  void set_uuid(std::array<unsigned char, 16> uuid);
-  void set_global_mem_cache_size(unsigned int global_mem_cache_size);
-  void set_image1d_max(size_t image_max_buffer_size);
-  void set_image2d_max(size_t image_max_width_buffer_size,
-                       size_t image_max_height_buffer_size);
-  void set_image3d_max(size_t image_max_width_buffer_size,
-                       size_t image_max_height_buffer_size,
-                       size_t image_max_depth_buffer_size);
-};
-```
-
-### Device Management
-
-Multiple SYCL functionalities are exposed through utility functions to manage
-the current `sycl::device`, `sycl::queue`, and `sycl::context`, exposed as
-follows:
-
-```c++
-namespace syclcompat {
-
-// Util function to create a new queue for the current device
-static inline sycl::queue create_queue(bool print_on_async_exceptions = false,
-                                       bool in_order = true);
-
-// Util function to get the default queue of current device in
-// device manager.
-static inline sycl::queue get_default_queue();
-
-// Util function to set the default queue of the current device in the
-// device manager.
-// If the device extension saved queue is the default queue, 
-// the previous saved queue will be overwritten as well.
-// This function will be blocking if there are submitted kernels in the
-// previous default queue.
-static inline void set_default_queue(const sycl::queue &q);
-
-// Util function to wait for the queued kernels.
-static inline void wait(sycl::queue q = get_default_queue());
-
-// Util function to wait for the queued kernels and throw unhandled errors.
-static inline void wait_and_throw(sycl::queue q = get_default_queue());
-
-// Util function to get the id of current device in
-// device manager.
-static inline unsigned int get_current_device_id();
-
-// Util function to get the current device.
-static inline device_ext &get_current_device();
-
-// Util function to get a device by id.
-static inline device_ext &get_device(unsigned int id);
-
-// Util function to get the context of the default queue of current
-// device in device manager.
-static inline sycl::context get_default_context();
-
-// Util function to get a CPU device.
-static inline device_ext &cpu_device();
-
-/// Filter out devices; only keep the device whose name contains one of the
-/// subname in \p dev_subnames.
-/// May break device id mapping and change current device. It's better to be
-/// called before other SYCLcompat or SYCL APIs.
-static inline void filter_device(const std::vector<std::string> &dev_subnames);
-
-/// Print all the devices (and their IDs) in the dev_mgr
-static inline void list_devices();
-
-// Util function to select a device by its id
-static inline unsigned int select_device(unsigned int id);
-
-// Util function to get the device id from a device
-static inline unsigned int get_device_id(const sycl::device &dev);
-
-// Util function to get the number of available devices
-static inline unsigned int device_count();
-
-// Util function to check whether a device supports some kinds of sycl::aspect.
-static inline void
-has_capability_or_fail(const sycl::device &dev,
-                       const std::initializer_list<sycl::aspect> &props);
-} // syclcompat
-```
-
-The exposed functionalities include creation and destruction of queues, through
-`syclcompat::create_queue` and `syclcompat::destroy_queue`, and providing the
-ability to wait for submitted kernels using `syclcompat::wait` or
-`syclcompat::wait_and_throw`. Any async errors will be output to `stderr` if
-`print_on_async_exceptions`, and will have the default behavior otherwise, which
-calls `std:terminate`. Synchronous exceptions have to be managed by users
-independently of what is set in this parameter.
-
-Devices are managed through a helper class, `device_ext`. The `device_ext` class
-associates a vector of `sycl::queues` with its `sycl::device`. The `device_ext`
-destructor waits on a set of `sycl::event` which can be added to via
-`add_event`. This is used, for example, to implement `syclcompat::enqueue_free` to
-schedule release of memory after a kernel or `mempcy`. SYCL device properties
-can be queried through `device_ext` as well.
-`device_ext` also provides the `has_capability_or_fail` member function, which
-throws a `sycl::exception` if the device does not have the specified list of
-`sycl::aspect`.
-
-Devices can be listed and filtered using `syclcompat::list_devices()` and
-`syclcompat::filter_device()`. If `SYCLCOMPAT_VERBOSE` is defined at compile
-time, the available SYCL devices are printed to the standard output both at
-initialization time, and when the device list is filtered using
-`syclcompat::filter_device`.
-
-Users can manage queues through the `syclcompat::set_default_queue(sycl::queue
-q)` free function, and the `device_ext` `set_saved_queue`, `set_default_queue`,
-and `get_saved_queue` member functions.
-`set_default_queue` is blocking, and overwrites the previous default queue with
-a user defined one, waiting for any submitted kernels to finish.
-The `device_ext` automatically sets the saved queue to the default queue.
-Therefore, it's important to note that if the previous default queue was the
-device's saved queue, setting a new default queue will update the reference of
-the saved queue to the new default one to keep the state of the class
-consistent.
-
-The class is exposed as follows:
-
-```c++
-namespace syclcompat {
-
-class device_ext : public sycl::device {
-  device_ext();
-  device_ext(const sycl::device &base, bool print_on_async_exceptions = false,
-             bool in_order = true);
-  ~device_ext();
-
-  bool is_native_host_atomic_supported();
-  int get_major_version() const;
-  int get_minor_version() const;
-  int get_max_compute_units() const;
-  int get_max_clock_frequency() const;
-  int get_integrated() const;
-  int get_max_sub_group_size() const;
-  int get_max_register_size_per_work_group() const;
-  int get_max_work_group_size() const;
-  int get_mem_base_addr_align() const;
-  size_t get_global_mem_size() const;
-  size_t get_local_mem_size() const;
-  void get_memory_info(size_t &free_memory, size_t &total_memory) const;
-
-  void get_device_info(device_info &out) const;
-  device_info get_device_info() const;
-  void reset(bool print_on_async_exceptions = false, bool in_order = true);
-
-  sycl::queue *default_queue();
-  void set_default_queue(const sycl::queue &q);
-  void queues_wait_and_throw();
-  sycl::queue *create_queue(bool print_on_async_exceptions = false,
-                            bool in_order = true);
-  void destroy_queue(sycl::queue *&queue);
-  void set_saved_queue(sycl::queue *q);
-  sycl::queue *get_saved_queue();
-  sycl::context get_context();
-
-  void
-  has_capability_or_fail(const std::initializer_list<sycl::aspect> &props) const;
-};
-
-} // syclcompat
-```
-
-Free functions are provided for querying major and minor version directly from a `sycl::device`, equivalent to the methods of `device_ext` described above:
-
-```c++
-static int get_major_version(const sycl::device &dev);
-static int get_minor_version(const sycl::device &dev);
-```
-
-#### Multiple devices
-
-SYCLcompat allows you to manage multiple devices through
-`syclcompat::select_device` and `syclcompat::create_queue`. The library uses the
-default SYCL device (i.e. the device returned by `sycl::default_selector_v`) as
-the default device, and exposes all other devices available on the system
-through the `syclcompat::select_device(unsigned int id)` member function.
-
-The interface uses the `syclcompat::device_ext::get_current_device_id()` to get
-the current CPU thread, and returns the associated device stored internally as a
-map with that thread. The map is constructed using calls to
-`syclcompat::select_device(unsigned int id)`. Any thread which hasn't used this
-member function to select a device will be given the default device. Note that
-this implies multiple threads on a single device by default.
-
-Be aware that targetting multiple devices may lead to unintended behavior caused
-by developers, as SYCLcompat does not implement a mechanism to warn when the
-wrong queue is used as an argument in any of the member functions of the
-`syclcompat` namespace.
-
-#### Atomic Operations
-
-SYCLcompat provides an interface for common atomic operations (`add`, `sub`,
-`and`, `or`, `xor`, `min`, `max`, `inc`, `dec`, `exchange`, `compare_exchange`).
-While SYCL exposes atomic operations through member functions of
-`sycl::atomic_ref`, this library provides access via functions taking a standard
-pointer argument. Template arguments control the `sycl::memory_scope`,
-`sycl::memory_order` and `sycl::access::address_space` of these atomic
-operations. SYCLcompat also exposes overloads for these atomic functions which
-take a runtime memoryScope argument. Every atomic operation is implemented via
-an API function taking a raw pointer as the target. Additional overloads for
-`syclcompat::compare_exchange_strong` are provided which take a
-`sycl::multi_ptr` instead of a raw pointer. The type of the operand for most
-atomic operations is defined as `syclcompat::type_identity_t<T>` to avoid
-template deduction issues when an operand of a different type (e.g. double
-literal) is supplied. Atomic addition and subtraction free functions make use of
-`syclcompat::arith_t<T>` to differentiate between numeric and pointer
-arithmetic.
-
-The available operations are exposed as follows:
-
-``` c++
-namespace syclcompat {
-
-template <class T> struct type_identity {
-  using type = T;
-};
-template <class T> using type_identity_t = typename type_identity<T>::type;
-
-template <typename T> struct arith {
-  using type = std::conditional_t<std::is_pointer_v<T>, std::ptrdiff_t, T>;
-};
-template <typename T> using arith_t = typename arith<T>::type;
-
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::generic_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device,
-          typename T>
-T atomic_fetch_add(T *addr, arith_t<T> operand);
-
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::generic_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device,
-          typename T>
-T atomic_fetch_sub(T *addr, arith_t<T> operand);
-
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::generic_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device,
-          typename T>
-T atomic_fetch_and(T *addr, type_identity<T> operand);
-
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::generic_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device,
-          typename T>
-T atomic_fetch_or(T *addr, type_identity<T> operand);
-
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::generic_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device,
-          typename T>
-T atomic_fetch_xor(T *addr, type_identity<T> operand);
-
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::generic_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device,
-          typename T>
-T atomic_fetch_min(T *addr, type_identity<T> operand);
-
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::generic_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device,
-          typename T>
-T atomic_fetch_max(T *addr, type_identity<T> operand);
-
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::generic_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-unsigned int atomic_fetch_compare_inc(unsigned int *addr,
-                                      unsigned int operand);
-
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::generic_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-unsigned int atomic_fetch_compare_dec(unsigned int *addr,
-                                      unsigned int operand);
-
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::generic_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device,
-          typename T>
-T atomic_exchange(T *addr, type_identity<T> operand);
-
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::generic_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device,
-          typename T>
-T atomic_compare_exchange_strong(
-    sycl::multi_ptr<T, addressSpace> addr, type_identity_t<T> expected,
-    type_identity_t<T> desired,
-    sycl::memory_order success = sycl::memory_order::relaxed,
-    sycl::memory_order fail = sycl::memory_order::relaxed);
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::generic_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device,
-          typename T>
-T atomic_compare_exchange_strong(
-    T *addr, T expected, T desired,
-    sycl::memory_order success = sycl::memory_order::relaxed,
-    sycl::memory_order fail = sycl::memory_order::relaxed);
-
-} // namespace syclcompat
-```
-
-SYCLcompat also provides an atomic class with the `store`, `load`, `exchange`,
-`compare_exchange_weak`, `fetch_add`, and `fetch_sub` operations. The atomic
-class wrapper supports int, unsigned int, long, unsigned long, long long,
-unsigned long long, float, double and pointer datatypes.
-
-```cpp
-namespace syclcompat {
-
-template <typename T,
-          sycl::memory_scope DefaultScope = sycl::memory_scope::system,
-          sycl::memory_order DefaultOrder = sycl::memory_order::seq_cst,
-          sycl::access::address_space Space =
-              sycl::access::address_space::generic_space>
-class atomic {
-  static constexpr sycl::memory_order default_read_order =
-      sycl::atomic_ref<T, DefaultOrder, DefaultScope,
-                       Space>::default_read_order;
-  static constexpr sycl::memory_order default_write_order =
-      sycl::atomic_ref<T, DefaultOrder, DefaultScope,
-                       Space>::default_write_order;
-  static constexpr sycl::memory_scope default_scope = DefaultScope;
-  static constexpr sycl::memory_order default_read_modify_write_order =
-      DefaultOrder;
-
-  constexpr atomic() noexcept = default;
-
-  constexpr atomic(T d) noexcept;
-
-  void store(T operand, sycl::memory_order memoryOrder = default_write_order,
-             sycl::memory_scope memoryScope = default_scope) noexcept;
-
-  T load(sycl::memory_order memoryOrder = default_read_order,
-         sycl::memory_scope memoryScope = default_scope) const noexcept;
-
-  T exchange(T operand,
-             sycl::memory_order memoryOrder = default_read_modify_write_order,
-             sycl::memory_scope memoryScope = default_scope) noexcept;
-
-  bool compare_exchange_weak(
-      T &expected, T desired, sycl::memory_order success,
-      sycl::memory_order failure,
-      sycl::memory_scope memoryScope = default_scope) noexcept;
-
-  bool compare_exchange_weak(
-      T &expected, T desired,
-      sycl::memory_order memoryOrder = default_read_modify_write_order,
-      sycl::memory_scope memoryScope = default_scope) noexcept;
-
-  bool compare_exchange_strong(
-      T &expected, T desired, sycl::memory_order success,
-      sycl::memory_order failure,
-      sycl::memory_scope memoryScope = default_scope) noexcept;
-
-  bool compare_exchange_strong(
-      T &expected, T desired,
-      sycl::memory_order memoryOrder = default_read_modify_write_order,
-      sycl::memory_scope memoryScope = default_scope) noexcept;
-
-  T fetch_add(arith_t<T> operand,
-              sycl::memory_order memoryOrder = default_read_modify_write_order,
-              sycl::memory_scope memoryScope = default_scope) noexcept;
-
-  T fetch_sub(arith_t<T> operand,
-              sycl::memory_order memoryOrder = default_read_modify_write_order,
-              sycl::memory_scope memoryScope = default_scope) noexcept;
-};
-
-} // namespace syclcompat
-```
-
-### Compatibility Utilities
-
-This library provides a number of small compatibility utilities which exist to
-facilitate machine translation of code from other programming models to SYCL.
-These functions are part of the public API, but they are not expected to be
-useful to developers writing their own code.
-
-Functionality is provided to represent a pair of integers as a `double`.
-`cast_ints_to_double(int, int)` returns a `double` containing the given integers
-in the high & low 32-bits respectively. `cast_double_to_int` casts the high or
-low 32-bits back into an integer.
-
-`reverse_bits` reverses the bits of a 32-bit unsigned integer, `ffs` returns the
-position of the first least significant set bit in an integer.
-`byte_level_permute` returns a byte-permutation of two input unsigned integers,
-with bytes selected according to a third unsigned integer argument.
-`match_all_over_sub_group` and `match_any_over_sub_group` allows comparison of
-values across work-items within a sub-group.
-
-The function `ternary_logic_op`performs bitwise logical operations on three input values of
-`a`, `b` and `c` based on the specified 8-bit truth table `lut` and return the
-result.
-
-The functions `select_from_sub_group`, `shift_sub_group_left`,
-`shift_sub_group_right` and `permute_sub_group_by_xor` provide equivalent
-functionality to `sycl::select_from_group`, `sycl::shift_group_left`,
-`sycl::shift_group_right` and `sycl::permute_group_by_xor`, respectively.
-However, they provide an optional argument to represent the `logical_group` size
-(default 32).
-
-`int_as_queue_ptr` helps with translation of code by reinterpret casting an
-address to `sycl::queue *`, or returning a pointer to SYCLcompat's default queue
-if the address is <= 2.
-`args_selector` is a helper class for extracting arguments from an array of
-pointers to arguments or buffer of arguments to pass to a kernel function.
-The class allows users to exclude parameters such as `sycl::nd_item`.
-Experimental support for masked versions of `select_from_sub_group`,
-`shift_sub_group_left`, `shift_sub_group_right` and `permute_sub_group_by_xor`
-is provided only for SPIRV or CUDA devices.
-
-As part of the compatibility utilities to facilitate machine translation to
-SYCL, two aliases for errors are provided, `err0` and `err1`.
-
-```c++
-namespace syclcompat {
-
-inline int cast_double_to_int(double d, bool use_high32 = true);
-
-inline double cast_ints_to_double(int high32, int low32);
-
-inline unsigned int byte_level_permute(unsigned int a, unsigned int b,
-                                       unsigned int s);
-
-inline uint32_t lop3(uint32_t a, uint32_t b, uint32_t c, uint8_t lut)
-
-template <typename ValueT> inline int ffs(ValueT a);
-
-template <typename T>
-unsigned int match_any_over_sub_group(sycl::sub_group g, unsigned member_mask,
-                                      T value);
-
-template <typename T>
-unsigned int match_all_over_sub_group(sycl::sub_group g, unsigned member_mask,
-                                      T value, int *pred);
-
-template <typename ValueT>
-ValueT select_from_sub_group(sycl::sub_group g, ValueT x, int remote_local_id,
-                        int logical_sub_group_size = 32);
-
-template <typename ValueT>
-ValueT shift_sub_group_left(sycl::sub_group g, ValueT x, unsigned int delta,
-                       int logical_sub_group_size = 32);
-
-template <typename ValueT>
-ValueT shift_sub_group_right(sycl::sub_group g, ValueT x, unsigned int delta,
-                        int logical_sub_group_size = 32);
-
-template <typename ValueT>
-ValueT permute_sub_group_by_xor(sycl::sub_group g, ValueT x, unsigned int mask,
-                           int logical_sub_group_size = 32);
-
-namespace experimental {
-
-template <typename ValueT>
-ValueT select_from_sub_group(unsigned int member_mask, sycl::sub_group g, ValueT x,
-                             int remote_local_id, int logical_sub_group_size = 32);
-
-template <typename ValueT>
-ValueT shift_sub_group_left(unsigned int member_mask, sycl::sub_group g, ValueT x,
-                            unsigned int delta, int logical_sub_group_size = 32);
-
-template <typename ValueT>
-ValueT shift_sub_group_right(unsigned int member_mask, sycl::sub_group g, ValueT x,
-                             unsigned int delta, int logical_sub_group_size = 32);
-
-template <typename ValueT>
-ValueT permute_sub_group_by_xor(unsigned int member_mask, sycql::sub_group g, ValueT x,
-                                unsigned int mask, int logical_sub_group_size = 32);
-
-} // namespace experimental
-
-inline sycl::queue *int_as_queue_ptr(uintptr_t x);
-
-using err0 = detail::generic_error_type<struct err0_tag, int>;
-using err1 = detail::generic_error_type<struct err1_tag, int>;
-
-template <int n_nondefault_params, int n_default_params, typename T>
-class args_selector;
-
-template <int n_nondefault_params, int n_default_params, typename R,
-          typename... Ts>
-class args_selector<n_nondefault_params, n_default_params, R(Ts...)> {
-public:
-  // Get the type of the ith argument of R(Ts...)
-  template <int i>
-  using arg_type =
-      std::tuple_element_t<account_for_default_params<i>(), std::tuple<Ts...>>;
-
-  // If kernel_params is nonnull, then args_selector will
-  // extract arguments from kernel_params. Otherwise, it
-  // will extract them from extra.
-  args_selector(void **kernel_params, void **extra)
-      : kernel_params(kernel_params), args_buffer(get_args_buffer(extra)) {}
-
-  // Get a reference to the i-th argument extracted from kernel_params
-  // or extra.
-  template <int i> arg_type<i> &get();
-};
-
-} // namespace syclcompat
-```
-
-The function `experimental::nd_range_barrier` synchronizes work items from all
-work groups within a SYCL kernel. This is not officially supported by the SYCL
-spec, and so should be used with caution.
-`experimental::calculate_max_active_wg_per_xecore` and
-`experimental::calculate_max_potential_wg` are used for occupancy calculation.
-There is also an `experimental::logical_group` class which allows
-`sycl::sub_group`s to be further subdivided into 'logical' groups to perform
-sub-group level operations. This class provides methods to get the local & group
-id and range. `experimental::group_type`, `experimental::group` and
-`experimental::group_base` are helper classes to manage the supported group
-types.
-
-```c++
-namespace syclcompat {
-namespace experimental {
-
-#if defined(__AMDGPU__) || defined(__NVPTX__)
-// seq_cst currently not working for AMD nor Nvidia
-constexpr sycl::memory_order barrier_memory_order = sycl::memory_order::acq_rel;
-#else
-constexpr sycl::memory_order barrier_memory_order = sycl::memory_order::seq_cst;
-#endif
-
-template <int dimensions = 3>
-inline void nd_range_barrier(
-    sycl::nd_item<dimensions> item,
-    sycl::atomic_ref<unsigned int, barrier_memory_order,
-                     sycl::memory_scope::device,
-                     sycl::access::address_space::global_space> &counter);
-
-template <>
-inline void nd_range_barrier(
-    sycl::nd_item<1> item,
-    sycl::atomic_ref<unsigned int, barrier_memory_order,
-                     sycl::memory_scope::device,
-                     sycl::access::address_space::global_space> &counter);
-
-template <int dimensions = 3> class logical_group {
-public:
-  logical_group(sycl::nd_item<dimensions> item, sycl::group<dimensions> parent_group,
-                uint32_t size);
-  uint32_t get_local_linear_id() const;
-  uint32_t get_group_linear_id() const;
-  uint32_t get_local_linear_range() const;
-  uint32_t get_group_linear_range() const;
-};
-
-inline int calculate_max_active_wg_per_xecore(int *num_wg, int wg_size,
-                                              int slm_size = 0,
-                                              int sg_size = 32,
-                                              bool used_barrier = false,
-                                              bool used_large_grf = false);
-
-inline int calculate_max_potential_wg(int *num_wg, int *wg_size,
-                                      int max_wg_size_for_device_code,
-                                      int slm_size = 0, int sg_size = 32,
-                                      bool used_barrier = false,
-                                      bool used_large_grf = false);
-// Supported group types
-enum class group_type { work_group, sub_group, logical_group, root_group };
-
-// The group_base will dispatch the function call to the specific interface
-// based on the group type.
-template <int dimensions = 3> class group_base {
-public:
-  group_base(sycl::nd_item<dimensions> item);
-
-  // Returns the number of work-items in the group.
-  size_t get_local_linear_range();
-  // Returns the index of the work-item within the group.
-  size_t get_local_linear_id();
-
-  // Wait for all the elements within the group to complete their execution
-  // before proceeding.
-  void barrier();
-};
-
-// Container type that can store supported group_types.
-template <typename GroupT, int dimensions = 3>
-class group : public group_base<dimensions> {
-public:
-  group(GroupT g, sycl::nd_item<dimensions> item);
-};
-
-} // namespace experimental
-} // namespace syclcompat
-```
-
-SYCLcompat provides a wrapper API `max_active_work_groups_per_cu` providing
-'work-groups per compute unit' semantics. It is templated on the kernel
-functor, and takes a work-group size represented by either `sycl::range<Dim>`
-or `syclcompat::dim3`, the local memory size in bytes, and an optional queue.
-The function returns the maximum number of work-groups which can be executed
-per compute unit. May return *zero* even when below resource limits (i.e.
-returning `0` does not imply the kernel cannot execute).
-```cpp
-namespace syclcompat{
-template <class KernelName>
-size_t max_active_work_groups_per_cu(
-    syclcompat::dim3 wg_dim3, size_t local_mem_size,
-    sycl::queue queue = syclcompat::get_default_queue());
-
-template <class KernelName, int RangeDim>
-size_t max_active_work_groups_per_cu(
-    sycl::range<RangeDim> wg_range, size_t local_mem_size,
-    sycl::queue queue = syclcompat::get_default_queue());
-}
-```
-
-To assist machine translation, helper aliases are provided for inlining and
-alignment attributes. The class template declarations `sycl_compat_kernel_name`
-and `sycl_compat_kernel_scalar` are used to assist automatic generation of
-kernel names during machine translation.
-
-`get_sycl_language_version` returns an integer representing the version of the
-SYCL spec supported by the current SYCL compiler.
-
-The `SYCLCOMPAT_CHECK_ERROR` macro encapsulates an error-handling mechanism for
-expressions that might throw `sycl::exception` and `std::runtime_error`. If no
-exceptions are thrown, it returns `syclcompat::error_code::success`. If a
-`sycl::exception` is caught, it returns `syclcompat::error_code::backend_error`.
-If a `std::runtime_error` exception is caught,
-`syclcompat::error_code::default_error` is returned instead. For both cases, it
-prints the error message to the standard error stream.
-
-`get_error_string_dummy` is a dummy function introduced to assist auto
-migration. The SYCLomatic user should replace it with a real error-handling 
-function. SYCL reports errors using exceptions and does not use error codes.
-
-``` c++
-namespace syclcompat {
-
-template <class... Args> class syclcompat_kernel_name;
-template <int Arg> class syclcompat_kernel_scalar;
-
-#if defined(_MSC_VER)
-#define __syclcompat_align__(n) __declspec(align(n))
-#define __syclcompat_inline__ __forceinline
-#else
-#define __syclcompat_align__(n) __attribute__((aligned(n)))
-#define __syclcompat_inline__ __inline__ __attribute__((always_inline))
-#endif
-
-#if defined(_MSC_VER)
-#define __syclcompat_noinline__ __declspec(noinline)
-#else
-#define __syclcompat_noinline__ __attribute__((noinline))
-#endif
-
-#define SYCLCOMPAT_COMPATIBILITY_TEMP (900)
-
-#ifdef _WIN32
-#define SYCLCOMPAT_EXPORT __declspec(dllexport)
-#else
-#define SYCLCOMPAT_EXPORT
-#endif
-
-
-namespace syclcompat {
-enum error_code { success = 0, backend_error = 1, default_error = 999 };
-inline const char *get_error_string_dummy(int ec);
-}
-
-#define SYCLCOMPAT_CHECK_ERROR(expr)
-
-int get_sycl_language_version();
-
-} // namespace syclcompat
-```
-
-### Kernel Helper Functions
-
-Kernel helper functions provide a structure `kernel_function_info` to keep SYCL
-kernel information, and provide a utility function `get_kernel_function_info()`
-to get the kernel information. Overloads are provided to allow either returning
-a `kernel_function_info` object, or to return by pointer argument. In the
-current version, `kernel_function_info` describes only maximum work-group size.
-
-SYCLcompat also provides the `kernel_library` and `kernel_function` classes.
-`kernel_library` facilitates the loading and unloading of kernel libraries.
-`kernel_function` represents a specific kernel function within a loaded library
-and can be invoked with specified arguments.
-`load_kernel_library`, `load_kernel_library_mem`, and `unload_kernel_library`
-are free functions to handle the loading and unloading of `kernel_library`
-objects. `get_kernel_function`, and `invoke_kernel_function` offer a similar
-functionality for `kernel_function` objects.
-
-``` c++
-namespace syclcompat {
-
-struct kernel_function_info {
-  int max_work_group_size = 0;
-};
-
-static void get_kernel_function_info(kernel_function_info *kernel_info,
-                                     const void *function);
-static kernel_function_info get_kernel_function_info(const void *function);
-
-class kernel_library {
-  constexpr kernel_library();
-  constexpr kernel_library(void *ptr);
-  operator void *() const;
-};
-
-static kernel_library load_kernel_library(const std::string &name);
-static kernel_library load_kernel_library_mem(char const *const image);
-static void unload_kernel_library(const kernel_library &library);
-
-class kernel_function {
-    constexpr kernel_function();
-    constexpr kernel_function(kernel_functor ptr);
-    operator void *() const;
-    void operator()(sycl::queue &q, const sycl::nd_range<3> &range,
-                    unsigned int local_mem_size, void **args, void **extra);
-};
-
-static kernel_function get_kernel_function(kernel_library &library,
-                                           const std::string &name);
-static void invoke_kernel_function(kernel_function &function,
-                                   sycl::queue &queue,
-                                   sycl::range<3> group_range,
-                                   sycl::range<3> local_range,
-                                   unsigned int local_mem_size,
-                                   void **kernel_params, void **extra);
-
-} // namespace syclcompat
-```
-
-### Math Functions
-
-The `funnelshift_*` APIs perform a concatenate-shift operation on two 32-bit
-values, and return a 32-bit result. The two unsigned integer arguments (`low`
-and `high`) are concatenated to a 64-bit value which is then shifted left or
-right by `shift` bits. The functions then return either the least- or
-most-significant 32 bits. The `_l*` variants shift *left* and return the *most*
-significant 32 bits, while the `_r*` variants shift *right* and return the
-*least* significant 32 bits. The `_l`/`_r` APIs differ from the `_lc`/`_rc` APIs
-in how they clamp the `shift` argument: `funnelshift_l` and `funnelshift_r`
-shift the result by `shift & 31` bits, whereas `funnelshift_lc` and
-`funnelshift_rc` shift the result by `min(shift, 32)` bits.
-
-`syclcompat::fast_length` provides a wrapper to SYCL's
-`fast_length(sycl::vec<float,N>)` that accepts arguments for a C++ array and a
-length. `syclcompat::length` provides a templated version that wraps over
-`sycl::length`. There are wrappers for `clamp`, `isnan`, `cbrt`, `min`, `max`,
-`fmax_nan`, `fmin_nan`, and `pow`, as well as an implementation of `relu`
-saturation is also provided.
-
-`compare`, `unordered_compare`, `compare_both`, `unordered_compare_both`,
-`compare_mask`, and `unordered_compare_mask`, handle both ordered and unordered
-comparisons.
-
-`vectorized_max` and `vectorized_min` are binary operations returning the
-max/min of two arguments, where each argument is treated as a `sycl::vec` type.
-`vectorized_isgreater` performs elementwise `isgreater`, treating each argument
-as a vector of elements, and returning `0` for vector components for which
-`isgreater` is false, and `-1` when true.
-`vectorized_sum_abs_diff` calculates the absolute difference for two values
-without modulo overflow for vector types.
-
-The functions `cmul`,`cdiv`,`cabs`, `cmul_add`, and `conj` define complex math
-operations which accept `sycl::vec<T,2>` arguments representing complex values.
-
-The `dp4a` function returns the 4-way 8-bit dot product accumulate for unsigned
-and signed 32-bit integer values. The `dp2a_lo` and `dp2a_hi` functions return the
-two-way 16-bit to 8-bit dot product using the second and first 16 bits of the
-second operand, respectively. These three APIs return a single 32-bit value with
-the accumulated result, which is unsigned if both operands are `uint32_t` and
-signed otherwise.
-
-Various maths functions are defined operate on any floating point types.
-`syclcompat::is_floating_point_v` extends the standard library's
-`std::is_floating_point_v` to include `sycl::half` and, where available,
-`sycl::ext::oneapi::bfloat16`. The current version of SYCLcompat also provides
-a specialization of `std::common_type_t` for `sycl::ext::oneapi::bfloat16`,
-though this will be moved to the `sycl_ext_oneapi_bfloat16` extension in
-future.
-
-```cpp
-namespace std {
-template <> struct common_type<sycl::ext::oneapi::bfloat16> {
-  using type = sycl::ext::oneapi::bfloat16;
-};
-
-template <>
-struct common_type<sycl::ext::oneapi::bfloat16, sycl::ext::oneapi::bfloat16> {
-  using type = sycl::ext::oneapi::bfloat16;
-};
-
-template <typename T> struct common_type<sycl::ext::oneapi::bfloat16, T> {
-  using type = sycl::ext::oneapi::bfloat16;
-};
-
-template <typename T> struct common_type<T, sycl::ext::oneapi::bfloat16> {
-  using type = sycl::ext::oneapi::bfloat16;
-};
-} // namespace std
-```
-
-```cpp
-namespace syclcompat{
-
-// Trait for extended floating point definition
-template <typename T>
-struct is_floating_point : std::is_floating_point<T>{};
-
-template <> struct is_floating_point<sycl::half> : std::true_type {};
-
-#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
-template <> struct is_floating_point<sycl::ext::oneapi::bfloat16> : std::true_type {};
-#endif
-template <typename T>
-
-inline constexpr bool is_floating_point_v = is_floating_point<T>::value;
-
-inline unsigned int funnelshift_l(unsigned int low, unsigned int high,
-                                  unsigned int shift); 
-
-inline unsigned int funnelshift_lc(unsigned int low, unsigned int high,
-                                   unsigned int shift); 
-
-inline unsigned int funnelshift_r(unsigned int low, unsigned int high,
-                                  unsigned int shift);
-
-inline unsigned int funnelshift_rc(unsigned int low, unsigned int high,
-                                   unsigned int shift);
-
-inline float fast_length(const float *a, int len);
-
-template <typename ValueT>
-inline ValueT length(const ValueT *a, const int len);
-
-inline ValueT clamp(ValueT val, ValueT min_val, ValueT max_val);
-
-// Determine whether 2 element value is NaN.
-template <typename ValueT>
-inline std::enable_if_t<ValueT::size() == 2, ValueT> isnan(const ValueT a);
-
-// cbrt function wrapper.
-template <typename ValueT>
-inline std::enable_if_t<std::is_floating_point_v<ValueT> ||
-                            std::is_same_v<ValueT, sycl::half>,
-                        ValueT>
-cbrt(ValueT val);
-
-// For floating-point types, `float` or `double` arguments are acceptable.
-// For integer types, `std::uint32_t`, `std::int32_t`, `std::uint64_t` or
-// `std::int64_t` type arguments are acceptable.
-// sycl::half supported as well.
-template <typename T1, typename T2>
-std::enable_if_t<std::is_integral_v<T1> && std::is_integral_v<T2>,
-                 std::common_type_t<T1, T2>>
-min(T1 a, T2 b);
-template <typename T1, typename T2>
-std::enable_if_t<std::is_floating_point_v<T1> && std::is_floating_point_v<T2>,
-                 std::common_type_t<T1, T2>>
-min(T1 a, T2 b);
-
-sycl::half min(sycl::half a, sycl::half b);
-
-template <typename T1, typename T2>
-std::enable_if_t<std::is_integral_v<T1> && std::is_integral_v<T2>,
-                 std::common_type_t<T1, T2>>
-max(T1 a, T2 b);
-template <typename T1, typename T2>
-std::enable_if_t<std::is_floating_point_v<T1> && std::is_floating_point_v<T2>,
-                 std::common_type_t<T1, T2>>
-max(T1 a, T2 b);
-
-sycl::half max(sycl::half a, sycl::half b);
-
-// Performs 2 elements comparison and returns the bigger one. If either of
-// inputs is NaN, then return NaN.
-template <typename ValueT, typename ValueU>
-inline std::common_type_t<ValueT, ValueU> fmax_nan(const ValueT a,
-                                                   const ValueU b);
-
-template <typename ValueT, typename ValueU>
-inline sycl::vec<std::common_type_t<ValueT, ValueU>, 2>
-fmax_nan(const sycl::vec<ValueT, 2> a, const sycl::vec<ValueU, 2> b);
-
-template <typename ValueT, typename ValueU>
-inline sycl::marray<std::common_type_t<ValueT, ValueU>, 2>
-fmax_nan(const sycl::marray<ValueT, 2> a, const sycl::marray<ValueU, 2> b);
-
-// Performs 2 elements comparison and returns the smaller one. If either of
-// inputs is NaN, then return NaN.
-template <typename ValueT, typename ValueU>
-inline std::common_type_t<ValueT, ValueU> fmin_nan(const ValueT a,
-                                                   const ValueU b);
-template <typename ValueT, typename ValueU>
-inline sycl::vec<std::common_type_t<ValueT, ValueU>, 2>
-fmin_nan(const sycl::vec<ValueT, 2> a, const sycl::vec<ValueU, 2> b);
-
-template <typename ValueT, typename ValueU>
-inline sycl::marray<std::common_type_t<ValueT, ValueU>, 2>
-fmin_nan(const sycl::marray<ValueT, 2> a, const sycl::marray<ValueU, 2> b);
-
-inline float pow(const float a, const int b) { return sycl::pown(a, b); }
-inline double pow(const double a, const int b) { return sycl::pown(a, b); }
-
-template <typename ValueT, typename ValueU>
-inline typename std::enable_if_t<std::is_floating_point_v<ValueT>, ValueT>
-pow(const ValueT a, const ValueU b);
-
-// Requires aspect::fp64, as it casts to double internally.
-template <typename ValueT, typename ValueU>
-inline typename std::enable_if_t<!std::is_floating_point_v<ValueT>, double>
-pow(const ValueT a, const ValueU b);
-
-template <typename ValueT> inline ValueT relu(const ValueT a);
-
-template <class ValueT, int NumElements>
-inline sycl::vec<ValueT, NumElements>
-relu(const sycl::vec<ValueT, NumElements> a);
-
-template <class ValueT>
-inline std::enable_if_t<std::is_floating_point_v<ValueT> ||
-                            std::is_same_v<sycl::half, ValueT>,
-                        sycl::marray<ValueT, 2>>
-relu(const sycl::marray<ValueT, 2> a);
-
-// The following definition is enabled when BinaryOperation(ValueT, ValueT) returns bool
-// std::enable_if_t<std::is_same_v<std::invoke_result_t<BinaryOperation, ValueT, ValueT>, bool>, bool>
-template <typename ValueT, class BinaryOperation>
-inline bool 
-compare(const ValueT a, const ValueT b, const BinaryOperation binary_op);
-template <typename ValueT, class BinaryOperation>
-inline std::enable_if_t<ValueT::size() == 2, ValueT>
-compare(const ValueT a, const ValueT b, const BinaryOperation binary_op);
-
-// The following definition is enabled when BinaryOperation(ValueT, ValueT) returns bool
-// std::enable_if_t<std::is_same_v<std::invoke_result_t<BinaryOperation, ValueT, ValueT>, bool>, bool>
-template <typename ValueT, class BinaryOperation>
-inline bool
-unordered_compare(const ValueT a, const ValueT b,
-                  const BinaryOperation binary_op);
-template <typename ValueT, class BinaryOperation>
-inline std::enable_if_t<ValueT::size() == 2, ValueT>
-unordered_compare(const ValueT a, const ValueT b,
-                  const BinaryOperation binary_op);
-
-template <typename ValueT, class BinaryOperation>
-inline std::enable_if_t<ValueT::size() == 2, bool>
-compare_both(const ValueT a, const ValueT b, const BinaryOperation binary_op);
-template <typename ValueT, class BinaryOperation>
-
-inline std::enable_if_t<ValueT::size() == 2, bool>
-unordered_compare_both(const ValueT a, const ValueT b,
-                       const BinaryOperation binary_op);
-
-template <typename ValueT, class BinaryOperation>
-inline std::enable_if_t<ValueT::size() == 2, unsigned>
-compare_mask(const ValueT a, const ValueT b, const BinaryOperation binary_op);
-
-template <typename ValueT, class BinaryOperation>
-inline std::enable_if_t<ValueT::size() == 2, unsigned>
-unordered_compare_mask(const ValueT a, const ValueT b,
-                       const BinaryOperation binary_op);
-
-template <typename S, typename T> inline T vectorized_max(T a, T b);
-
-template <typename S, typename T> inline T vectorized_min(T a, T b);
-
-template <typename S, typename T> inline T vectorized_isgreater(T a, T b);
-
-template <>
-inline unsigned vectorized_isgreater<sycl::ushort2, unsigned>(unsigned a,
-                                                              unsigned b);
-
-template <typename VecT>
-inline unsigned vectorized_sum_abs_diff(unsigned a, unsigned b);
-
-template <typename T>
-sycl::vec<T, 2> cmul(sycl::vec<T, 2> x, sycl::vec<T, 2> y);
-
-template <typename T>
-sycl::vec<T, 2> cdiv(sycl::vec<T, 2> x, sycl::vec<T, 2> y);
-
-template <typename T> T cabs(sycl::vec<T, 2> x);
-
-template <typename ValueT>
-inline sycl::vec<ValueT, 2> cmul_add(const sycl::vec<ValueT, 2> a,
-                                     const sycl::vec<ValueT, 2> b,
-                                     const sycl::vec<ValueT, 2> c);
-
-template <typename ValueT>
-inline sycl::marray<ValueT, 2> cmul_add(const sycl::marray<ValueT, 2> a,
-                                        const sycl::marray<ValueT, 2> b,
-                                        const sycl::marray<ValueT, 2> c);
-
-template <typename T> sycl::vec<T, 2> conj(sycl::vec<T, 2> x);
-
-template <typename ValueT> inline ValueT reverse_bits(ValueT a);
-
-
-template <typename T1, typename T2>
-using dot_product_acc_t =
-    std::conditional_t<std::is_unsigned_v<T1> && std::is_unsigned_v<T2>,
-                       uint32_t, int32_t>;
-
-template <typename T1, typename T2>
-inline dot_product_acc_t<T1, T2> dp2a_lo(T1 a, T2 b,
-                                         dot_product_acc_t<T1, T2> c);
-
-template <typename T1, typename T2>
-inline dot_product_acc_t<T1, T2> dp2a_hi(T1 a, T2 b,
-                                         dot_product_acc_t<T1, T2> c);
-
-template <typename T1, typename T2>
-inline dot_product_acc_t<T1, T2> dp4a(T1 a, T2 b,
-                                      dot_product_acc_t<T1, T2> c);
-} // namespace syclcompat
-```
-
-`vectorized_binary` computes the `BinaryOperation` for two operands,
-with each value treated as a vector type. `vectorized_unary` offers the same
-interface for operations with a single operand. `vectorized_ternary` offers the
-interface for three operands with two `BinaryOperation`.
-The implemented `BinaryOperation`s are `abs_diff`, `add_sat`, `rhadd`, `hadd`,
-`maximum`, `minimum`, and `sub_sat`.
-And the `vectorized_with_pred` offers the `BinaryOperation` for two operands,
-meanwihle provides the pred of high/low halfword operation.
-
-```cpp
-namespace syclcompat {
-  
-template <typename VecT, class UnaryOperation>
-inline unsigned vectorized_unary(unsigned a, const UnaryOperation unary_op);
-
-// A sycl::abs wrapper functor.
-struct abs {
-  template <typename ValueT> auto operator()(const ValueT x) const;
-};
-
-template <typename VecT, class BinaryOperation>
-inline unsigned vectorized_binary(unsigned a, unsigned b,
-                                  const BinaryOperation binary_op,
-                                  bool need_relu = false);
-
-template <typename VecT, typename BinaryOperation1, typename BinaryOperation2>
-inline unsigned vectorized_ternary(unsigned a, unsigned b, unsigned c,
-                                   const BinaryOperation1 binary_op1,
-                                   const BinaryOperation2 binary_op2,
-                                   bool need_relu = false);
-
-template <typename ValueT, typename BinaryOperation>
-inline unsigned vectorized_with_pred(unsigned a, unsigned b,
-                                     const BinaryOperation binary_op,
-                                     bool *pred_hi, bool *pred_lo);
-
-// A sycl::abs_diff wrapper functor.
-struct abs_diff {
-  template <typename ValueT>
-  auto operator()(const ValueT x, const ValueT y) const;
-};
-// A sycl::add_sat wrapper functor.
-struct add_sat {
-  template <typename ValueT>
-  auto operator()(const ValueT x, const ValueT y) const;
-};
-// A sycl::rhadd wrapper functor.
-struct rhadd {
-  template <typename ValueT>
-  auto operator()(const ValueT x, const ValueT y) const;
-};
-// A sycl::hadd wrapper functor.
-struct hadd {
-  template <typename ValueT>
-  auto operator()(const ValueT x, const ValueT y) const;
-};
-// A sycl::max wrapper functor.
-struct maximum {
-  template <typename ValueT>
-  auto operator()(const ValueT x, const ValueT y) const;
-  template <typename ValueT>
-  auto operator()(const ValueT x, const ValueT y, bool *pred) const;
-};
-// A sycl::min wrapper functor.
-struct minimum {
-  template <typename ValueT>
-  auto operator()(const ValueT x, const ValueT y) const;
-  template <typename ValueT>
-  auto operator()(const ValueT x, const ValueT y, bool *pred) const;
-};
-// A sycl::sub_sat wrapper functor.
-struct sub_sat {
-  template <typename ValueT>
-  auto operator()(const ValueT x, const ValueT y) const;
-};
-
-} // namespace syclcompat
-```
-
-`vectorized_binary` also supports comparison operators from the standard library (`std::equal_to`, `std::not_equal_to`, etc) 
-and the semantics can be modified by changing the comparison operator template instantiation. For example:
-
-```cpp
-unsigned int Input1;
-unsigned int Input2;
-// initialize inputs...
-
-// Performs comparison on sycl::ushort2, following sycl::vec semantics
-// Returns unsigned int containing, per vector element, 0xFFFF if true, and 0x0000 if false
-syclcompat::vectorized_binary<sycl::ushort2>(
-      Input1, Input2, std::equal_to<>());
-
-// Performs element-wise comparison on unsigned short
-// Returns unsigned int containing, per vector element, 1 if true, and 0 if false
-syclcompat::vectorized_binary<sycl::ushort2>(
-      Input1, Input2, std::equal_to<unsigned short>());
-```
-
-The math header provides a set of functions to extend 32-bit operations
-to 33 bit, and handle sign extension internally. There is support for `add`,
-`sub`, `absdiff`, `min` and `max` operations. Each operation provides overloads
-to include a second, separate, `BinaryOperation` after the first, and include
-the `_sat` variation, determines if the returning value is saturated or not.
-
-```cpp
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_add(AT a, BT b);
-
-template <typename RetT, typename AT, typename BT, typename CT,
-          typename BinaryOperation>
-inline constexpr RetT extend_add(AT a, BT b, CT c, BinaryOperation second_op);
-
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_add_sat(AT a, BT b);
-
-template <typename RetT, typename AT, typename BT, typename CT,
-          typename BinaryOperation>
-inline constexpr RetT extend_add_sat(AT a, BT b, CT c,
-                                     BinaryOperation second_op);
-
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_sub(AT a, BT b);
-
-template <typename RetT, typename AT, typename BT, typename CT,
-          typename BinaryOperation>
-inline constexpr RetT extend_sub(AT a, BT b, CT c, BinaryOperation second_op);
-
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_sub_sat(AT a, BT b);
-
-template <typename RetT, typename AT, typename BT, typename CT,
-          typename BinaryOperation>
-inline constexpr RetT extend_sub_sat(AT a, BT b, CT c,
-                                     BinaryOperation second_op);
-
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_absdiff(AT a, BT b);
-
-template <typename RetT, typename AT, typename BT, typename CT,
-          typename BinaryOperation>
-inline constexpr RetT extend_absdiff(AT a, BT b, CT c,
-                                     BinaryOperation second_op);
-
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_absdiff_sat(AT a, BT b);
-
-template <typename RetT, typename AT, typename BT, typename CT,
-          typename BinaryOperation>
-inline constexpr RetT extend_absdiff_sat(AT a, BT b, CT c,
-                                         BinaryOperation second_op);
-
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_min(AT a, BT b);
-
-template <typename RetT, typename AT, typename BT, typename CT,
-          typename BinaryOperation>
-inline constexpr RetT extend_min(AT a, BT b, CT c, BinaryOperation second_op);
-
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_min_sat(AT a, BT b);
-
-template <typename RetT, typename AT, typename BT, typename CT,
-          typename BinaryOperation>
-inline constexpr RetT extend_min_sat(AT a, BT b, CT c,
-                                     BinaryOperation second_op);
-
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_max(AT a, BT b);
-
-template <typename RetT, typename AT, typename BT, typename CT,
-          typename BinaryOperation>
-inline constexpr RetT extend_max(AT a, BT b, CT c, BinaryOperation second_op);
-
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_max_sat(AT a, BT b);
-
-template <typename RetT, typename AT, typename BT, typename CT,
-          typename BinaryOperation>
-inline constexpr RetT extend_max_sat(AT a, BT b, CT c,
-                                     BinaryOperation second_op);
-```
-
-Another set of vectorized extend 32-bit operations is provided in the math 
-header.These APIs treat each of the 32-bit operands as 2-elements vector 
-(16-bits each) while handling sign extension to 17-bits internally. There is 
-support for `add`, `sub`, `absdiff`, `min`, `max` and `avg` binary operations. 
-Each operation provides has a `_sat` variat which determines if the returning 
-value is saturated or not, and a `_add` variant that computes the binary sum 
-of the the initial operation outputs and a third operand. 
-
-```cpp
-/// Compute vectorized addition of \p a and \p b, with each value treated as a
-/// 2 elements vector type and extend each element to 17 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized addition of the two values
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vadd2(AT a, BT b, RetT c);
-
-/// Compute vectorized addition of \p a and \p b, with each value treated as a 2
-/// elements vector type and extend each element to 17 bit. Then add each half
-/// of the result and add with \p c.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The addition of each half of extend vectorized addition of the two
-/// values and the third value
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vadd2_add(AT a, BT b, RetT c);
-
-/// Compute vectorized addition of \p a and \p b with saturation, with each
-/// value treated as a 2 elements vector type and extend each element to 17 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized addition of the two values with saturation
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vadd2_sat(AT a, BT b, RetT c);
-
-/// Compute vectorized subtraction of \p a and \p b, with each value treated as
-/// a 2 elements vector type and extend each element to 17 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized subtraction of the two values
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vsub2(AT a, BT b, RetT c);
-
-/// Compute vectorized subtraction of \p a and \p b, with each value treated as
-/// a 2 elements vector type and extend each element to 17 bit. Then add each
-/// half of the result and add with \p c.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The addition of each half of extend vectorized subtraction of the
-/// two values and the third value
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vsub2_add(AT a, BT b, RetT c);
-
-/// Compute vectorized subtraction of \p a and \p b with saturation, with each
-/// value treated as a 2 elements vector type and extend each element to 17 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized subtraction of the two values with saturation
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vsub2_sat(AT a, BT b, RetT c);
-
-/// Compute vectorized abs_diff of \p a and \p b, with each value treated as a 2
-/// elements vector type and extend each element to 17 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized abs_diff of the two values
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vabsdiff2(AT a, BT b, RetT c);
-
-/// Compute vectorized abs_diff of \p a and \p b, with each value treated as a 2
-/// elements vector type and extend each element to 17 bit. Then add each half
-/// of the result and add with \p c.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The addition of each half of extend vectorized abs_diff of the
-/// two values and the third value
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vabsdiff2_add(AT a, BT b, RetT c);
-
-/// Compute vectorized abs_diff of \p a and \p b with saturation, with each
-/// value treated as a 2 elements vector type and extend each element to 17 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized abs_diff of the two values with saturation
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vabsdiff2_sat(AT a, BT b, RetT c);
-
-/// Compute vectorized minimum of \p a and \p b, with each value treated as a 2
-/// elements vector type and extend each element to 17 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized minimum of the two values
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vmin2(AT a, BT b, RetT c);
-
-/// Compute vectorized minimum of \p a and \p b, with each value treated as a 2
-/// elements vector type and extend each element to 17 bit. Then add each half
-/// of the result and add with \p c.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The addition of each half of extend vectorized minimum of the
-/// two values and the third value
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vmin2_add(AT a, BT b, RetT c);
-
-/// Compute vectorized minimum of \p a and \p b with saturation, with each value
-/// treated as a 2 elements vector type and extend each element to 17 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized minimum of the two values with saturation
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vmin2_sat(AT a, BT b, RetT c);
-
-/// Compute vectorized maximum of \p a and \p b, with each value treated as a 2
-/// elements vector type and extend each element to 17 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized maximum of the two values
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vmax2(AT a, BT b, RetT c);
-
-/// Compute vectorized maximum of \p a and \p b, with each value treated as a 2
-/// elements vector type and extend each element to 17 bit. Then add each half
-/// of the result and add with \p c.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The addition of each half of extend vectorized maximum of the
-/// two values and the third value
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vmax2_add(AT a, BT b, RetT c);
-
-/// Compute vectorized maximum of \p a and \p b with saturation, with each value
-/// treated as a 2 elements vector type and extend each element to 17 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized maximum of the two values with saturation
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vmax2_sat(AT a, BT b, RetT c);
-
-/// Compute vectorized average of \p a and \p b, with each value treated as a 2
-/// elements vector type and extend each element to 17 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized average of the two values
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vavrg2(AT a, BT b, RetT c);
-
-/// Compute vectorized average of \p a and \p b, with each value treated as a 2
-/// elements vector type and extend each element to 17 bit. Then add each half
-/// of the result and add with \p c.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The addition of each half of extend average maximum of the
-/// two values and the third value
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vavrg2_add(AT a, BT b, RetT c);
-
-/// Compute vectorized average of \p a and \p b with saturation, with each value
-/// treated as a 2 elements vector type and extend each element to 17 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized average of the two values with saturation
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vavrg2_sat(AT a, BT b, RetT c);
-```
-
-Similarly, a set of vectorized extend 32-bit operations is provided in the math 
-header treating each of the 32-bit operands as 4-elements vector (8-bits each) 
-while handling sign extension to 9-bits internally. There is support for `add`,
-`sub`, `absdiff`, `min`, `max` and `avg` binary operations. 
-Each operation provides has a `_sat` variat which determines if the returning 
-value is saturated or not, and a `_add` variant that computes the binary sum 
-of the the initial operation outputs and a third operand. 
-
-```cpp
-/// Compute vectorized addition of \p a and \p b, with each value treated as a
-/// 4 elements vector type and extend each element to 9 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized addition of the two values
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vadd4(AT a, BT b, RetT c);
-
-/// Compute vectorized addition of \p a and \p b, with each value treated as a 4
-/// elements vector type and extend each element to 9 bit. Then add each half
-/// of the result and add with \p c.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The addition of each half of extend vectorized addition of the two
-/// values and the third value
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vadd4_add(AT a, BT b, RetT c);
-
-/// Compute vectorized addition of \p a and \p b with saturation, with each
-/// value treated as a 4 elements vector type and extend each element to 9 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized addition of the two values with saturation
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vadd4_sat(AT a, BT b, RetT c);
-
-/// Compute vectorized subtraction of \p a and \p b, with each value treated as
-/// a 4 elements vector type and extend each element to 9 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized subtraction of the two values
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vsub4(AT a, BT b, RetT c);
-
-/// Compute vectorized subtraction of \p a and \p b, with each value treated as
-/// a 4 elements vector type and extend each element to 9 bit. Then add each
-/// half of the result and add with \p c.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The addition of each half of extend vectorized subtraction of the
-/// two values and the third value
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vsub4_add(AT a, BT b, RetT c);
-
-/// Compute vectorized subtraction of \p a and \p b with saturation, with each
-/// value treated as a 4 elements vector type and extend each element to 9 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized subtraction of the two values with saturation
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vsub4_sat(AT a, BT b, RetT c);
-
-/// Compute vectorized abs_diff of \p a and \p b, with each value treated as a 4
-/// elements vector type and extend each element to 9 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized abs_diff of the two values
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vabsdiff4(AT a, BT b, RetT c);
-
-/// Compute vectorized abs_diff of \p a and \p b, with each value treated as a 4
-/// elements vector type and extend each element to 9 bit. Then add each half
-/// of the result and add with \p c.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The addition of each half of extend vectorized abs_diff of the
-/// two values and the third value
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vabsdiff4_add(AT a, BT b, RetT c);
-
-/// Compute vectorized abs_diff of \p a and \p b with saturation, with each
-/// value treated as a 4 elements vector type and extend each element to 9 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized abs_diff of the two values with saturation
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vabsdiff4_sat(AT a, BT b, RetT c);
-
-/// Compute vectorized minimum of \p a and \p b, with each value treated as a 4
-/// elements vector type and extend each element to 9 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized minimum of the two values
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vmin4(AT a, BT b, RetT c);
-
-/// Compute vectorized minimum of \p a and \p b, with each value treated as a 4
-/// elements vector type and extend each element to 9 bit. Then add each half
-/// of the result and add with \p c.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The addition of each half of extend vectorized minimum of the
-/// two values and the third value
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vmin4_add(AT a, BT b, RetT c);
-
-/// Compute vectorized minimum of \p a and \p b with saturation, with each value
-/// treated as a 4 elements vector type and extend each element to 9 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized minimum of the two values with saturation
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vmin4_sat(AT a, BT b, RetT c);
-
-/// Compute vectorized maximum of \p a and \p b, with each value treated as a 4
-/// elements vector type and extend each element to 9 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized maximum of the two values
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vmax4(AT a, BT b, RetT c);
-
-/// Compute vectorized maximum of \p a and \p b, with each value treated as a 4
-/// elements vector type and extend each element to 9 bit. Then add each half
-/// of the result and add with \p c.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The addition of each half of extend vectorized maximum of the
-/// two values and the third value
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vmax4_add(AT a, BT b, RetT c);
-
-/// Compute vectorized maximum of \p a and \p b with saturation, with each value
-/// treated as a 4 elements vector type and extend each element to 9 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized maximum of the two values with saturation
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vmax4_sat(AT a, BT b, RetT c);
-
-/// Compute vectorized average of \p a and \p b, with each value treated as a 4
-/// elements vector type and extend each element to 9 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized average of the two values
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vavrg4(AT a, BT b, RetT c);
-
-/// Compute vectorized average of \p a and \p b, with each value treated as a 4
-/// elements vector type and extend each element to 9 bit. Then add each half
-/// of the result and add with \p c.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The addition of each half of extend vectorized average of the
-/// two values and the third value
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vavrg4_add(AT a, BT b, RetT c);
-
-/// Compute vectorized average of \p a and \p b with saturation, with each value
-/// treated as a 4 elements vector type and extend each element to 9 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized average of the two values with saturation
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vavrg4_sat(AT a, BT b, RetT c);
-```
-
-Vectorized comparison APIs also provided in the math header behave similarly 
-and support a `std` comparison operator parameter which can be `greater`, 
-`less`, `greater_equal`, `less_equal`, `equal_to` or `not_equal_to`. These APIs 
-cover both the 2-elements *(16-bits each)* and 4-elements *(8-bits each)* 
-variants, as well as an additional `_add` variant that computes the sum of the 
-2/4 output elements.
-
-```cpp
-/// Extend \p a and \p b to 33 bit and vectorized compare input values using
-/// specified comparison \p cmp .
-///
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \tparam [in] BinaryOperation The type of the compare operation
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] cmp The comparsion operator
-/// \returns The comparison result of the two extended values.
-template <typename AT, typename BT, typename BinaryOperation>
-inline constexpr unsigned extend_vcompare2(AT a, BT b, BinaryOperation cmp);
-
-/// Extend Inputs to 33 bit, and vectorized compare input values using specified
-/// comparison \p cmp , then add the result with \p c .
-///
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \tparam [in] BinaryOperation The type of the compare operation
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \param [in] cmp The comparsion operator
-/// \returns The comparison result of the two extended values, and add the
-/// result with \p c .
-template <typename AT, typename BT, typename BinaryOperation>
-inline constexpr unsigned extend_vcompare2_add(AT a, BT b, unsigned c,
-                                               BinaryOperation cmp);
-
-/// Extend \p a and \p b to 33 bit and vectorized compare input values using
-/// specified comparison \p cmp .
-///
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \tparam [in] BinaryOperation The type of the compare operation
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] cmp The comparsion operator
-/// \returns The comparison result of the two extended values.
-template <typename AT, typename BT, typename BinaryOperation>
-inline constexpr unsigned extend_vcompare4(AT a, BT b, BinaryOperation cmp);
-
-/// Extend Inputs to 33 bit, and vectorized compare input values using specified
-/// comparison \p cmp , then add the result with \p c .
-///
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \tparam [in] BinaryOperation The type of the compare operation
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \param [in] cmp The comparsion operator
-/// \returns The comparison result of the two extended values, and add the
-/// result with \p c .
-template <typename AT, typename BT, typename BinaryOperation>
-inline constexpr unsigned extend_vcompare4_add(AT a, BT b, unsigned c,
-                                               BinaryOperation cmp);
-```
-
-The math header file provides APIs for bit-field insertion (`bfi_safe`) and
-bit-field extraction (`bfe_safe`). These are bounds-checked variants of
-underlying `detail` APIs (`detail::bfi`, `detail::bfe`) which, in future
-releases, will be exposed to the user.
-
-```c++
-
-/// Bitfield-insert with boundary checking.
-///
-/// Align and insert a bit field from \param x into \param y . Source \param
-/// bit_start gives the starting bit position for the insertion, and source
-/// \param num_bits gives the bit field length in bits.
-///
-/// \tparam T The type of \param x and \param y , must be an unsigned integer.
-/// \param x The source of the bitfield.
-/// \param y The source where bitfield is inserted.
-/// \param bit_start The position to start insertion.
-/// \param num_bits The number of bits to insertion.
-template <typename T>
-inline T bfi_safe(const T x, const T y, const uint32_t bit_start,
-                  const uint32_t num_bits);
-
-/// Bitfield-extract with boundary checking.
-///
-/// Extract bit field from \param source and return the zero or sign-extended
-/// result. Source \param bit_start gives the bit field starting bit position,
-/// and source \param num_bits gives the bit field length in bits.
-///
-/// The result is padded with the sign bit of the extracted field. If `num_bits`
-/// is zero, the  result is zero. If the start position is beyond the msb of the
-/// input, the result is filled with the replicated sign bit of the extracted
-/// field.
-///
-/// \tparam T The type of \param source value, must be an integer.
-/// \param source The source value to extracting.
-/// \param bit_start The position to start extracting.
-/// \param num_bits The number of bits to extracting.
-template <typename T>
-inline T bfe_safe(const T source, const uint32_t bit_start,
-                  const uint32_t num_bits);
-```
-
-### Group Utilities
-
-Group utility functions and classes optimize data movement,
-processing, and communication within work-groups. The `exchange` class
-facilitates rearranging data between blocked and striped layouts, improving
-memory access patterns. The `group_radix_sort` class implements an efficient
-radix sort for distributed data, supporting both ascending and descending
-order. The `group_load` and `group_store` classes manage structured data
-movement between linear memory and work-group arrangements, supporting
-blocked and striped formats with optional range-guarding. The `group_shuffle`
-class enables efficient inter-work-item communication through selective data
-exchanges, shifting operations, and group-wide data movement. These utilities
-collectively enhance parallel performance by improving memory efficiency,
-load balancing, and computational throughput.
-
-``` c++
-namespace syclcompat {
-/// Rearranging data partitioned across a work-group.
-///
-/// \tparam T The type of the data elements.
-/// \tparam ElementsPerWorkItem The number of data elements assigned to a
-/// work-item.
-template <typename T, size_t ElementsPerWorkItem> class exchange {
-public:
-  exchange(uint8_t *local_memory);
-
-  static size_t get_local_memory_size(size_t group_threads);
-
-  /// Inplace rearrange elements from blocked order to striped order.
-  ///
-  /// Suppose 512 integer data elements partitioned across 128 work-items, where
-  /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the
-  /// blocked \p input across the work-group is:
-  ///
-  ///   {[0, 1, 2, 3], [4, 5, 6, 7], ..., [508, 509, 510, 511]}.
-  ///
-  /// The striped order output is:
-  ///
-  ///   {[0, 128, 256, 384], [1, 129, 257, 385], ..., [127, 255, 383, 511]}.
-  ///
-  /// \tparam Item The work-item identifier type.
-  /// \param item The work-item identifier.
-  /// \param input The input data of each work-item.
-  template <typename Item>
-  __syclcompat_inline__ void
-  blocked_to_striped(Item item, T (&input)[ElementsPerWorkItem]);
-
-  /// Inplace rearrange elements from striped order to blocked order.
-  ///
-  /// Suppose 512 integer data elements partitioned across 128 work-items, where
-  /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the
-  /// striped \p input across the work-group is:
-  ///
-  ///   { [0, 128, 256, 384], [1, 129, 257, 385], ..., [127, 255, 383, 511] }.
-  ///
-  /// The blocked order output is:
-  ///
-  ///   { [0, 1, 2, 3], [4, 5, 6, 7], ..., [508, 509, 510, 511] }.
-  ///
-  /// \tparam Item The work-item identifier type.
-  /// \param item The work-item identifier.
-  /// \param input The input data of each work-item.
-  template <typename Item>
-  __syclcompat_inline__ void
-  striped_to_blocked(Item item, T (&input)[ElementsPerWorkItem]);
-
-  /// Rearrange elements from blocked order to striped order.
-  ///
-  /// Suppose 512 integer data elements partitioned across 128 work-items, where
-  /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the
-  /// blocked \p input across the work-group is:
-  ///
-  ///   { [0, 1, 2, 3], [4, 5, 6, 7], ..., [508, 509, 510, 511] }.
-  ///
-  /// The striped order output is:
-  ///
-  ///   { [0, 128, 256, 384], [1, 129, 257, 385], ..., [127, 255, 383, 511] }.
-  ///
-  /// \tparam Item The work-item identifier type.
-  /// \param item The work-item identifier.
-  /// \param input The input data of each work-item.
-  /// \param output The corresponding output data of each work-item.
-  template <typename Item>
-  __syclcompat_inline__ void
-  blocked_to_striped(Item item, T (&input)[ElementsPerWorkItem],
-                     T (&output)[ElementsPerWorkItem]);
-
-  /// Rearrange elements from striped order to blocked order.
-  ///
-  /// Suppose 512 integer data elements partitioned across 128 work-items, where
-  /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the
-  /// striped \p input across the work-group is:
-  ///
-  ///   { [0, 128, 256, 384], [1, 129, 257, 385], ..., [127, 255, 383, 511] }.
-  ///
-  /// The blocked order output is:
-  ///
-  ///   { [0, 1, 2, 3], [4, 5, 6, 7], ..., [508, 509, 510, 511] }.
-  ///
-  /// \tparam Item The work-item identifier type.
-  /// \param item The work-item identifier.
-  /// \param input The input data of each work-item.
-  /// \param output The corresponding output data of each work-item.
-  template <typename Item>
-  __syclcompat_inline__ void
-  striped_to_blocked(Item item, T (&input)[ElementsPerWorkItem],
-                     T (&output)[ElementsPerWorkItem]);
-
-  /// Inplace exchanges data items annotated by rank into blocked arrangement.
-  ///
-  /// Suppose 512 integer data elements partitioned across 128 work-items, where
-  /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the
-  /// striped \p input across the work-group is:
-  ///
-  ///   { [0, 128, 256, 384], [1, 129, 257, 385], ..., [127, 255, 383, 511] }.
-  ///
-  /// The rank across the work-group is:
-  ///
-  ///   { [0, 1, 2, 3], [4, 5, 6, 7], ..., [508, 509, 510, 511] }.
-  ///
-  /// The blocked order output is:
-  ///
-  ///   { [0, 1, 2, 3], [4, 5, 6, 7], ..., [508, 509, 510, 511] }.
-  ///
-  /// \tparam Item The work-item identifier type.
-  /// \param item The work-item identifier.
-  /// \param input The input data of each work-item.
-  /// \param ranks The corresponding rank annotation of each work-item.
-  template <typename Item>
-  __syclcompat_inline__ void
-  scatter_to_blocked(Item item, T (&input)[ElementsPerWorkItem],
-                     int (&ranks)[ElementsPerWorkItem]);
-
-  /// Inplace exchanges data items annotated by rank into striped arrangement.
-  ///
-  /// Suppose 512 integer data elements partitioned across 128 work-items, where
-  /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the
-  /// blocked \p input across the work-group is:
-  ///
-  ///   { [0, 1, 2, 3], [4, 5, 6, 7], ..., [508, 509, 510, 511] }.
-  ///
-  /// The rank across the work-group is:
-  ///
-  ///   { [16, 20, 24, 28], [32, 36, 40, 44], ..., [499, 503, 507, 511] }.
-  ///
-  /// The striped order output of each work-item will be:
-  ///
-  ///   { [0, 128, 256, 384], [1, 129, 257, 385], ..., [127, 255, 383, 511] }.
-  ///
-  /// \tparam Item The work-item identifier type.
-  /// \param item The work-item identifier.
-  /// \param input The input data of each work-item.
-  /// \param ranks The corresponding rank annotation of each work-item.
-  template <typename Item>
-  __syclcompat_inline__ void
-  scatter_to_striped(Item item, T (&input)[ElementsPerWorkItem],
-                     int (&ranks)[ElementsPerWorkItem]);
-};
-
-/// The work-group wide radix sort to sort integer data elements
-/// assigned to all work-items in the work-group.
-///
-/// \tparam T The type of the data elements.
-/// \tparam ElementsPerWorkItem The number of data elements assigned to
-/// a work-item.
-/// \tparam RADIX_BITS The number of radix bits per digit place.
-template <typename T, int ElementsPerWorkItem, int RADIX_BITS = 4>
-class group_radix_sort {
-public:
-  group_radix_sort(uint8_t *local_memory);
-
-  static size_t get_local_memory_size(size_t group_threads);
-
-  /// Performs an ascending work-group wide radix sort over a blocked
-  /// arrangement of input elements.
-  ///
-  /// Suppose 512 integer data elements partitioned across 128 work-items, where
-  /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the
-  /// \p input across the work-group is:
-  ///
-  ///   { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }.
-  ///
-  /// The ascending order output is:
-  ///
-  ///   { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }.
-  ///
-  /// \tparam Item The work-item identifier type.
-  /// \param item The work-item identifier.
-  /// \param input The input data of each work-item.
-  /// \param begin_bit The beginning (least-significant) bit index needed for
-  /// key comparison.
-  /// \param end_bit The past-the-end (most-significant) bit
-  /// index needed for key comparison.
-  template <typename Item>
-  __syclcompat_inline__ void
-  sort(const Item &item, T (&input)[ElementsPerWorkItem], int begin_bit = 0,
-       int end_bit = 8 * sizeof(T));
-
-  /// Performs an descending work-group wide radix sort over a blocked
-  /// arrangement of input elements.
-  ///
-  /// Suppose 512 integer data elements partitioned across 128 work-items, where
-  /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the
-  /// \p input across the work-group is:
-  ///
-  ///   { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }.
-  ///
-  /// The descending order output is:
-  ///
-  ///   { [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }.
-  ///
-  /// \tparam Item The work-item identifier type.
-  /// \param item The work-item identifier.
-  /// \param input The input data of each work-item.
-  /// \param begin_bit The beginning (least-significant) bit index needed for
-  /// key comparison.
-  /// \param end_bit The past-the-end (most-significant) bit
-  /// index needed for key comparison.
-  template <typename Item>
-  __syclcompat_inline__ void
-  sort_descending(const Item &item, T (&input)[ElementsPerWorkItem],
-                  int begin_bit = 0, int end_bit = 8 * sizeof(T));
-
-  /// Performs an ascending radix sort across a blocked arrangement of input
-  /// elements, leaving them in a striped arrangement.
-  ///
-  /// Suppose 512 integer data elements partitioned across 128 work-items, where
-  /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the
-  /// \p input across the work-group is:
-  ///
-  ///   { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }.
-  ///
-  /// The corresponding output of each work-item will be:
-  ///
-  ///   { [0,128,256,384], [1,129,257,385], [2,130,258,386], ...,
-  ///   [127,255,383,511] }.
-  ///
-  /// \tparam Item The work-item identifier type.
-  /// \param item The work-item identifier.
-  /// \param input The input data of each work-item.
-  /// \param begin_bit The beginning (least-significant) bit index needed for
-  /// key comparison.
-  /// \param end_bit The past-the-end (most-significant) bit
-  /// index needed for key comparison.
-  template <typename Item>
-  __syclcompat_inline__ void
-  sort_blocked_to_striped(const Item &item, T (&input)[ElementsPerWorkItem],
-                          int begin_bit = 0, int end_bit = 8 * sizeof(T));
-
-  /// Performs an descending radix sort across a blocked arrangement of input
-  /// elements, leaving them in a striped arrangement.
-  ///
-  /// Suppose 512 integer data elements partitioned across 128 work-items, where
-  /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the
-  /// \p input across the work-group is:
-  ///
-  ///   { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }.
-  ///
-  /// The descending striped order output is:
-  ///
-  ///   { [0,128,256,384], [1,129,257,385], [2,130,258,386], ...,
-  ///   [127,255,383,511] }.
-  ///
-  /// \tparam Item The work-item identifier type.
-  /// \param item The work-item identifier.
-  /// \param input The input data of each work-item.
-  /// \param begin_bit The beginning (least-significant) bit index needed for
-  /// key comparison.
-  /// \param end_bit The past-the-end (most-significant) bit
-  /// index needed for key comparison.
-  template <typename Item>
-  __syclcompat_inline__ void sort_descending_blocked_to_striped(
-      const Item &item, T (&input)[ElementsPerWorkItem], int begin_bit = 0,
-      int end_bit = 8 * sizeof(T));
-};
-
-/// Load linear segment items into block format across threads
-/// Helper for Block Load
-enum load_algorithm {
-  BLOCK_LOAD_DIRECT,
-  BLOCK_LOAD_STRIPED,
-};
-
-/// Load a linear segment of elements into a blocked arrangement across the
-/// work-group.
-///
-/// \tparam T The data type to load.
-/// \tparam ElementsPerWorkItem The number of consecutive elements partitioned
-/// onto each work-item.
-/// \tparam InputIteratorT  The random-access iterator type for input \iterator.
-/// \tparam ItemT The sycl::nd_item index space class.
-/// \param item The calling work-item.
-/// \param input_iter The work-group's base input iterator for loading from.
-/// \param data Data to load.
-template <typename T, size_t ElementsPerWorkItem, typename InputIteratorT,
-          typename ItemT>
-__syclcompat_inline__ void load_direct_blocked(const ItemT &item,
-                                               InputIteratorT input_iter,
-                                               T (&data)[ElementsPerWorkItem]);
-
-/// Load a linear segment of elements into a striped arrangement across the
-/// work-group.
-///
-/// \tparam T The data type to load.
-/// \tparam ElementsPerWorkItem The number of consecutive elements partitioned
-/// onto each work-item.
-/// \tparam InputIteratorT  The random-access iterator type for input \iterator.
-/// \tparam ItemT The sycl::nd_item index space class.
-/// \param item The calling work-item.
-/// \param input_iter The work-group's base input iterator for loading from.
-/// \param data Data to load.
-template <typename T, int ElementsPerWorkItem, typename InputIteratorT,
-          typename ItemT>
-__syclcompat_inline__ void load_direct_striped(const ItemT &item,
-                                               InputIteratorT input_iter,
-                                               T (&data)[ElementsPerWorkItem]);
-
-/// Load a linear segment of elements into a blocked arrangement across the
-/// work-group, guarded by range.
-///
-/// \tparam T The data type to load.
-/// \tparam ElementsPerWorkItem The number of consecutive elements partitioned
-/// onto each work-item.
-/// \tparam InputIteratorT  The random-access iterator type for input \iterator.
-/// \tparam ItemT The sycl::nd_item index space class.
-/// \param item The calling work-item.
-/// \param input_iter The work-group's base input iterator for loading from.
-/// \param data Data to load.
-/// \param valid_items Number of valid items to load
-template <typename T, size_t ElementsPerWorkItem, typename InputIteratorT,
-          typename ItemT>
-__syclcompat_inline__ void
-load_direct_blocked(const ItemT &item, InputIteratorT input_iter,
-                    T (&data)[ElementsPerWorkItem], int valid_items);
-
-/// Load a linear segment of elements into a striped arrangement across the
-/// work-group, guarded by range.
-///
-/// \tparam T The data type to load.
-/// \tparam ElementsPerWorkItem The number of consecutive elements partitioned
-/// onto each work-item.
-/// \tparam InputIteratorT  The random-access iterator type for input \iterator.
-/// \tparam ItemT The sycl::nd_item index space class.
-/// \param item The calling work-item.
-/// \param input_iter The work-group's base input iterator for loading from.
-/// \param data Data to load.
-/// \param valid_items Number of valid items to load
-template <typename T, int ElementsPerWorkItem, typename InputIteratorT,
-          typename ItemT>
-__syclcompat_inline__ void
-load_direct_striped(const ItemT &item, InputIteratorT input_iter,
-                    T (&data)[ElementsPerWorkItem], int valid_items);
-
-/// Store a blocked arrangement of items across a work-group into a linear
-/// segment of items.
-///
-/// \tparam T The data type to store.
-/// \tparam ElementsPerWorkItem The number of consecutive elements partitioned
-/// onto each work-item.
-/// \tparam OutputIteratorT  The random-access iterator type for output.
-/// \iterator.
-/// \tparam ItemT The sycl::nd_item index space class.
-/// \param item The calling work-item.
-/// \param output_iter The work-group's base output iterator for writing.
-/// \param data Data to store.
-template <typename T, size_t ElementsPerWorkItem, typename OutputIteratorT,
-          typename ItemT>
-__syclcompat_inline__ void
-store_direct_blocked(const ItemT &item, OutputIteratorT output_iter,
-                     T (&data)[ElementsPerWorkItem]);
-
-/// Store a striped arrangement of items across a work-group into a linear
-/// segment of items.
-///
-/// \tparam T The data type to store.
-/// \tparam ElementsPerWorkItem The number of consecutive elements partitioned
-/// onto each work-item.
-/// \tparam OutputIteratorT  The random-access iterator type for output.
-/// \iterator.
-/// \tparam ItemT The sycl::nd_item index space class.
-/// \param item The calling work-item.
-/// \param output_iter The work-group's base output iterator for writing.
-/// \param items Data to store.
-template <typename T, size_t ElementsPerWorkItem, typename OutputIteratorT,
-          typename ItemT>
-__syclcompat_inline__ void
-store_direct_striped(const ItemT &item, OutputIteratorT output_iter,
-                     T (&data)[ElementsPerWorkItem]);
-
-/// Store a blocked arrangement of items across a work-group into a linear
-/// segment of items, guarded by range.
-///
-/// \tparam T The data type to store.
-/// \tparam ElementsPerWorkItem The number of consecutive elements partitioned
-/// onto each work-item.
-/// \tparam OutputIteratorT  The random-access iterator type for output.
-/// \iterator.
-/// \tparam ItemT The sycl::nd_item index space class.
-/// \param item The calling work-item.
-/// \param output_iter The work-group's base output iterator for writing.
-/// \param data Data to store.
-/// \param valid_items Number of valid items to load
-template <typename T, size_t ElementsPerWorkItem, typename OutputIteratorT,
-          typename ItemT>
-__syclcompat_inline__ void
-store_direct_blocked(const ItemT &item, OutputIteratorT output_iter,
-                     T (&data)[ElementsPerWorkItem], size_t valid_items);
-
-/// Store a striped arrangement of items across a work-group into a linear
-/// segment of items, guarded by range.
-///
-/// \tparam T The data type to store.
-/// \tparam ElementsPerWorkItem The number of consecutive elements partitioned
-/// onto each work-item.
-/// \tparam OutputIteratorT  The random-access iterator type for output.
-/// \iterator.
-/// \tparam ItemT The sycl::nd_item index space class.
-/// \param item The calling work-item.
-/// \param output_iter The work-group's base output iterator for writing.
-/// \param items Data to store.
-/// \param valid_items Number of valid items to load
-template <typename T, size_t ElementsPerWorkItem, typename OutputIteratorT,
-          typename ItemT>
-__syclcompat_inline__ void
-store_direct_striped(const ItemT &item, OutputIteratorT output_iter,
-                     T (&data)[ElementsPerWorkItem], size_t valid_items);
-
-/// Enumerates alternative algorithms for syclcompat::group::group_load to read
-/// a linear segment of data from memory into a blocked arrangement across a
-/// work-group.
-enum class group_load_algorithm {
-  /// A blocked arrangement of data is read directly from memory.
-  blocked,
-
-  /// A striped arrangement of data is read directly from memory.
-  striped
-};
-
-/// Provide methods for loading a linear segment of items from memory into a
-/// blocked arrangement across a work-group.
-///
-/// \tparam T The input data type.
-/// \tparam ElementsPerWorkItem The number of data elements assigned to a
-/// work-item.
-/// \tparam LoadAlgorithm The data movement strategy, default is blocked.
-template <typename T, size_t ElementsPerWorkItem,
-          group_load_algorithm LoadAlgorithm = group_load_algorithm::blocked>
-class group_load {
-public:
-  group_load(uint8_t *);
-
-  static size_t get_local_memory_size([[maybe_unused]] size_t work_group_size);
-
-  /// Load a linear segment of items from memory.
-  ///
-  /// Suppose 512 integer data elements partitioned across 128 work-items, where
-  /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the
-  /// \p input across the work-group is:
-  ///
-  ///   1, 2, 3, 4, 5, 6, 7, ..., 508, 509, 510, 511.
-  ///
-  /// The blocked order \p data of each work-item will be:
-  ///
-  ///   {[0,1,2,3], [4,5,6,7], ..., [508,509,510,511]}.
-  ///
-  /// The striped order \p output of each work-item will be:
-  ///
-  ///   {[0,128,256,384], [1,129,257,385], ..., [127,255,383,511]}.
-  ///
-  /// \tparam ItemT The sycl::nd_item index space class.
-  /// \tparam InputIteratorT The random-access iterator type for input
-  /// \iterator.
-  /// \param item The work-item identifier.
-  /// \param input_iter The work-group's base input iterator for loading from.
-  /// \param data The data to load.
-  template <typename ItemT, typename InputIteratorT>
-  __syclcompat_inline__ void load(const ItemT &item, InputIteratorT input_iter,
-                                  T (&data)[ElementsPerWorkItem]);
-
-  /// Load a linear segment of items from memory, guarded by range.
-  ///
-  /// Suppose 512 integer data elements partitioned across 128 work-items, where
-  /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and
-  /// valid_items is 5, the \p input across the work-group is:
-  ///
-  ///   0, 1, 2, 3, 4, 5, 6, 7, ..., 508, 509, 510, 511.
-  ///
-  /// The blocked order \p data of each work-item will be:
-  ///
-  ///   {[0,1,2,3], [4,?,?,?], ..., [?,?,?,?]}.
-  ///
-  /// The striped order \p output of each work-item will be:
-  ///
-  ///   {[0,?,?,?], [1,?,?,?], [2,?,?,?], [3,?,?,?] ..., [?,?,?,?]}.
-  ///
-  /// \tparam ItemT The sycl::nd_item index space class.
-  /// \tparam InputIteratorT The random-access iterator type for input
-  /// \iterator.
-  /// \param item The work-item identifier.
-  /// \param input_iter The work-group's base input iterator for loading from.
-  /// \param data The data to load.
-  /// \param valid_items Number of valid items to load
-  template <typename ItemT, typename InputIteratorT>
-  __syclcompat_inline__ void load(const ItemT &item, InputIteratorT input_iter,
-                                  T (&data)[ElementsPerWorkItem],
-                                  int valid_items);
-};
-
-/// Enumerates alternative algorithms for syclcompat::group::group_load to write
-/// a blocked arrangement of items across a work-group to a linear segment of
-/// memory.
-enum class group_store_algorithm {
-  /// A blocked arrangement of data is written directly to memory.
-  blocked,
-
-  /// A striped arrangement of data is written directly to memory.
-  striped,
-};
-
-/// Provide methods for writing a blocked arrangement of elements partitioned
-/// across a work-group to a linear segment of memory.
-///
-/// \tparam T The output data type.
-/// \tparam ElementsPerWorkItem The number of data elements assigned to a
-/// work-item.
-/// \tparam StoreAlgorithm The data movement strategy, default is blocked.
-template <typename T, size_t ElementsPerWorkItem,
-          group_store_algorithm StoreAlgorithm = group_store_algorithm::blocked>
-class group_store {
-public:
-  group_store(uint8_t *);
-
-  static size_t get_local_memory_size([[maybe_unused]] size_t work_group_size);
-
-  /// Store items into a linear segment of memory.
-  ///
-  /// Suppose 512 integer data elements partitioned across 128 work-items, where
-  /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the
-  /// \p input across the work-group is:
-  ///
-  ///   {[0,1,2,3], [4,5,6,7], ..., [508,509,510,511]}.
-  ///
-  /// The blocked order \p output will be:
-  ///
-  ///   1, 2, 3, 4, 5, 6, 7, ..., 508, 509, 510, 511.
-  ///
-  /// The striped order \p output will be:
-  ///
-  ///   0, 128, 256, 384, 1, 129, 257, 385, ..., 127, 255, 383, 511.
-  ///
-  /// \tparam ItemT The sycl::nd_item index space class.
-  /// \tparam OutputIteratorT The random-access iterator type for \p output
-  /// iterator.
-  /// \param item The work-item identifier.
-  /// \param input The input data of each work-item.
-  /// \param data The data to store.
-  template <typename ItemT, typename OutputIteratorT>
-  __syclcompat_inline__ void store(const ItemT &item,
-                                   OutputIteratorT output_iter,
-                                   T (&data)[ElementsPerWorkItem]);
-
-  /// Store items into a linear segment of memory, guarded by range.
-  ///
-  /// Suppose 512 integer data elements partitioned across 128 work-items, where
-  /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and
-  /// \p valid_items is 5, the \p output across the work-group is:
-  ///
-  ///   {[0,0,0,0], [0,0,0,0], ..., [0,0,0,0]}.
-  ///
-  /// The blocked order \p output will be:
-  ///
-  ///   0, 1, 2, 3, 4, 5, 0, 0, ..., 0, 0, 0, 0.
-  ///
-  /// The striped order \p output will be:
-  ///
-  ///   0, 4, 8, 12, 16, 0, 0, 0, ..., 0, 0, 0, 0.
-  ///
-  /// \tparam ItemT The sycl::nd_item index space class.
-  /// \tparam OutputIteratorT The random-access iterator type for \p output
-  /// iterator.
-  /// \param item The work-item identifier.
-  /// \param input The input data of each work-item.
-  /// \param data The data to store.
-  /// \param valid_items Number of valid items to load
-  template <typename ItemT, typename OutputIteratorT>
-  __syclcompat_inline__ void
-  store(const ItemT &item, OutputIteratorT output_iter,
-        T (&data)[ElementsPerWorkItem], size_t valid_items);
-};
-
-/// The work-group wide shuffle operations that allow work-items to exchange
-/// data elements with other work-items within the same work-group.
-///
-/// \tparam T The type of the data elements.
-/// \tparam group_dim_0 The first dimension size of the work-group.
-/// \tparam group_dim_1 The second dimension size of the work-group.
-/// \tparam group_dim_2 The third dimension size of the work-group.
-template <typename T, int group_dim_0, int group_dim_1 = 1, int group_dim_2 = 1>
-class group_shuffle {
-public:
-  group_shuffle(uint8_t *local_memory);
-
-  static constexpr size_t get_local_memory_size(size_t work_group_size);
-
-  /// Selects a value from a work-item at a given distance in the work-group
-  /// and stores the value in the output.
-  ///
-  /// \tparam ItemT The work-item identifier type.
-  /// \param item The work-item identifier.
-  /// \param input The input from the calling work-item.
-  /// \param output The output where the selected data will be stored.
-  /// \param distance The distance of work-items to look ahead or behind in the
-  /// work-group.
-  template <typename ItemT>
-  __syclcompat_inline__ void select(const ItemT &item, T input, T &output,
-                                    int distance = 1);
-  /// Selects a value from a work-item at a given distance in the work-group
-  /// and stores the value in the output, using a wrapped index to handle
-  /// overflow.
-  ///
-  /// \tparam ItemT The work-item identifier type.
-  /// \param item The work-item identifier.
-  /// \param input The input data to be selected.
-  /// \param output The output where the selected data will be stored.
-  /// \param distance The number of work-items to look ahead in the
-  /// work-group.
-  template <typename ItemT>
-  __syclcompat_inline__ void select2(const ItemT &item, T input, T &output,
-                                     unsigned int distance = 1);
-  /// Performs a shuffle operation to move data to the right across the
-  /// work-items, shifting elements in a work-item array by one position to the
-  /// right.
-  ///
-  /// \tparam ElementsPerWorkItem The number of data elements per work-item.
-  /// \tparam ItemT The work-item identifier type.
-  /// \param item The work-item identifier.
-  /// \param input The input data to be shuffled.
-  /// \param output The array that will store the shuffle result.
-  template <int ElementsPerWorkItem, typename ItemT>
-  __syclcompat_inline__ void shuffle_right(const ItemT &item,
-                                           T (&input)[ElementsPerWorkItem],
-                                           T (&output)[ElementsPerWorkItem]);
-
-  /// Performs a shuffle operation to move data to the right across the
-  /// work-items, storing the suffix of the group after the shuffle operation.
-  ///
-  /// \tparam ElementsPerWorkItem The number of data elements per work-item.
-  /// \tparam ItemT The work-item identifier type.
-  /// \param item The work-item identifier.
-  /// \param input The input data to be shuffled.
-  /// \param output The array that will store the shuffle result.
-  /// \param group_suffix The suffix of the group after the shuffle.
-  template <int ElementsPerWorkItem, typename ItemT>
-  __syclcompat_inline__ void
-  shuffle_right(const ItemT &item, T (&input)[ElementsPerWorkItem],
-                T (&output)[ElementsPerWorkItem], T &group_suffix);
-
-  /// Performs a shuffle operation to move data to the left across the
-  /// work-items, shifting elements in a work-item array by one position to the
-  /// left.
-  ///
-  /// \tparam ElementsPerWorkItem The number of data elements per work-item.
-  /// \tparam ItemT The work-item identifier type.
-  /// \param item The work-item identifier.
-  /// \param input The input data to be shuffled.
-  /// \param output The array that will store the shuffle result.
-  template <int ElementsPerWorkItem, typename ItemT>
-  __syclcompat_inline__ void shuffle_left(const ItemT &item,
-                                          T (&input)[ElementsPerWorkItem],
-                                          T (&output)[ElementsPerWorkItem]);
-
-  /// Performs a shuffle operation to move data to the left across the
-  /// work-items, storing the prefix of the group before the shuffle operation.
-  ///
-  /// \tparam ElementsPerWorkItem The number of data elements per work-item.
-  /// \tparam ItemT The work-item identifier type.
-  /// \param item The work-item identifier.
-  /// \param input The input data to be shuffled.
-  /// \param output The array that will store the shuffle result.
-  /// \param group_prefix The prefix of the group before the shuffle.
-  template <int ElementsPerWorkItem, typename ItemT>
-  __syclcompat_inline__ void
-  shuffle_left(const ItemT &item, T (&input)[ElementsPerWorkItem],
-               T (&output)[ElementsPerWorkItem], T &group_prefix);
-};
-} // namespace syclcompat
-```
-
-## Sample Code
-
-The file [helloworld.cpp](../../test-e2e/syclcompat/helloworld.cpp) contains
-a simple example which computes `y = mx + b` implemented using this library.
-
-## Maintainers
-
-To report problems with this library, please open a new issue with the [COMPAT]
-tag at:
-
-<https://github.com/intel/llvm/issues>
-
-## Contributors
-
-Alberto Cabrera, Codeplay \
-Gordon Brown, Codeplay \
-Joe Todd, Codeplay \
-Pietro Ghiglio, Codeplay \
-Ruyman Reyes, Codeplay/Intel
-
-## Contributions
-
-This library is licensed under the Apache 2.0 license. If you have an idea for a
-new sample, different build system integration or even a fix for something that
-is broken, please get in contact.
diff --git a/sycl/include/syclcompat.hpp b/sycl/include/syclcompat.hpp
deleted file mode 100644
index c12ad8ef0cf89..0000000000000
--- a/sycl/include/syclcompat.hpp
+++ /dev/null
@@ -1,25 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat
- *
- *  syclcompat.hpp
- *
- *  Description:
- *    Main include header for SYCLcompat
- **************************************************************************/
-
-#pragma once
-
-#include <syclcompat/syclcompat.hpp>
diff --git a/sycl/include/syclcompat/atomic.hpp b/sycl/include/syclcompat/atomic.hpp
deleted file mode 100644
index 85f5dab65f7f1..0000000000000
--- a/sycl/include/syclcompat/atomic.hpp
+++ /dev/null
@@ -1,473 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCL compatibility extension
- *
- *  atomic.hpp
- *
- *  Description:
- *    Atomic functionality for the SYCL compatibility extension
- **************************************************************************/
-
-// The original source was under the license below:
-//==---- atomic.hpp -------------------------------*- C++ -*----------------==//
-//
-// Copyright (C) Intel Corporation
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// See https://llvm.org/LICENSE.txt for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#pragma once
-
-#include <cassert>
-
-#include <sycl/access/access.hpp>
-#include <sycl/atomic_ref.hpp>
-#include <sycl/memory_enums.hpp>
-#include <sycl/multi_ptr.hpp>
-
-#include <syclcompat/traits.hpp>
-
-namespace [[deprecated("syclcompat is deprecated")]] syclcompat {
-
-/// Atomically add the value operand to the value at the addr and assign the
-/// result to the value at addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand The value to add to the value at \p addr.
-/// \param memoryOrder The memory ordering used.
-/// \returns The value at the \p addr before the call.
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::generic_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device,
-          typename T>
-inline T atomic_fetch_add(T *addr, arith_t<T> operand) {
-  auto atm =
-      sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
-  return atm.fetch_add(operand);
-}
-
-/// Atomically subtract the value operand from the value at the addr and
-/// assign the result to the value at addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand The value to subtract from the value at \p addr.
-/// \param memoryOrder The memory ordering used.
-/// \returns The value at the \p addr before the call.
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::generic_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device,
-          typename T>
-inline T atomic_fetch_sub(T *addr, arith_t<T> operand) {
-  auto atm =
-      sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
-  return atm.fetch_sub(operand);
-}
-
-/// Atomically perform a bitwise AND between the value operand and the value
-/// at the addr and assign the result to the value at addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand The value to use in bitwise AND operation with the value at
-/// the \p addr.
-/// \param memoryOrder The memory ordering used.
-/// \returns The value at the \p addr before the call.
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::generic_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device,
-          typename T>
-inline T atomic_fetch_and(T *addr, type_identity_t<T> operand) {
-  auto atm =
-      sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
-  return atm.fetch_and(operand);
-}
-
-/// Atomically or the value at the addr with the value operand, and assign
-/// the result to the value at addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand The value to use in bitwise OR operation with the value at
-/// the \p addr.
-/// \param memoryOrder The memory ordering used.
-/// \returns The value at the \p addr before the call.
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::generic_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device,
-          typename T>
-inline T atomic_fetch_or(T *addr, type_identity_t<T> operand) {
-  auto atm =
-      sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
-  return atm.fetch_or(operand);
-}
-
-/// Atomically xor the value at the addr with the value operand, and assign
-/// the result to the value at addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand The value to use in bitwise XOR operation with the value at
-/// the \p addr.
-/// \param memoryOrder The memory ordering used.
-/// \returns The value at the \p addr before the call.
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::generic_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device,
-          typename T>
-inline T atomic_fetch_xor(T *addr, type_identity_t<T> operand) {
-  auto atm =
-      sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
-  return atm.fetch_xor(operand);
-}
-
-/// Atomically calculate the minimum of the value at addr and the value
-/// operand and assign the result to the value at addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand. \param memoryOrder The memory ordering used.
-/// \returns The value at the \p addr before the call.
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::generic_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device,
-          typename T>
-inline T atomic_fetch_min(T *addr, type_identity_t<T> operand) {
-  auto atm =
-      sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
-  return atm.fetch_min(operand);
-}
-
-/// Atomically calculate the maximum of the value at addr and the value
-/// operand and assign the result to the value at addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand.
-/// \param memoryOrder The memory ordering used.
-/// \returns The value at the \p addr before the call.
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::generic_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device,
-          typename T>
-inline T atomic_fetch_max(T *addr, type_identity_t<T> operand) {
-  auto atm =
-      sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
-  return atm.fetch_max(operand);
-}
-
-/// Atomically set \p operand to the value stored in \p addr, if old value
-/// stored in \p addr is equal to zero or greater than \p operand, else decrease
-/// the value stored in \p addr. \param [in, out] addr The pointer to the data.
-/// \param operand The threshold value.
-/// \param memoryOrder The memory ordering used.
-/// \returns The old value stored in \p addr.
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::generic_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-unsigned int atomic_fetch_compare_dec(unsigned int *addr,
-                                      unsigned int operand) {
-  auto atm =
-      sycl::atomic_ref<unsigned int, memoryOrder, memoryScope, addressSpace>(
-          addr[0]);
-  unsigned int old;
-
-  while (true) {
-    old = atm.load();
-    if (old == 0 || old > operand) {
-      if (atm.compare_exchange_strong(old, operand))
-        break;
-    } else if (atm.compare_exchange_strong(old, old - 1))
-      break;
-  }
-
-  return old;
-}
-
-/// Atomically increment the value stored in \p addr if old value stored in \p
-/// addr is less than \p operand, else set 0 to the value stored in \p addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand The threshold value.
-/// \param memoryOrder The memory ordering used.
-/// \returns The old value stored in \p addr.
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::generic_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-inline unsigned int atomic_fetch_compare_inc(unsigned int *addr,
-                                             unsigned int operand) {
-  auto atm =
-      sycl::atomic_ref<unsigned int, memoryOrder, memoryScope, addressSpace>(
-          addr[0]);
-  unsigned int old;
-  while (true) {
-    old = atm.load();
-    if (old >= operand) {
-      if (atm.compare_exchange_strong(old, 0))
-        break;
-    } else if (atm.compare_exchange_strong(old, old + 1))
-      break;
-  }
-  return old;
-}
-
-/// Atomically exchange the value at the address addr with the value operand.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand The value to be exchanged with the value pointed by \p addr.
-/// \param memoryOrder The memory ordering used.
-/// \returns The value at the \p addr before the call.
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::generic_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device,
-          typename T>
-inline T atomic_exchange(T *addr, type_identity_t<T> operand) {
-  auto atm =
-      sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
-  return atm.exchange(operand);
-}
-
-/// Atomically compare the value at \p addr to the value expected and exchange
-/// with the value desired if the value at \p addr is equal to the value
-/// expected. Returns the value at the \p addr before the call.
-/// \param [in, out] addr Multi_ptr.
-/// \param expected The value to compare against the value at \p addr.
-/// \param desired The value to assign to \p addr if the value at \p addr
-/// is expected.
-/// \param success The memory ordering used when comparison succeeds.
-/// \param fail The memory ordering used when comparison fails.
-/// \returns The value at the \p addr before the call.
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::generic_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device,
-          typename T>
-T atomic_compare_exchange_strong(
-    sycl::multi_ptr<T, addressSpace> addr, type_identity_t<T> expected,
-    type_identity_t<T> desired,
-    sycl::memory_order success = sycl::memory_order::relaxed,
-    sycl::memory_order fail = sycl::memory_order::relaxed) {
-  auto atm = sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(*addr);
-
-  atm.compare_exchange_strong(expected, desired, success, fail);
-  return expected;
-}
-
-/// Atomically compare the value at \p addr to the value expected and exchange
-/// with the value desired if the value at \p addr is equal to the value
-/// expected. Returns the value at the \p addr before the call.
-/// \param [in] addr The pointer to the data.
-/// \param expected The value to compare against the value at \p addr.
-/// \param desired The value to assign to \p addr if the value at \p addr is
-/// expected.
-/// \param success The memory ordering used when comparison succeeds.
-/// \param fail The memory ordering used when comparison fails.
-/// \returns The value at the \p addr before the call.
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::generic_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device,
-          typename T>
-T atomic_compare_exchange_strong(
-    T *addr, type_identity_t<T> expected, type_identity_t<T> desired,
-    sycl::memory_order success = sycl::memory_order::relaxed,
-    sycl::memory_order fail = sycl::memory_order::relaxed) {
-  auto atm =
-      sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
-  atm.compare_exchange_strong(expected, desired, success, fail);
-  return expected;
-}
-
-/// Atomic extension to implement standard APIs in std::atomic
-namespace detail {
-template <typename T> struct IsValidAtomicType {
-  static constexpr bool value =
-      (std::is_same<T, int>::value || std::is_same<T, unsigned int>::value ||
-       std::is_same<T, long>::value || std::is_same<T, unsigned long>::value ||
-       std::is_same<T, long long>::value ||
-       std::is_same<T, unsigned long long>::value ||
-       std::is_same<T, float>::value || std::is_same<T, double>::value ||
-       std::is_pointer<T>::value);
-};
-} // namespace detail
-
-template <typename T,
-          sycl::memory_scope DefaultScope = sycl::memory_scope::system,
-          sycl::memory_order DefaultOrder = sycl::memory_order::seq_cst,
-          sycl::access::address_space Space =
-              sycl::access::address_space::generic_space>
-class atomic {
-  static_assert(
-      detail::IsValidAtomicType<T>::value,
-      "Invalid atomic type.  Valid types are int, unsigned int, long, "
-      "unsigned long, long long, unsigned long long, float, double "
-      "and pointer types");
-  T __d;
-
-public:
-  /// default memory synchronization order
-  static constexpr sycl::memory_order default_read_order =
-      sycl::atomic_ref<T, DefaultOrder, DefaultScope,
-                       Space>::default_read_order;
-  static constexpr sycl::memory_order default_write_order =
-      sycl::atomic_ref<T, DefaultOrder, DefaultScope,
-                       Space>::default_write_order;
-  static constexpr sycl::memory_scope default_scope = DefaultScope;
-  static constexpr sycl::memory_order default_read_modify_write_order =
-      DefaultOrder;
-
-  /// Default constructor.
-  constexpr atomic() noexcept = default;
-  /// Constructor with initialize value.
-  constexpr atomic(T d) noexcept : __d(d){};
-
-  /// atomically replaces the value of the referenced object with a non-atomic
-  /// argument
-  /// \param operand The value to replace the pointed value.
-  /// \param memoryOrder The memory ordering used.
-  /// \param memoryScope The memory scope used.
-  void store(T operand, sycl::memory_order memoryOrder = default_write_order,
-             sycl::memory_scope memoryScope = default_scope) noexcept {
-    sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
-    atm.store(operand, memoryOrder, memoryScope);
-  }
-
-  /// atomically obtains the value of the referenced object
-  /// \param memoryOrder The memory ordering used.
-  /// \param memoryScope The memory scope used.
-  /// \returns The value of the referenced object
-  T load(sycl::memory_order memoryOrder = default_read_order,
-         sycl::memory_scope memoryScope = default_scope) const noexcept {
-    sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(
-        const_cast<T &>(__d));
-    return atm.load(memoryOrder, memoryScope);
-  }
-
-  /// atomically replaces the value of the referenced object and obtains the
-  /// value held previously
-  /// \param operand The value to replace the pointed value.
-  /// \param memoryOrder The memory ordering used.
-  /// \param memoryScope The memory scope used.
-  /// \returns The value of the referenced object before the call.
-  T exchange(T operand,
-             sycl::memory_order memoryOrder = default_read_modify_write_order,
-             sycl::memory_scope memoryScope = default_scope) noexcept {
-
-    sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
-    return atm.exchange(operand, memoryOrder, memoryScope);
-  }
-
-  /// atomically compares the value of the referenced object with non-atomic
-  /// argument and performs atomic exchange if equal or atomic load if not
-  /// \param expected The value expected to be found in the object referenced by
-  /// the atomic_ref object
-  /// \param desired  The value to store in the referenced object if it is as
-  /// expected
-  /// \param success The memory models for the read-modify-write
-  /// \param failure The memory models for load operations
-  /// \param memoryScope The memory scope used.
-  /// \returns true if the referenced object was successfully changed, false
-  /// otherwise.
-  bool compare_exchange_weak(
-      T &expected, T desired, sycl::memory_order success,
-      sycl::memory_order failure,
-      sycl::memory_scope memoryScope = default_scope) noexcept {
-    sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
-    return atm.compare_exchange_weak(expected, desired, success, failure,
-                                     memoryScope);
-  }
-  /// \param expected The value expected to be found in the object referenced by
-  /// the atomic_ref object
-  /// \param desired  The value to store in the referenced
-  /// object if it is as expected
-  /// \param memoryOrder 	The memory synchronization ordering for
-  /// operations
-  /// \param memoryScope The memory scope used.
-  /// \returns true if the referenced object was successfully
-  /// changed, false otherwise.
-  bool compare_exchange_weak(
-      T &expected, T desired,
-      sycl::memory_order memoryOrder = default_read_modify_write_order,
-      sycl::memory_scope memoryScope = default_scope) noexcept {
-    sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
-    return atm.compare_exchange_weak(expected, desired, memoryOrder,
-                                     memoryScope);
-  }
-
-  /// atomically compares the value of the referenced object with non-atomic
-  /// argument and performs atomic exchange if equal or atomic load if not
-  /// \param expected The value expected to be found in the object referenced by
-  /// the atomic_ref object
-  /// \param desired  The value to store in the referenced
-  /// object if it is as expected
-  /// \param success The memory models for the
-  /// read-modify-write
-  /// \param failure The memory models for load operations
-  /// \param memoryScope The memory scope used.
-  /// \returns true if the referenced object was successfully changed, false
-  /// otherwise.
-  bool compare_exchange_strong(
-      T &expected, T desired, sycl::memory_order success,
-      sycl::memory_order failure,
-      sycl::memory_scope memoryScope = default_scope) noexcept {
-
-    sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
-    return atm.compare_exchange_strong(expected, desired, success, failure,
-                                       memoryScope);
-  }
-  /// \param expected The value expected to be found in the object referenced by
-  /// the atomic_ref object
-  /// \param desired The value to store in the referenced
-  /// object if it is as expected
-  /// \param memoryOrder 	The memory synchronization ordering for
-  /// operations
-  /// \param memoryScope The memory scope used.
-  /// \returns true if the referenced object was successfully changed, false
-  /// otherwise.
-  bool compare_exchange_strong(
-      T &expected, T desired,
-      sycl::memory_order memoryOrder = default_read_modify_write_order,
-      sycl::memory_scope memoryScope = default_scope) noexcept {
-    sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
-    return atm.compare_exchange_strong(expected, desired, memoryOrder,
-                                       memoryScope);
-  }
-
-  /// atomically adds the argument to the value stored in the atomic object and
-  /// obtains the value held previously
-  /// \param operand 	The other argument of arithmetic addition
-  /// \param memoryOrder The memory ordering used.
-  /// \param memoryScope The memory scope used.
-  /// \returns The value of the referenced object before the call.
-  T fetch_add(arith_t<T> operand,
-              sycl::memory_order memoryOrder = default_read_modify_write_order,
-              sycl::memory_scope memoryScope = default_scope) noexcept {
-
-    auto atm = sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space>(__d);
-    return atm.fetch_add(operand, memoryOrder, memoryScope);
-  }
-
-  /// atomically subtracts the argument from the value stored in the atomic
-  /// object and obtains the value held previously
-  /// \param operand 	The other argument of arithmetic subtraction
-  /// \param memoryOrder The memory ordering used.
-  /// \param memoryScope The memory scope used.
-  /// \returns The value of the referenced object before the call.
-  T fetch_sub(arith_t<T> operand,
-              sycl::memory_order memoryOrder = default_read_modify_write_order,
-              sycl::memory_scope memoryScope = default_scope) noexcept {
-
-    auto atm = sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space>(__d);
-    return atm.fetch_sub(operand, memoryOrder, memoryScope);
-  }
-};
-
-} // namespace syclcompat
diff --git a/sycl/include/syclcompat/defs.hpp b/sycl/include/syclcompat/defs.hpp
deleted file mode 100644
index 32f0c2197bde7..0000000000000
--- a/sycl/include/syclcompat/defs.hpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat
- *
- *  defs.hpp
- *
- *  Description:
- *    helper aliases and definitions for SYCLcompat
- *
- **************************************************************************/
-
-// The original source was under the license below:
-//==---- defs.hpp ---------------------------------*- C++ -*----------------==//
-//
-// Copyright (C) Intel Corporation
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// See https://llvm.org/LICENSE.txt for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#pragma once
-
-#include <iostream>
-
-template <class... Args> class syclcompat_kernel_name;
-template <int Arg> class syclcompat_kernel_scalar;
-
-#if defined(_MSC_VER)
-#define __syclcompat_align__(n) __declspec(align(n))
-#define __syclcompat_inline__ __forceinline
-#define __syclcompat_noinline__ __declspec(noinline)
-#else
-#define __syclcompat_align__(n) __attribute__((aligned(n)))
-#define __syclcompat_inline__ __inline__ __attribute__((always_inline))
-#define __syclcompat_noinline__ __attribute__((noinline))
-#endif
-
-#define SYCLCOMPAT_COMPATIBILITY_TEMP (900)
-
-#ifdef _WIN32
-#define SYCLCOMPAT_EXPORT __declspec(dllexport)
-#else
-#define SYCLCOMPAT_EXPORT
-#endif
-
-#define SYCLCOMPAT_MAJOR_VERSION 0
-#define SYCLCOMPAT_MINOR_VERSION 2
-#define SYCLCOMPAT_PATCH_VERSION 0
-
-#define SYCLCOMPAT_MAKE_VERSION(_major, _minor, _patch)                        \
-  ((1E6 * _major) + (1E3 * _minor) + _patch)
-
-#define SYCLCOMPAT_VERSION                                                     \
-  SYCLCOMPAT_MAKE_VERSION(SYCLCOMPAT_MAJOR_VERSION, SYCLCOMPAT_MINOR_VERSION,  \
-                          SYCLCOMPAT_PATCH_VERSION)
-
-namespace [[deprecated("syclcompat is deprecated")]] syclcompat {
-enum error_code { success = 0, backend_error = 1, default_error = 999 };
-/// A dummy function introduced to assist auto migration.
-/// The SYCLomatic user should replace it with a real error-handling function.
-/// SYCL reports errors using exceptions and does not use error codes.
-inline const char *get_error_string_dummy(int ec) {
-  (void)ec;
-  return "<FIXME: Placeholder>"; // Return the error string for the error code
-                                 // ec.
-}
-} // namespace syclcompat
-
-#define SYCLCOMPAT_CHECK_ERROR(expr)                                           \
-  [&]() {                                                                      \
-    try {                                                                      \
-      expr;                                                                    \
-      return syclcompat::error_code::success;                                  \
-    } catch (sycl::exception const &e) {                                       \
-      std::cerr << e.what() << std::endl;                                      \
-      return syclcompat::error_code::backend_error;                            \
-    } catch (std::runtime_error const &e) {                                    \
-      std::cerr << e.what() << std::endl;                                      \
-      return syclcompat::error_code::default_error;                            \
-    }                                                                          \
-  }()
diff --git a/sycl/include/syclcompat/device.hpp b/sycl/include/syclcompat/device.hpp
deleted file mode 100644
index 5951b4fc6492c..0000000000000
--- a/sycl/include/syclcompat/device.hpp
+++ /dev/null
@@ -1,954 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCL compatibility extension
- *
- *  device.hpp
- *
- *  Description:
- *    Device functionality for the SYCL compatibility extension
- **************************************************************************/
-
-// The original source was under the license below:
-//==---- device.hpp -------------------------------*- C++ -*----------------==//
-//
-// Copyright (C) Intel Corporation
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// See https://llvm.org/LICENSE.txt for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#pragma once
-
-#include <algorithm>
-#include <cstring>
-#include <iostream>
-#include <map>
-#include <mutex>
-#include <set>
-#include <sstream>
-#include <thread>
-#include <vector>
-#if defined(__linux__)
-#include <sys/syscall.h>
-#include <unistd.h>
-#endif
-#if defined(_WIN64)
-#ifndef NOMINMAX
-#define NOMINMAX
-#endif
-#include <windows.h>
-#endif
-
-#include <sycl/detail/defines_elementary.hpp>
-#include <sycl/exception_list.hpp>
-#include <sycl/properties/queue_properties.hpp>
-#include <sycl/queue.hpp>
-
-namespace [[deprecated("syclcompat is deprecated")]] syclcompat {
-
-namespace detail {
-static void parse_version_string(const std::string &ver, int &major,
-                                 int &minor) {
-  // Version string has the following format:
-  // a. OpenCL<space><major.minor><space><vendor-specific-information>
-  // b. <major.minor>
-  // c. <AmdGcnArchName> e.g gfx1030
-  std::string::size_type i = 0;
-  while (i < ver.size()) {
-    if (isdigit(ver[i]))
-      break;
-    i++;
-  }
-  if (i < ver.size())
-    major = std::stoi(&(ver[i]));
-  else
-    major = 0;
-  while (i < ver.size()) {
-    if (ver[i] == '.')
-      break;
-    i++;
-  }
-  i++;
-  if (i < ver.size())
-    minor = std::stoi(&(ver[i]));
-  else
-    minor = 0;
-}
-
-static void get_version(const sycl::device &dev, int &major, int &minor) {
-  std::string ver = dev.get_info<sycl::info::device::version>();
-  parse_version_string(ver, major, minor);
-}
-
-/// SYCL default exception handler
-inline auto exception_handler = [](sycl::exception_list exceptions) {
-  for (std::exception_ptr const &e : exceptions) {
-    try {
-      std::rethrow_exception(e);
-    } catch (sycl::exception const &e) {
-      std::cerr << "[SYCLcompat] Caught asynchronous SYCL exception:"
-                << std::endl
-                << e.what() << std::endl
-                << "Exception caught at file:" << __FILE__
-                << ", line:" << __LINE__ << std::endl;
-    }
-  }
-};
-
-} // namespace detail
-
-using event_ptr = sycl::event *;
-
-using queue_ptr = sycl::queue *;
-
-using device_ptr = char *;
-
-/// Destroy \p event pointed memory.
-///
-/// \param event Pointer to the sycl::event address.
-static void destroy_event(event_ptr event) { delete event; }
-
-class device_info {
-public:
-  // get interface
-  const char *get_name() const { return _name; }
-  char *get_name() { return _name; }
-  template <typename WorkItemSizesTy = sycl::range<3>,
-            std::enable_if_t<std::is_same_v<WorkItemSizesTy, sycl::range<3>> ||
-                                 std::is_same_v<WorkItemSizesTy, int *>,
-                             int> = 0>
-  auto get_max_work_item_sizes() const {
-    if constexpr (std::is_same_v<WorkItemSizesTy, sycl::range<3>>)
-      return _max_work_item_sizes;
-    else
-      return _max_work_item_sizes_i;
-  }
-  template <typename WorkItemSizesTy = sycl::range<3>,
-            std::enable_if_t<std::is_same_v<WorkItemSizesTy, sycl::range<3>> ||
-                                 std::is_same_v<WorkItemSizesTy, int *>,
-                             int> = 0>
-  auto get_max_work_item_sizes() {
-    if constexpr (std::is_same_v<WorkItemSizesTy, sycl::range<3>>)
-      return _max_work_item_sizes;
-    else
-      return _max_work_item_sizes_i;
-  }
-  bool get_host_unified_memory() const { return _host_unified_memory; }
-  int get_major_version() const { return _major; }
-  int get_minor_version() const { return _minor; }
-  int get_integrated() const { return _integrated; }
-  int get_max_clock_frequency() const { return _frequency; }
-  int get_max_compute_units() const { return _max_compute_units; }
-  int get_max_work_group_size() const { return _max_work_group_size; }
-  int get_max_sub_group_size() const { return _max_sub_group_size; }
-  int get_max_work_items_per_compute_unit() const {
-    return _max_work_items_per_compute_unit;
-  }
-  int get_max_register_size_per_work_group() const {
-    return _max_register_size_per_work_group;
-  }
-  template <typename NDRangeSizeTy = size_t *,
-            std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t *> ||
-                                 std::is_same_v<NDRangeSizeTy, int *>,
-                             int> = 0>
-  auto get_max_nd_range_size() const {
-    if constexpr (std::is_same_v<NDRangeSizeTy, size_t *>)
-      return _max_nd_range_size;
-    else
-      return _max_nd_range_size_i;
-  }
-  template <typename NDRangeSizeTy = size_t *,
-            std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t *> ||
-                                 std::is_same_v<NDRangeSizeTy, int *>,
-                             int> = 0>
-  auto get_max_nd_range_size() {
-    if constexpr (std::is_same_v<NDRangeSizeTy, size_t *>)
-      return _max_nd_range_size;
-    else
-      return _max_nd_range_size_i;
-  }
-  size_t get_global_mem_size() const { return _global_mem_size; }
-  size_t get_local_mem_size() const { return _local_mem_size; }
-  /// Returns the maximum clock rate of device's global memory in kHz. If
-  /// compiler does not support this API then returns default value 3200000 kHz.
-  unsigned int get_memory_clock_rate() const { return _memory_clock_rate; }
-  /// Returns the maximum bus width between device and memory in bits. If
-  /// compiler does not support this API then returns default value 64 bits.
-  unsigned int get_memory_bus_width() const { return _memory_bus_width; }
-  uint32_t get_device_id() const { return _device_id; }
-  std::array<unsigned char, 16> get_uuid() const { return _uuid; }
-  /// Returns global memory cache size in bytes.
-  unsigned int get_global_mem_cache_size() const {
-    return _global_mem_cache_size;
-  }
-  int get_image1d_max() const { return _image1d_max; }
-  auto get_image2d_max() const { return _image2d_max; }
-  auto get_image2d_max() { return _image2d_max; }
-  auto get_image3d_max() const { return _image3d_max; }
-  auto get_image3d_max() { return _image3d_max; }
-
-  // set interface
-  void set_name(const char *name) {
-    size_t length = strlen(name);
-    if (length < device_info::NAME_BUFFER_SIZE) {
-      std::memcpy(_name, name, length + 1);
-    } else {
-      std::memcpy(_name, name, device_info::NAME_BUFFER_SIZE - 1);
-      _name[255] = '\0';
-    }
-  }
-  void set_max_work_item_sizes(const sycl::range<3> max_work_item_sizes) {
-    _max_work_item_sizes = max_work_item_sizes;
-    for (int i = 0; i < 3; ++i)
-      _max_work_item_sizes_i[i] = max_work_item_sizes[i];
-  }
-  [[deprecated]] void
-  set_max_work_item_sizes(const sycl::id<3> max_work_item_sizes) {
-    for (int i = 0; i < 3; ++i) {
-      _max_work_item_sizes[i] = max_work_item_sizes[i];
-      _max_work_item_sizes_i[i] = max_work_item_sizes[i];
-    }
-  }
-  void set_host_unified_memory(bool host_unified_memory) {
-    _host_unified_memory = host_unified_memory;
-  }
-  void set_major_version(int major) { _major = major; }
-  void set_minor_version(int minor) { _minor = minor; }
-  void set_integrated(int integrated) { _integrated = integrated; }
-  void set_max_clock_frequency(int frequency) { _frequency = frequency; }
-  void set_max_compute_units(int max_compute_units) {
-    _max_compute_units = max_compute_units;
-  }
-  void set_global_mem_size(size_t global_mem_size) {
-    _global_mem_size = global_mem_size;
-  }
-  void set_local_mem_size(size_t local_mem_size) {
-    _local_mem_size = local_mem_size;
-  }
-  void set_max_work_group_size(int max_work_group_size) {
-    _max_work_group_size = max_work_group_size;
-  }
-  void set_max_sub_group_size(int max_sub_group_size) {
-    _max_sub_group_size = max_sub_group_size;
-  }
-  void
-  set_max_work_items_per_compute_unit(int max_work_items_per_compute_unit) {
-    _max_work_items_per_compute_unit = max_work_items_per_compute_unit;
-  }
-  void set_max_nd_range_size(int max_nd_range_size[]) {
-    for (int i = 0; i < 3; i++) {
-      _max_nd_range_size[i] = max_nd_range_size[i];
-      _max_nd_range_size_i[i] = max_nd_range_size[i];
-    }
-  }
-  void set_max_nd_range_size(sycl::id<3> max_nd_range_size) {
-    for (int i = 0; i < 3; i++) {
-      _max_nd_range_size[i] = max_nd_range_size[i];
-      _max_nd_range_size_i[i] = max_nd_range_size[i];
-    }
-  }
-  void set_memory_clock_rate(unsigned int memory_clock_rate) {
-    _memory_clock_rate = memory_clock_rate;
-  }
-  void set_memory_bus_width(unsigned int memory_bus_width) {
-    _memory_bus_width = memory_bus_width;
-  }
-  void
-  set_max_register_size_per_work_group(int max_register_size_per_work_group) {
-    _max_register_size_per_work_group = max_register_size_per_work_group;
-  }
-  void set_device_id(uint32_t device_id) { _device_id = device_id; }
-  void set_uuid(std::array<unsigned char, 16> uuid) { _uuid = std::move(uuid); }
-  void set_global_mem_cache_size(unsigned int global_mem_cache_size) {
-    _global_mem_cache_size = global_mem_cache_size;
-  }
-  void set_image1d_max(size_t image_max_buffer_size) {
-    _image1d_max = image_max_buffer_size;
-  }
-  void set_image2d_max(size_t image_max_width_buffer_size,
-                       size_t image_max_height_buffer_size) {
-    _image2d_max[0] = image_max_width_buffer_size;
-    _image2d_max[1] = image_max_height_buffer_size;
-  }
-  void set_image3d_max(size_t image_max_width_buffer_size,
-                       size_t image_max_height_buffer_size,
-                       size_t image_max_depth_buffer_size) {
-    _image3d_max[0] = image_max_width_buffer_size;
-    _image3d_max[1] = image_max_height_buffer_size;
-    _image3d_max[2] = image_max_depth_buffer_size;
-  }
-
-private:
-  constexpr static size_t NAME_BUFFER_SIZE = 256;
-
-  char _name[device_info::NAME_BUFFER_SIZE];
-  sycl::range<3> _max_work_item_sizes;
-  int _max_work_item_sizes_i[3];
-  bool _host_unified_memory = false;
-  int _major;
-  int _minor;
-  int _integrated = 0;
-  int _frequency;
-  // Set estimated value 3200000 kHz as default value.
-  unsigned int _memory_clock_rate = 3200000;
-  // Set estimated value 64 bits as default value.
-  unsigned int _memory_bus_width = 64;
-  unsigned int _global_mem_cache_size;
-  int _max_compute_units;
-  int _max_work_group_size;
-  int _max_sub_group_size;
-  int _max_work_items_per_compute_unit;
-  int _max_register_size_per_work_group;
-  size_t _global_mem_size;
-  size_t _local_mem_size;
-  size_t _max_nd_range_size[3];
-  int _max_nd_range_size_i[3];
-  uint32_t _device_id;
-  std::array<unsigned char, 16> _uuid;
-  int _image1d_max;
-  int _image2d_max[2];
-  int _image3d_max[3];
-};
-
-static int get_major_version(const sycl::device &dev) {
-  int major, minor;
-  detail::get_version(dev, major, minor);
-  return major;
-}
-
-static int get_minor_version(const sycl::device &dev) {
-  int major, minor;
-  detail::get_version(dev, major, minor);
-  return minor;
-}
-
-static inline void
-has_capability_or_fail(const sycl::device &dev,
-                       const std::initializer_list<sycl::aspect> &props) {
-  for (const auto &it : props) {
-    if (dev.has(it))
-      continue;
-    switch (it) {
-    case sycl::aspect::fp64:
-      throw sycl::exception(sycl::make_error_code(sycl::errc::runtime),
-                            "[SYCLcompat] 'double' is not supported in '" +
-                                dev.get_info<sycl::info::device::name>() +
-                                "' device");
-      break;
-    case sycl::aspect::fp16:
-      throw sycl::exception(sycl::make_error_code(sycl::errc::runtime),
-                            "[SYCLcompat] 'half' is not supported in '" +
-                                dev.get_info<sycl::info::device::name>() +
-                                "' device");
-      break;
-    default:
-#define __SYCL_ASPECT(ASPECT, ID)                                              \
-  case sycl::aspect::ASPECT:                                                   \
-    return #ASPECT;
-#define __SYCL_ASPECT_DEPRECATED(ASPECT, ID, MESSAGE) __SYCL_ASPECT(ASPECT, ID)
-#define __SYCL_ASPECT_DEPRECATED_ALIAS(ASPECT, ID, MESSAGE)
-      auto getAspectNameStr = [](sycl::aspect AspectNum) -> std::string {
-        switch (AspectNum) {
-#include <sycl/info/aspects.def>
-#include <sycl/info/aspects_deprecated.def>
-        default:
-          return "unknown aspect";
-        }
-      };
-#undef __SYCL_ASPECT_DEPRECATED_ALIAS
-#undef __SYCL_ASPECT_DEPRECATED
-#undef __SYCL_ASPECT
-      throw sycl::exception(
-          sycl::make_error_code(sycl::errc::runtime),
-          "[SYCLcompat] '" + getAspectNameStr(it) + "' is not supported in '" +
-              dev.get_info<sycl::info::device::name>() + "' device");
-    }
-    break;
-  }
-}
-
-/// device extension
-class device_ext : public sycl::device {
-public:
-  device_ext() : sycl::device(), _ctx(*this) {}
-  ~device_ext() {
-    try {
-      std::lock_guard<std::mutex> lock(m_mutex);
-      sycl::event::wait(_events);
-      _queues.clear();
-    } catch (std::exception &e) {
-      __SYCL_REPORT_EXCEPTION_TO_STREAM("exception in ~device_ext", e);
-    }
-  }
-  device_ext(const sycl::device &base, bool print_on_async_exceptions = false,
-             bool in_order = true)
-      : sycl::device(base), _ctx(*this) {
-    if (!this->has(sycl::aspect::usm_device_allocations)) {
-      throw std::invalid_argument(
-          "Device does not support device USM allocations");
-    }
-    // calls create_queue since we don't have a locked m_mutex
-    _default_queue = create_queue(print_on_async_exceptions, in_order);
-    _saved_queue = _default_queue;
-  }
-
-  bool is_native_host_atomic_supported() { return false; }
-  int get_major_version() const { return syclcompat::get_major_version(*this); }
-
-  int get_minor_version() const { return syclcompat::get_minor_version(*this); }
-
-  int get_max_compute_units() const {
-    return get_device_info().get_max_compute_units();
-  }
-
-  /// Return the maximum clock frequency of this device in KHz.
-  int get_max_clock_frequency() const {
-    return get_device_info().get_max_clock_frequency();
-  }
-
-  int get_integrated() const { return get_device_info().get_integrated(); }
-
-  int get_max_sub_group_size() const {
-    return get_device_info().get_max_sub_group_size();
-  }
-
-  int get_max_register_size_per_work_group() const {
-    return get_device_info().get_max_register_size_per_work_group();
-  }
-
-  int get_max_work_group_size() const {
-    return get_device_info().get_max_work_group_size();
-  }
-
-  int get_mem_base_addr_align() const {
-    return get_info<sycl::info::device::mem_base_addr_align>();
-  }
-
-  size_t get_global_mem_size() const {
-    return get_device_info().get_global_mem_size();
-  }
-
-  size_t get_local_mem_size() const {
-    return get_device_info().get_local_mem_size();
-  }
-
-  /// Get the number of bytes of free and total memory on the SYCL device.
-  /// \param [out] free_memory The number of bytes of free memory on the SYCL
-  /// device.
-  /// \param [out] total_memory The number of bytes of total memory on the SYCL
-  /// device.
-  void get_memory_info(size_t &free_memory, size_t &total_memory) const {
-    if (!has(sycl::aspect::ext_intel_free_memory)) {
-      std::cerr << "[SYCLCompat] get_memory_info: ext_intel_free_memory is not "
-                   "supported."
-                << std::endl;
-      free_memory = 0;
-    } else {
-      free_memory = get_info<sycl::ext::intel::info::device::free_memory>();
-    }
-    total_memory = get_device_info().get_global_mem_size();
-  }
-
-  void get_device_info(device_info &out) const {
-    if (_dev_info) {
-      out = *_dev_info;
-      return;
-    }
-
-    std::lock_guard<std::mutex> lock(m_mutex);
-    device_info prop;
-    prop.set_name(get_info<sycl::info::device::name>().c_str());
-
-    int major, minor;
-    get_version(major, minor);
-    prop.set_major_version(major);
-    prop.set_minor_version(minor);
-
-    prop.set_max_work_item_sizes(
-        // SYCL 2020-conformant code, max_work_item_sizes is a struct
-        // templated by an int
-        get_info<sycl::info::device::max_work_item_sizes<3>>());
-
-    prop.set_host_unified_memory(has(sycl::aspect::usm_host_allocations));
-
-    prop.set_max_clock_frequency(
-        get_info<sycl::info::device::max_clock_frequency>());
-    prop.set_max_compute_units(
-        get_info<sycl::info::device::max_compute_units>());
-    prop.set_max_work_group_size(
-        get_info<sycl::info::device::max_work_group_size>());
-    prop.set_global_mem_size(get_info<sycl::info::device::global_mem_size>());
-    prop.set_local_mem_size(get_info<sycl::info::device::local_mem_size>());
-
-#if (defined(SYCL_EXT_INTEL_DEVICE_INFO) && SYCL_EXT_INTEL_DEVICE_INFO >= 6)
-    if (has(sycl::aspect::ext_intel_memory_clock_rate)) {
-      unsigned int tmp =
-          get_info<sycl::ext::intel::info::device::memory_clock_rate>();
-      if (tmp != 0)
-        prop.set_memory_clock_rate(1000 * tmp);
-    }
-    if (has(sycl::aspect::ext_intel_memory_bus_width)) {
-      prop.set_memory_bus_width(
-          get_info<sycl::ext::intel::info::device::memory_bus_width>());
-    }
-    if (has(sycl::aspect::ext_intel_device_id)) {
-      prop.set_device_id(get_info<sycl::ext::intel::info::device::device_id>());
-    }
-    if (has(sycl::aspect::ext_intel_device_info_uuid)) {
-      prop.set_uuid(get_info<sycl::ext::intel::info::device::uuid>());
-    }
-#elif defined(_MSC_VER) && !defined(__clang__)
-#pragma message("get_device_info: querying memory_clock_rate and \
-memory_bus_width are not supported by the compiler used. \
-Use 3200000 kHz as memory_clock_rate default value. \
-Use 64 bits as memory_bus_width default value.")
-#else
-#warning "get_device_info: querying memory_clock_rate and \
-memory_bus_width are not supported by the compiler used. \
-Use 3200000 kHz as memory_clock_rate default value. \
-Use 64 bits as memory_bus_width default value."
-#endif
-
-    size_t max_sub_group_size = 1;
-    std::vector<size_t> sub_group_sizes =
-        get_info<sycl::info::device::sub_group_sizes>();
-
-    for (const auto &sub_group_size : sub_group_sizes) {
-      if (max_sub_group_size < sub_group_size)
-        max_sub_group_size = sub_group_size;
-    }
-
-    prop.set_max_sub_group_size(max_sub_group_size);
-
-    prop.set_max_work_items_per_compute_unit(
-        get_info<sycl::info::device::max_work_group_size>());
-#ifdef SYCL_EXT_ONEAPI_MAX_WORK_GROUP_QUERY
-    prop.set_max_nd_range_size(
-        get_info<sycl::ext::oneapi::experimental::info::device::max_work_groups<
-            3>>());
-#else
-#if defined(_MSC_VER) && !defined(__clang__)
-#pragma message("get_device_info: querying the maximum number \
-    of work groups is not supported.")
-#else
-#warning "get_device_info: querying the maximum number of \
-    work groups is not supported."
-#endif
-    int max_nd_range_size[] = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
-    prop.set_max_nd_range_size(max_nd_range_size);
-#endif
-
-    // Estimates max register size per work group, feel free to update the
-    // value according to device properties.
-    prop.set_max_register_size_per_work_group(65536);
-
-    prop.set_global_mem_cache_size(
-        get_info<sycl::info::device::global_mem_cache_size>());
-
-    prop.set_image1d_max(get_info<sycl::info::device::image_max_buffer_size>());
-    prop.set_image1d_max(get_info<sycl::info::device::image_max_buffer_size>());
-    prop.set_image2d_max(get_info<sycl::info::device::image2d_max_width>(),
-                         get_info<sycl::info::device::image2d_max_height>());
-    prop.set_image3d_max(get_info<sycl::info::device::image3d_max_width>(),
-                         get_info<sycl::info::device::image3d_max_height>(),
-                         get_info<sycl::info::device::image3d_max_height>());
-
-    _dev_info = prop;
-    out = prop;
-  }
-
-  device_info get_device_info() const {
-    if (!_dev_info) {
-      this->get_device_info(*_dev_info);
-    }
-    return _dev_info.value();
-  }
-
-  void reset(bool print_on_async_exceptions = false, bool in_order = true) {
-    std::lock_guard<std::mutex> lock(m_mutex);
-    // The queues are shared_ptrs and the ref counts of the shared_ptrs increase
-    // only in wait_and_throw(). If there is no other thread calling
-    // wait_and_throw(), the queues will be destructed. The destructor waits for
-    // all commands executing on the queue to complete. It isn't possible to
-    // destroy a queue immediately. This is a synchronization point in SYCL.
-    _queues.clear();
-    // create new default queue
-    // calls create_queue_impl since we already have a locked m_mutex
-
-    _saved_queue = _default_queue =
-        in_order ? create_queue_impl(print_on_async_exceptions,
-                                     sycl::property::queue::in_order())
-                 : create_queue_impl(print_on_async_exceptions);
-  }
-
-  void set_default_queue(const sycl::queue &q) {
-    std::lock_guard<std::mutex> lock(m_mutex);
-    _queues.front().get()->wait_and_throw();
-    _queues[0] = std::make_shared<sycl::queue>(q);
-    if (_saved_queue == _default_queue)
-      _saved_queue = _queues.front().get();
-    _default_queue = _queues.front().get();
-  }
-
-  queue_ptr default_queue() { return _default_queue; }
-
-  void queues_wait_and_throw() {
-    std::unique_lock<std::mutex> lock(m_mutex);
-    std::vector<std::shared_ptr<sycl::queue>> current_queues(_queues);
-    lock.unlock();
-    for (const auto &q : current_queues) {
-      q->wait_and_throw();
-    }
-    // Guard the destruct of current_queues to make sure the ref count is safe.
-    lock.lock();
-  }
-  queue_ptr create_queue(bool print_on_async_exceptions = false,
-                         bool in_order = true) {
-    std::lock_guard<std::mutex> lock(m_mutex);
-    return in_order ? create_queue_impl(print_on_async_exceptions,
-                                        sycl::property::queue::in_order())
-                    : create_queue_impl(print_on_async_exceptions);
-  }
-  void destroy_queue(queue_ptr &queue) {
-    std::lock_guard<std::mutex> lock(m_mutex);
-    _queues.erase(
-        std::remove_if(_queues.begin(), _queues.end(),
-                       [=](const std::shared_ptr<sycl::queue> &q) -> bool {
-                         return q.get() == queue;
-                       }),
-        _queues.end());
-    queue = nullptr;
-  }
-  void set_saved_queue(queue_ptr q) {
-    std::lock_guard<std::mutex> lock(m_mutex);
-    _saved_queue = q;
-  }
-  queue_ptr get_saved_queue() const {
-    std::lock_guard<std::mutex> lock(m_mutex);
-    return _saved_queue;
-  }
-  sycl::context get_context() const { return _ctx; }
-
-  /// Util function to check whether a device supports some kinds of
-  /// sycl::aspect.
-  void has_capability_or_fail(
-      const std::initializer_list<sycl::aspect> &props) const {
-    ::syclcompat::has_capability_or_fail(*this, props);
-  }
-
-private:
-  /// Caller should only be done from functions where the resource \p m_mutex
-  /// has been acquired.
-  template <typename... PropertiesT>
-  queue_ptr create_queue_impl(bool print_on_async_exceptions = false,
-                              PropertiesT... properties) {
-    sycl::property_list prop = sycl::property_list(
-#ifdef SYCLCOMPAT_PROFILING_ENABLED
-        sycl::property::queue::enable_profiling(),
-#endif
-        properties...);
-    if (print_on_async_exceptions) {
-      _queues.push_back(std::make_shared<sycl::queue>(
-          _ctx, *this, detail::exception_handler, prop));
-    } else {
-      _queues.push_back(std::make_shared<sycl::queue>(_ctx, *this, prop));
-    }
-    return _queues.back().get();
-  }
-
-  void get_version(int &major, int &minor) const {
-    detail::get_version(*this, major, minor);
-  }
-  void add_event(sycl::event event) {
-    std::lock_guard<std::mutex> lock(m_mutex);
-    _events.push_back(event);
-  }
-  friend sycl::event enqueue_free(const std::vector<void *> &,
-                                  const std::vector<sycl::event> &,
-                                  sycl::queue);
-  queue_ptr _default_queue;
-  queue_ptr _saved_queue;
-  sycl::context _ctx;
-  std::vector<std::shared_ptr<sycl::queue>> _queues;
-  mutable std::mutex m_mutex;
-  std::vector<sycl::event> _events;
-  mutable std::optional<device_info> _dev_info;
-};
-
-namespace detail {
-
-static inline unsigned int get_tid() {
-#if defined(__linux__)
-  return syscall(SYS_gettid);
-#elif defined(_WIN64)
-  return GetCurrentThreadId();
-#else
-#error "Only support Windows and Linux."
-#endif
-}
-
-/// device manager
-class dev_mgr {
-public:
-  device_ext &current_device() {
-    unsigned int dev_id = current_device_id();
-    check_id(dev_id);
-    return *_devs[dev_id];
-  }
-  device_ext &cpu_device() const {
-    std::lock_guard<std::mutex> lock(m_mutex);
-    if (_cpu_device == -1) {
-      throw std::runtime_error("[SYCLcompat] No valid cpu device");
-    } else {
-      return *_devs[_cpu_device];
-    }
-  }
-  device_ext &get_device(unsigned int id) const {
-    std::lock_guard<std::mutex> lock(m_mutex);
-    check_id(id);
-    return *_devs[id];
-  }
-  unsigned int current_device_id() const {
-    std::lock_guard<std::mutex> lock(m_mutex);
-    auto it = _thread2dev_map.find(get_tid());
-    if (it != _thread2dev_map.end())
-      return it->second;
-    return _default_device_id;
-  }
-
-  /// Select device with a device ID.
-  /// \param [in] id The id of the device which can
-  /// be obtained through get_device_id(const sycl::device).
-  void select_device(unsigned int id) {
-    std::lock_guard<std::mutex> lock(m_mutex);
-    check_id(id);
-    _thread2dev_map[get_tid()] = id;
-  }
-  unsigned int device_count() { return _devs.size(); }
-
-  unsigned int get_device_id(const sycl::device &dev) {
-    if (!_devs.size()) {
-      throw std::runtime_error(
-          "[SYCLcompat] No SYCL devices found in the device list. Device list "
-          "may have been filtered by syclcompat::filter_device");
-    }
-    unsigned int id = 0;
-    for (auto dev_item : _devs) {
-      if (*dev_item == dev) {
-        return id;
-      }
-      id++;
-    }
-    throw std::runtime_error("[SYCLcompat] The device[" +
-                             dev.get_info<sycl::info::device::name>() +
-                             "] is filtered out by syclcompat::filter_device "
-                             "in current device list!");
-  }
-
-  /// List all the devices with its id in dev_mgr.
-  void list_devices() const {
-    for (size_t i = 0; i < _devs.size(); ++i) {
-      std::cout << "Device " << i << ": "
-                << _devs[i]->get_info<sycl::info::device::name>() << std::endl;
-    }
-  }
-
-  /// Filter out devices; only keep the device whose name contains one of the
-  /// subname in \p dev_subnames.
-  /// May break device id mapping and change current device. It's better to be
-  /// called before other SYCLcompat/SYCL APIs.
-  void filter(const std::vector<std::string> &dev_subnames) {
-    std::lock_guard<std::mutex> lock(m_mutex);
-    auto iter = _devs.begin();
-    while (iter != _devs.end()) {
-      std::string dev_name = (*iter)->get_info<sycl::info::device::name>();
-      bool matched = false;
-      for (const auto &name : dev_subnames) {
-        if (dev_name.find(name) != std::string::npos) {
-          matched = true;
-          break;
-        }
-      }
-      if (matched)
-        ++iter;
-      else
-        iter = _devs.erase(iter);
-    }
-    _cpu_device = -1;
-    for (unsigned i = 0; i < _devs.size(); ++i) {
-      if (_devs[i]->is_cpu()) {
-        _cpu_device = i;
-        break;
-      }
-    }
-    _thread2dev_map.clear();
-#ifdef SYCLCOMPAT_VERBOSE
-    list_devices();
-#endif
-  }
-
-  /// Select device with a Device Selector
-  /// \param selector device selector to get the device id from. Defaults to
-  /// sycl::gpu_selector_v
-  template <class DeviceSelector>
-  std::enable_if_t<
-      std::is_invocable_r_v<int, DeviceSelector, const sycl::device &>>
-  select_device(const DeviceSelector &selector = sycl::gpu_selector_v) {
-    sycl::device selected_device = sycl::device(selector);
-    unsigned int selected_device_id = get_device_id(selected_device);
-    select_device(selected_device_id);
-  }
-
-  /// Returns the instance of device manager singleton.
-  static dev_mgr &instance() {
-    static dev_mgr d_m;
-    return d_m;
-  }
-  dev_mgr(const dev_mgr &) = delete;
-  dev_mgr &operator=(const dev_mgr &) = delete;
-  dev_mgr(dev_mgr &&) = delete;
-  dev_mgr &operator=(dev_mgr &&) = delete;
-
-private:
-  mutable std::mutex m_mutex;
-
-  dev_mgr() {
-    sycl::device default_device = sycl::device(sycl::default_selector_v);
-    _devs.push_back(std::make_shared<device_ext>(default_device));
-
-    std::vector<sycl::device> sycl_all_devs =
-        sycl::device::get_devices(sycl::info::device_type::all);
-    // Collect other devices except for the default device.
-    if (default_device.is_cpu())
-      _cpu_device = 0;
-    for (auto &dev : sycl_all_devs) {
-      if (dev == default_device) {
-        continue;
-      }
-      _devs.push_back(std::make_shared<device_ext>(dev));
-      if (_cpu_device == -1 && dev.is_cpu()) {
-        _cpu_device = _devs.size() - 1;
-      }
-    }
-#ifdef SYCLCOMPAT_VERBOSE
-    list_devices();
-#endif
-  }
-  void check_id(unsigned int id) const {
-    if (id >= _devs.size()) {
-      throw std::runtime_error("invalid device id");
-    }
-  }
-  std::vector<std::shared_ptr<device_ext>> _devs;
-  /// DEFAULT_DEVICE_ID is used, if current_device_id() can not find current
-  /// thread id in _thread2dev_map, which means default device should be used
-  /// for the current thread.
-  const unsigned int _default_device_id = 0;
-  /// thread-id to device-id map.
-  std::map<unsigned int, unsigned int> _thread2dev_map;
-  int _cpu_device = -1;
-};
-
-} // namespace detail
-
-static inline sycl::queue create_queue(bool print_on_async_exceptions = false,
-                                       bool in_order = true) {
-  return *detail::dev_mgr::instance().current_device().create_queue(
-      print_on_async_exceptions, in_order);
-}
-
-/// Util function to get the default queue of current device in
-/// device manager.
-static inline sycl::queue get_default_queue() {
-  return *detail::dev_mgr::instance().current_device().default_queue();
-}
-
-/// Util function to change the default queue of the current device in the
-/// device manager
-/// If the device extension saved queue is the default queue,
-/// the previous saved queue will be overwritten as well.
-/// This function will be blocking if there are submitted kernels in the
-/// previous default queue.
-/// @param q New user-defined queue
-static inline void set_default_queue(const sycl::queue &q) {
-  detail::dev_mgr::instance().current_device().set_default_queue(q);
-}
-
-static inline void wait(sycl::queue q = get_default_queue()) { q.wait(); }
-
-static inline void wait_and_throw(sycl::queue q = get_default_queue()) {
-  q.wait_and_throw();
-}
-
-/// Util function to get the id of current device in
-/// device manager.
-static inline unsigned int get_current_device_id() {
-  return detail::dev_mgr::instance().current_device_id();
-}
-
-/// Util function to get the current device.
-static inline device_ext &get_current_device() {
-  return detail::dev_mgr::instance().current_device();
-}
-
-/// Util function to get a device by id.
-static inline device_ext &get_device(unsigned int id) {
-  return detail::dev_mgr::instance().get_device(id);
-}
-
-/// Util function to get the context of the default queue of current
-/// device in device manager.
-static inline sycl::context get_default_context() {
-  return get_current_device().get_context();
-}
-
-/// Util function to get a CPU device.
-static inline device_ext &cpu_device() {
-  return detail::dev_mgr::instance().cpu_device();
-}
-
-/// Filter out devices; only keep the device whose name contains one of the
-/// subname in \p dev_subnames.
-/// May break device id mapping and change current device. It's better to be
-/// called before other SYCLcompat or SYCL APIs.
-static inline void filter_device(const std::vector<std::string> &dev_subnames) {
-  detail::dev_mgr::instance().filter(dev_subnames);
-}
-
-/// List all the devices with its id in dev_mgr.
-static inline void list_devices() {
-  detail::dev_mgr::instance().list_devices();
-}
-
-static inline unsigned int select_device(unsigned int id) {
-  detail::dev_mgr::instance().select_device(id);
-  return id;
-}
-
-template <class DeviceSelector>
-static inline std::enable_if_t<
-    std::is_invocable_r_v<int, DeviceSelector, const sycl::device &>>
-select_device(const DeviceSelector &selector = sycl::gpu_selector_v) {
-  detail::dev_mgr::instance().select_device(selector);
-}
-
-static inline unsigned int get_device_id(const sycl::device &dev) {
-  return detail::dev_mgr::instance().get_device_id(dev);
-}
-
-static inline unsigned int device_count() {
-  return detail::dev_mgr::instance().device_count();
-}
-} // namespace syclcompat
diff --git a/sycl/include/syclcompat/dims.hpp b/sycl/include/syclcompat/dims.hpp
deleted file mode 100644
index 3af6c15f96d2a..0000000000000
--- a/sycl/include/syclcompat/dims.hpp
+++ /dev/null
@@ -1,74 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat
- *
- *  dims.hpp
- *
- *  Description:
- *    dim3 functionality for SYCLcompat
- **************************************************************************/
-
-#pragma once
-
-#include <stdexcept>
-#include <tuple>
-
-#include <sycl/range.hpp>
-
-namespace [[deprecated("syclcompat is deprecated")]] syclcompat {
-
-class dim3 {
-public:
-  unsigned int x, y, z;
-
-  dim3(const sycl::range<3> &r) : x(r[2]), y(r[1]), z(r[0]) {}
-
-  dim3(const sycl::range<2> &r) : x(r[1]), y(r[0]), z(1) {}
-
-  dim3(const sycl::range<1> &r) : x(r[0]), y(1), z(1) {}
-
-  constexpr dim3(unsigned int x = 1, unsigned int y = 1, unsigned int z = 1)
-      : x(x), y(y), z(z) {}
-
-  constexpr size_t size() const { return x * y * z; }
-
-  operator sycl::range<3>() const { return sycl::range<3>(z, y, x); }
-  operator sycl::range<2>() const {
-    if (z != 1)
-      throw std::invalid_argument(
-          "Attempting to convert a 3D dim3 into sycl::range<2>");
-    return sycl::range<2>(y, x);
-  }
-  operator sycl::range<1>() const {
-    if (z != 1 || y != 1)
-      throw std::invalid_argument(
-          "Attempting to convert a 2D or 3D dim3 into sycl::range<1>");
-    return sycl::range<1>(x);
-  }
-}; // namespace dim3
-
-inline dim3 operator*(const dim3 &a, const dim3 &b) {
-  return dim3{a.x * b.x, a.y * b.y, a.z * b.z};
-}
-
-inline dim3 operator+(const dim3 &a, const dim3 &b) {
-  return dim3{a.x + b.x, a.y + b.y, a.z + b.z};
-}
-
-inline dim3 operator-(const dim3 &a, const dim3 &b) {
-  return dim3{a.x - b.x, a.y - b.y, a.z - b.z};
-}
-
-} // namespace syclcompat
diff --git a/sycl/include/syclcompat/group_utils.hpp b/sycl/include/syclcompat/group_utils.hpp
deleted file mode 100644
index 52376fe7b45d7..0000000000000
--- a/sycl/include/syclcompat/group_utils.hpp
+++ /dev/null
@@ -1,1269 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCL compatibility extension
- *
- *  group_utils.hpp
- *
- *  Description:
- *    Group util functionality for the SYCL compatibility extension
- **************************************************************************/
-
-// The original source was under the license below:
-//==---- group_utils.hpp ------------------*- C++ -*--------------------==//
-//
-// Copyright (C) Intel Corporation
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// See https://llvm.org/LICENSE.txt for license information.
-//
-//===------------------------------------------------------------------===//
-
-#pragma once
-
-#include <iterator>
-#include <stdexcept>
-#include <sycl/sycl.hpp>
-
-#include <syclcompat/defs.hpp>
-#include <syclcompat/math.hpp>
-
-namespace [[deprecated("syclcompat is deprecated")]] syclcompat {
-namespace group {
-namespace detail {
-
-template <typename... _Args>
-constexpr auto __reduce_over_group(_Args... __args) {
-  return sycl::reduce_over_group(__args...);
-}
-
-template <typename... _Args> constexpr auto __group_broadcast(_Args... __args) {
-  return sycl::group_broadcast(__args...);
-}
-
-template <typename... _Args>
-constexpr auto __exclusive_scan_over_group(_Args... __args) {
-  return sycl::exclusive_scan_over_group(__args...);
-}
-
-template <typename... _Args>
-constexpr auto __inclusive_scan_over_group(_Args... __args) {
-  return sycl::inclusive_scan_over_group(__args...);
-}
-
-template <typename Item, typename T, class BinaryOperation,
-          class GroupPrefixCallbackOperation>
-__syclcompat_inline__ T
-exclusive_scan(const Item &item, T input, BinaryOperation binary_op,
-               GroupPrefixCallbackOperation &prefix_callback_op) {
-  T group_aggregate;
-
-  T output =
-      detail::__exclusive_scan_over_group(item.get_group(), input, binary_op);
-  if (item.get_local_linear_id() == item.get_local_range().size() - 1) {
-    group_aggregate = binary_op(output, input);
-  }
-
-  group_aggregate = detail::__group_broadcast(
-      item.get_group(), group_aggregate, item.get_local_range().size() - 1);
-
-  T group_prefix = prefix_callback_op(group_aggregate);
-  if (item.get_local_linear_id() == 0) {
-    output = group_prefix;
-  } else {
-    output = binary_op(group_prefix, output);
-  }
-
-  return output;
-}
-
-typedef uint16_t digit_counter_type;
-typedef uint32_t packed_counter_type;
-
-template <int N, int CURRENT_VAL = N, int COUNT = 0> struct log2 {
-  enum { VALUE = log2<N, (CURRENT_VAL >> 1), COUNT + 1>::VALUE };
-};
-
-template <int N, int COUNT> struct log2<N, 0, COUNT> {
-  enum { VALUE = (1 << (COUNT - 1) < N) ? COUNT : COUNT - 1 };
-};
-
-template <int RADIX_BITS, bool DESCENDING = false> class radix_rank {
-public:
-  static size_t get_local_memory_size(size_t group_threads) {
-    return group_threads * PADDED_COUNTER_LANES * sizeof(packed_counter_type);
-  }
-
-  radix_rank(uint8_t *local_memory) : _local_memory(local_memory) {}
-
-  template <typename Item, int VALUES_PER_THREAD>
-  __syclcompat_inline__ void
-  rank_keys(const Item &item, uint32_t (&keys)[VALUES_PER_THREAD],
-            int (&ranks)[VALUES_PER_THREAD], int current_bit, int num_bits) {
-
-    digit_counter_type thread_prefixes[VALUES_PER_THREAD];
-    digit_counter_type *digit_counters[VALUES_PER_THREAD];
-    digit_counter_type *buffer =
-        reinterpret_cast<digit_counter_type *>(_local_memory);
-    auto g = item.get_group();
-    reset_local_memory(item);
-
-    sycl::group_barrier(g, sycl::memory_scope::work_group);
-
-#pragma unroll
-    for (int i = 0; i < VALUES_PER_THREAD; ++i) {
-      uint32_t digit =
-          ::syclcompat::detail::bfe(keys[i], current_bit, num_bits);
-      uint32_t sub_counter = digit >> LOG_COUNTER_LANES;
-      uint32_t counter_lane = digit & (COUNTER_LANES - 1);
-
-      if (DESCENDING) {
-        sub_counter = PACKING_RATIO - 1 - sub_counter;
-        counter_lane = COUNTER_LANES - 1 - counter_lane;
-      }
-
-      digit_counters[i] =
-          &buffer[counter_lane * item.get_local_range().size() * PACKING_RATIO +
-                  item.get_local_linear_id() * PACKING_RATIO + sub_counter];
-      thread_prefixes[i] = *digit_counters[i];
-      *digit_counters[i] = thread_prefixes[i] + 1;
-    }
-
-    sycl::group_barrier(g, sycl::memory_scope::work_group);
-
-    scan_counters(item);
-
-    sycl::group_barrier(g, sycl::memory_scope::work_group);
-
-    for (int i = 0; i < VALUES_PER_THREAD; ++i) {
-      ranks[i] = thread_prefixes[i] + *digit_counters[i];
-    }
-  }
-
-private:
-  template <typename Item>
-  __syclcompat_inline__ void reset_local_memory(const Item &item) {
-    packed_counter_type *ptr =
-        reinterpret_cast<packed_counter_type *>(_local_memory);
-
-#pragma unroll
-    for (int i = 0; i < PADDED_COUNTER_LANES; ++i) {
-      ptr[i * item.get_local_range().size() + item.get_local_linear_id()] = 0;
-    }
-  }
-
-  template <typename Item>
-  __syclcompat_inline__ packed_counter_type upsweep(const Item &item) {
-    packed_counter_type sum = 0;
-    packed_counter_type *ptr =
-        reinterpret_cast<packed_counter_type *>(_local_memory);
-
-#pragma unroll
-    for (int i = 0; i < PADDED_COUNTER_LANES; i++) {
-      cached_segment[i] =
-          ptr[item.get_local_linear_id() * PADDED_COUNTER_LANES + i];
-    }
-
-#pragma unroll
-    for (int i = 0; i < PADDED_COUNTER_LANES; ++i) {
-      sum += cached_segment[i];
-    }
-
-    return sum;
-  }
-
-  template <typename Item>
-  __syclcompat_inline__ void
-  exclusive_downsweep(const Item &item, packed_counter_type raking_partial) {
-    packed_counter_type *ptr =
-        reinterpret_cast<packed_counter_type *>(_local_memory);
-    packed_counter_type sum = raking_partial;
-
-#pragma unroll
-    for (int i = 0; i < PADDED_COUNTER_LANES; ++i) {
-      packed_counter_type value = cached_segment[i];
-      cached_segment[i] = sum;
-      sum += value;
-    }
-
-#pragma unroll
-    for (int i = 0; i < PADDED_COUNTER_LANES; ++i) {
-      ptr[item.get_local_linear_id() * PADDED_COUNTER_LANES + i] =
-          cached_segment[i];
-    }
-  }
-
-  struct prefix_callback {
-    __syclcompat_inline__ packed_counter_type
-    operator()(packed_counter_type block_aggregate) {
-      packed_counter_type block_prefix = 0;
-
-#pragma unroll
-      for (int packed = 1; packed < PACKING_RATIO; packed++) {
-        block_prefix += block_aggregate
-                        << (sizeof(digit_counter_type) * 8 * packed);
-      }
-
-      return block_prefix;
-    }
-  };
-
-  template <typename Item>
-  __syclcompat_inline__ void scan_counters(const Item &item) {
-    packed_counter_type raking_partial = upsweep(item);
-
-    prefix_callback callback;
-    packed_counter_type exclusive_partial = exclusive_scan(
-        item, raking_partial, sycl::ext::oneapi::plus<packed_counter_type>(),
-        callback);
-
-    exclusive_downsweep(item, exclusive_partial);
-  }
-
-private:
-  static constexpr int PACKING_RATIO =
-      sizeof(packed_counter_type) / sizeof(digit_counter_type);
-  static constexpr int LOG_PACKING_RATIO = log2<PACKING_RATIO>::VALUE;
-  static constexpr int LOG_COUNTER_LANES = RADIX_BITS - LOG_PACKING_RATIO;
-  static constexpr int COUNTER_LANES = 1 << LOG_COUNTER_LANES;
-  static constexpr int PADDED_COUNTER_LANES = COUNTER_LANES + 1;
-
-  packed_counter_type cached_segment[PADDED_COUNTER_LANES];
-  uint8_t *_local_memory;
-};
-
-template <typename T, typename U> struct base_traits {
-
-  static __syclcompat_inline__ U twiddle_in(U key) {
-    throw std::runtime_error("Not implemented");
-  }
-  static __syclcompat_inline__ U twiddle_out(U key) {
-    throw std::runtime_error("Not implemented");
-  }
-};
-
-template <typename U> struct base_traits<uint32_t, U> {
-  static __syclcompat_inline__ U twiddle_in(U key) { return key; }
-  static __syclcompat_inline__ U twiddle_out(U key) { return key; }
-};
-
-template <typename U> struct base_traits<int, U> {
-  static constexpr U HIGH_BIT = U(1) << ((sizeof(U) * 8) - 1);
-  static __syclcompat_inline__ U twiddle_in(U key) { return key ^ HIGH_BIT; }
-  static __syclcompat_inline__ U twiddle_out(U key) { return key ^ HIGH_BIT; }
-};
-
-template <typename U> struct base_traits<float, U> {
-  static constexpr U HIGH_BIT = U(1) << ((sizeof(U) * 8) - 1);
-  static __syclcompat_inline__ U twiddle_in(U key) {
-    U mask = (key & HIGH_BIT) ? U(-1) : HIGH_BIT;
-    return key ^ mask;
-  }
-  static __syclcompat_inline__ U twiddle_out(U key) {
-    U mask = (key & HIGH_BIT) ? HIGH_BIT : U(-1);
-    return key ^ mask;
-  }
-};
-
-template <typename T> struct traits : base_traits<T, T> {};
-template <> struct traits<uint32_t> : base_traits<uint32_t, uint32_t> {};
-template <> struct traits<int> : base_traits<int, uint32_t> {};
-template <> struct traits<float> : base_traits<float, uint32_t> {};
-
-template <int N> struct power_of_two {
-  enum { VALUE = ((N & (N - 1)) == 0) };
-};
-
-__syclcompat_inline__ uint32_t shr_add(uint32_t x, uint32_t shift,
-                                       uint32_t addend) {
-  return (x >> shift) + addend;
-}
-
-} // namespace detail
-
-/// Rearranging data partitioned across a work-group.
-///
-/// \tparam T The type of the data elements.
-/// \tparam ElementsPerWorkItem The number of data elements assigned to a
-/// work-item.
-template <typename T, size_t ElementsPerWorkItem> class exchange {
-public:
-  static size_t get_local_memory_size(size_t group_threads) {
-    size_t padding_values =
-        (INSERT_PADDING)
-            ? ((group_threads * ElementsPerWorkItem) >> LOG_LOCAL_MEMORY_BANKS)
-            : 0;
-    return (group_threads * ElementsPerWorkItem + padding_values) * sizeof(T);
-  }
-
-  exchange(uint8_t *local_memory) : _local_memory(local_memory) {}
-
-  // TODO: Investigate if padding is required for performance,
-  // and if specializations are required for specific target hardware.
-  static size_t adjust_by_padding(size_t offset) {
-
-    if constexpr (INSERT_PADDING) {
-      offset = detail::shr_add(offset, LOG_LOCAL_MEMORY_BANKS, offset);
-    }
-    return offset;
-  }
-
-  struct blocked_offset {
-    template <typename Item> size_t operator()(Item item, size_t i) {
-      size_t offset = item.get_local_linear_id() * ElementsPerWorkItem + i;
-      return adjust_by_padding(offset);
-    }
-  };
-
-  struct striped_offset {
-    template <typename Item> size_t operator()(Item item, size_t i) {
-      size_t offset = i * item.get_local_range(2) * item.get_local_range(1) *
-                          item.get_local_range(0) +
-                      item.get_local_linear_id();
-      return adjust_by_padding(offset);
-    }
-  };
-
-  template <typename Iterator> struct scatter_offset {
-    Iterator begin;
-    scatter_offset(const int (&ranks)[ElementsPerWorkItem]) {
-      begin = std::begin(ranks);
-    }
-    template <typename Item> size_t operator()(Item item, size_t i) const {
-      // iterator i is expected to be within bounds [0,VALUES_PER_THREAD)
-      return adjust_by_padding(begin[i]);
-    }
-  };
-
-  /// Inplace rearrange elements from blocked order to striped order.
-  ///
-  /// Suppose 512 integer data elements partitioned across 128 work-items, where
-  /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the
-  /// blocked \p input across the work-group is:
-  ///
-  ///   {[0, 1, 2, 3], [4, 5, 6, 7], ..., [508, 509, 510, 511]}.
-  ///
-  /// The striped order output is:
-  ///
-  ///   {[0, 128, 256, 384], [1, 129, 257, 385], ..., [127, 255, 383, 511]}.
-  ///
-  /// \tparam Item The work-item identifier type.
-  /// \param item The work-item identifier.
-  /// \param input The input data of each work-item.
-  template <typename Item>
-  __syclcompat_inline__ void
-  blocked_to_striped(Item item, T (&input)[ElementsPerWorkItem]) {
-    striped_offset get_striped_offset;
-    blocked_offset get_blocked_offset;
-    helper_exchange(item, input, input, get_blocked_offset, get_striped_offset);
-  }
-
-  /// Inplace rearrange elements from striped order to blocked order.
-  ///
-  /// Suppose 512 integer data elements partitioned across 128 work-items, where
-  /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the
-  /// striped \p input across the work-group is:
-  ///
-  ///   { [0, 128, 256, 384], [1, 129, 257, 385], ..., [127, 255, 383, 511] }.
-  ///
-  /// The blocked order output is:
-  ///
-  ///   { [0, 1, 2, 3], [4, 5, 6, 7], ..., [508, 509, 510, 511] }.
-  ///
-  /// \tparam Item The work-item identifier type.
-  /// \param item The work-item identifier.
-  /// \param input The input data of each work-item.
-  template <typename Item>
-  __syclcompat_inline__ void
-  striped_to_blocked(Item item, T (&input)[ElementsPerWorkItem]) {
-    blocked_offset get_blocked_offset;
-    striped_offset get_striped_offset;
-    helper_exchange(item, input, input, get_striped_offset, get_blocked_offset);
-  }
-
-  /// Rearrange elements from blocked order to striped order.
-  ///
-  /// Suppose 512 integer data elements partitioned across 128 work-items, where
-  /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the
-  /// blocked \p input across the work-group is:
-  ///
-  ///   { [0, 1, 2, 3], [4, 5, 6, 7], ..., [508, 509, 510, 511] }.
-  ///
-  /// The striped order output is:
-  ///
-  ///   { [0, 128, 256, 384], [1, 129, 257, 385], ..., [127, 255, 383, 511] }.
-  ///
-  /// \tparam Item The work-item identifier type.
-  /// \param item The work-item identifier.
-  /// \param input The input data of each work-item.
-  /// \param output The corresponding output data of each work-item.
-  template <typename Item>
-  __syclcompat_inline__ void
-  blocked_to_striped(Item item, T (&input)[ElementsPerWorkItem],
-                     T (&output)[ElementsPerWorkItem]) {
-    striped_offset get_striped_offset;
-    blocked_offset get_blocked_offset;
-    helper_exchange(item, input, output, get_blocked_offset,
-                    get_striped_offset);
-  }
-
-  /// Rearrange elements from striped order to blocked order.
-  ///
-  /// Suppose 512 integer data elements partitioned across 128 work-items, where
-  /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the
-  /// striped \p input across the work-group is:
-  ///
-  ///   { [0, 128, 256, 384], [1, 129, 257, 385], ..., [127, 255, 383, 511] }.
-  ///
-  /// The blocked order output is:
-  ///
-  ///   { [0, 1, 2, 3], [4, 5, 6, 7], ..., [508, 509, 510, 511] }.
-  ///
-  /// \tparam Item The work-item identifier type.
-  /// \param item The work-item identifier.
-  /// \param input The input data of each work-item.
-  /// \param output The corresponding output data of each work-item.
-  template <typename Item>
-  __syclcompat_inline__ void
-  striped_to_blocked(Item item, T (&input)[ElementsPerWorkItem],
-                     T (&output)[ElementsPerWorkItem]) {
-    blocked_offset get_blocked_offset;
-    striped_offset get_striped_offset;
-    helper_exchange(item, input, output, get_striped_offset,
-                    get_blocked_offset);
-  }
-
-  /// Inplace exchanges data items annotated by rank into blocked arrangement.
-  ///
-  /// Suppose 512 integer data elements partitioned across 128 work-items, where
-  /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the
-  /// striped \p input across the work-group is:
-  ///
-  ///   { [0, 128, 256, 384], [1, 129, 257, 385], ..., [127, 255, 383, 511] }.
-  ///
-  /// The rank across the work-group is:
-  ///
-  ///   { [0, 1, 2, 3], [4, 5, 6, 7], ..., [508, 509, 510, 511] }.
-  ///
-  /// The blocked order output is:
-  ///
-  ///   { [0, 1, 2, 3], [4, 5, 6, 7], ..., [508, 509, 510, 511] }.
-  ///
-  /// \tparam Item The work-item identifier type.
-  /// \param item The work-item identifier.
-  /// \param input The input data of each work-item.
-  /// \param ranks The corresponding rank annotation of each work-item.
-  template <typename Item>
-  __syclcompat_inline__ void
-  scatter_to_blocked(Item item, T (&input)[ElementsPerWorkItem],
-                     int (&ranks)[ElementsPerWorkItem]) {
-    scatter_offset<const int *> get_scatter_offset(ranks);
-    blocked_offset get_blocked_offset;
-    helper_exchange(item, input, input, get_scatter_offset, get_blocked_offset);
-  }
-
-  /// Inplace exchanges data items annotated by rank into striped arrangement.
-  ///
-  /// Suppose 512 integer data elements partitioned across 128 work-items, where
-  /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the
-  /// blocked \p input across the work-group is:
-  ///
-  ///   { [0, 1, 2, 3], [4, 5, 6, 7], ..., [508, 509, 510, 511] }.
-  ///
-  /// The rank across the work-group is:
-  ///
-  ///   { [16, 20, 24, 28], [32, 36, 40, 44], ..., [499, 503, 507, 511] }.
-  ///
-  /// The striped order output of each work-item will be:
-  ///
-  ///   { [0, 128, 256, 384], [1, 129, 257, 385], ..., [127, 255, 383, 511] }.
-  ///
-  /// \tparam Item The work-item identifier type.
-  /// \param item The work-item identifier.
-  /// \param input The input data of each work-item.
-  /// \param ranks The corresponding rank annotation of each work-item.
-  template <typename Item>
-  __syclcompat_inline__ void
-  scatter_to_striped(Item item, T (&input)[ElementsPerWorkItem],
-                     int (&ranks)[ElementsPerWorkItem]) {
-    scatter_offset<const int *> get_scatter_offset(ranks);
-    striped_offset get_striped_offset;
-    helper_exchange(item, input, input, get_scatter_offset, get_striped_offset);
-  }
-
-private:
-  template <typename Item, typename offsetFunctorTypeFW,
-            typename offsetFunctorTypeRV>
-  __syclcompat_inline__ void
-  helper_exchange(Item item, T (&input)[ElementsPerWorkItem],
-                  T (&output)[ElementsPerWorkItem],
-                  offsetFunctorTypeFW &offset_functor_fw,
-                  offsetFunctorTypeRV &offset_functor_rv) {
-    T *buffer = reinterpret_cast<T *>(_local_memory);
-#pragma unroll
-    for (size_t i = 0; i < ElementsPerWorkItem; i++) {
-      size_t offset = offset_functor_fw(item, i);
-      buffer[offset] = input[i];
-    }
-    sycl::group_barrier(item.get_group());
-#pragma unroll
-    for (size_t i = 0; i < ElementsPerWorkItem; i++) {
-      size_t offset = offset_functor_rv(item, i);
-      output[i] = buffer[offset];
-    }
-  }
-
-  static constexpr int LOG_LOCAL_MEMORY_BANKS = 4;
-  static constexpr bool INSERT_PADDING =
-      (ElementsPerWorkItem > 4) &&
-      (detail::power_of_two<ElementsPerWorkItem>::VALUE);
-
-  uint8_t *_local_memory;
-};
-
-/// The work-group wide radix sort to sort integer data elements
-/// assigned to all work-items in the work-group.
-///
-/// \tparam T The type of the data elements.
-/// \tparam ElementsPerWorkItem The number of data elements assigned to
-/// a work-item.
-/// \tparam RADIX_BITS The number of radix bits per digit place.
-template <typename T, int ElementsPerWorkItem, int RADIX_BITS = 4>
-class group_radix_sort {
-  uint8_t *_local_memory;
-
-public:
-  group_radix_sort(uint8_t *local_memory) : _local_memory(local_memory) {}
-
-  static size_t get_local_memory_size(size_t group_threads) {
-    size_t ranks_size =
-        detail::radix_rank<RADIX_BITS>::get_local_memory_size(group_threads);
-    size_t exchange_size =
-        exchange<T, ElementsPerWorkItem>::get_local_memory_size(group_threads);
-    return sycl::max(ranks_size, exchange_size);
-  }
-
-private:
-  template <typename Item, bool DESCENDING>
-  __syclcompat_inline__ void
-  helper_sort(const Item &item, T (&keys)[ElementsPerWorkItem],
-              int begin_bit = 0, int end_bit = 8 * sizeof(T),
-              bool is_striped = false) {
-
-    uint32_t(&unsigned_keys)[ElementsPerWorkItem] =
-        reinterpret_cast<uint32_t(&)[ElementsPerWorkItem]>(keys);
-
-#pragma unroll
-    for (int i = 0; i < ElementsPerWorkItem; ++i) {
-      unsigned_keys[i] = detail::traits<T>::twiddle_in(unsigned_keys[i]);
-    }
-
-    for (int i = begin_bit; i < end_bit; i += RADIX_BITS) {
-      int pass_bits = sycl::min(RADIX_BITS, end_bit - begin_bit);
-
-      int ranks[ElementsPerWorkItem];
-      detail::radix_rank<RADIX_BITS, DESCENDING>(_local_memory)
-          .template rank_keys<Item, ElementsPerWorkItem>(item, unsigned_keys,
-                                                         ranks, i, pass_bits);
-
-      sycl::group_barrier(item.get_group());
-
-      bool last_iter = i + RADIX_BITS >= end_bit;
-      if (last_iter && is_striped) {
-        exchange<T, ElementsPerWorkItem>(_local_memory)
-            .scatter_to_striped(item, keys, ranks);
-
-      } else {
-        exchange<T, ElementsPerWorkItem>(_local_memory)
-            .scatter_to_blocked(item, keys, ranks);
-      }
-
-      sycl::group_barrier(item.get_group());
-    }
-
-#pragma unroll
-    for (int i = 0; i < ElementsPerWorkItem; ++i) {
-      unsigned_keys[i] = detail::traits<T>::twiddle_out(unsigned_keys[i]);
-    }
-  }
-
-public:
-  /// Performs an ascending work-group wide radix sort over a blocked
-  /// arrangement of input elements.
-  ///
-  /// Suppose 512 integer data elements partitioned across 128 work-items, where
-  /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the
-  /// \p input across the work-group is:
-  ///
-  ///   { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }.
-  ///
-  /// The ascending order output is:
-  ///
-  ///   { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }.
-  ///
-  /// \tparam Item The work-item identifier type.
-  /// \param item The work-item identifier.
-  /// \param input The input data of each work-item.
-  /// \param begin_bit The beginning (least-significant) bit index needed for
-  /// key comparison.
-  /// \param end_bit The past-the-end (most-significant) bit
-  /// index needed for key comparison.
-  template <typename Item>
-  __syclcompat_inline__ void
-  sort(const Item &item, T (&input)[ElementsPerWorkItem], int begin_bit = 0,
-       int end_bit = 8 * sizeof(T)) {
-    helper_sort<Item, /*DESCENDING=*/false>(item, input, begin_bit, end_bit);
-  }
-
-  /// Performs an descending work-group wide radix sort over a blocked
-  /// arrangement of input elements.
-  ///
-  /// Suppose 512 integer data elements partitioned across 128 work-items, where
-  /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the
-  /// \p input across the work-group is:
-  ///
-  ///   { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }.
-  ///
-  /// The descending order output is:
-  ///
-  ///   { [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }.
-  ///
-  /// \tparam Item The work-item identifier type.
-  /// \param item The work-item identifier.
-  /// \param input The input data of each work-item.
-  /// \param begin_bit The beginning (least-significant) bit index needed for
-  /// key comparison.
-  /// \param end_bit The past-the-end (most-significant) bit
-  /// index needed for key comparison.
-  template <typename Item>
-  __syclcompat_inline__ void
-  sort_descending(const Item &item, T (&input)[ElementsPerWorkItem],
-                  int begin_bit = 0, int end_bit = 8 * sizeof(T)) {
-    helper_sort<Item, /*DESCENDING=*/true>(item, input, begin_bit, end_bit);
-  }
-
-  /// Performs an ascending radix sort across a blocked arrangement of input
-  /// elements, leaving them in a striped arrangement.
-  ///
-  /// Suppose 512 integer data elements partitioned across 128 work-items, where
-  /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the
-  /// \p input across the work-group is:
-  ///
-  ///   { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }.
-  ///
-  /// The corresponding output of each work-item will be:
-  ///
-  ///   { [0,128,256,384], [1,129,257,385], [2,130,258,386], ...,
-  ///   [127,255,383,511] }.
-  ///
-  /// \tparam Item The work-item identifier type.
-  /// \param item The work-item identifier.
-  /// \param input The input data of each work-item.
-  /// \param begin_bit The beginning (least-significant) bit index needed for
-  /// key comparison.
-  /// \param end_bit The past-the-end (most-significant) bit
-  /// index needed for key comparison.
-  template <typename Item>
-  __syclcompat_inline__ void
-  sort_blocked_to_striped(const Item &item, T (&input)[ElementsPerWorkItem],
-                          int begin_bit = 0, int end_bit = 8 * sizeof(T)) {
-    helper_sort<Item, /*DESCENDING=*/false>(item, input, begin_bit, end_bit,
-                                            /*is_striped=*/true);
-  }
-
-  /// Performs an descending radix sort across a blocked arrangement of input
-  /// elements, leaving them in a striped arrangement.
-  ///
-  /// Suppose 512 integer data elements partitioned across 128 work-items, where
-  /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the
-  /// \p input across the work-group is:
-  ///
-  ///   { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }.
-  ///
-  /// The descending striped order output is:
-  ///
-  ///   { [0,128,256,384], [1,129,257,385], [2,130,258,386], ...,
-  ///   [127,255,383,511] }.
-  ///
-  /// \tparam Item The work-item identifier type.
-  /// \param item The work-item identifier.
-  /// \param input The input data of each work-item.
-  /// \param begin_bit The beginning (least-significant) bit index needed for
-  /// key comparison.
-  /// \param end_bit The past-the-end (most-significant) bit
-  /// index needed for key comparison.
-  template <typename Item>
-  __syclcompat_inline__ void sort_descending_blocked_to_striped(
-      const Item &item, T (&input)[ElementsPerWorkItem], int begin_bit = 0,
-      int end_bit = 8 * sizeof(T)) {
-    helper_sort<Item, /*DESCENDING=*/true>(item, input, begin_bit, end_bit,
-                                           /*is_striped=*/true);
-  }
-};
-
-/// Load linear segment items into block format across threads
-/// Helper for Block Load
-enum load_algorithm {
-  BLOCK_LOAD_DIRECT,
-  BLOCK_LOAD_STRIPED,
-};
-
-/// Load a linear segment of elements into a blocked arrangement across the
-/// work-group.
-///
-/// \tparam T The data type to load.
-/// \tparam ElementsPerWorkItem The number of consecutive elements partitioned
-/// onto each work-item.
-/// \tparam InputIteratorT  The random-access iterator type for input \iterator.
-/// \tparam ItemT The sycl::nd_item index space class.
-/// \param item The calling work-item.
-/// \param input_iter The work-group's base input iterator for loading from.
-/// \param data Data to load.
-template <typename T, size_t ElementsPerWorkItem, typename InputIteratorT,
-          typename ItemT>
-__syclcompat_inline__ void load_direct_blocked(const ItemT &item,
-                                               InputIteratorT input_iter,
-                                               T (&data)[ElementsPerWorkItem]) {
-  size_t work_item_id = item.get_local_linear_id();
-#pragma unroll
-  for (size_t i = 0; i < ElementsPerWorkItem; i++)
-    data[i] = input_iter[(work_item_id * ElementsPerWorkItem) + i];
-}
-
-/// Load a linear segment of elements into a striped arrangement across the
-/// work-group.
-///
-/// \tparam T The data type to load.
-/// \tparam ElementsPerWorkItem The number of consecutive elements partitioned
-/// onto each work-item.
-/// \tparam InputIteratorT  The random-access iterator type for input \iterator.
-/// \tparam ItemT The sycl::nd_item index space class.
-/// \param item The calling work-item.
-/// \param input_iter The work-group's base input iterator for loading from.
-/// \param data Data to load.
-template <typename T, int ElementsPerWorkItem, typename InputIteratorT,
-          typename ItemT>
-__syclcompat_inline__ void load_direct_striped(const ItemT &item,
-                                               InputIteratorT input_iter,
-                                               T (&data)[ElementsPerWorkItem]) {
-  size_t work_group_size = item.get_group().get_local_linear_range();
-  size_t work_item_id = item.get_local_linear_id();
-#pragma unroll
-  for (size_t i = 0; i < ElementsPerWorkItem; i++)
-    data[i] = input_iter[work_item_id + i * work_group_size];
-}
-
-/// Load a linear segment of elements into a blocked arrangement across the
-/// work-group, guarded by range.
-///
-/// \tparam T The data type to load.
-/// \tparam ElementsPerWorkItem The number of consecutive elements partitioned
-/// onto each work-item.
-/// \tparam InputIteratorT  The random-access iterator type for input \iterator.
-/// \tparam ItemT The sycl::nd_item index space class.
-/// \param item The calling work-item.
-/// \param input_iter The work-group's base input iterator for loading from.
-/// \param data Data to load.
-/// \param valid_items Number of valid items to load
-template <typename T, size_t ElementsPerWorkItem, typename InputIteratorT,
-          typename ItemT>
-__syclcompat_inline__ void
-load_direct_blocked(const ItemT &item, InputIteratorT input_iter,
-                    T (&data)[ElementsPerWorkItem], int valid_items) {
-  size_t work_item_id = item.get_local_linear_id();
-#pragma unroll
-  for (size_t i = 0; i < ElementsPerWorkItem; i++)
-    if ((work_item_id * ElementsPerWorkItem) + i < valid_items)
-      data[i] = input_iter[(work_item_id * ElementsPerWorkItem) + i];
-}
-
-/// Load a linear segment of elements into a striped arrangement across the
-/// work-group, guarded by range.
-///
-/// \tparam T The data type to load.
-/// \tparam ElementsPerWorkItem The number of consecutive elements partitioned
-/// onto each work-item.
-/// \tparam InputIteratorT  The random-access iterator type for input \iterator.
-/// \tparam ItemT The sycl::nd_item index space class.
-/// \param item The calling work-item.
-/// \param input_iter The work-group's base input iterator for loading from.
-/// \param data Data to load.
-/// \param valid_items Number of valid items to load
-template <typename T, int ElementsPerWorkItem, typename InputIteratorT,
-          typename ItemT>
-__syclcompat_inline__ void
-load_direct_striped(const ItemT &item, InputIteratorT input_iter,
-                    T (&data)[ElementsPerWorkItem], int valid_items) {
-  size_t work_group_size = item.get_group().get_local_linear_range();
-  size_t work_item_id = item.get_local_linear_id();
-#pragma unroll
-  for (size_t i = 0; i < ElementsPerWorkItem; i++)
-    if (work_item_id + (i * work_group_size) < valid_items)
-      data[i] = input_iter[work_item_id + i * work_group_size];
-}
-
-/// Store a blocked arrangement of items across a work-group into a linear
-/// segment of items.
-///
-/// \tparam T The data type to store.
-/// \tparam ElementsPerWorkItem The number of consecutive elements partitioned
-/// onto each work-item.
-/// \tparam OutputIteratorT  The random-access iterator type for output.
-/// \iterator.
-/// \tparam ItemT The sycl::nd_item index space class.
-/// \param item The calling work-item.
-/// \param output_iter The work-group's base output iterator for writing.
-/// \param data Data to store.
-template <typename T, size_t ElementsPerWorkItem, typename OutputIteratorT,
-          typename ItemT>
-__syclcompat_inline__ void
-store_direct_blocked(const ItemT &item, OutputIteratorT output_iter,
-                     T (&data)[ElementsPerWorkItem]) {
-  size_t work_item_id = item.get_local_linear_id();
-  OutputIteratorT work_item_iter =
-      output_iter + (work_item_id * ElementsPerWorkItem);
-#pragma unroll
-  for (size_t i = 0; i < ElementsPerWorkItem; i++)
-    work_item_iter[i] = data[i];
-}
-
-/// Store a striped arrangement of items across a work-group into a linear
-/// segment of items.
-///
-/// \tparam T The data type to store.
-/// \tparam ElementsPerWorkItem The number of consecutive elements partitioned
-/// onto each work-item.
-/// \tparam OutputIteratorT  The random-access iterator type for output.
-/// \iterator.
-/// \tparam ItemT The sycl::nd_item index space class.
-/// \param item The calling work-item.
-/// \param output_iter The work-group's base output iterator for writing.
-/// \param items Data to store.
-template <typename T, size_t ElementsPerWorkItem, typename OutputIteratorT,
-          typename ItemT>
-__syclcompat_inline__ void
-store_direct_striped(const ItemT &item, OutputIteratorT output_iter,
-                     T (&data)[ElementsPerWorkItem]) {
-  size_t work_group_size = item.get_group().get_local_linear_range();
-  size_t work_item_id = item.get_local_linear_id();
-  OutputIteratorT work_item_iter = output_iter + work_item_id;
-#pragma unroll
-  for (size_t i = 0; i < ElementsPerWorkItem; i++)
-    work_item_iter[i * work_group_size] = data[i];
-}
-
-/// Store a blocked arrangement of items across a work-group into a linear
-/// segment of items, guarded by range.
-///
-/// \tparam T The data type to store.
-/// \tparam ElementsPerWorkItem The number of consecutive elements partitioned
-/// onto each work-item.
-/// \tparam OutputIteratorT  The random-access iterator type for output.
-/// \iterator.
-/// \tparam ItemT The sycl::nd_item index space class.
-/// \param item The calling work-item.
-/// \param output_iter The work-group's base output iterator for writing.
-/// \param data Data to store.
-/// \param valid_items Number of valid items to load
-template <typename T, size_t ElementsPerWorkItem, typename OutputIteratorT,
-          typename ItemT>
-__syclcompat_inline__ void
-store_direct_blocked(const ItemT &item, OutputIteratorT output_iter,
-                     T (&data)[ElementsPerWorkItem], size_t valid_items) {
-  size_t work_item_id = item.get_local_linear_id();
-  OutputIteratorT work_item_iter =
-      output_iter + (work_item_id * ElementsPerWorkItem);
-#pragma unroll
-  for (size_t i = 0; i < ElementsPerWorkItem; i++)
-    if (i + (work_item_id * ElementsPerWorkItem) < valid_items)
-      work_item_iter[i] = data[i];
-}
-
-/// Store a striped arrangement of items across a work-group into a linear
-/// segment of items, guarded by range.
-///
-/// \tparam T The data type to store.
-/// \tparam ElementsPerWorkItem The number of consecutive elements partitioned
-/// onto each work-item.
-/// \tparam OutputIteratorT  The random-access iterator type for output.
-/// \iterator.
-/// \tparam ItemT The sycl::nd_item index space class.
-/// \param item The calling work-item.
-/// \param output_iter The work-group's base output iterator for writing.
-/// \param items Data to store.
-/// \param valid_items Number of valid items to load
-template <typename T, size_t ElementsPerWorkItem, typename OutputIteratorT,
-          typename ItemT>
-__syclcompat_inline__ void
-store_direct_striped(const ItemT &item, OutputIteratorT output_iter,
-                     T (&data)[ElementsPerWorkItem], size_t valid_items) {
-  size_t work_group_size = item.get_group().get_local_linear_range();
-  size_t work_item_id = item.get_local_linear_id();
-  OutputIteratorT work_item_iter = output_iter + work_item_id;
-#pragma unroll
-  for (size_t i = 0; i < ElementsPerWorkItem; i++)
-    if ((i * work_group_size) + work_item_id < valid_items)
-      work_item_iter[i * work_group_size] = data[i];
-}
-
-/// Enumerates alternative algorithms for syclcompat::group::group_load to read
-/// a linear segment of data from memory into a blocked arrangement across a
-/// work-group.
-enum class group_load_algorithm {
-  /// A blocked arrangement of data is read directly from memory.
-  blocked,
-
-  /// A striped arrangement of data is read directly from memory.
-  striped
-};
-
-/// Provide methods for loading a linear segment of items from memory into a
-/// blocked arrangement across a work-group.
-///
-/// \tparam T The input data type.
-/// \tparam ElementsPerWorkItem The number of data elements assigned to a
-/// work-item.
-/// \tparam LoadAlgorithm The data movement strategy, default is blocked.
-template <typename T, size_t ElementsPerWorkItem,
-          group_load_algorithm LoadAlgorithm = group_load_algorithm::blocked>
-class group_load {
-public:
-  static size_t get_local_memory_size([[maybe_unused]] size_t work_group_size) {
-    return 0;
-  }
-  group_load(uint8_t *) {}
-
-  /// Load a linear segment of items from memory.
-  ///
-  /// Suppose 512 integer data elements partitioned across 128 work-items, where
-  /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the
-  /// \p input across the work-group is:
-  ///
-  ///   1, 2, 3, 4, 5, 6, 7, ..., 508, 509, 510, 511.
-  ///
-  /// The blocked order \p data of each work-item will be:
-  ///
-  ///   {[0,1,2,3], [4,5,6,7], ..., [508,509,510,511]}.
-  ///
-  /// The striped order \p output of each work-item will be:
-  ///
-  ///   {[0,128,256,384], [1,129,257,385], ..., [127,255,383,511]}.
-  ///
-  /// \tparam ItemT The sycl::nd_item index space class.
-  /// \tparam InputIteratorT The random-access iterator type for input
-  /// \iterator.
-  /// \param item The work-item identifier.
-  /// \param input_iter The work-group's base input iterator for loading from.
-  /// \param data The data to load.
-  template <typename ItemT, typename InputIteratorT>
-  __syclcompat_inline__ void load(const ItemT &item, InputIteratorT input_iter,
-                                  T (&data)[ElementsPerWorkItem]) {
-    if constexpr (LoadAlgorithm == group_load_algorithm::blocked) {
-      load_direct_blocked<T, ElementsPerWorkItem, InputIteratorT, ItemT>(
-          item, input_iter, data);
-    } else if constexpr (LoadAlgorithm == group_load_algorithm::striped) {
-      load_direct_striped<T, ElementsPerWorkItem, InputIteratorT, ItemT>(
-          item, input_iter, data);
-    }
-  }
-
-  /// Load a linear segment of items from memory, guarded by range.
-  ///
-  /// Suppose 512 integer data elements partitioned across 128 work-items, where
-  /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and
-  /// valid_items is 5, the \p input across the work-group is:
-  ///
-  ///   0, 1, 2, 3, 4, 5, 6, 7, ..., 508, 509, 510, 511.
-  ///
-  /// The blocked order \p data of each work-item will be:
-  ///
-  ///   {[0,1,2,3], [4,?,?,?], ..., [?,?,?,?]}.
-  ///
-  /// The striped order \p output of each work-item will be:
-  ///
-  ///   {[0,?,?,?], [1,?,?,?], [2,?,?,?], [3,?,?,?] ..., [?,?,?,?]}.
-  ///
-  /// \tparam ItemT The sycl::nd_item index space class.
-  /// \tparam InputIteratorT The random-access iterator type for input
-  /// \iterator.
-  /// \param item The work-item identifier.
-  /// \param input_iter The work-group's base input iterator for loading from.
-  /// \param data The data to load.
-  /// \param valid_items Number of valid items to load
-  template <typename ItemT, typename InputIteratorT>
-  __syclcompat_inline__ void load(const ItemT &item, InputIteratorT input_iter,
-                                  T (&data)[ElementsPerWorkItem],
-                                  int valid_items) {
-    if constexpr (LoadAlgorithm == group_load_algorithm::blocked) {
-      load_direct_blocked<T, ElementsPerWorkItem, InputIteratorT, ItemT>(
-          item, input_iter, data, valid_items);
-    } else if constexpr (LoadAlgorithm == group_load_algorithm::striped) {
-      load_direct_striped<T, ElementsPerWorkItem, InputIteratorT, ItemT>(
-          item, input_iter, data, valid_items);
-    }
-  }
-};
-
-/// Enumerates alternative algorithms for syclcompat::group::group_load to write
-/// a blocked arrangement of items across a work-group to a linear segment of
-/// memory.
-enum class group_store_algorithm {
-  /// A blocked arrangement of data is written directly to memory.
-  blocked,
-
-  /// A striped arrangement of data is written directly to memory.
-  striped,
-};
-
-/// Provide methods for writing a blocked arrangement of elements partitioned
-/// across a work-group to a linear segment of memory.
-///
-/// \tparam T The output data type.
-/// \tparam ElementsPerWorkItem The number of data elements assigned to a
-/// work-item.
-/// \tparam StoreAlgorithm The data movement strategy, default is blocked.
-template <typename T, size_t ElementsPerWorkItem,
-          group_store_algorithm StoreAlgorithm = group_store_algorithm::blocked>
-class group_store {
-public:
-  static size_t get_local_memory_size([[maybe_unused]] size_t work_group_size) {
-    return 0;
-  }
-  group_store(uint8_t *) {}
-
-  /// Store items into a linear segment of memory.
-  ///
-  /// Suppose 512 integer data elements partitioned across 128 work-items, where
-  /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the
-  /// \p input across the work-group is:
-  ///
-  ///   {[0,1,2,3], [4,5,6,7], ..., [508,509,510,511]}.
-  ///
-  /// The blocked order \p output will be:
-  ///
-  ///   1, 2, 3, 4, 5, 6, 7, ..., 508, 509, 510, 511.
-  ///
-  /// The striped order \p output will be:
-  ///
-  ///   0, 128, 256, 384, 1, 129, 257, 385, ..., 127, 255, 383, 511.
-  ///
-  /// \tparam ItemT The sycl::nd_item index space class.
-  /// \tparam OutputIteratorT The random-access iterator type for \p output
-  /// iterator.
-  /// \param item The work-item identifier.
-  /// \param input The input data of each work-item.
-  /// \param data The data to store.
-  template <typename ItemT, typename OutputIteratorT>
-  __syclcompat_inline__ void store(const ItemT &item,
-                                   OutputIteratorT output_iter,
-                                   T (&data)[ElementsPerWorkItem]) {
-    if constexpr (StoreAlgorithm == group_store_algorithm::blocked) {
-      store_direct_blocked<T, ElementsPerWorkItem, OutputIteratorT, ItemT>(
-          item, output_iter, data);
-    } else if constexpr (StoreAlgorithm == group_store_algorithm::striped) {
-      store_direct_striped<T, ElementsPerWorkItem, OutputIteratorT, ItemT>(
-          item, output_iter, data);
-    }
-  }
-
-  /// Store items into a linear segment of memory, guarded by range.
-  ///
-  /// Suppose 512 integer data elements partitioned across 128 work-items, where
-  /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and
-  /// \p valid_items is 5, the \p output across the work-group is:
-  ///
-  ///   {[0,0,0,0], [0,0,0,0], ..., [0,0,0,0]}.
-  ///
-  /// The blocked order \p output will be:
-  ///
-  ///   0, 1, 2, 3, 4, 5, 0, 0, ..., 0, 0, 0, 0.
-  ///
-  /// The striped order \p output will be:
-  ///
-  ///   0, 4, 8, 12, 16, 0, 0, 0, ..., 0, 0, 0, 0.
-  ///
-  /// \tparam ItemT The sycl::nd_item index space class.
-  /// \tparam OutputIteratorT The random-access iterator type for \p output
-  /// iterator.
-  /// \param item The work-item identifier.
-  /// \param input The input data of each work-item.
-  /// \param data The data to store.
-  /// \param valid_items Number of valid items to load
-  template <typename ItemT, typename OutputIteratorT>
-  __syclcompat_inline__ void
-  store(const ItemT &item, OutputIteratorT output_iter,
-        T (&data)[ElementsPerWorkItem], size_t valid_items) {
-    if constexpr (StoreAlgorithm == group_store_algorithm::blocked) {
-      store_direct_blocked<T, ElementsPerWorkItem, OutputIteratorT, ItemT>(
-          item, output_iter, data, valid_items);
-    } else if constexpr (StoreAlgorithm == group_store_algorithm::striped) {
-      store_direct_striped<T, ElementsPerWorkItem, OutputIteratorT, ItemT>(
-          item, output_iter, data, valid_items);
-    }
-  }
-};
-
-/// The work-group wide shuffle operations that allow work-items to exchange
-/// data elements with other work-items within the same work-group.
-///
-/// \tparam T The type of the data elements.
-/// \tparam group_dim_0 The first dimension size of the work-group.
-/// \tparam group_dim_1 The second dimension size of the work-group.
-/// \tparam group_dim_2 The third dimension size of the work-group.
-template <typename T, int group_dim_0, int group_dim_1 = 1, int group_dim_2 = 1>
-class group_shuffle {
-  T *_local_memory = nullptr;
-  static constexpr size_t group_work_items =
-      group_dim_0 * group_dim_1 * group_dim_2;
-
-public:
-  static constexpr size_t get_local_memory_size(size_t work_group_size) {
-    return sizeof(T) * work_group_size;
-  }
-  group_shuffle(uint8_t *local_memory) : _local_memory((T *)local_memory) {}
-
-  /// Selects a value from a work-item at a given distance in the work-group
-  /// and stores the value in the output.
-  ///
-  /// \tparam ItemT The work-item identifier type.
-  /// \param item The work-item identifier.
-  /// \param input The input from the calling work-item.
-  /// \param output The output where the selected data will be stored.
-  /// \param distance The distance of work-items to look ahead or behind in the
-  /// work-group.
-  template <typename ItemT>
-  __syclcompat_inline__ void select(const ItemT &item, T input, T &output,
-                                    int distance = 1) {
-    auto g = item.get_group();
-    size_t id = g.get_local_linear_id();
-    _local_memory[id] = input;
-
-    sycl::group_barrier(g, sycl::memory_scope::work_group);
-
-    const int target_id = static_cast<int>(id) + distance;
-    if ((target_id >= 0) && (target_id < group_work_items)) {
-      output = _local_memory[static_cast<size_t>(target_id)];
-    }
-  }
-  /// Selects a value from a work-item at a given distance in the work-group
-  /// and stores the value in the output, using a wrapped index to handle
-  /// overflow.
-  ///
-  /// \tparam ItemT The work-item identifier type.
-  /// \param item The work-item identifier.
-  /// \param input The input data to be selected.
-  /// \param output The output where the selected data will be stored.
-  /// \param distance The number of work-items to look ahead in the
-  /// work-group.
-  template <typename ItemT>
-  __syclcompat_inline__ void select2(const ItemT &item, T input, T &output,
-                                     unsigned int distance = 1) {
-    auto g = item.get_group();
-    size_t id = g.get_local_linear_id();
-    _local_memory[id] = input;
-
-    sycl::group_barrier(g, sycl::memory_scope::work_group);
-
-    unsigned int offset = id + distance;
-    if (offset >= group_work_items)
-      offset -= group_work_items;
-
-    output = _local_memory[offset];
-  }
-  /// Performs a shuffle operation to move data to the right across the
-  /// work-items, shifting elements in a work-item array by one position to the
-  /// right.
-  ///
-  /// \tparam ElementsPerWorkItem The number of data elements per work-item.
-  /// \tparam ItemT The work-item identifier type.
-  /// \param item The work-item identifier.
-  /// \param input The input data to be shuffled.
-  /// \param output The array that will store the shuffle result.
-  template <int ElementsPerWorkItem, typename ItemT>
-  __syclcompat_inline__ void shuffle_right(const ItemT &item,
-                                           T (&input)[ElementsPerWorkItem],
-                                           T (&output)[ElementsPerWorkItem]) {
-    auto g = item.get_group();
-    size_t id = g.get_local_linear_id();
-    _local_memory[id] = input[ElementsPerWorkItem - 1];
-
-    sycl::group_barrier(g, sycl::memory_scope::work_group);
-
-#pragma unroll
-    for (int index = ElementsPerWorkItem - 1; index > 0; --index)
-      output[index] = input[index - 1];
-
-    if (id > 0)
-      output[0] = _local_memory[id - 1];
-  }
-  /// Performs a shuffle operation to move data to the right across the
-  /// work-items, storing the suffix of the group after the shuffle operation.
-  ///
-  /// \tparam ElementsPerWorkItem The number of data elements per work-item.
-  /// \tparam ItemT The work-item identifier type.
-  /// \param item The work-item identifier.
-  /// \param input The input data to be shuffled.
-  /// \param output The array that will store the shuffle result.
-  /// \param group_suffix The suffix of the group after the shuffle.
-  template <int ElementsPerWorkItem, typename ItemT>
-  __syclcompat_inline__ void
-  shuffle_right(const ItemT &item, T (&input)[ElementsPerWorkItem],
-                T (&output)[ElementsPerWorkItem], T &group_suffix) {
-    shuffle_right(item, input, output);
-    group_suffix = _local_memory[group_work_items - 1];
-  }
-  /// Performs a shuffle operation to move data to the left across the
-  /// work-items, shifting elements in a work-item array by one position to the
-  /// left.
-  ///
-  /// \tparam ElementsPerWorkItem The number of data elements per work-item.
-  /// \tparam ItemT The work-item identifier type.
-  /// \param item The work-item identifier.
-  /// \param input The input data to be shuffled.
-  /// \param output The array that will store the shuffle result.
-  template <int ElementsPerWorkItem, typename ItemT>
-  __syclcompat_inline__ void shuffle_left(const ItemT &item,
-                                          T (&input)[ElementsPerWorkItem],
-                                          T (&output)[ElementsPerWorkItem]) {
-    auto g = item.get_group();
-    size_t id = g.get_local_linear_id();
-    _local_memory[id] = input[0];
-
-    sycl::group_barrier(g, sycl::memory_scope::work_group);
-
-#pragma unroll
-    for (int index = 0; index < ElementsPerWorkItem - 1; index++)
-      output[index] = input[index + 1];
-
-    if (id < group_work_items - 1)
-      output[ElementsPerWorkItem - 1] = _local_memory[id + 1];
-  }
-  /// Performs a shuffle operation to move data to the left across the
-  /// work-items, storing the prefix of the group before the shuffle operation.
-  ///
-  /// \tparam ElementsPerWorkItem The number of data elements per work-item.
-  /// \tparam ItemT The work-item identifier type.
-  /// \param item The work-item identifier.
-  /// \param input The input data to be shuffled.
-  /// \param output The array that will store the shuffle result.
-  /// \param group_prefix The prefix of the group before the shuffle.
-  template <int ElementsPerWorkItem, typename ItemT>
-  __syclcompat_inline__ void
-  shuffle_left(const ItemT &item, T (&input)[ElementsPerWorkItem],
-               T (&output)[ElementsPerWorkItem], T &group_prefix) {
-    shuffle_left(item, input, output);
-    group_prefix = _local_memory[0];
-  }
-};
-} // namespace group
-} // namespace syclcompat
diff --git a/sycl/include/syclcompat/id_query.hpp b/sycl/include/syclcompat/id_query.hpp
deleted file mode 100644
index 2a61ac7c2127f..0000000000000
--- a/sycl/include/syclcompat/id_query.hpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCL compatibility extension
- *
- *  id_query.hpp
- *
- *  Description:
- *    id_query functionality for the SYCL compatibility extension
- **************************************************************************/
-
-#pragma once
-
-#include <sycl/ext/oneapi/free_function_queries.hpp>
-#include <sycl/nd_item.hpp>
-
-namespace [[deprecated("syclcompat is deprecated")]] syclcompat {
-
-using sycl::ext::oneapi::this_work_item::get_nd_item;
-
-inline void wg_barrier() { get_nd_item<3>().barrier(); }
-
-namespace local_id {
-inline size_t x() { return get_nd_item<3>().get_local_id(2); }
-inline size_t y() { return get_nd_item<3>().get_local_id(1); }
-inline size_t z() { return get_nd_item<3>().get_local_id(0); }
-} // namespace local_id
-
-namespace local_range {
-inline size_t x() { return get_nd_item<3>().get_local_range(2); }
-inline size_t y() { return get_nd_item<3>().get_local_range(1); }
-inline size_t z() { return get_nd_item<3>().get_local_range(0); }
-} // namespace local_range
-
-namespace work_group_id {
-inline size_t x() { return get_nd_item<3>().get_group(2); }
-inline size_t y() { return get_nd_item<3>().get_group(1); }
-inline size_t z() { return get_nd_item<3>().get_group(0); }
-} // namespace work_group_id
-
-namespace work_group_range {
-inline size_t x() { return get_nd_item<3>().get_group_range(2); }
-inline size_t y() { return get_nd_item<3>().get_group_range(1); }
-inline size_t z() { return get_nd_item<3>().get_group_range(0); }
-} // namespace work_group_range
-
-namespace global_range {
-inline size_t x() { return get_nd_item<3>().get_global_range(2); }
-inline size_t y() { return get_nd_item<3>().get_global_range(1); }
-inline size_t z() { return get_nd_item<3>().get_global_range(0); }
-} // namespace global_range
-
-namespace global_id {
-inline size_t x() { return get_nd_item<3>().get_global_id(2); }
-inline size_t y() { return get_nd_item<3>().get_global_id(1); }
-inline size_t z() { return get_nd_item<3>().get_global_id(0); }
-} // namespace global_id
-
-} // namespace syclcompat
diff --git a/sycl/include/syclcompat/kernel.hpp b/sycl/include/syclcompat/kernel.hpp
deleted file mode 100644
index 286761fe343ce..0000000000000
--- a/sycl/include/syclcompat/kernel.hpp
+++ /dev/null
@@ -1,471 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCL compatibility extension
- *
- *  kernel.hpp
- *
- *  Description:
- *    kernel functionality for the SYCL compatibility extension.
- **************************************************************************/
-
-// The original source was under the license below:
-//==---- kernel.hpp -------------------------------*- C++ -*----------------==//
-//
-// Copyright (C) Intel Corporation
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// See https://llvm.org/LICENSE.txt for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#pragma once
-
-#ifdef _WIN32
-#include <unordered_set>
-#include <windows.h>
-#else
-#include <dlfcn.h>
-#endif
-
-#if defined(__has_include) && __has_include(<filesystem>)
-#include <filesystem>
-#elif defined(__has_include) && __has_include(<experimental/filesystem>)
-#include <experimental/filesystem>
-#else
-#error "SYCLomatic runtime requires C++ filesystem support"
-#endif
-
-#include <fstream>
-#include <random>
-
-#include <sycl/image.hpp>
-#include <sycl/info/info_desc.hpp>
-#include <sycl/nd_range.hpp>
-#include <sycl/queue.hpp>
-
-namespace [[deprecated("syclcompat is deprecated")]] syclcompat {
-
-typedef void (*kernel_functor)(sycl::queue &, const sycl::nd_range<3> &,
-                               unsigned int, void **, void **);
-
-struct kernel_function_info {
-  int max_work_group_size = 0;
-};
-
-static inline void get_kernel_function_info(kernel_function_info *kernel_info,
-                                            const void *function) {
-  kernel_info->max_work_group_size =
-      detail::dev_mgr::instance()
-          .current_device()
-          .get_info<sycl::info::device::max_work_group_size>();
-}
-
-static inline kernel_function_info
-get_kernel_function_info(const void *function) {
-  kernel_function_info kernel_info;
-  kernel_info.max_work_group_size =
-      detail::dev_mgr::instance()
-          .current_device()
-          .get_info<sycl::info::device::max_work_group_size>();
-  return kernel_info;
-}
-
-namespace detail {
-
-#if defined(__has_include) && __has_include(<filesystem>)
-namespace fs = std::filesystem;
-#else
-namespace fs = std::experimental::filesystem;
-#endif
-
-/// Write data to temporary file and return absolute path to temporary file.
-/// Temporary file is created in a temporary directory both of which have random
-/// names with only the user having access permissions.  Only one temporary file
-/// will be created in the temporary directory.
-static inline fs::path write_data_to_file(char const *const data, size_t size) {
-  std::error_code ec;
-
-  if (sizeof(size_t) >= sizeof(std::streamsize) &&
-      size > (std::numeric_limits<std::streamsize>::max)())
-    throw std::runtime_error("[SYCLcompat] data file too large");
-
-  // random number generator
-  std::random_device dev;
-  std::mt19937 prng(dev());
-  std::uniform_int_distribution<uint64_t> rand(0);
-
-  // find temporary directory
-  auto tmp_dir = fs::temp_directory_path(ec);
-  if (ec)
-    throw std::runtime_error("[SYCLcompat] could not find temporary directory");
-
-  // create private directory
-  std::stringstream directory;
-  directory.imbue(std::locale::classic()); // avoid locale issues, like commas
-  fs::path directory_path;
-  constexpr int max_attempts = 5;
-  int i;
-
-  for (i = 0; i < max_attempts; i++) {
-    directory << std::hex << rand(prng);
-    directory_path = tmp_dir / directory.str();
-    if (fs::create_directory(directory_path)) {
-      break;
-    }
-  }
-  if (i == max_attempts)
-    throw std::runtime_error("[SYCLcompat] could not create directory");
-
-  // only allow owner permissions to private directory
-  fs::permissions(directory_path, fs::perms::owner_all, ec);
-  if (ec)
-    throw std::runtime_error(
-        "[SYCLcompat] could not set directory permissions");
-
-  // random filename in private directory
-  std::stringstream filename;
-  filename.imbue(std::locale::classic());
-  filename << std::hex << rand(prng);
-#ifdef _WIN32
-  auto filepath = directory_path / (filename.str() + ".dll");
-#else
-  auto filepath = directory_path / filename.str();
-#endif
-
-  // write data to temporary file
-  auto outfile = std::ofstream(filepath, std::ios::out | std::ios::binary);
-  if (outfile) {
-    // only allow program to write file
-    fs::permissions(filepath, fs::perms::owner_write, ec);
-    if (ec)
-      throw std::runtime_error("[SYCLcompat] could not set permissions");
-
-    outfile.write(data, size);
-    if (!outfile.good())
-      throw std::runtime_error("[SYCLcompat] could not write data");
-    outfile.close();
-
-    // only allow program to read/execute file
-    fs::permissions(filepath, fs::perms::owner_read | fs::perms::owner_exec,
-                    ec);
-    if (ec)
-      throw std::runtime_error("[SYCLcompat] could not set permissions");
-  } else
-    throw std::runtime_error("[SYCLcompat] could not write data");
-
-  // check temporary file contents
-  auto infile = std::ifstream(filepath, std::ios::in | std::ios::binary);
-  if (infile) {
-    bool mismatch = false;
-    size_t cnt = 0;
-
-    while (1) {
-      char c;
-      infile.get(c);
-      if (infile.eof())
-        break;
-      if (c != data[cnt++])
-        mismatch = true;
-    }
-    if (cnt != size || mismatch)
-      throw std::runtime_error(
-          "[SYCLcompat] file contents not written correctly");
-  } else
-    throw std::runtime_error("[SYCLcompat] could not validate file");
-
-  if (!filepath.is_absolute())
-    throw std::runtime_error("[SYCLcompat] temporary filepath is not absolute");
-
-  return filepath;
-}
-
-static inline uint16_t extract16(unsigned char const *const ptr) {
-  uint16_t ret = 0;
-
-  ret |= static_cast<uint16_t>(ptr[0]) << 0;
-  ret |= static_cast<uint16_t>(ptr[1]) << 8;
-
-  return (ret);
-}
-
-static inline uint32_t extract32(unsigned char const *const ptr) {
-  uint32_t ret = 0;
-
-  ret |= static_cast<uint32_t>(ptr[0]) << 0;
-  ret |= static_cast<uint32_t>(ptr[1]) << 8;
-  ret |= static_cast<uint32_t>(ptr[2]) << 16;
-  ret |= static_cast<uint32_t>(ptr[3]) << 24;
-
-  return (ret);
-}
-
-static inline uint64_t extract64(unsigned char const *const ptr) {
-  uint64_t ret = 0;
-
-  ret |= static_cast<uint64_t>(ptr[0]) << 0;
-  ret |= static_cast<uint64_t>(ptr[1]) << 8;
-  ret |= static_cast<uint64_t>(ptr[2]) << 16;
-  ret |= static_cast<uint64_t>(ptr[3]) << 24;
-  ret |= static_cast<uint64_t>(ptr[4]) << 32;
-  ret |= static_cast<uint64_t>(ptr[5]) << 40;
-  ret |= static_cast<uint64_t>(ptr[6]) << 48;
-  ret |= static_cast<uint64_t>(ptr[7]) << 56;
-
-  return (ret);
-}
-
-static inline uint64_t get_lib_size(char const *const blob) {
-#ifdef _WIN32
-  ///////////////////////////////////////////////////////////////////////
-  // Analyze DOS stub
-  unsigned char const *const ublob =
-      reinterpret_cast<unsigned char const *const>(blob);
-  if (ublob[0] != 0x4d || ublob[1] != 0x5a) {
-    throw std::runtime_error("[SYCLcompat] blob is not a Windows DLL.");
-  }
-  uint32_t pe_header_offset = extract32(ublob + 0x3c);
-
-  ///////////////////////////////////////////////////////////////////////
-  // Ananlyze PE-header
-  unsigned char const *const pe_header = ublob + pe_header_offset;
-
-  // signature
-  uint32_t pe_signature = extract32(pe_header + 0);
-  if (pe_signature != 0x00004550) {
-    throw std::runtime_error(
-        "[SYCLcompat] PE-header signature is not 0x00004550");
-  }
-
-  // machine
-  uint16_t machine = extract16(pe_header + 4);
-  if (machine != 0x8664) {
-    throw std::runtime_error("[SYCLcompat] only DLLs for x64 supported");
-  }
-
-  // number of sections
-  uint16_t number_of_sections = extract16(pe_header + 6);
-
-  // sizeof optional header
-  uint16_t sizeof_optional_header = extract16(pe_header + 20);
-
-  // magic
-  uint16_t magic = extract16(pe_header + 24);
-  if (magic != 0x10b && magic != 0x20b) {
-    throw std::runtime_error("[SYCLcompat] MAGIC is not 0x010b or 0x020b");
-  }
-
-  ///////////////////////////////////////////////////////////////////////
-  // Analyze tail of optional header
-  constexpr int coff_header_size = 24;
-
-  unsigned char const *const tail_of_optional_header =
-      pe_header + coff_header_size + sizeof_optional_header;
-  if (extract64(tail_of_optional_header - 8) != 0) {
-    throw std::runtime_error("Optional header not zero-padded");
-  }
-
-  ///////////////////////////////////////////////////////////////////////
-  // Analyze last section header
-  constexpr int section_header_size = 40;
-  unsigned char const *const last_section_header =
-      tail_of_optional_header + section_header_size * (number_of_sections - 1);
-
-  uint32_t sizeof_raw_data = extract32(last_section_header + 16);
-  uint32_t pointer_to_raw_data = extract32(last_section_header + 20);
-
-  return sizeof_raw_data + pointer_to_raw_data;
-#else
-  if (blob[0] != 0x7F || blob[1] != 'E' || blob[2] != 'L' || blob[3] != 'F')
-    throw std::runtime_error("[SYCLcompat] blob is not in ELF format");
-
-  if (blob[4] != 0x02)
-    throw std::runtime_error("[SYCLcompat] only 64-bit headers are supported");
-
-  if (blob[5] != 0x01)
-    throw std::runtime_error(
-        "[SYCLcompat] only little-endian headers are supported");
-
-  unsigned char const *const ublob =
-      reinterpret_cast<unsigned char const *const>(blob);
-  uint64_t e_shoff = extract64(ublob + 0x28);
-  uint16_t e_shentsize = extract16(ublob + 0x3A);
-  uint16_t e_shnum = extract16(ublob + 0x3C);
-
-  return e_shoff + (e_shentsize * e_shnum);
-#endif
-}
-
-#ifdef _WIN32
-class path_lib_record {
-public:
-  void operator=(const path_lib_record &) = delete;
-  ~path_lib_record() {
-    for (auto entry : lib_to_path) {
-      FreeLibrary(static_cast<HMODULE>(entry.first));
-      fs::permissions(entry.second, fs::perms::owner_all);
-      fs::remove_all(entry.second.remove_filename());
-    }
-  }
-  static void record_lib_path(fs::path path, void *library) {
-    lib_to_path[library] = path;
-  }
-  static void remove_lib(void *library) {
-    auto path = lib_to_path[library];
-    std::error_code ec;
-
-    FreeLibrary(static_cast<HMODULE>(library));
-    fs::permissions(path, fs::perms::owner_all);
-    if (fs::remove_all(path.remove_filename(), ec) != 2 || ec)
-      // one directory and one temporary file should have been deleted
-      throw std::runtime_error("[SYCLcompat] directory delete failed");
-
-    lib_to_path.erase(library);
-  }
-
-private:
-  static inline std::unordered_map<void *, fs::path> lib_to_path;
-};
-#endif
-
-} // namespace detail
-
-class kernel_library {
-public:
-  constexpr kernel_library() : ptr{nullptr} {}
-  constexpr kernel_library(void *ptr) : ptr{ptr} {}
-
-  operator void *() const { return ptr; }
-
-private:
-  void *ptr;
-#ifdef _WIN32
-  static inline detail::path_lib_record single_instance_to_trigger_destructor;
-#endif
-};
-
-namespace detail {
-
-static inline kernel_library load_dl_from_data(char const *const data,
-                                               size_t size) {
-  fs::path filename = write_data_to_file(data, size);
-#ifdef _WIN32
-  void *so = LoadLibraryW(filename.wstring().c_str());
-#else
-  void *so = dlopen(filename.c_str(), RTLD_LAZY);
-#endif
-  if (so == nullptr)
-    throw std::runtime_error("[SYCLcompat] failed to load kernel library");
-
-#ifdef _WIN32
-  detail::path_lib_record::record_lib_path(filename, so);
-#else
-  std::error_code ec;
-
-  // Windows DLL cannot be deleted while in use
-  if (fs::remove_all(filename.remove_filename(), ec) != 2 || ec)
-    // one directory and one temporary file should have been deleted
-    throw std::runtime_error("[SYCLcompat] directory delete failed");
-#endif
-
-  return so;
-}
-
-} // namespace detail
-
-/// Load kernel library and return a handle to use the library.
-/// \param [in] name The name of the library.
-static inline kernel_library load_kernel_library(const std::string &name) {
-  std::ifstream ifs;
-  ifs.open(name, std::ios::in | std::ios::binary);
-
-  std::stringstream buffer;
-  buffer << ifs.rdbuf();
-
-  const std::string buffer_string = buffer.str();
-  return detail::load_dl_from_data(buffer_string.c_str(), buffer_string.size());
-}
-
-/// Load kernel library whose image is alreay in memory and return a handle to
-/// use the library.
-/// \param [in] image A pointer to the image in memory.
-static inline kernel_library load_kernel_library_mem(char const *const image) {
-  const size_t size = detail::get_lib_size(image);
-
-  return detail::load_dl_from_data(image, size);
-}
-
-/// Unload kernel library.
-/// \param [in,out] library Handle to the library to be closed.
-static inline void unload_kernel_library(const kernel_library &library) {
-#ifdef _WIN32
-  detail::path_lib_record::remove_lib(library);
-#else
-  dlclose(library);
-#endif
-}
-
-class kernel_function {
-public:
-  constexpr kernel_function() : ptr{nullptr} {}
-  constexpr kernel_function(kernel_functor ptr) : ptr{ptr} {}
-
-  operator void *() const { return ((void *)ptr); }
-
-  void operator()(sycl::queue &q, const sycl::nd_range<3> &range,
-                  unsigned int local_mem_size, void **args, void **extra) {
-    ptr(q, range, local_mem_size, args, extra);
-  }
-
-private:
-  kernel_functor ptr;
-};
-
-/// Find kernel function in a kernel library and return its address.
-/// \param [in] library Handle to the kernel library.
-/// \param [in] name Name of the kernel function.
-static inline kernel_function get_kernel_function(kernel_library &library,
-                                                  const std::string &name) {
-#ifdef _WIN32
-  kernel_functor fn = reinterpret_cast<kernel_functor>(
-      GetProcAddress(static_cast<HMODULE>(static_cast<void *>(library)),
-                     (name + std::string("_wrapper")).c_str()));
-#else
-  kernel_functor fn = reinterpret_cast<kernel_functor>(
-      dlsym(library, (name + std::string("_wrapper")).c_str()));
-#endif
-  if (fn == nullptr)
-    throw std::runtime_error("[SYCLcompat] failed to get function");
-  return fn;
-}
-
-/// Invoke a kernel function.
-/// \param [in] function kernel function.
-/// \param [in] queue SYCL queue used to execute kernel
-/// \param [in] group_range SYCL group range
-/// \param [in] local_range SYCL local range
-/// \param [in] local_mem_size The size of local memory required by the kernel
-///             function.
-/// \param [in] kernel_params Array of pointers to kernel arguments.
-/// \param [in] extra Extra arguments.
-static inline void invoke_kernel_function(kernel_function &function,
-                                          sycl::queue &queue,
-                                          sycl::range<3> group_range,
-                                          sycl::range<3> local_range,
-                                          unsigned int local_mem_size,
-                                          void **kernel_params, void **extra) {
-  function(queue, sycl::nd_range<3>(group_range * local_range, local_range),
-           local_mem_size, kernel_params, extra);
-}
-
-} // namespace syclcompat
diff --git a/sycl/include/syclcompat/launch.hpp b/sycl/include/syclcompat/launch.hpp
deleted file mode 100644
index 83234182c8fee..0000000000000
--- a/sycl/include/syclcompat/launch.hpp
+++ /dev/null
@@ -1,164 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCL compatibility extension
- *
- *  launch.hpp
- *
- *  Description:
- *    launch functionality for the SYCL compatibility extension
- **************************************************************************/
-
-#pragma once
-
-#include <sycl/accessor.hpp>
-#include <sycl/event.hpp>
-#include <sycl/nd_range.hpp>
-#include <sycl/queue.hpp>
-#include <sycl/range.hpp>
-#include <sycl/reduction.hpp>
-
-#include <syclcompat/device.hpp>
-#include <syclcompat/dims.hpp>
-#include <syclcompat/launch_policy.hpp>
-
-namespace [[deprecated("syclcompat is deprecated")]] syclcompat {
-
-namespace detail {
-
-template <typename R, typename... Types>
-constexpr size_t getArgumentCount(R (*f)(Types...)) {
-  return sizeof...(Types);
-}
-
-template <int Dim>
-sycl::nd_range<3> transform_nd_range(const sycl::nd_range<Dim> &range) {
-  sycl::range<Dim> global_range = range.get_global_range();
-  sycl::range<Dim> local_range = range.get_local_range();
-  if constexpr (Dim == 3) {
-    return range;
-  } else if constexpr (Dim == 2) {
-    return sycl::nd_range<3>{{1, global_range[0], global_range[1]},
-                             {1, local_range[0], local_range[1]}};
-  }
-  return sycl::nd_range<3>{{1, 1, global_range[0]}, {1, 1, local_range[0]}};
-}
-
-template <auto F, typename... Args>
-std::enable_if_t<std::is_invocable_v<decltype(F), Args...>, sycl::event>
-launch(const sycl::nd_range<3> &range, sycl::queue q, Args... args) {
-  static_assert(detail::getArgumentCount(F) == sizeof...(args),
-                "Wrong number of arguments to SYCL kernel");
-  static_assert(
-      std::is_same<std::invoke_result_t<decltype(F), Args...>, void>::value,
-      "SYCL kernels should return void");
-
-  return q.parallel_for(
-      range, [=](sycl::nd_item<3>) { [[clang::always_inline]] F(args...); });
-}
-
-} // namespace detail
-
-template <int Dim>
-inline sycl::nd_range<Dim> compute_nd_range(sycl::range<Dim> global_size_in,
-                                            sycl::range<Dim> work_group_size) {
-
-  if (global_size_in.size() == 0 || work_group_size.size() == 0) {
-    throw std::invalid_argument("Global or local size is zero!");
-  }
-  for (size_t i = 0; i < Dim; ++i) {
-    if (global_size_in[i] < work_group_size[i])
-      throw std::invalid_argument("Work group size larger than global size");
-  }
-
-  auto global_size =
-      ((global_size_in + work_group_size - 1) / work_group_size) *
-      work_group_size;
-  return {global_size, work_group_size};
-}
-
-inline sycl::nd_range<1> compute_nd_range(int global_size_in,
-                                          int work_group_size) {
-  return compute_nd_range<1>(global_size_in, work_group_size);
-}
-
-template <auto F, int Dim, typename... Args>
-std::enable_if_t<std::is_invocable_v<decltype(F), Args...>, sycl::event>
-launch(const sycl::nd_range<Dim> &range, sycl::queue q, Args... args) {
-  return detail::launch<F>(detail::transform_nd_range<Dim>(range), q, args...);
-}
-
-template <auto F, int Dim, typename... Args>
-std::enable_if_t<std::is_invocable_v<decltype(F), Args...>, sycl::event>
-launch(const sycl::nd_range<Dim> &range, Args... args) {
-  return launch<F>(range, get_default_queue(), args...);
-}
-
-// Alternative launch through dim3 objects
-template <auto F, typename... Args>
-std::enable_if_t<std::is_invocable_v<decltype(F), Args...>, sycl::event>
-launch(const dim3 &grid, const dim3 &threads, sycl::queue q, Args... args) {
-  return launch<F>(sycl::nd_range<3>{grid * threads, threads}, q, args...);
-}
-
-template <auto F, typename... Args>
-std::enable_if_t<std::is_invocable_v<decltype(F), Args...>, sycl::event>
-launch(const dim3 &grid, const dim3 &threads, Args... args) {
-  return launch<F>(grid, threads, get_default_queue(), args...);
-}
-
-} // namespace syclcompat
-
-namespace [[deprecated("syclcompat is deprecated")]] syclcompat {
-namespace experimental {
-namespace detail {
-
-template <auto F, typename LaunchPolicy, typename... Args>
-sycl::event launch(LaunchPolicy launch_policy, sycl::queue q, Args... args) {
-  static_assert(syclcompat::args_compatible<LaunchPolicy, F, Args...>,
-                "Mismatch between device function signature and supplied "
-                "arguments. Have you correctly handled local memory/char*?");
-
-  sycl_exp::launch_config config(launch_policy.get_range(),
-                                 launch_policy.get_launch_properties());
-
-  return sycl_exp::submit_with_event(q, [&](sycl::handler &cgh) {
-    auto KernelFunctor = build_kernel_functor<F>(cgh, launch_policy, args...);
-    if constexpr (syclcompat::detail::is_range_v<
-                      typename LaunchPolicy::RangeT>) {
-      parallel_for(cgh, config, KernelFunctor);
-    } else {
-      static_assert(
-          syclcompat::detail::is_nd_range_v<typename LaunchPolicy::RangeT>);
-      nd_launch(cgh, config, KernelFunctor);
-    }
-  });
-}
-}
-
-
-template <auto F, typename LaunchPolicy, typename... Args>
-sycl::event launch(LaunchPolicy launch_policy, sycl::queue q, Args... args) {
-  static_assert(detail::is_launch_policy_v<LaunchPolicy>);
-  return detail::launch<F>(launch_policy, q, args...);
-}
-
-template <auto F, typename LaunchPolicy, typename... Args>
-sycl::event launch(LaunchPolicy launch_policy, Args... args) {
-  static_assert(detail::is_launch_policy_v<LaunchPolicy>);
-  return launch<F>(launch_policy, get_default_queue(), args...);
-}
-
-} // namespace experimental
-} // namespace syclcompat
diff --git a/sycl/include/syclcompat/launch_policy.hpp b/sycl/include/syclcompat/launch_policy.hpp
deleted file mode 100644
index 13980d03c93c9..0000000000000
--- a/sycl/include/syclcompat/launch_policy.hpp
+++ /dev/null
@@ -1,272 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCL compatibility extension
- *
- *  launch.hpp
- *
- *  Description:
- *    launch functionality for the SYCL compatibility extension
- **************************************************************************/
-
-#pragma once
-
-#include "sycl/ext/oneapi/experimental/enqueue_functions.hpp"
-#include "sycl/ext/oneapi/properties/properties.hpp"
-#include <sycl/event.hpp>
-#include <sycl/nd_range.hpp>
-#include <sycl/queue.hpp>
-#include <sycl/range.hpp>
-
-#include <syclcompat/defs.hpp>
-#include <syclcompat/device.hpp>
-#include <syclcompat/dims.hpp>
-#include <syclcompat/traits.hpp>
-
-namespace [[deprecated("syclcompat is deprecated")]] syclcompat {
-namespace experimental {
-
-namespace sycl_exp = sycl::ext::oneapi::experimental;
-
-// Wrapper for kernel sycl_exp::properties
-template <typename Properties> struct kernel_properties {
-  static_assert(sycl_exp::is_property_list_v<Properties>);
-  using Props = Properties;
-
-  template <typename... Props>
-  kernel_properties(Props... properties) : props{properties...} {}
-
-  template <typename... Props>
-  kernel_properties(sycl_exp::properties<Props...> properties)
-      : props{properties} {}
-
-  Properties props;
-};
-
-template <typename... Props, typename = std::enable_if_t<detail::are_all_props<Props...>::value, void>>
-kernel_properties(Props... props)
-    -> kernel_properties<decltype(sycl_exp::properties(props...))>;
-
-template <typename... Props>
-kernel_properties(sycl_exp::properties<Props...> props)
-    -> kernel_properties<sycl_exp::properties<Props...>>;
-
-// Wrapper for launch sycl_exp::properties
-template <typename Properties> struct launch_properties {
-  static_assert(sycl_exp::is_property_list_v<Properties>);
-  using Props = Properties;
-
-  template <typename... Props>
-  launch_properties(Props... properties) : props{properties...} {}
-
-  template <typename... Props>
-  launch_properties(sycl_exp::properties<Props...> properties)
-      : props{properties} {}
-
-  Properties props;
-};
-
-template <typename... Props, typename = std::enable_if_t<detail::are_all_props<Props...>::value, void>>
-launch_properties(Props... props)
-    -> launch_properties<decltype(sycl_exp::properties(props...))>;
-
-template <typename... Props>
-launch_properties(sycl_exp::properties<Props...> props)
-    -> launch_properties<sycl_exp::properties<Props...>>;
-
-// Wrapper for local memory size
-struct local_mem_size {
-  local_mem_size(size_t size = 0) : size{size} {};
-  size_t size;
-};
-
-// launch_policy is constructed by the user & passed to `compat_exp::launch`
-template <typename Range, typename KProps, typename LProps, bool LocalMem>
-class launch_policy {
-  static_assert(sycl_exp::is_property_list_v<KProps>);
-  static_assert(sycl_exp::is_property_list_v<LProps>);
-  static_assert(syclcompat::detail::is_range_or_nd_range_v<Range>);
-  static_assert(syclcompat::detail::is_nd_range_v<Range> || !LocalMem,
-                "sycl::range kernel launches are incompatible with local "
-                "memory usage!");
-
-public:
-  using KPropsT = KProps;
-  using LPropsT = LProps;
-  using RangeT = Range;
-  static constexpr bool HasLocalMem = LocalMem;
-
-private:
-  launch_policy() = default;
-
-  template <typename... Ts>
-  launch_policy(Ts... ts)
-      : _kernel_properties{detail::property_getter<
-            kernel_properties, kernel_properties<KPropsT>, std::tuple<Ts...>>()(
-            std::tuple<Ts...>(ts...))},
-        _launch_properties{detail::property_getter<
-            launch_properties, launch_properties<LPropsT>, std::tuple<Ts...>>()(
-            std::tuple<Ts...>(ts...))},
-        _local_mem_size{
-            detail::local_mem_getter<local_mem_size, std::tuple<Ts...>>()(
-                std::tuple<Ts...>(ts...))} {
-    check_variadic_args(ts...);
-  }
-
-  template <typename... Ts> void check_variadic_args(Ts...) {
-    static_assert(
-        std::conjunction_v<std::disjunction<detail::is_kernel_properties<Ts>,
-                                            detail::is_launch_properties<Ts>,
-                                            detail::is_local_mem_size<Ts>>...>,
-        "Received an unexpected argument to ctor. Did you forget to wrap "
-        "in "
-        "compat::kernel_properties, launch_properties, local_mem_size?");
-  }
-
-public:
-  template <typename... Ts>
-  launch_policy(Range range, Ts... ts) : launch_policy(ts...) {
-    _range = range;
-    check_variadic_args(ts...);
-  }
-
-  template <typename... Ts>
-  launch_policy(dim3 global_range, Ts... ts) : launch_policy(ts...) {
-    _range = Range{global_range};
-    check_variadic_args(ts...);
-  }
-
-  template <typename... Ts>
-  launch_policy(dim3 global_range, dim3 local_range, Ts... ts)
-      : launch_policy(ts...) {
-    _range = Range{global_range * local_range, local_range};
-    check_variadic_args(ts...);
-  }
-
-  KProps get_kernel_properties() { return _kernel_properties.props; }
-  LProps get_launch_properties() { return _launch_properties.props; }
-  size_t get_local_mem_size() { return _local_mem_size.size; }
-  Range get_range() { return _range; }
-
-private:
-  Range _range;
-  kernel_properties<KProps> _kernel_properties;
-  launch_properties<LProps> _launch_properties;
-  local_mem_size _local_mem_size;
-};
-
-// Deduction guides for launch_policy
-template <typename Range, typename... Ts>
-launch_policy(Range, Ts...) -> launch_policy<
-    Range, detail::properties_or_empty<kernel_properties, Ts...>,
-    detail::properties_or_empty<launch_properties, Ts...>,
-    detail::has_type<local_mem_size, std::tuple<Ts...>>::value>;
-
-template <int Dim, typename... Ts>
-launch_policy(sycl::range<Dim>, sycl::range<Dim>, Ts...) -> launch_policy<
-    sycl::nd_range<Dim>, detail::properties_or_empty<kernel_properties, Ts...>,
-    detail::properties_or_empty<launch_properties, Ts...>,
-    detail::has_type<local_mem_size, std::tuple<Ts...>>::value>;
-
-template <typename... Ts>
-launch_policy(dim3, Ts...) -> launch_policy<
-    sycl::range<3>, detail::properties_or_empty<kernel_properties, Ts...>,
-    detail::properties_or_empty<launch_properties, Ts...>,
-    detail::has_type<local_mem_size, std::tuple<Ts...>>::value>;
-
-template <typename... Ts>
-launch_policy(dim3, dim3, Ts...) -> launch_policy<
-    sycl::nd_range<3>, detail::properties_or_empty<kernel_properties, Ts...>,
-    detail::properties_or_empty<launch_properties, Ts...>,
-    detail::has_type<local_mem_size, std::tuple<Ts...>>::value>;
-
-namespace detail {
-// Custom std::apply helpers to enable inlining
-template <class F, class Tuple, size_t... Is>
-__syclcompat_inline__ constexpr void apply_expand(F &&f, Tuple &&t,
-                                                  std::index_sequence<Is...>) {
-  [[clang::always_inline]] std::forward<F>(f)(
-      get<Is>(std::forward<Tuple>(t))...);
-}
-
-template <class F, class Tuple>
-__syclcompat_inline__ constexpr void apply_helper(F &&f, Tuple &&t) {
-  apply_expand(
-      std::forward<F>(f), std::forward<Tuple>(t),
-      std::make_index_sequence<std::tuple_size_v<std::decay_t<Tuple>>>{});
-}
-
-template <auto F, typename Range, typename KProps, bool HasLocalMem,
-          typename... Args>
-struct KernelFunctor {
-  KernelFunctor(KProps kernel_props, Args... args)
-      : _kernel_properties{kernel_props},
-        _argument_tuple(std::make_tuple(args...)) {}
-
-  KernelFunctor(KProps kernel_props, sycl::local_accessor<char, 1> local_acc,
-                Args... args)
-      : _kernel_properties{kernel_props}, _local_acc{local_acc},
-        _argument_tuple(std::make_tuple(args...)) {}
-
-  auto get(sycl_exp::properties_tag) const { return _kernel_properties; }
-
-  __syclcompat_inline__ void
-  operator()(syclcompat::detail::range_to_item_t<Range>) const {
-    if constexpr (HasLocalMem) {
-      char *local_mem_ptr = static_cast<char *>(
-          _local_acc.template get_multi_ptr<sycl::access::decorated::no>()
-              .get());
-      apply_helper(
-          [lmem_ptr = local_mem_ptr](auto &&...args) {
-            [[clang::always_inline]] F(args..., lmem_ptr);
-          },
-          _argument_tuple);
-    } else {
-      apply_helper([](auto &&...args) { [[clang::always_inline]] F(args...); },
-                   _argument_tuple);
-    }
-  }
-
-  KProps _kernel_properties;
-  std::tuple<Args...> _argument_tuple;
-  std::conditional_t<HasLocalMem, sycl::local_accessor<char, 1>, std::monostate>
-      _local_acc; // monostate for empty type
-};
-
-//====================================================================
-// This helper function avoids 2 nested `if constexpr` in detail::launch
-template <auto F, typename LaunchPolicy, typename... Args>
-auto build_kernel_functor(sycl::handler &cgh, LaunchPolicy launch_policy,
-                          Args... args)
-    -> KernelFunctor<F, typename LaunchPolicy::RangeT,
-                     typename LaunchPolicy::KPropsT, LaunchPolicy::HasLocalMem,
-                     Args...> {
-  if constexpr (LaunchPolicy::HasLocalMem) {
-    sycl::local_accessor<char, 1> local_memory(
-        launch_policy.get_local_mem_size(), cgh);
-    return KernelFunctor<F, typename LaunchPolicy::RangeT,
-                         typename LaunchPolicy::KPropsT,
-                         LaunchPolicy::HasLocalMem, Args...>(
-        launch_policy.get_kernel_properties(), local_memory, args...);
-  } else {
-    return KernelFunctor<F, typename LaunchPolicy::RangeT,
-                         typename LaunchPolicy::KPropsT,
-                         LaunchPolicy::HasLocalMem, Args...>(
-        launch_policy.get_kernel_properties(), args...);
-  }
-}
-
-} // namespace detail
-} // namespace experimental
-} // namespace syclcompat
diff --git a/sycl/include/syclcompat/math.hpp b/sycl/include/syclcompat/math.hpp
deleted file mode 100644
index f70a2b0dcb085..0000000000000
--- a/sycl/include/syclcompat/math.hpp
+++ /dev/null
@@ -1,2385 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCL compatibility extension
- *
- *  math.hpp
- *
- *  Description:
- *    math utilities for the SYCL compatibility extension.
- **************************************************************************/
-
-// The original source was under the license below:
-//==---- math.hpp ---------------------------------*- C++ -*----------------==//
-//
-// Copyright (C) Intel Corporation
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// See https://llvm.org/LICENSE.txt for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#pragma once
-
-#include <limits>
-#include <sycl/feature_test.hpp>
-#include <type_traits>
-
-// TODO(syclcompat-lib-reviewers): this should not be required
-#ifndef SYCL_EXT_ONEAPI_COMPLEX
-#define SYCL_EXT_ONEAPI_COMPLEX
-#endif
-
-#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
-#include <sycl/ext/oneapi/experimental/bfloat16_math.hpp>
-#endif
-#include <sycl/ext/oneapi/experimental/complex/complex.hpp>
-#include <syclcompat/traits.hpp>
-
-namespace [[deprecated("syclcompat is deprecated")]] syclcompat {
-namespace detail {
-
-namespace complex_namespace = sycl::ext::oneapi::experimental;
-
-template <typename ValueT>
-using complex_type = detail::complex_namespace::complex<ValueT>;
-
-template <typename T>
-constexpr bool is_int32_type = std::is_same_v<std::decay_t<T>, int32_t> ||
-  std::is_same_v<std::decay_t<T>, uint32_t>;
-
-// Helper constexpr bool to avoid ugly macros where possible
-#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
-constexpr bool support_bfloat16_math = true;
-#else
-constexpr bool support_bfloat16_math = false;
-#endif
-
-template <typename ValueT>
-inline ValueT clamp(ValueT val, ValueT min_val, ValueT max_val) {
-  return sycl::clamp(val, min_val, max_val);
-}
-#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
-// TODO(syclcompat-lib-reviewers): Follow the process to add this (& other math
-// fns) to the bfloat16 math function extension. If added, remove this
-// functionality from the header.
-template <>
-inline sycl::ext::oneapi::bfloat16 clamp(sycl::ext::oneapi::bfloat16 val,
-                                         sycl::ext::oneapi::bfloat16 min_val,
-                                         sycl::ext::oneapi::bfloat16 max_val) {
-  if (val < min_val)
-    return min_val;
-  if (val > max_val)
-    return max_val;
-  return val;
-}
-
-template <typename T, int Size>
-inline std::enable_if_t<std::is_same_v<T, sycl::ext::oneapi::bfloat16>,
-                        sycl::vec<T, Size>>
-clamp(sycl::vec<T, Size> val, sycl::vec<T, Size> min_val,
-      sycl::vec<T, Size> max_val) {
-  return [&val, &min_val, &max_val]<int... I>(std::integer_sequence<int, I...>) {
-    return sycl::vec<T, Size>{
-        clamp<sycl::ext::oneapi::bfloat16>(val[I], min_val[I], max_val[I])...};
-  }(std::make_integer_sequence<int, Size>{});
-}
-
-template <typename T, std::size_t Size>
-inline std::enable_if_t<std::is_same_v<T, sycl::ext::oneapi::bfloat16>,
-                        sycl::marray<T, Size>>
-clamp(sycl::marray<T, Size> val, sycl::marray<T, Size> min_val,
-      sycl::marray<T, Size> max_val) {
-  return [&val, &min_val, &max_val]<std::size_t... I>(std::index_sequence<I...>) {
-    return sycl::marray<T, Size>{
-        clamp<sycl::ext::oneapi::bfloat16>(val[I], min_val[I], max_val[I])...};
-  }(std::make_index_sequence<Size>{});
-}
-#endif
-
-template <typename VecT, class BinaryOperation, class = void>
-class vectorized_binary {
-public:
-  inline VecT operator()(VecT a, VecT b, const BinaryOperation binary_op) {
-    VecT v4;
-    for (size_t i = 0; i < v4.size(); ++i) {
-      v4[i] = binary_op(a[i], b[i]);
-    }
-    return v4;
-  }
-};
-
-template <typename VecT, class BinaryOperation>
-class vectorized_binary<
-    VecT, BinaryOperation,
-    std::void_t<std::invoke_result_t<BinaryOperation, VecT, VecT>>> {
-public:
-  inline VecT operator()(VecT a, VecT b, const BinaryOperation binary_op) {
-    return binary_op(a, b).template as<VecT>();
-  }
-};
-
-/// Extend the 'val' to 'bit' size, zero extend for unsigned int and signed
-/// extend for signed int. Returns a signed integer type.
-template <typename ValueT>
-inline auto zero_or_signed_extend(ValueT val, unsigned bit) {
-  static_assert(std::is_integral_v<ValueT>);
-  if constexpr (sizeof(ValueT) == 4) {
-    assert(bit < 64 &&
-           "When extending int32 value, bit must be smaller than 64.");
-    if constexpr (std::is_signed_v<ValueT>)
-      return int64_t(val) << (64 - bit) >> (64 - bit);
-    else
-      return int64_t(val);
-  } else if constexpr (sizeof(ValueT) == 2) {
-    assert(bit < 32 &&
-           "When extending int16 value, bit must be smaller than 32.");
-    if constexpr (std::is_signed_v<ValueT>)
-      return int32_t(val) << (32 - bit) >> (32 - bit);
-    else
-      return int32_t(val);
-  } else if constexpr (sizeof(ValueT) == 1) {
-    assert(bit < 16 &&
-           "When extending int8 value, bit must be smaller than 16.");
-    if constexpr (std::is_signed_v<ValueT>)
-      return int16_t(val) << (16 - bit) >> (16 - bit);
-    else
-      return int16_t(val);
-  } else {
-    static_assert(sizeof(ValueT) == 8);
-    assert(bit < 64 && "Cannot extend int64 value.");
-    return static_cast<int64_t>(val);
-  }
-}
-
-template <typename RetT, bool needSat, typename AT, typename BT,
-          typename BinaryOperation>
-inline constexpr RetT extend_binary(AT a, BT b, BinaryOperation binary_op) {
-  const int64_t extend_a = zero_or_signed_extend(a, 33);
-  const int64_t extend_b = zero_or_signed_extend(b, 33);
-  const int64_t ret = binary_op(extend_a, extend_b);
-  if constexpr (needSat)
-    return detail::clamp<int64_t>(ret, std::numeric_limits<RetT>::min(),
-                                  std::numeric_limits<RetT>::max());
-  return ret;
-}
-
-template <typename RetT, bool needSat, typename AT, typename BT, typename CT,
-          typename BinaryOperation1, typename BinaryOperation2>
-inline constexpr RetT extend_binary(AT a, BT b, CT c,
-                                    BinaryOperation1 binary_op,
-                                    BinaryOperation2 second_op) {
-  const int64_t extend_a = zero_or_signed_extend(a, 33);
-  const int64_t extend_b = zero_or_signed_extend(b, 33);
-  int64_t extend_temp =
-      zero_or_signed_extend(binary_op(extend_a, extend_b), 34);
-  if constexpr (needSat)
-    extend_temp =
-        detail::clamp<int64_t>(extend_temp, std::numeric_limits<RetT>::min(),
-                               std::numeric_limits<RetT>::max());
-  const int64_t extend_c = zero_or_signed_extend(c, 33);
-  return second_op(extend_temp, extend_c);
-}
-
-template <typename T> sycl::vec<int32_t, 2> extract_and_extend2(T a) {
-  sycl::vec<int32_t, 2> ret;
-  sycl::vec<T, 1> va{a};
-  using IntT = std::conditional_t<std::is_signed_v<T>, int16_t, uint16_t>;
-  auto v = va.template as<sycl::vec<IntT, 2>>();
-  ret[0] = zero_or_signed_extend(v[0], 17);
-  ret[1] = zero_or_signed_extend(v[1], 17);
-  return ret;
-}
-
-template <typename T> sycl::vec<int16_t, 4> extract_and_extend4(T a) {
-  sycl::vec<int16_t, 4> ret;
-  sycl::vec<T, 1> va{a};
-  using IntT = std::conditional_t<std::is_signed_v<T>, int8_t, uint8_t>;
-  auto v = va.template as<sycl::vec<IntT, 4>>();
-  ret[0] = zero_or_signed_extend(v[0], 9);
-  ret[1] = zero_or_signed_extend(v[1], 9);
-  ret[2] = zero_or_signed_extend(v[2], 9);
-  ret[3] = zero_or_signed_extend(v[3], 9);
-  return ret;
-}
-
-template <typename RetT, bool NeedSat, bool NeedAdd, typename AT, typename BT,
-          typename BinaryOperation>
-inline constexpr RetT extend_vbinary2(AT a, BT b, RetT c,
-                                      BinaryOperation binary_op) {
-  static_assert(is_int32_type<AT> && is_int32_type<BT> && is_int32_type<RetT>);
-  sycl::vec<int32_t, 2> extend_a = extract_and_extend2(a);
-  sycl::vec<int32_t, 2> extend_b = extract_and_extend2(b);
-  sycl::vec<int32_t, 2> temp{binary_op(extend_a[0], extend_b[0]),
-                             binary_op(extend_a[1], extend_b[1])};
-  using IntT = std::conditional_t<std::is_signed_v<RetT>, int16_t, uint16_t>;
-
-  if constexpr (NeedSat) {
-    int32_t min_val = 0, max_val = 0;
-    min_val = std::numeric_limits<IntT>::min();
-    max_val = std::numeric_limits<IntT>::max();
-    temp = detail::clamp(temp, sycl::vec<int32_t, 2>(min_val),
-                         sycl::vec<int32_t, 2>(max_val));
-  }
-  if constexpr (NeedAdd) {
-    return temp[0] + temp[1] + c;
-  }
-  return sycl::vec<IntT, 2>{temp[0], temp[1]}.template as<sycl::vec<RetT, 1>>();
-}
-
-template <typename RetT, bool NeedSat, bool NeedAdd, typename AT, typename BT,
-          typename BinaryOperation>
-inline constexpr RetT extend_vbinary4(AT a, BT b, RetT c,
-                                      BinaryOperation binary_op) {
-  static_assert(is_int32_type<AT> && is_int32_type<BT> && is_int32_type<RetT>);
-  sycl::vec<int16_t, 4> extend_a = extract_and_extend4(a);
-  sycl::vec<int16_t, 4> extend_b = extract_and_extend4(b);
-  sycl::vec<int16_t, 4> temp{
-      binary_op(extend_a[0], extend_b[0]), binary_op(extend_a[1], extend_b[1]),
-      binary_op(extend_a[2], extend_b[2]), binary_op(extend_a[3], extend_b[3])};
-  using IntT = std::conditional_t<std::is_signed_v<RetT>, int8_t, uint8_t>;
-
-  if constexpr (NeedSat) {
-    int16_t min_val = 0, max_val = 0;
-    min_val = std::numeric_limits<IntT>::min();
-    max_val = std::numeric_limits<IntT>::max();
-    temp = detail::clamp(temp, sycl::vec<int16_t, 4>(min_val),
-                         sycl::vec<int16_t, 4>(max_val));
-  }
-  if constexpr (NeedAdd) {
-    return temp[0] + temp[1] + temp[2] + temp[3] + c;
-  }
-
-  return sycl::vec<IntT, 4>{temp[0], temp[1], temp[2], temp[3]}
-      .template as<sycl::vec<RetT, 1>>();
-}
-
-template <typename ValueT> inline bool isnan(const ValueT a) {
-  if constexpr (std::is_same_v<ValueT, sycl::ext::oneapi::bfloat16>) {
-    static_assert(detail::support_bfloat16_math);
-    return sycl::ext::oneapi::experimental::isnan(a);
-  } else {
-    return sycl::isnan(a);
-  }
-}
-
-// FIXME(syclcompat-lib-reviewers): move bfe outside detail once perf is
-// improved & semantics understood
-/// Bitfield-extract.
-///
-/// \tparam T The type of \param source value, must be an integer.
-/// \param source The source value to extracting.
-/// \param bit_start The position to start extracting.
-/// \param num_bits The number of bits to extracting.
-template <typename T>
-inline T bfe(const T source, const uint32_t bit_start,
-             const uint32_t num_bits) {
-  static_assert(std::is_unsigned_v<T>);
-  // FIXME(syclcompat-lib-reviewers): This ternary was added to catch a case
-  // which may be undefined anyway. Consider that we are losing perf here.
-  const T mask =
-      num_bits >= std::numeric_limits<unsigned char>::digits * sizeof(T)
-          ? static_cast<T>(-1)
-          : ((static_cast<T>(1) << num_bits) - 1);
-  return (source >> bit_start) & mask;
-}
-
-} // namespace detail
-
-/// Bitfield-extract with boundary checking.
-///
-/// Extract bit field from \param source and return the zero or sign-extended
-/// result. Source \param bit_start gives the bit field starting bit position,
-/// and source \param num_bits gives the bit field length in bits.
-///
-/// The result is padded with the sign bit of the extracted field. If `num_bits`
-/// is zero, the result is zero. If the start position is beyond the msb of the
-/// input, the result is filled with the replicated sign bit of the extracted
-/// field.
-///
-/// \tparam T The type of \param source value, must be an integer.
-/// \param source The source value to extracting.
-/// \param bit_start The position to start extracting.
-/// \param num_bits The number of bits to extracting.
-template <typename T>
-inline T bfe_safe(const T source, const uint32_t bit_start,
-                  const uint32_t num_bits) {
-  static_assert(std::is_integral_v<T>);
-#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
-  if constexpr (std::is_same_v<T, int8_t> || std::is_same_v<T, int16_t> ||
-                std::is_same_v<T, int32_t>) {
-    int32_t res{};
-    asm volatile("bfe.s32 %0, %1, %2, %3;"
-                 : "=r"(res)
-                 : "r"((int32_t)source), "r"(bit_start), "r"(num_bits));
-    return res;
-  } else if constexpr (std::is_same_v<T, uint8_t> ||
-                       std::is_same_v<T, uint16_t> ||
-                       std::is_same_v<T, uint32_t>) {
-    uint32_t res{};
-    asm volatile("bfe.u32 %0, %1, %2, %3;"
-                 : "=r"(res)
-                 : "r"((uint32_t)source), "r"(bit_start), "r"(num_bits));
-    return res;
-  } else if constexpr (std::is_same_v<T, int64_t>) {
-    T res{};
-    asm volatile("bfe.s64 %0, %1, %2, %3;"
-                 : "=l"(res)
-                 : "l"(source), "r"(bit_start), "r"(num_bits));
-    return res;
-  } else if constexpr (std::is_same_v<T, uint64_t>) {
-    T res{};
-    asm volatile("bfe.u64 %0, %1, %2, %3;"
-                 : "=l"(res)
-                 : "l"(source), "r"(bit_start), "r"(num_bits));
-    return res;
-  }
-#endif
-  const uint32_t bit_width =
-      std::numeric_limits<unsigned char>::digits * sizeof(T);
-  const uint32_t pos = std::min(bit_start, bit_width);
-  const uint32_t len = std::min(pos + num_bits, bit_width) - pos;
-  if constexpr (std::is_signed_v<T>) {
-    // FIXME(syclcompat-lib-reviewers): As above, catching a case whose result
-    // is undefined and likely losing perf.
-    const T mask = len >= bit_width ? T{-1} : static_cast<T>((T{1} << len) - 1);
-
-    // Find the sign-bit, the result is padded with the sign bit of the
-    // extracted field.
-    // Note if requested num_bits==0, we return zero via sign_bit=0
-    const uint32_t sign_bit_pos = std::min(pos + len - 1, bit_width - 1);
-    const T sign_bit = num_bits != 0 && ((source >> sign_bit_pos) & 1);
-    const T sign_bit_padding = (-sign_bit & ~mask);
-    return ((source >> pos) & mask) | sign_bit_padding;
-  } else {
-    return syclcompat::detail::bfe(source, pos, len);
-  }
-}
-
-namespace detail {
-// FIXME(syclcompat-lib-reviewers): move bfi outside detail once perf is
-// improved & semantics understood
-/// Bitfield-insert.
-///
-/// \tparam T The type of \param x and \param y , must be an unsigned integer.
-/// \param x The source of the bitfield.
-/// \param y The source where bitfield is inserted.
-/// \param bit_start The position to start insertion.
-/// \param num_bits The number of bits to insertion.
-template <typename T>
-inline T bfi(const T x, const T y, const uint32_t bit_start,
-             const uint32_t num_bits) {
-  static_assert(std::is_unsigned_v<T>);
-  constexpr unsigned bit_width =
-      std::numeric_limits<unsigned char>::digits * sizeof(T);
-
-  // if bit_start > bit_width || len == 0, should return y.
-  const T ignore_bfi = static_cast<T>(bit_start > bit_width || num_bits == 0);
-  T extract_bitfield_mask = (static_cast<T>(~T{0}) >> (bit_width - num_bits))
-                            << bit_start;
-  T clean_bitfield_mask = ~extract_bitfield_mask;
-  return (y & (-ignore_bfi | clean_bitfield_mask)) |
-         (~-ignore_bfi & ((x << bit_start) & extract_bitfield_mask));
-}
-} // namespace detail
-
-/// Bitfield-insert with boundary checking.
-///
-/// Align and insert a bit field from \param x into \param y . Source \param
-/// bit_start gives the starting bit position for the insertion, and source
-/// \param num_bits gives the bit field length in bits.
-///
-/// \tparam T The type of \param x and \param y , must be an unsigned integer.
-/// \param x The source of the bitfield.
-/// \param y The source where bitfield is inserted.
-/// \param bit_start The position to start insertion.
-/// \param num_bits The number of bits to insertion.
-template <typename T>
-inline T bfi_safe(const T x, const T y, const uint32_t bit_start,
-                  const uint32_t num_bits) {
-  static_assert(std::is_unsigned_v<T>);
-#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
-  if constexpr (std::is_same_v<T, uint8_t> || std::is_same_v<T, uint16_t> ||
-                std::is_same_v<T, uint32_t>) {
-    uint32_t res{};
-    asm volatile("bfi.b32 %0, %1, %2, %3, %4;"
-                 : "=r"(res)
-                 : "r"((uint32_t)x), "r"((uint32_t)y), "r"(bit_start),
-                   "r"(num_bits));
-    return res;
-  } else if constexpr (std::is_same_v<T, uint64_t>) {
-    uint64_t res{};
-    asm volatile("bfi.b64 %0, %1, %2, %3, %4;"
-                 : "=l"(res)
-                 : "l"(x), "l"(y), "r"(bit_start), "r"(num_bits));
-    return res;
-  }
-#endif
-  constexpr unsigned bit_width =
-      std::numeric_limits<unsigned char>::digits * sizeof(T);
-  const uint32_t pos = std::min(bit_start, bit_width);
-  const uint32_t len = std::min(pos + num_bits, bit_width) - pos;
-  return syclcompat::detail::bfi(x, y, pos, len);
-}
-
-/// Emulated function for __funnelshift_l
-inline unsigned int funnelshift_l(unsigned int low, unsigned int high,
-                                  unsigned int shift) {
-  return (sycl::upsample(high, low) << (shift & 31U)) >> 32;
-}
-
-/// Emulated function for __funnelshift_lc
-inline unsigned int funnelshift_lc(unsigned int low, unsigned int high,
-                                   unsigned int shift) {
-  return (sycl::upsample(high, low) << sycl::min(shift, 32U)) >> 32;
-}
-
-/// Emulated function for __funnelshift_r
-inline unsigned int funnelshift_r(unsigned int low, unsigned int high,
-                                  unsigned int shift) {
-  return (sycl::upsample(high, low) >> (shift & 31U)) & 0xFFFFFFFF;
-}
-
-/// Emulated function for __funnelshift_rc
-inline unsigned int funnelshift_rc(unsigned int low, unsigned int high,
-                                   unsigned int shift) {
-  return (sycl::upsample(high, low) >> sycl::min(shift, 32U)) & 0xFFFFFFFF;
-}
-
-/// Compute fast_length for variable-length array
-/// \param [in] a The array
-/// \param [in] len Length of the array
-/// \returns The computed fast_length
-inline float fast_length(const float *a, int len) {
-  switch (len) {
-  case 1:
-    return sycl::fast_length(a[0]);
-  case 2:
-    return sycl::fast_length(sycl::float2(a[0], a[1]));
-  case 3:
-    return sycl::fast_length(sycl::float3(a[0], a[1], a[2]));
-  case 4:
-    return sycl::fast_length(sycl::float4(a[0], a[1], a[2], a[3]));
-  case 0:
-    return 0;
-  default:
-    float f = 0;
-    for (int i = 0; i < len; ++i)
-      f += a[i] * a[i];
-    return sycl::sqrt(f);
-  }
-}
-
-/// Calculate the square root of the input array.
-/// \param [in] a The array pointer
-/// \param [in] len Length of the array
-/// \returns The square root
-template <typename ValueT>
-inline ValueT length(const ValueT *a, const int len) {
-  switch (len) {
-  case 1:
-    return a[0];
-  case 2:
-    return sycl::length(sycl::vec<ValueT, 2>(a[0], a[1]));
-  case 3:
-    return sycl::length(sycl::vec<ValueT, 3>(a[0], a[1], a[2]));
-  case 4:
-    return sycl::length(sycl::vec<ValueT, 4>(a[0], a[1], a[2], a[3]));
-  default:
-    ValueT ret = 0;
-    for (int i = 0; i < len; ++i)
-      ret += a[i] * a[i];
-    return sycl::sqrt(ret);
-  }
-}
-
-/// Performs comparison.
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] binary_op functor that implements the binary operation
-/// \returns the comparison result
-template <typename ValueT, class BinaryOperation>
-inline std::enable_if_t<
-    std::is_same_v<std::invoke_result_t<BinaryOperation, ValueT, ValueT>, bool>,
-    bool>
-compare(const ValueT a, const ValueT b, const BinaryOperation binary_op) {
-  return binary_op(a, b);
-}
-template <typename ValueT>
-inline std::enable_if_t<
-    std::is_same_v<std::invoke_result_t<std::not_equal_to<>, ValueT, ValueT>,
-                   bool>,
-    bool>
-compare(const ValueT a, const ValueT b, const std::not_equal_to<> binary_op) {
-  return !detail::isnan(a) && !detail::isnan(b) && binary_op(a, b);
-}
-
-/// Performs 2 element comparison.
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] binary_op functor that implements the binary operation
-/// \returns the comparison result
-template <typename ValueT, class BinaryOperation>
-inline std::enable_if_t<ValueT::size() == 2, ValueT>
-compare(const ValueT a, const ValueT b, const BinaryOperation binary_op) {
-  return {compare(a[0], b[0], binary_op), compare(a[1], b[1], binary_op)};
-}
-
-/// Performs unordered comparison.
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] binary_op functor that implements the binary operation
-/// \returns the comparison result
-template <typename ValueT, class BinaryOperation>
-inline std::enable_if_t<
-    std::is_same_v<std::invoke_result_t<BinaryOperation, ValueT, ValueT>, bool>,
-    bool>
-unordered_compare(const ValueT a, const ValueT b,
-                  const BinaryOperation binary_op) {
-  return detail::isnan(a) || detail::isnan(b) || binary_op(a, b);
-}
-
-/// Performs 2 element unordered comparison.
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] binary_op functor that implements the binary operation
-/// \returns the comparison result
-template <typename ValueT, class BinaryOperation>
-inline std::enable_if_t<ValueT::size() == 2, ValueT>
-unordered_compare(const ValueT a, const ValueT b,
-                  const BinaryOperation binary_op) {
-  return {unordered_compare(a[0], b[0], binary_op),
-          unordered_compare(a[1], b[1], binary_op)};
-}
-
-/// Performs 2 element comparison and return true if both results are true.
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] binary_op functor that implements the binary operation
-/// \returns the comparison result
-template <typename ValueT, class BinaryOperation>
-inline std::enable_if_t<ValueT::size() == 2, bool>
-compare_both(const ValueT a, const ValueT b, const BinaryOperation binary_op) {
-  return compare(a[0], b[0], binary_op) && compare(a[1], b[1], binary_op);
-}
-
-/// Performs 2 element unordered comparison and return true if both results are
-/// true.
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] binary_op functor that implements the binary operation
-/// \returns the comparison result
-template <typename ValueT, class BinaryOperation>
-inline std::enable_if_t<ValueT::size() == 2, bool>
-unordered_compare_both(const ValueT a, const ValueT b,
-                       const BinaryOperation binary_op) {
-  return unordered_compare(a[0], b[0], binary_op) &&
-         unordered_compare(a[1], b[1], binary_op);
-}
-
-/// Performs 2 elements comparison, compare result of each element is 0 (false)
-/// or 0xffff (true), returns an unsigned int by composing compare result of two
-/// elements.
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] binary_op functor that implements the binary operation
-/// \returns the comparison result
-template <typename ValueT, class BinaryOperation>
-inline std::enable_if_t<ValueT::size() == 2, unsigned>
-compare_mask(const ValueT a, const ValueT b, const BinaryOperation binary_op) {
-  // Since compare returns 0 or 1, -compare will be 0x00000000 or 0xFFFFFFFF
-  return ((-compare(a[0], b[0], binary_op)) & 0xFFFF) |
-         ((-compare(a[1], b[1], binary_op)) << 16u);
-}
-
-/// Performs 2 elements unordered comparison, compare result of each element is
-/// 0 (false) or 0xffff (true), returns an unsigned int by composing compare
-/// result of two elements.
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] binary_op functor that implements the binary operation
-/// \returns the comparison result
-template <typename ValueT, class BinaryOperation>
-inline std::enable_if_t<ValueT::size() == 2, unsigned>
-unordered_compare_mask(const ValueT a, const ValueT b,
-                       const BinaryOperation binary_op) {
-  return ((-unordered_compare(a[0], b[0], binary_op)) & 0xFFFF) |
-         ((-unordered_compare(a[1], b[1], binary_op)) << 16);
-}
-
-/// Compute vectorized max for two values, with each value treated as a vector
-/// type \p S
-/// \param [in] S The type of the vector
-/// \param [in] T The type of the original values
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \returns The vectorized max of the two values
-template <typename S, typename T> inline T vectorized_max(T a, T b) {
-  sycl::vec<T, 1> v0{a}, v1{b};
-  auto v2 = v0.template as<S>();
-  auto v3 = v1.template as<S>();
-  v2 = sycl::max(v2, v3);
-  v0 = v2.template as<sycl::vec<T, 1>>();
-  return v0;
-}
-
-/// Compute vectorized min for two values, with each value treated as a vector
-/// type \p S
-/// \param [in] S The type of the vector
-/// \param [in] T The type of the original values
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \returns The vectorized min of the two values
-template <typename S, typename T> inline T vectorized_min(T a, T b) {
-  sycl::vec<T, 1> v0{a}, v1{b};
-  auto v2 = v0.template as<S>();
-  auto v3 = v1.template as<S>();
-  v2 = sycl::min(v2, v3);
-  v0 = v2.template as<sycl::vec<T, 1>>();
-  return v0;
-}
-
-/// Compute vectorized unary operation for a value, with the value treated as a
-/// vector type \p VecT.
-/// \tparam [in] VecT The type of the vector
-/// \tparam [in] UnaryOperation The unary operation class
-/// \param [in] a The input value
-/// \returns The vectorized unary operation value of the input value
-template <typename VecT, class UnaryOperation>
-inline unsigned vectorized_unary(unsigned a, const UnaryOperation unary_op) {
-  sycl::vec<unsigned, 1> v0{a};
-  auto v1 = v0.as<VecT>();
-  auto v2 = unary_op(v1);
-  v0 = v2.template as<sycl::vec<unsigned, 1>>();
-  return v0;
-}
-
-/// Compute vectorized absolute difference for two values without modulo
-/// overflow, with each value treated as a vector type \p VecT.
-/// \tparam [in] VecT The type of the vector
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \returns The vectorized absolute difference of the two values
-template <typename VecT>
-inline unsigned vectorized_sum_abs_diff(unsigned a, unsigned b) {
-  sycl::vec<unsigned, 1> v0{a}, v1{b};
-  // Need convert element type to wider signed type to avoid overflow.
-  auto v2 = v0.as<VecT>().template convert<int>();
-  auto v3 = v1.as<VecT>().template convert<int>();
-  auto v4 = sycl::abs_diff(v2, v3);
-  unsigned sum = 0;
-  for (size_t i = 0; i < v4.size(); ++i) {
-    sum += v4[i];
-  }
-  return sum;
-}
-
-/// Compute vectorized isgreater for two values, with each value treated as a
-/// vector type \p S
-/// \param [in] S The type of the vector
-/// \param [in] T The type of the original values
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \returns The vectorized greater than of the two values
-template <typename S, typename T> inline T vectorized_isgreater(T a, T b) {
-  sycl::vec<T, 1> v0{a}, v1{b};
-  auto v2 = v0.template as<S>();
-  auto v3 = v1.template as<S>();
-  auto v4 = sycl::isgreater(v2, v3);
-  v0 = v4.template as<sycl::vec<T, 1>>();
-  return v0;
-}
-
-/// Compute vectorized isgreater for two unsigned int values, with each value
-/// treated as a vector of two unsigned short
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \returns The vectorized greater than of the two values
-template <>
-inline unsigned vectorized_isgreater<sycl::ushort2, unsigned>(unsigned a,
-                                                              unsigned b) {
-  sycl::vec<unsigned, 1> v0{a}, v1{b};
-  auto v2 = v0.template as<sycl::ushort2>();
-  auto v3 = v1.template as<sycl::ushort2>();
-  sycl::ushort2 v4;
-  v4[0] = v2[0] > v3[0];
-  v4[1] = v2[1] > v3[1];
-  v0 = v4.template as<sycl::vec<unsigned, 1>>();
-  return v0;
-}
-
-/// Returns min(max(val, min_val), max_val)
-/// \param [in] val The input value
-/// \param [in] min_val The minimum value
-/// \param [in] max_val The maximum value
-/// \returns the value between min_val and max_val
-template <typename ValueT>
-inline ValueT clamp(ValueT val, ValueT min_val, ValueT max_val) {
-  return detail::clamp(val, min_val, max_val);
-}
-
-/// Determine whether 2 element value is NaN.
-/// \param [in] a The input value
-/// \returns the comparison result
-template <typename ValueT>
-inline std::enable_if_t<ValueT::size() == 2, ValueT> isnan(const ValueT a) {
-  return {detail::isnan(a[0]), detail::isnan(a[1])};
-}
-
-/// cbrt function wrapper.
-template <typename ValueT>
-inline std::enable_if_t<std::is_floating_point_v<ValueT> ||
-                            std::is_same_v<ValueT, sycl::half>,
-                        ValueT>
-cbrt(ValueT val) {
-  return sycl::cbrt(static_cast<ValueT>(val));
-}
-
-// min/max function overloads.
-// For floating-point types, `float` or `double` arguments are acceptable.
-// For integer types, `std::uint32_t`, `std::int32_t`, `std::uint64_t` or
-// `std::int64_t` type arguments are acceptable.
-// sycl::half supported as well, and sycl::ext::oneapi::bfloat16 if available.
-template <typename ValueT, typename ValueU>
-inline std::enable_if_t<std::is_integral_v<ValueT> &&
-                            std::is_integral_v<ValueU>,
-                        std::common_type_t<ValueT, ValueU>>
-min(ValueT a, ValueU b) {
-  return sycl::min(static_cast<std::common_type_t<ValueT, ValueU>>(a),
-                   static_cast<std::common_type_t<ValueT, ValueU>>(b));
-}
-
-template <typename ValueT, typename ValueU>
-inline std::enable_if_t<syclcompat::is_floating_point_v<ValueT> &&
-                            syclcompat::is_floating_point_v<ValueU>,
-                        std::common_type_t<ValueT, ValueU>>
-min(ValueT a, ValueU b) {
-  if constexpr (std::is_same_v<std::common_type_t<ValueT, ValueU>,
-                               sycl::ext::oneapi::bfloat16>) {
-    static_assert(detail::support_bfloat16_math);
-    return sycl::ext::oneapi::experimental::fmin(
-        static_cast<std::common_type_t<ValueT, ValueU>>(a),
-        static_cast<std::common_type_t<ValueT, ValueU>>(b));
-  } else {
-    return sycl::fmin(static_cast<std::common_type_t<ValueT, ValueU>>(a),
-                      static_cast<std::common_type_t<ValueT, ValueU>>(b));
-  }
-}
-
-template <typename ValueT, typename ValueU>
-inline std::enable_if_t<std::is_integral_v<ValueT> &&
-                            std::is_integral_v<ValueU>,
-                        std::common_type_t<ValueT, ValueU>>
-max(ValueT a, ValueU b) {
-  return sycl::max(static_cast<std::common_type_t<ValueT, ValueU>>(a),
-                   static_cast<std::common_type_t<ValueT, ValueU>>(b));
-}
-template <typename ValueT, typename ValueU>
-inline std::enable_if_t<syclcompat::is_floating_point_v<ValueT> &&
-                            syclcompat::is_floating_point_v<ValueU>,
-                        std::common_type_t<ValueT, ValueU>>
-max(ValueT a, ValueU b) {
-  if constexpr (std::is_same_v<std::common_type_t<ValueT, ValueU>,
-                               sycl::ext::oneapi::bfloat16>) {
-    static_assert(detail::support_bfloat16_math);
-    return sycl::ext::oneapi::experimental::fmax(
-        static_cast<std::common_type_t<ValueT, ValueU>>(a),
-        static_cast<std::common_type_t<ValueT, ValueU>>(b));
-  } else {
-    return sycl::fmax(static_cast<std::common_type_t<ValueT, ValueU>>(a),
-                      static_cast<std::common_type_t<ValueT, ValueU>>(b));
-  }
-}
-
-/// Performs 2 elements comparison and returns the bigger one. If either of
-/// inputs is NaN, then return NaN.
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \returns the bigger value
-template <typename ValueT, typename ValueU>
-inline std::common_type_t<ValueT, ValueU> fmax_nan(const ValueT a,
-                                                   const ValueU b) {
-  if (detail::isnan(a) || detail::isnan(b))
-    return NAN;
-  return syclcompat::max(a, b);
-}
-
-template <typename ValueT, typename ValueU>
-inline sycl::vec<std::common_type_t<ValueT, ValueU>, 2>
-fmax_nan(const sycl::vec<ValueT, 2> a, const sycl::vec<ValueU, 2> b) {
-  return {fmax_nan(a[0], b[0]), fmax_nan(a[1], b[1])};
-}
-
-template <typename ValueT, typename ValueU>
-inline sycl::marray<std::common_type_t<ValueT, ValueU>, 2>
-fmax_nan(const sycl::marray<ValueT, 2> a, const sycl::marray<ValueU, 2> b) {
-  return {fmax_nan(a[0], b[0]), fmax_nan(a[1], b[1])};
-}
-
-/// Performs 2 elements comparison and returns the smaller one. If either of
-/// inputs is NaN, then return NaN.
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \returns the smaller value
-template <typename ValueT, typename ValueU>
-inline std::common_type_t<ValueT, ValueU> fmin_nan(const ValueT a,
-                                                   const ValueU b) {
-  if (detail::isnan(a) || detail::isnan(b))
-    return NAN;
-  return syclcompat::min(a,b);
-}
-
-template <typename ValueT, typename ValueU>
-inline sycl::vec<std::common_type_t<ValueT, ValueU>, 2>
-fmin_nan(const sycl::vec<ValueT, 2> a, const sycl::vec<ValueU, 2> b) {
-  return {fmin_nan(a[0], b[0]), fmin_nan(a[1], b[1])};
-}
-
-template <typename ValueT, typename ValueU>
-inline sycl::marray<std::common_type_t<ValueT, ValueU>, 2>
-fmin_nan(const sycl::marray<ValueT, 2> a, const sycl::marray<ValueU, 2> b) {
-  return {fmin_nan(a[0], b[0]), fmin_nan(a[1], b[1])};
-}
-
-// pow functions overload.
-inline float pow(const float a, const int b) { return sycl::pown(a, b); }
-inline double pow(const double a, const int b) { return sycl::pown(a, b); }
-
-template <typename ValueT, typename ValueU>
-inline typename std::enable_if_t<std::is_floating_point_v<ValueT>, ValueT>
-pow(const ValueT a, const ValueU b) {
-  return sycl::pow(a, static_cast<ValueT>(b));
-}
-// TODO(syclcompat-lib-reviewers)  calling pow with non-floating point values
-// is currently defaulting to double, which fails on devices without
-// aspect::fp64. This has to be properly documented, and maybe changed to
-// support all devices.
-template <typename ValueT, typename ValueU>
-inline typename std::enable_if_t<!std::is_floating_point_v<ValueT>, double>
-pow(const ValueT a, const ValueU b) {
-  return sycl::pow(static_cast<double>(a), static_cast<double>(b));
-}
-
-/// Performs relu saturation.
-/// \param [in] a The input value
-/// \returns the relu saturation result
-template <typename ValueT> inline ValueT relu(const ValueT a) {
-  if constexpr (syclcompat::is_floating_point_v<ValueT>)
-    if (detail::isnan(a))
-      return a;
-  if (a < ValueT(0))
-    return ValueT(0);
-  return a;
-}
-template <class ValueT, int NumElements>
-inline sycl::vec<ValueT, NumElements>
-relu(const sycl::vec<ValueT, NumElements> a) {
-  sycl::vec<ValueT, NumElements> ret;
-  for (int i = 0; i < NumElements; ++i)
-    ret[i] = relu(a[i]);
-  return ret;
-}
-template <class ValueT>
-inline sycl::marray<ValueT, 2> relu(const sycl::marray<ValueT, 2> a) {
-  return {relu(a[0]), relu(a[1])};
-}
-
-/// Computes the multiplication of two complex numbers.
-/// \tparam T Complex element type
-/// \param [in] x The first input complex number
-/// \param [in] y The second input complex number
-/// \returns The result
-template <typename T>
-sycl::vec<T, 2> cmul(sycl::vec<T, 2> x, sycl::vec<T, 2> y) {
-  sycl::ext::oneapi::experimental::complex<T> t1(x[0], x[1]), t2(y[0], y[1]);
-  t1 = t1 * t2;
-  return sycl::vec<T, 2>(t1.real(), t1.imag());
-}
-
-/// Computes the division of two complex numbers.
-/// \tparam T Complex element type
-/// \param [in] x The first input complex number
-/// \param [in] y The second input complex number
-/// \returns The result
-template <typename T>
-sycl::vec<T, 2> cdiv(sycl::vec<T, 2> x, sycl::vec<T, 2> y) {
-  sycl::ext::oneapi::experimental::complex<T> t1(x[0], x[1]), t2(y[0], y[1]);
-  t1 = t1 / t2;
-  return sycl::vec<T, 2>(t1.real(), t1.imag());
-}
-
-/// Computes the magnitude of a complex number.
-/// \tparam T Complex element type
-/// \param [in] x The input complex number
-/// \returns The result
-template <typename T> T cabs(sycl::vec<T, 2> x) {
-  sycl::ext::oneapi::experimental::complex<T> t(x[0], x[1]);
-  return sycl::ext::oneapi::experimental::abs(t);
-}
-
-/// Computes the complex conjugate of a complex number.
-/// \tparam T Complex element type
-/// \param [in] x The input complex number
-/// \returns The result
-template <typename T> sycl::vec<T, 2> conj(sycl::vec<T, 2> x) {
-  sycl::ext::oneapi::experimental::complex<T> t(x[0], x[1]);
-  t = conj(t);
-  return sycl::vec<T, 2>(t.real(), t.imag());
-}
-
-/// Performs complex number multiply addition.
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns the operation result
-template <typename ValueT>
-inline sycl::vec<ValueT, 2> cmul_add(const sycl::vec<ValueT, 2> a,
-                                     const sycl::vec<ValueT, 2> b,
-                                     const sycl::vec<ValueT, 2> c) {
-  sycl::ext::oneapi::experimental::complex<ValueT> t(a[0], a[1]);
-  sycl::ext::oneapi::experimental::complex<ValueT> u(b[0], b[1]);
-  sycl::ext::oneapi::experimental::complex<ValueT> v(c[0], c[1]);
-  t = t * u + v;
-  return sycl::vec<ValueT, 2>{t.real(), t.imag()};
-}
-template <typename ValueT>
-inline sycl::marray<ValueT, 2> cmul_add(const sycl::marray<ValueT, 2> a,
-                                        const sycl::marray<ValueT, 2> b,
-                                        const sycl::marray<ValueT, 2> c) {
-  sycl::ext::oneapi::experimental::complex<ValueT> t(a[0], a[1]);
-  sycl::ext::oneapi::experimental::complex<ValueT> u(b[0], b[1]);
-  sycl::ext::oneapi::experimental::complex<ValueT> v(c[0], c[1]);
-  t = t * u + v;
-  return sycl::marray<ValueT, 2>{t.real(), t.imag()};
-}
-
-/// A sycl::abs wrapper functors.
-struct abs {
-  template <typename ValueT> auto operator()(const ValueT x) const {
-    return sycl::abs(x);
-  }
-};
-
-/// A sycl::abs_diff wrapper functors.
-struct abs_diff {
-  template <typename ValueT>
-  auto operator()(const ValueT x, const ValueT y) const {
-    return sycl::abs_diff(x, y);
-  }
-};
-
-/// A sycl::add_sat wrapper functors.
-struct add_sat {
-  template <typename ValueT>
-  auto operator()(const ValueT x, const ValueT y) const {
-    return sycl::add_sat(x, y);
-  }
-};
-
-/// A sycl::rhadd wrapper functors.
-struct rhadd {
-  template <typename ValueT>
-  auto operator()(const ValueT x, const ValueT y) const {
-    return sycl::rhadd(x, y);
-  }
-};
-
-/// A sycl::hadd wrapper functors.
-struct hadd {
-  template <typename ValueT>
-  auto operator()(const ValueT x, const ValueT y) const {
-    return sycl::hadd(x, y);
-  }
-};
-
-/// A sycl::max wrapper functors.
-struct maximum {
-  template <typename ValueT>
-  auto operator()(const ValueT x, const ValueT y) const {
-    return sycl::max(x, y);
-  }
-  template <typename ValueT>
-  auto operator()(const ValueT x, const ValueT y, bool *pred) const {
-    return (x >= y) ? ((*pred = true), x) : ((*pred = false), y);
-  }
-};
-
-/// A sycl::min wrapper functors.
-struct minimum {
-  template <typename ValueT>
-  auto operator()(const ValueT x, const ValueT y) const {
-    return sycl::min(x, y);
-  }
-  template <typename ValueT>
-  auto operator()(const ValueT x, const ValueT y, bool *pred) const {
-    return (x <= y) ? ((*pred = true), x) : ((*pred = false), y);
-  }
-};
-
-/// A sycl::sub_sat wrapper functors.
-struct sub_sat {
-  template <typename ValueT>
-  auto operator()(const ValueT x, const ValueT y) const {
-    return sycl::sub_sat(x, y);
-  }
-};
-
-namespace detail {
-struct shift_left {
-  template <typename T>
-  auto operator()(const T x, const uint32_t offset) const {
-    return x << offset;
-  }
-};
-
-struct shift_right {
-  template <typename T>
-  auto operator()(const T x, const uint32_t offset) const {
-    return x >> offset;
-  }
-};
-
-struct average {
-  template <typename T> auto operator()(const T x, const T y) const {
-    return (x + y + (x + y >= 0)) >> 1;
-  }
-};
-
-} // namespace detail
-
-/// Compute vectorized binary operation value for two/four values, with each
-/// treated as a vector type \p VecT.
-/// \tparam [in] VecT The type of the vector
-/// \tparam [in] BinaryOperation The binary operation class
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] binary_op The operation to do with the two values
-/// \param [in] need_relu Whether the result need relu saturation
-/// \returns The vectorized binary operation value of the two values
-template <typename VecT, class BinaryOperation>
-inline unsigned vectorized_binary(unsigned a, unsigned b,
-                                  const BinaryOperation binary_op,
-                                  [[maybe_unused]] bool need_relu = false) {
-  sycl::vec<unsigned, 1> v0{a}, v1{b};
-  auto v2 = v0.as<VecT>();
-  auto v3 = v1.as<VecT>();
-  auto v4 =
-      detail::vectorized_binary<VecT, BinaryOperation>()(v2, v3, binary_op);
-  if (need_relu)
-    v4 = relu(v4);
-  v0 = v4.template as<sycl::vec<unsigned, 1>>();
-  return v0;
-}
-
-/// Compute two vectorized binary operation value with pred for three values,
-/// with each value treated as a 2 \p T type elements vector type.
-///
-/// \tparam [in] VecT The type of the vector
-/// \tparam [in] BinaryOperation1 The first binary operation class
-/// \tparam [in] BinaryOperation2 The second binary operation class
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \param [in] binary_op1 The first operation to do with the first two values
-/// \param [in] binary_op2 The second operation to do with the third values
-/// \param [in] need_relu Whether the result need relu saturation
-/// \returns The two vectorized binary operation value of the three values
-template <typename VecT, typename BinaryOperation1, typename BinaryOperation2>
-inline unsigned vectorized_ternary(unsigned a, unsigned b, unsigned c,
-                                   const BinaryOperation1 binary_op1,
-                                   const BinaryOperation2 binary_op2,
-                                   bool need_relu = false) {
-  const auto v1 = sycl::vec<unsigned, 1>(a).as<VecT>();
-  const auto v2 = sycl::vec<unsigned, 1>(b).as<VecT>();
-  const auto v3 = sycl::vec<unsigned, 1>(c).as<VecT>();
-  auto v4 =
-      detail::vectorized_binary<VecT, BinaryOperation1>()(v1, v2, binary_op1);
-  v4 = detail::vectorized_binary<VecT, BinaryOperation2>()(v4, v3, binary_op2);
-  if (need_relu)
-    v4 = relu(v4);
-  return v4.template as<sycl::vec<unsigned, 1>>();
-}
-
-/// Compute vectorized binary operation value with pred for two values, with
-/// each value treated as a 2 \p T type elements vector type.
-///
-/// \tparam [in] VecT The type of the vector
-/// \tparam [in] BinaryOperation The binary operation class
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] binary_op The operation with pred to do with the two values
-/// \param [out] pred_hi The pred pointer that pass into high halfword operation
-/// \param [out] pred_lo The pred pointer that pass into low halfword operation
-/// \returns The vectorized binary operation value of the two values
-template <typename VecT, typename BinaryOperation>
-inline unsigned vectorized_binary_with_pred(unsigned a, unsigned b,
-                                            const BinaryOperation binary_op,
-                                            bool *pred_hi, bool *pred_lo) {
-  auto v1 = sycl::vec<unsigned, 1>(a).as<VecT>();
-  auto v2 = sycl::vec<unsigned, 1>(b).as<VecT>();
-  VecT ret;
-  ret[0] = binary_op(v1[0], v2[0], pred_lo);
-  ret[1] = binary_op(v1[1], v2[1], pred_hi);
-  return ret.template as<sycl::vec<unsigned, 1>>();
-}
-
-template <typename T1, typename T2>
-using dot_product_acc_t =
-    std::conditional_t<std::is_unsigned_v<T1> && std::is_unsigned_v<T2>,
-                       uint32_t, int32_t>;
-
-namespace detail {
-
-template <typename T> sycl::vec<T, 4> extract_and_sign_or_zero_extend4(T val) {
-  return sycl::vec<T, 1>(val)
-      .template as<sycl::vec<
-          std::conditional_t<std::is_signed_v<T>, int8_t, uint8_t>, 4>>()
-      .template convert<T>();
-}
-
-template <typename T> sycl::vec<T, 2> extract_and_sign_or_zero_extend2(T val) {
-  return sycl::vec<T, 1>(val)
-      .template as<sycl::vec<
-          std::conditional_t<std::is_signed_v<T>, int16_t, uint16_t>, 2>>()
-      .template convert<T>();
-}
-
-} // namespace detail
-
-/// Two-way dot product-accumulate. Calculate and return integer_vector2(
-/// \param a) dot product integer_vector2(low16_bit( \param b)) + \param c
-///
-/// \tparam [in] T1 The type of first value.
-/// \tparam [in] T2 The type of second value.
-/// \param [in] a The first value.
-/// \param [in] b The second value.
-/// \param [in] c The third value. It has type uint32_t if both T1 and T1 are
-/// uint32_t else has type int32_t.
-/// \return Two-way 16-bit to 8-bit dot product which is accumulated in 32-bit
-/// result.
-template <typename T1, typename T2>
-inline dot_product_acc_t<T1, T2> dp2a_lo(T1 a, T2 b,
-                                         dot_product_acc_t<T1, T2> c) {
-  static_assert(detail::is_int32_type<T1> && detail::is_int32_type<T2>,
-                "[SYCLcompat] dp2a_lo expects 32-bit integers as operands.");
-#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&                     \
-    defined(__SYCL_CUDA_ARCH__) && __SYCL_CUDA_ARCH__ >= 610
-  dot_product_acc_t<T1, T2> res;
-  if constexpr (std::is_signed_v<T1> && std::is_signed_v<T2>) {
-    asm volatile("dp2a.lo.s32.s32 %0, %1, %2, %3;"
-                 : "=r"(res)
-                 : "r"(a), "r"(b), "r"(c));
-  } else if constexpr (std::is_signed_v<T1> && std::is_unsigned_v<T2>) {
-    asm volatile("dp2a.lo.s32.u32 %0, %1, %2, %3;"
-                 : "=r"(res)
-                 : "r"(a), "r"(b), "r"(c));
-  } else if constexpr (std::is_unsigned_v<T1> && std::is_signed_v<T2>) {
-    asm volatile("dp2a.lo.u32.s32 %0, %1, %2, %3;"
-                 : "=r"(res)
-                 : "r"(a), "r"(b), "r"(c));
-  } else {
-    asm volatile("dp2a.lo.u32.u32 %0, %1, %2, %3;"
-                 : "=r"(res)
-                 : "r"(a), "r"(b), "r"(c));
-  }
-  return res;
-#else
-  dot_product_acc_t<T1, T2> res = c;
-  auto va = detail::extract_and_sign_or_zero_extend2(a);
-  auto vb = detail::extract_and_sign_or_zero_extend4(b);
-  res += va[0] * vb[0];
-  res += va[1] * vb[1];
-  return res;
-#endif
-}
-
-/// Two-way dot product-accumulate. Calculate and return integer_vector2(
-/// \param a) dot product integer_vector2(high_16bit( \param b)) + \param c
-///
-/// \tparam [in] T1 The type of first value.
-/// \tparam [in] T2 The type of second value.
-/// \param [in] a The first value.
-/// \param [in] b The second value.
-/// \param [in] c The third value. uint32_t if both T1 and T1 are
-/// uint32_t else has type int32_t.
-/// \return Two-way 16-bit to 8-bit dot product which is accumulated in 32-bit
-/// result.
-template <typename T1, typename T2>
-inline dot_product_acc_t<T1, T2> dp2a_hi(T1 a, T2 b,
-                                         dot_product_acc_t<T1, T2> c) {
-  static_assert(detail::is_int32_type<T1> && detail::is_int32_type<T2>,
-                "[SYCLcompat] dp2a_hi expects 32-bit integers as operands.");
-#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&                     \
-    defined(__SYCL_CUDA_ARCH__) && __SYCL_CUDA_ARCH__ >= 610
-  dot_product_acc_t<T1, T2> res;
-  if constexpr (std::is_signed_v<T1> && std::is_signed_v<T2>) {
-    asm volatile("dp2a.hi.s32.s32 %0, %1, %2, %3;"
-                 : "=r"(res)
-                 : "r"(a), "r"(b), "r"(c));
-  } else if constexpr (std::is_signed_v<T1> && std::is_unsigned_v<T2>) {
-    asm volatile("dp2a.hi.s32.u32 %0, %1, %2, %3;"
-                 : "=r"(res)
-                 : "r"(a), "r"(b), "r"(c));
-  } else if constexpr (std::is_unsigned_v<T1> && std::is_signed_v<T2>) {
-    asm volatile("dp2a.hi.u32.s32 %0, %1, %2, %3;"
-                 : "=r"(res)
-                 : "r"(a), "r"(b), "r"(c));
-  } else {
-    asm volatile("dp2a.hi.u32.u32 %0, %1, %2, %3;"
-                 : "=r"(res)
-                 : "r"(a), "r"(b), "r"(c));
-  }
-  return res;
-#else
-  dot_product_acc_t<T1, T2> res = c;
-  auto va = detail::extract_and_sign_or_zero_extend2(a);
-  auto vb = detail::extract_and_sign_or_zero_extend4(b);
-  res += va[0] * vb[2];
-  res += va[1] * vb[3];
-  return res;
-#endif
-}
-
-/// Four-way byte dot product-accumulate. Calculate and return integer_vector4(
-/// \param a) dot product integer_vector4( \param b)  + \param c
-///
-/// \tparam [in] T1 The type of first value.
-/// \tparam [in] T2 The type of second value.
-/// \param [in] a The first value.
-/// \param [in] b The second value.
-/// \param [in] c The third value. It has type uint32_t if both T1 and T1 are
-/// uint32_t else has type int32_t.
-/// \return Four-way byte dot product which is accumulated in 32-bit result.
-template <typename T1, typename T2>
-inline dot_product_acc_t<T1, T2> dp4a(T1 a, T2 b, dot_product_acc_t<T1, T2> c) {
-  static_assert(detail::is_int32_type<T1> && detail::is_int32_type<T2>,
-                "[SYCLcompat] dp4a expects 32-bit integers as operands.");
-#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&                     \
-    defined(__SYCL_CUDA_ARCH__) && __SYCL_CUDA_ARCH__ >= 610
-  dot_product_acc_t<T1, T2> res;
-  if constexpr (std::is_signed_v<T1> && std::is_signed_v<T2>) {
-    asm volatile("dp4a.s32.s32 %0, %1, %2, %3;"
-                 : "=r"(res)
-                 : "r"(a), "r"(b), "r"(c));
-  } else if constexpr (std::is_signed_v<T1> && std::is_unsigned_v<T2>) {
-    asm volatile("dp4a.s32.u32 %0, %1, %2, %3;"
-                 : "=r"(res)
-                 : "r"(a), "r"(b), "r"(c));
-  } else if constexpr (std::is_unsigned_v<T1> && std::is_signed_v<T2>) {
-    asm volatile("dp4a.u32.s32 %0, %1, %2, %3;"
-                 : "=r"(res)
-                 : "r"(a), "r"(b), "r"(c));
-  } else {
-    asm volatile("dp4a.u32.u32 %0, %1, %2, %3;"
-                 : "=r"(res)
-                 : "r"(a), "r"(b), "r"(c));
-  }
-  return res;
-#else
-  dot_product_acc_t<T1, T2> res = c;
-  auto va = detail::extract_and_sign_or_zero_extend4(a);
-  auto vb = detail::extract_and_sign_or_zero_extend4(b);
-  res += va[0] * vb[0];
-  res += va[1] * vb[1];
-  res += va[2] * vb[2];
-  res += va[3] * vb[3];
-  return res;
-#endif
-}
-
-/// Extend \p a and \p b to 33 bit and add them.
-/// \tparam [in] RetT The type of the return value
-/// \tparam [in] AT The type of the first value
-/// \tparam [in] BT The type of the second value
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \returns The extend addition of the two values
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_add(AT a, BT b) {
-  return detail::extend_binary<RetT, false>(a, b, std::plus());
-}
-
-/// Extend Inputs to 33 bit, add \p a, \p b, then do \p second_op with \p c.
-/// \tparam [in] RetT The type of the return value
-/// \tparam [in] AT The type of the first value
-/// \tparam [in] BT The type of the second value
-/// \tparam [in] CT The type of the third value
-/// \tparam [in] BinaryOperation The type of the second operation
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \param [in] second_op The operation to do with the third value
-/// \returns The extend addition of \p a, \p b and \p second_op with \p c
-template <typename RetT, typename AT, typename BT, typename CT,
-          typename BinaryOperation>
-inline constexpr RetT extend_add(AT a, BT b, CT c, BinaryOperation second_op) {
-  return detail::extend_binary<RetT, false>(a, b, c, std::plus(), second_op);
-}
-
-/// Extend \p a and \p b to 33 bit and add them with saturation.
-/// \tparam [in] RetT The type of the return value
-/// \tparam [in] AT The type of the first value
-/// \tparam [in] BT The type of the second value
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \returns The extend addition of the two values with saturation
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_add_sat(AT a, BT b) {
-  return detail::extend_binary<RetT, true>(a, b, std::plus());
-}
-
-/// Extend Inputs to 33 bit, add \p a, \p b with saturation, then do \p
-/// second_op with \p c.
-/// \tparam [in] RetT The type of the return value
-/// \tparam [in] AT The type of the first value
-/// \tparam [in] BT The type of the second value
-/// \tparam [in] CT The type of the third value
-/// \tparam [in] BinaryOperation The type of the second operation
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \param [in] second_op The operation to do with the third value
-/// \returns The extend addition of \p a, \p b with saturation and \p second_op
-/// with \p c
-template <typename RetT, typename AT, typename BT, typename CT,
-          typename BinaryOperation>
-inline constexpr RetT extend_add_sat(AT a, BT b, CT c,
-                                     BinaryOperation second_op) {
-  return detail::extend_binary<RetT, true>(a, b, c, std::plus(), second_op);
-}
-
-/// Extend \p a and \p b to 33 bit and minus them.
-/// \tparam [in] RetT The type of the return value
-/// \tparam [in] AT The type of the first value
-/// \tparam [in] BT The type of the second value
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \returns The extend subtraction of the two values
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_sub(AT a, BT b) {
-  return detail::extend_binary<RetT, false>(a, b, std::minus());
-}
-
-/// Extend Inputs to 33 bit, minus \p a, \p b, then do \p second_op with \p c.
-/// \tparam [in] RetT The type of the return value
-/// \tparam [in] AT The type of the first value
-/// \tparam [in] BT The type of the second value
-/// \tparam [in] CT The type of the third value
-/// \tparam [in] BinaryOperation The type of the second operation
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \param [in] second_op The operation to do with the third value
-/// \returns The extend subtraction of \p a, \p b and \p second_op with \p c
-template <typename RetT, typename AT, typename BT, typename CT,
-          typename BinaryOperation>
-inline constexpr RetT extend_sub(AT a, BT b, CT c, BinaryOperation second_op) {
-  return detail::extend_binary<RetT, false>(a, b, c, std::minus(), second_op);
-}
-
-/// Extend \p a and \p b to 33 bit and minus them with saturation.
-/// \tparam [in] RetT The type of the return value
-/// \tparam [in] AT The type of the first value
-/// \tparam [in] BT The type of the second value
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \returns The extend subtraction of the two values with saturation
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_sub_sat(AT a, BT b) {
-  return detail::extend_binary<RetT, true>(a, b, std::minus());
-}
-
-/// Extend Inputs to 33 bit, minus \p a, \p b with saturation, then do \p
-/// second_op with \p c.
-/// \tparam [in] RetT The type of the return value
-/// \tparam [in] AT The type of the first value
-/// \tparam [in] BT The type of the second value
-/// \tparam [in] CT The type of the third value
-/// \tparam [in] BinaryOperation The type of the second operation
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \param [in] second_op The operation to do with the third value
-/// \returns The extend subtraction of \p a, \p b with saturation and \p
-/// second_op with \p c
-template <typename RetT, typename AT, typename BT, typename CT,
-          typename BinaryOperation>
-inline constexpr RetT extend_sub_sat(AT a, BT b, CT c,
-                                     BinaryOperation second_op) {
-  return detail::extend_binary<RetT, true>(a, b, c, std::minus(), second_op);
-}
-
-/// Extend \p a and \p b to 33 bit and do abs_diff.
-/// \tparam [in] RetT The type of the return value
-/// \tparam [in] AT The type of the first value
-/// \tparam [in] BT The type of the second value
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \returns The extend abs_diff of the two values
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_absdiff(AT a, BT b) {
-  return detail::extend_binary<RetT, false>(a, b, abs_diff());
-}
-
-/// Extend Inputs to 33 bit, abs_diff \p a, \p b, then do \p second_op with \p
-/// c.
-/// \tparam [in] RetT The type of the return value
-/// \tparam [in] AT The type of the first value
-/// \tparam [in] BT The type of the second value
-/// \tparam [in] CT The type of the third value
-/// \tparam [in] BinaryOperation The type of the second operation
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \param [in] second_op The operation to do with the third value
-/// \returns The extend abs_diff of \p a, \p b and \p second_op with \p c
-template <typename RetT, typename AT, typename BT, typename CT,
-          typename BinaryOperation>
-inline constexpr RetT extend_absdiff(AT a, BT b, CT c,
-                                     BinaryOperation second_op) {
-  return detail::extend_binary<RetT, false>(a, b, c, abs_diff(), second_op);
-}
-
-/// Extend \p a and \p b to 33 bit and do abs_diff with saturation.
-/// \tparam [in] RetT The type of the return value
-/// \tparam [in] AT The type of the first value
-/// \tparam [in] BT The type of the second value
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \returns The extend abs_diff of the two values with saturation
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_absdiff_sat(AT a, BT b) {
-  return detail::extend_binary<RetT, true>(a, b, abs_diff());
-}
-
-/// Extend Inputs to 33 bit, abs_diff \p a, \p b with saturation, then do \p
-/// second_op with \p c.
-/// \tparam [in] RetT The type of the return value
-/// \tparam [in] AT The type of the first value
-/// \tparam [in] BT The type of the second value
-/// \tparam [in] CT The type of the third value
-/// \tparam [in] BinaryOperation The type of the second operation
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \param [in] second_op The operation to do with the third value
-/// \returns The extend abs_diff of \p a, \p b with saturation and \p
-/// second_op with \p c
-template <typename RetT, typename AT, typename BT, typename CT,
-          typename BinaryOperation>
-inline constexpr RetT extend_absdiff_sat(AT a, BT b, CT c,
-                                         BinaryOperation second_op) {
-  return detail::extend_binary<RetT, true>(a, b, c, abs_diff(), second_op);
-}
-
-/// Extend \p a and \p b to 33 bit and return smaller one.
-/// \tparam [in] RetT The type of the return value
-/// \tparam [in] AT The type of the first value
-/// \tparam [in] BT The type of the second value
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \returns The smaller one of the two extended values
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_min(AT a, BT b) {
-  return detail::extend_binary<RetT, false>(a, b, minimum());
-}
-
-/// Extend Inputs to 33 bit, find the smaller one in \p a, \p b, then do \p
-/// second_op with \p c.
-/// \tparam [in] RetT The type of the return value
-/// \tparam [in] AT The type of the first value
-/// \tparam [in] BT The type of the second value
-/// \tparam [in] CT The type of the third value
-/// \tparam [in] BinaryOperation The type of the second operation
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \param [in] second_op The operation to do with the third value
-/// \returns The smaller one of \p a, \p b and \p second_op with \p c
-template <typename RetT, typename AT, typename BT, typename CT,
-          typename BinaryOperation>
-inline constexpr RetT extend_min(AT a, BT b, CT c, BinaryOperation second_op) {
-  return detail::extend_binary<RetT, false>(a, b, c, minimum(), second_op);
-}
-
-/// Extend \p a and \p b to 33 bit and return smaller one with saturation.
-/// \tparam [in] RetT The type of the return value
-/// \tparam [in] AT The type of the first value
-/// \tparam [in] BT The type of the second value
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \returns The smaller one of the two extended values with saturation
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_min_sat(AT a, BT b) {
-  return detail::extend_binary<RetT, true>(a, b, minimum());
-}
-
-/// Extend Inputs to 33 bit, find the smaller one in \p a, \p b with saturation,
-/// then do \p second_op with \p c.
-/// \tparam [in] RetT The type of the return value
-/// \tparam [in] AT The type of the first value
-/// \tparam [in] BT The type of the second value
-/// \tparam [in] CT The type of the third value
-/// \tparam [in] BinaryOperation The type of the second operation
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \param [in] second_op The operation to do with the third value
-/// \returns The smaller one of \p a, \p b with saturation and \p
-/// second_op with \p c
-template <typename RetT, typename AT, typename BT, typename CT,
-          typename BinaryOperation>
-inline constexpr RetT extend_min_sat(AT a, BT b, CT c,
-                                     BinaryOperation second_op) {
-  return detail::extend_binary<RetT, true>(a, b, c, minimum(), second_op);
-}
-
-/// Extend \p a and \p b to 33 bit and return bigger one.
-/// \tparam [in] RetT The type of the return value
-/// \tparam [in] AT The type of the first value
-/// \tparam [in] BT The type of the second value
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \returns The bigger one of the two extended values
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_max(AT a, BT b) {
-  return detail::extend_binary<RetT, false>(a, b, maximum());
-}
-
-/// Extend Inputs to 33 bit, find the bigger one in \p a, \p b, then do \p
-/// second_op with \p c.
-/// \tparam [in] RetT The type of the return value
-/// \tparam [in] AT The type of the first value
-/// \tparam [in] BT The type of the second value
-/// \tparam [in] CT The type of the third value
-/// \tparam [in] BinaryOperation The type of the second operation
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \param [in] second_op The operation to do with the third value
-/// \returns The bigger one of \p a, \p b and \p second_op with \p c
-template <typename RetT, typename AT, typename BT, typename CT,
-          typename BinaryOperation>
-inline constexpr RetT extend_max(AT a, BT b, CT c, BinaryOperation second_op) {
-  return detail::extend_binary<RetT, false>(a, b, c, maximum(), second_op);
-}
-
-/// Extend \p a and \p b to 33 bit and return bigger one with saturation.
-/// \tparam [in] RetT The type of the return value
-/// \tparam [in] AT The type of the first value
-/// \tparam [in] BT The type of the second value
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \returns The bigger one of the two extended values with saturation
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_max_sat(AT a, BT b) {
-  return detail::extend_binary<RetT, true>(a, b, maximum());
-}
-
-/// Extend Inputs to 33 bit, find the bigger one in \p a, \p b with saturation,
-/// then do \p second_op with \p c.
-/// \tparam [in] RetT The type of the return value
-/// \tparam [in] AT The type of the first value
-/// \tparam [in] BT The type of the second value
-/// \tparam [in] CT The type of the third value
-/// \tparam [in] BinaryOperation The type of the second operation
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \param [in] second_op The operation to do with the third value
-/// \returns The bigger one of \p a, \p b with saturation and \p
-/// second_op with \p c
-template <typename RetT, typename AT, typename BT, typename CT,
-          typename BinaryOperation>
-inline constexpr RetT extend_max_sat(AT a, BT b, CT c,
-                                     BinaryOperation second_op) {
-  return detail::extend_binary<RetT, true>(a, b, c, maximum(), second_op);
-}
-
-/// Extend \p a and \p b to 33 bit and return a << clamp(b, 0, 32).
-/// \param [in] a The source value
-/// \param [in] b The offset to shift
-/// \returns a << clamp(b, 0, 32)
-template <typename RetT, typename T>
-inline constexpr RetT extend_shl_clamp(T a, uint32_t b) {
-  return detail::extend_binary<RetT, false>(a, sycl::clamp(b, 0u, 32u),
-                                            detail::shift_left());
-}
-
-/// Extend Inputs to 33 bit, and return second_op(a << clamp(b, 0, 32), c).
-/// \param [in] a The source value
-/// \param [in] b The offset to shift
-/// \param [in] c The value to merge
-/// \param [in] second_op The operation to do with the third value
-/// \returns second_op(a << clamp(b, 0, 32), c)
-template <typename RetT, typename T, typename BinaryOperation>
-inline constexpr RetT extend_shl_clamp(T a, uint32_t b, uint32_t c,
-                                       BinaryOperation second_op) {
-  return detail::extend_binary<RetT, false>(a, sycl::clamp(b, 0u, 32u), c,
-                                            detail::shift_left(), second_op);
-}
-
-/// Extend \p a and \p b to 33 bit and return sat(a << clamp(b, 0, 32)).
-/// \param [in] a The source value
-/// \param [in] b The offset to shift
-/// \returns sat(a << clamp(b, 0, 32))
-template <typename RetT, typename T>
-inline constexpr RetT extend_shl_sat_clamp(T a, uint32_t b) {
-  return detail::extend_binary<RetT, true>(a, sycl::clamp(b, 0u, 32u),
-                                           detail::shift_left());
-}
-
-/// Extend Inputs to 33 bit, and return second_op(sat(a << clamp(b, 0, 32)), c).
-/// \param [in] a The source value
-/// \param [in] b The offset to shift
-/// \param [in] c The value to merge
-/// \param [in] second_op The operation to do with the third value
-/// \returns second_op(sat(a << clamp(b, 0, 32)), c)
-template <typename RetT, typename T, typename BinaryOperation>
-inline constexpr RetT extend_shl_sat_clamp(T a, uint32_t b, uint32_t c,
-                                           BinaryOperation second_op) {
-  return detail::extend_binary<RetT, true>(a, sycl::clamp(b, 0u, 32u), c,
-                                           detail::shift_left(), second_op);
-}
-
-/// Extend \p a and \p b to 33 bit and return a << (b & 0x1F).
-/// \param [in] a The source value
-/// \param [in] b The offset to shift
-/// \returns a << (b & 0x1F)
-template <typename RetT, typename T>
-inline constexpr RetT extend_shl_wrap(T a, uint32_t b) {
-  return detail::extend_binary<RetT, false>(a, b & 0x1F, detail::shift_left());
-}
-
-/// Extend Inputs to 33 bit, and return second_op(a << (b & 0x1F), c).
-/// \param [in] a The source value
-/// \param [in] b The offset to shift
-/// \param [in] c The value to merge
-/// \param [in] second_op The operation to do with the third value
-/// \returns second_op(a << (b & 0x1F), c)
-template <typename RetT, typename T, typename BinaryOperation>
-inline constexpr RetT extend_shl_wrap(T a, uint32_t b, uint32_t c,
-                                      BinaryOperation second_op) {
-  return detail::extend_binary<RetT, false>(a, b & 0x1F, c,
-                                            detail::shift_left(), second_op);
-}
-
-/// Extend \p a and \p b to 33 bit and return sat(a << (b & 0x1F)).
-/// \param [in] a The source value
-/// \param [in] b The offset to shift
-/// \returns sat(a << (b & 0x1F))
-template <typename RetT, typename T>
-inline constexpr RetT extend_shl_sat_wrap(T a, uint32_t b) {
-  return detail::extend_binary<RetT, true>(a, b & 0x1F, detail::shift_left());
-}
-
-/// Extend Inputs to 33 bit, and return second_op(sat(a << (b & 0x1F)), c).
-/// \param [in] a The source value
-/// \param [in] b The offset to shift
-/// \param [in] c The value to merge
-/// \param [in] second_op The operation to do with the third value
-/// \returns second_op(sat(a << (b & 0x1F)), c)
-template <typename RetT, typename T, typename BinaryOperation>
-inline constexpr RetT extend_shl_sat_wrap(T a, uint32_t b, uint32_t c,
-                                          BinaryOperation second_op) {
-  return detail::extend_binary<RetT, true>(a, b & 0x1F, c, detail::shift_left(),
-                                           second_op);
-}
-
-/// Extend \p a and \p b to 33 bit and return a >> clamp(b, 0, 32).
-/// \param [in] a The source value
-/// \param [in] b The offset to shift
-/// \returns a >> clamp(b, 0, 32)
-template <typename RetT, typename T>
-inline constexpr RetT extend_shr_clamp(T a, uint32_t b) {
-  return detail::extend_binary<RetT, false>(a, sycl::clamp(b, 0u, 32u),
-                                            detail::shift_right());
-}
-
-/// Extend Inputs to 33 bit, and return second_op(a >> clamp(b, 0, 32), c).
-/// \param [in] a The source value
-/// \param [in] b The offset to shift
-/// \param [in] c The value to merge
-/// \param [in] second_op The operation to do with the third value
-/// \returns second_op(a >> clamp(b, 0, 32), c)
-template <typename RetT, typename T, typename BinaryOperation>
-inline constexpr RetT extend_shr_clamp(T a, uint32_t b, uint32_t c,
-                                       BinaryOperation second_op) {
-  return detail::extend_binary<RetT, false>(a, sycl::clamp(b, 0u, 32u), c,
-                                            detail::shift_right(), second_op);
-}
-
-/// Extend \p a and \p b to 33 bit and return sat(a >> clamp(b, 0, 32)).
-/// \param [in] a The source value
-/// \param [in] b The offset to shift
-/// \returns sat(a >> clamp(b, 0, 32))
-template <typename RetT, typename T>
-inline constexpr RetT extend_shr_sat_clamp(T a, uint32_t b) {
-  return detail::extend_binary<RetT, true>(a, sycl::clamp(b, 0u, 32u),
-                                           detail::shift_right());
-}
-
-/// Extend Inputs to 33 bit, and return second_op(sat(a >> clamp(b, 0, 32)), c).
-/// \param [in] a The source value
-/// \param [in] b The offset to shift
-/// \param [in] c The value to merge
-/// \param [in] second_op The operation to do with the third value
-/// \returns second_op(sat(a >> clamp(b, 0, 32)), c)
-template <typename RetT, typename T, typename BinaryOperation>
-inline constexpr RetT extend_shr_sat_clamp(T a, uint32_t b, uint32_t c,
-                                           BinaryOperation second_op) {
-  return detail::extend_binary<RetT, true>(a, sycl::clamp(b, 0u, 32u), c,
-                                           detail::shift_right(), second_op);
-}
-
-/// Extend \p a and \p b to 33 bit and return a >> (b & 0x1F).
-/// \param [in] a The source value
-/// \param [in] b The offset to shift
-/// \returns a >> (b & 0x1F)
-template <typename RetT, typename T>
-inline constexpr RetT extend_shr_wrap(T a, uint32_t b) {
-  return detail::extend_binary<RetT, false>(a, b & 0x1F, detail::shift_right());
-}
-
-/// Extend Inputs to 33 bit, and return second_op(a >> (b & 0x1F), c).
-/// \param [in] a The source value
-/// \param [in] b The offset to shift
-/// \param [in] c The value to merge
-/// \param [in] second_op The operation to do with the third value
-/// \returns second_op(a >> (b & 0x1F), c)
-template <typename RetT, typename T, typename BinaryOperation>
-inline constexpr RetT extend_shr_wrap(T a, uint32_t b, uint32_t c,
-                                      BinaryOperation second_op) {
-  return detail::extend_binary<RetT, false>(a, b & 0x1F, c,
-                                            detail::shift_right(), second_op);
-}
-
-/// Extend \p a and \p b to 33 bit and return sat(a >> (b & 0x1F)).
-/// \param [in] a The source value
-/// \param [in] b The offset to shift
-/// \returns sat(a >> (b & 0x1F))
-template <typename RetT, typename T>
-inline constexpr RetT extend_shr_sat_wrap(T a, uint32_t b) {
-  return detail::extend_binary<RetT, true>(a, b & 0x1F, detail::shift_right());
-}
-
-/// Extend Inputs to 33 bit, and return second_op(sat(a >> (b & 0x1F)), c).
-/// \param [in] a The source value
-/// \param [in] b The offset to shift
-/// \param [in] c The value to merge
-/// \param [in] second_op The operation to do with the third value
-/// \returns second_op(sat(a >> (b & 0x1F)), c)
-template <typename RetT, typename T, typename BinaryOperation>
-inline constexpr RetT extend_shr_sat_wrap(T a, uint32_t b, uint32_t c,
-                                          BinaryOperation second_op) {
-  return detail::extend_binary<RetT, true>(a, b & 0x1F, c,
-                                           detail::shift_right(), second_op);
-}
-
-/// Compute vectorized addition of \p a and \p b, with each value treated as a
-/// 2 elements vector type and extend each element to 17 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized addition of the two values
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vadd2(AT a, BT b, RetT c) {
-  return detail::extend_vbinary2<RetT, false, false>(a, b, c, std::plus());
-}
-
-/// Compute vectorized addition of \p a and \p b, with each value treated as a 2
-/// elements vector type and extend each element to 17 bit. Then add each half
-/// of the result and add with \p c.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The addition of each half of extend vectorized addition of the two
-/// values and the third value
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vadd2_add(AT a, BT b, RetT c) {
-  return detail::extend_vbinary2<RetT, false, true>(a, b, c, std::plus());
-}
-
-/// Compute vectorized addition of \p a and \p b with saturation, with each
-/// value treated as a 2 elements vector type and extend each element to 17 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized addition of the two values with saturation
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vadd2_sat(AT a, BT b, RetT c) {
-  return detail::extend_vbinary2<RetT, true, false>(a, b, c, std::plus());
-}
-
-/// Compute vectorized subtraction of \p a and \p b, with each value treated as
-/// a 2 elements vector type and extend each element to 17 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized subtraction of the two values
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vsub2(AT a, BT b, RetT c) {
-  return detail::extend_vbinary2<RetT, false, false>(a, b, c, std::minus());
-}
-
-/// Compute vectorized subtraction of \p a and \p b, with each value treated as
-/// a 2 elements vector type and extend each element to 17 bit. Then add each
-/// half of the result and add with \p c.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The addition of each half of extend vectorized subtraction of the
-/// two values and the third value
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vsub2_add(AT a, BT b, RetT c) {
-  return detail::extend_vbinary2<RetT, false, true>(a, b, c, std::minus());
-}
-
-/// Compute vectorized subtraction of \p a and \p b with saturation, with each
-/// value treated as a 2 elements vector type and extend each element to 17 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized subtraction of the two values with saturation
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vsub2_sat(AT a, BT b, RetT c) {
-  return detail::extend_vbinary2<RetT, true, false>(a, b, c, std::minus());
-}
-
-/// Compute vectorized abs_diff of \p a and \p b, with each value treated as a 2
-/// elements vector type and extend each element to 17 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized abs_diff of the two values
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vabsdiff2(AT a, BT b, RetT c) {
-  return detail::extend_vbinary2<RetT, false, false>(a, b, c, abs_diff());
-}
-
-/// Compute vectorized abs_diff of \p a and \p b, with each value treated as a 2
-/// elements vector type and extend each element to 17 bit. Then add each half
-/// of the result and add with \p c.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The addition of each half of extend vectorized abs_diff of the
-/// two values and the third value
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vabsdiff2_add(AT a, BT b, RetT c) {
-  return detail::extend_vbinary2<RetT, false, true>(a, b, c, abs_diff());
-}
-
-/// Compute vectorized abs_diff of \p a and \p b with saturation, with each
-/// value treated as a 2 elements vector type and extend each element to 17 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized abs_diff of the two values with saturation
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vabsdiff2_sat(AT a, BT b, RetT c) {
-  return detail::extend_vbinary2<RetT, true, false>(a, b, c, abs_diff());
-}
-
-/// Compute vectorized minimum of \p a and \p b, with each value treated as a 2
-/// elements vector type and extend each element to 17 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized minimum of the two values
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vmin2(AT a, BT b, RetT c) {
-  return detail::extend_vbinary2<RetT, false, false>(a, b, c, minimum());
-}
-
-/// Compute vectorized minimum of \p a and \p b, with each value treated as a 2
-/// elements vector type and extend each element to 17 bit. Then add each half
-/// of the result and add with \p c.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The addition of each half of extend vectorized minimum of the
-/// two values and the third value
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vmin2_add(AT a, BT b, RetT c) {
-  return detail::extend_vbinary2<RetT, false, true>(a, b, c, minimum());
-}
-
-/// Compute vectorized minimum of \p a and \p b with saturation, with each value
-/// treated as a 2 elements vector type and extend each element to 17 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized minimum of the two values with saturation
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vmin2_sat(AT a, BT b, RetT c) {
-  return detail::extend_vbinary2<RetT, true, false>(a, b, c, minimum());
-}
-
-/// Compute vectorized maximum of \p a and \p b, with each value treated as a 2
-/// elements vector type and extend each element to 17 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized maximum of the two values
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vmax2(AT a, BT b, RetT c) {
-  return detail::extend_vbinary2<RetT, false, false>(a, b, c, maximum());
-}
-
-/// Compute vectorized maximum of \p a and \p b, with each value treated as a 2
-/// elements vector type and extend each element to 17 bit. Then add each half
-/// of the result and add with \p c.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The addition of each half of extend vectorized maximum of the
-/// two values and the third value
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vmax2_add(AT a, BT b, RetT c) {
-  return detail::extend_vbinary2<RetT, false, true>(a, b, c, maximum());
-}
-
-/// Compute vectorized maximum of \p a and \p b with saturation, with each value
-/// treated as a 2 elements vector type and extend each element to 17 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized maximum of the two values with saturation
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vmax2_sat(AT a, BT b, RetT c) {
-  return detail::extend_vbinary2<RetT, true, false>(a, b, c, maximum());
-}
-
-/// Compute vectorized average of \p a and \p b, with each value treated as a 2
-/// elements vector type and extend each element to 17 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized average of the two values
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vavrg2(AT a, BT b, RetT c) {
-  return detail::extend_vbinary2<RetT, false, false>(a, b, c,
-                                                     detail::average());
-}
-
-/// Compute vectorized average of \p a and \p b, with each value treated as a 2
-/// elements vector type and extend each element to 17 bit. Then add each half
-/// of the result and add with \p c.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The addition of each half of extend average maximum of the
-/// two values and the third value
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vavrg2_add(AT a, BT b, RetT c) {
-  return detail::extend_vbinary2<RetT, false, true>(a, b, c, detail::average());
-}
-
-/// Compute vectorized average of \p a and \p b with saturation, with each value
-/// treated as a 2 elements vector type and extend each element to 17 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized average of the two values with saturation
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vavrg2_sat(AT a, BT b, RetT c) {
-  return detail::extend_vbinary2<RetT, true, false>(a, b, c, detail::average());
-}
-
-/// Extend \p a and \p b to 33 bit and vectorized compare input values using
-/// specified comparison \p cmp .
-///
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \tparam [in] BinaryOperation The type of the compare operation
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] cmp The comparsion operator
-/// \returns The comparison result of the two extended values.
-template <typename AT, typename BT, typename BinaryOperation>
-inline constexpr unsigned extend_vcompare2(AT a, BT b, BinaryOperation cmp) {
-  return detail::extend_vbinary2<unsigned, false, false>(a, b, 0, cmp);
-}
-
-/// Extend Inputs to 33 bit, and vectorized compare input values using specified
-/// comparison \p cmp , then add the result with \p c .
-///
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \tparam [in] BinaryOperation The type of the compare operation
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \param [in] cmp The comparsion operator
-/// \returns The comparison result of the two extended values, and add the
-/// result with \p c .
-template <typename AT, typename BT, typename BinaryOperation>
-inline constexpr unsigned extend_vcompare2_add(AT a, BT b, unsigned c,
-                                               BinaryOperation cmp) {
-  return detail::extend_vbinary2<unsigned, false, true>(a, b, c, cmp);
-}
-
-/// Compute vectorized addition of \p a and \p b, with each value treated as a
-/// 4 elements vector type and extend each element to 9 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized addition of the two values
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vadd4(AT a, BT b, RetT c) {
-  return detail::extend_vbinary4<RetT, false, false>(a, b, c, std::plus());
-}
-
-/// Compute vectorized addition of \p a and \p b, with each value treated as a 4
-/// elements vector type and extend each element to 9 bit. Then add each half
-/// of the result and add with \p c.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The addition of each half of extend vectorized addition of the two
-/// values and the third value
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vadd4_add(AT a, BT b, RetT c) {
-  return detail::extend_vbinary4<RetT, false, true>(a, b, c, std::plus());
-}
-
-/// Compute vectorized addition of \p a and \p b with saturation, with each
-/// value treated as a 4 elements vector type and extend each element to 9 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized addition of the two values with saturation
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vadd4_sat(AT a, BT b, RetT c) {
-  return detail::extend_vbinary4<RetT, true, false>(a, b, c, std::plus());
-}
-
-/// Compute vectorized subtraction of \p a and \p b, with each value treated as
-/// a 4 elements vector type and extend each element to 9 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized subtraction of the two values
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vsub4(AT a, BT b, RetT c) {
-  return detail::extend_vbinary4<RetT, false, false>(a, b, c, std::minus());
-}
-
-/// Compute vectorized subtraction of \p a and \p b, with each value treated as
-/// a 4 elements vector type and extend each element to 9 bit. Then add each
-/// half of the result and add with \p c.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The addition of each half of extend vectorized subtraction of the
-/// two values and the third value
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vsub4_add(AT a, BT b, RetT c) {
-  return detail::extend_vbinary4<RetT, false, true>(a, b, c, std::minus());
-}
-
-/// Compute vectorized subtraction of \p a and \p b with saturation, with each
-/// value treated as a 4 elements vector type and extend each element to 9 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized subtraction of the two values with saturation
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vsub4_sat(AT a, BT b, RetT c) {
-  return detail::extend_vbinary4<RetT, true, false>(a, b, c, std::minus());
-}
-
-/// Compute vectorized abs_diff of \p a and \p b, with each value treated as a 4
-/// elements vector type and extend each element to 9 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized abs_diff of the two values
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vabsdiff4(AT a, BT b, RetT c) {
-  return detail::extend_vbinary4<RetT, false, false>(a, b, c, abs_diff());
-}
-
-/// Compute vectorized abs_diff of \p a and \p b, with each value treated as a 4
-/// elements vector type and extend each element to 9 bit. Then add each half
-/// of the result and add with \p c.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The addition of each half of extend vectorized abs_diff of the
-/// two values and the third value
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vabsdiff4_add(AT a, BT b, RetT c) {
-  return detail::extend_vbinary4<RetT, false, true>(a, b, c, abs_diff());
-}
-
-/// Compute vectorized abs_diff of \p a and \p b with saturation, with each
-/// value treated as a 4 elements vector type and extend each element to 9 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized abs_diff of the two values with saturation
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vabsdiff4_sat(AT a, BT b, RetT c) {
-  return detail::extend_vbinary4<RetT, true, false>(a, b, c, abs_diff());
-}
-
-/// Compute vectorized minimum of \p a and \p b, with each value treated as a 4
-/// elements vector type and extend each element to 9 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized minimum of the two values
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vmin4(AT a, BT b, RetT c) {
-  return detail::extend_vbinary4<RetT, false, false>(a, b, c, minimum());
-}
-
-/// Compute vectorized minimum of \p a and \p b, with each value treated as a 4
-/// elements vector type and extend each element to 9 bit. Then add each half
-/// of the result and add with \p c.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The addition of each half of extend vectorized minimum of the
-/// two values and the third value
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vmin4_add(AT a, BT b, RetT c) {
-  return detail::extend_vbinary4<RetT, false, true>(a, b, c, minimum());
-}
-
-/// Compute vectorized minimum of \p a and \p b with saturation, with each value
-/// treated as a 4 elements vector type and extend each element to 9 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized minimum of the two values with saturation
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vmin4_sat(AT a, BT b, RetT c) {
-  return detail::extend_vbinary4<RetT, true, false>(a, b, c, minimum());
-}
-
-/// Compute vectorized maximum of \p a and \p b, with each value treated as a 4
-/// elements vector type and extend each element to 9 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized maximum of the two values
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vmax4(AT a, BT b, RetT c) {
-  return detail::extend_vbinary4<RetT, false, false>(a, b, c, maximum());
-}
-
-/// Compute vectorized maximum of \p a and \p b, with each value treated as a 4
-/// elements vector type and extend each element to 9 bit. Then add each half
-/// of the result and add with \p c.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The addition of each half of extend vectorized maximum of the
-/// two values and the third value
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vmax4_add(AT a, BT b, RetT c) {
-  return detail::extend_vbinary4<RetT, false, true>(a, b, c, maximum());
-}
-
-/// Compute vectorized maximum of \p a and \p b with saturation, with each value
-/// treated as a 4 elements vector type and extend each element to 9 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized maximum of the two values with saturation
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vmax4_sat(AT a, BT b, RetT c) {
-  return detail::extend_vbinary4<RetT, true, false>(a, b, c, maximum());
-}
-
-/// Compute vectorized average of \p a and \p b, with each value treated as a 4
-/// elements vector type and extend each element to 9 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized average of the two values
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vavrg4(AT a, BT b, RetT c) {
-  return detail::extend_vbinary4<RetT, false, false>(a, b, c,
-                                                     detail::average());
-}
-
-/// Compute vectorized average of \p a and \p b, with each value treated as a 4
-/// elements vector type and extend each element to 9 bit. Then add each half
-/// of the result and add with \p c.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The addition of each half of extend vectorized average of the
-/// two values and the third value
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vavrg4_add(AT a, BT b, RetT c) {
-  return detail::extend_vbinary4<RetT, false, true>(a, b, c, detail::average());
-}
-
-/// Compute vectorized average of \p a and \p b with saturation, with each value
-/// treated as a 4 elements vector type and extend each element to 9 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized average of the two values with saturation
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vavrg4_sat(AT a, BT b, RetT c) {
-  return detail::extend_vbinary4<RetT, true, false>(a, b, c, detail::average());
-}
-
-/// Extend \p a and \p b to 33 bit and vectorized compare input values using
-/// specified comparison \p cmp .
-///
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \tparam [in] BinaryOperation The type of the compare operation
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] cmp The comparsion operator
-/// \returns The comparison result of the two extended values.
-template <typename AT, typename BT, typename BinaryOperation>
-inline constexpr unsigned extend_vcompare4(AT a, BT b, BinaryOperation cmp) {
-  return detail::extend_vbinary4<unsigned, false, false>(a, b, 0, cmp);
-}
-
-/// Extend Inputs to 33 bit, and vectorized compare input values using specified
-/// comparison \p cmp , then add the result with \p c .
-///
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \tparam [in] BinaryOperation The type of the compare operation
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \param [in] cmp The comparsion operator
-/// \returns The comparison result of the two extended values, and add the
-/// result with \p c .
-template <typename AT, typename BT, typename BinaryOperation>
-inline constexpr unsigned extend_vcompare4_add(AT a, BT b, unsigned c,
-                                               BinaryOperation cmp) {
-  return detail::extend_vbinary4<unsigned, false, true>(a, b, c, cmp);
-}
-
-} // namespace syclcompat
diff --git a/sycl/include/syclcompat/memory.hpp b/sycl/include/syclcompat/memory.hpp
deleted file mode 100644
index 7fc21fec8d2d4..0000000000000
--- a/sycl/include/syclcompat/memory.hpp
+++ /dev/null
@@ -1,1883 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCL compatibility extension
- *
- *  memory.hpp
- *
- *  Description:
- *    memory functionality for the SYCL compatibility extension
- **************************************************************************/
-
-// The original source was under the license below:
-//==---- memory.hpp -------------------------------*- C++ -*----------------==//
-//
-// Copyright (C) Intel Corporation
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// See https://llvm.org/LICENSE.txt for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#pragma once
-
-#include <cassert>
-#include <cstdint>
-#include <cstring>
-#include <map>
-#include <mutex>
-#include <thread>
-#include <type_traits>
-#include <unordered_map>
-#include <utility>
-
-#include <sycl/builtins.hpp>
-#include <sycl/ext/oneapi/free_function_queries.hpp>
-#include <sycl/ext/oneapi/group_local_memory.hpp>
-#include <sycl/group.hpp>
-#include <sycl/usm.hpp>
-
-#ifdef SYCL_EXT_ONEAPI_USM_DEVICE_READ_ONLY
-#include <sycl/ext/intel/experimental/usm_properties.hpp>
-#endif
-
-#include <syclcompat/device.hpp>
-#include <syclcompat/traits.hpp>
-#include <syclcompat/defs.hpp>
-
-#if defined(__linux__)
-#include <sys/mman.h>
-#elif defined(_WIN64)
-#ifndef NOMINMAX
-#define NOMINMAX
-#endif
-#include <windows.h>
-#else
-#error "Only support Windows and Linux."
-#endif
-
-namespace [[deprecated("syclcompat is deprecated")]] syclcompat {
-
-template <typename AllocT>
-#ifdef __SYCL_DEVICE_ONLY__
-[[__sycl_detail__::add_ir_attributes_function("sycl-forceinline", true)]]
-#endif
-__SYCL_ALWAYS_INLINE auto *local_mem() {
-  sycl::multi_ptr<AllocT, sycl::access::address_space::local_space>
-      As_multi_ptr =
-          sycl::ext::oneapi::group_local_memory_for_overwrite<AllocT>(
-              sycl::ext::oneapi::this_work_item::get_work_group<3>());
-  auto *As = *As_multi_ptr;
-  return As;
-}
-
-namespace detail {
-enum memcpy_direction {
-  host_to_host,
-  host_to_device,
-  device_to_host,
-  device_to_device,
-  automatic
-};
-} // namespace detail
-
-template <typename T>
-__syclcompat_inline__
-    std::enable_if_t<std::is_same_v<T, uint32_t> || std::is_same_v<T, size_t>,
-                     T>
-    ptr_to_int(void *ptr) {
-#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
-  if constexpr (std::is_same_v<T, uint32_t>) {
-    return (intptr_t)(sycl::decorated_local_ptr<const void>::pointer)ptr;
-  } else {
-    return (size_t)(sycl::decorated_local_ptr<const void>::pointer)ptr;
-  }
-#else
-  throw sycl::exception(make_error_code(sycl::errc::runtime),
-                        "ptr_to_int is only supported on Nvidia devices.");
-#endif
-}
-
-enum class memory_region {
-  global = 0, // device global memory
-  constant,   // device read-only memory
-  local,      // device local memory
-  usm_shared, // memory which can be accessed by host and device
-};
-
-using byte_t = uint8_t;
-
-/// Buffer type to be used in Memory Management runtime.
-typedef sycl::buffer<byte_t> buffer_t;
-
-/// Pitched 2D/3D memory data.
-class pitched_data {
-public:
-  pitched_data() : pitched_data(nullptr, 0, 0, 0) {}
-  pitched_data(void *data, size_t pitch, size_t x, size_t y)
-      : _data(data), _pitch(pitch), _x(x), _y(y) {}
-
-  void *get_data_ptr() { return _data; }
-  void set_data_ptr(void *data) { _data = data; }
-
-  size_t get_pitch() { return _pitch; }
-  void set_pitch(size_t pitch) { _pitch = pitch; }
-
-  size_t get_x() { return _x; }
-  void set_x(size_t x) { _x = x; };
-
-  size_t get_y() { return _y; }
-  void set_y(size_t y) { _y = y; }
-
-private:
-  void *_data;
-  size_t _pitch, _x, _y;
-};
-
-namespace experimental {
-#ifdef SYCL_EXT_ONEAPI_BINDLESS_IMAGES
-class image_mem_wrapper;
-namespace detail {
-static sycl::event memcpy(const image_mem_wrapper *src,
-                          const sycl::id<3> &src_id, pitched_data &dest,
-                          const sycl::id<3> &dest_id,
-                          const sycl::range<3> &copy_extend, sycl::queue q);
-static sycl::event memcpy(const pitched_data src, const sycl::id<3> &src_id,
-                          image_mem_wrapper *dest, const sycl::id<3> &dest_id,
-                          const sycl::range<3> &copy_extend, sycl::queue q);
-} // namespace detail
-#endif
-class image_matrix;
-namespace detail {
-static pitched_data to_pitched_data(image_matrix *image);
-}
-
-/// Memory copy parameters for 2D/3D memory data.
-struct memcpy_parameter {
-  struct data_wrapper {
-    pitched_data pitched{};
-    sycl::id<3> pos{};
-#ifdef SYCL_EXT_ONEAPI_BINDLESS_IMAGES
-    experimental::image_mem_wrapper *image_bindless{nullptr};
-#endif
-    image_matrix *image{nullptr};
-  };
-  data_wrapper from{};
-  data_wrapper to{};
-  sycl::range<3> size{};
-};
-} // namespace experimental
-
-namespace detail {
-class mem_mgr {
-  mem_mgr() {
-    // Reserved address space, no real memory allocation happens here.
-#if defined(__linux__)
-    mapped_address_space =
-        (byte_t *)mmap(nullptr, mapped_region_size, PROT_NONE,
-                       MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-#elif defined(_WIN64)
-    mapped_address_space = (byte_t *)VirtualAlloc(
-        NULL,               // NULL specified as the base address parameter
-        mapped_region_size, // Size of allocation
-        MEM_RESERVE,        // Allocate reserved pages
-        PAGE_NOACCESS);     // Protection = no access
-#else
-#error "Only support Windows and Linux."
-#endif
-    next_free = mapped_address_space;
-  };
-
-public:
-  using buffer_id_t = int;
-
-  struct allocation {
-    buffer_t buffer;
-    byte_t *alloc_ptr;
-    size_t size;
-  };
-
-  ~mem_mgr() {
-#if defined(__linux__)
-    munmap(mapped_address_space, mapped_region_size);
-#elif defined(_WIN64)
-    VirtualFree(mapped_address_space, 0, MEM_RELEASE);
-#else
-#error "Only support Windows and Linux."
-#endif
-  };
-
-  mem_mgr(const mem_mgr &) = delete;
-  mem_mgr &operator=(const mem_mgr &) = delete;
-  mem_mgr(mem_mgr &&) = delete;
-  mem_mgr &operator=(mem_mgr &&) = delete;
-
-  /// Allocate
-  void *mem_alloc(size_t size) {
-    if (!size)
-      return nullptr;
-    std::lock_guard<std::mutex> lock(m_mutex);
-    if (next_free + size > mapped_address_space + mapped_region_size) {
-      throw std::runtime_error(
-          "[SYCLcompat] malloc: out of memory for virtual memory pool");
-    }
-    // Allocation
-    sycl::range<1> buffer_range(size);
-    buffer_t buf(buffer_range);
-    allocation alloc{buf, next_free, size};
-    // Map allocation to device pointer
-    void *result = next_free;
-    m_map.emplace(next_free + size, alloc);
-    // Update pointer to the next free space.
-    next_free += (size + extra_padding + alignment - 1) & ~(alignment - 1);
-
-    return result;
-  }
-
-  /// Deallocate
-  void mem_free(const void *ptr) {
-    if (!ptr)
-      return;
-    std::lock_guard<std::mutex> lock(m_mutex);
-    auto it = get_map_iterator(ptr);
-    m_map.erase(it);
-  }
-
-  /// map: device pointer -> allocation(buffer, alloc_ptr, size)
-  allocation translate_ptr(const void *ptr) {
-    std::lock_guard<std::mutex> lock(m_mutex);
-    auto it = get_map_iterator(ptr);
-    return it->second;
-  }
-
-  /// Check if the pointer represents device pointer or not.
-  bool is_device_ptr(const void *ptr) const {
-    std::lock_guard<std::mutex> lock(m_mutex);
-    return (mapped_address_space <= ptr) &&
-           (ptr < mapped_address_space + mapped_region_size);
-  }
-
-  /// Returns the instance of memory manager singleton.
-  static mem_mgr &instance() {
-    static mem_mgr m;
-    return m;
-  }
-
-private:
-  std::map<byte_t *, allocation> m_map;
-  mutable std::mutex m_mutex;
-  byte_t *mapped_address_space;
-  byte_t *next_free;
-  const size_t mapped_region_size = 128ull * 1024 * 1024 * 1024;
-  const size_t alignment = 256;
-  /// This padding may be defined to some positive value to debug
-  /// out of bound accesses.
-  const size_t extra_padding = 0;
-
-  std::map<byte_t *, allocation>::iterator get_map_iterator(const void *ptr) {
-    auto it = m_map.upper_bound((byte_t *)ptr);
-    if (it == m_map.end()) {
-      // Not a virtual pointer.
-      throw std::runtime_error("[SYCLcompat] can not get buffer from non-virtual pointer");
-    }
-    const allocation &alloc = it->second;
-    if (ptr < alloc.alloc_ptr) {
-      // Out of bound.
-      // This may happen if there's a gap between allocations due to alignment
-      // or extra padding and pointer points to this gap.
-      throw std::runtime_error("[SYCLcompat] invalid virtual pointer");
-    }
-    return it;
-  }
-};
-
-template <class T, memory_region Memory, size_t Dimension> class accessor;
-template <memory_region Memory, class T = byte_t> class memory_traits {
-public:
-  static constexpr sycl::access::address_space asp =
-      (Memory == memory_region::local)
-          ? sycl::access::address_space::local_space
-          : sycl::access::address_space::global_space;
-  static constexpr sycl::target target = (Memory == memory_region::local)
-                                             ? sycl::target::local
-                                             : sycl::target::device;
-  static constexpr sycl::access_mode mode = (Memory == memory_region::constant)
-                                                ? sycl::access_mode::read
-                                                : sycl::access_mode::read_write;
-  static constexpr size_t type_size = sizeof(T);
-  using element_t =
-      typename std::conditional_t<Memory == memory_region::constant, const T,
-                                  T>;
-  using value_t = typename std::remove_cv_t<T>;
-  template <size_t Dimension = 1>
-  using accessor_t =
-      typename std::conditional_t<target == sycl::target::local,
-                                  sycl::local_accessor<T, Dimension>,
-                                  sycl::accessor<T, Dimension, mode>>;
-  using pointer_t =
-      typename std::conditional_t<Memory == memory_region::constant, const T *,
-                                  T *>;
-};
-
-static inline void *malloc(size_t size, sycl::queue q) {
-#ifdef SYCLCOMPAT_USM_LEVEL_NONE
-  return mem_mgr::instance().mem_alloc(size * sizeof(byte_t));
-#else
-  return sycl::malloc_device(size, q.get_device(), q.get_context());
-#endif // SYCLCOMPAT_USM_LEVEL_NONE
-}
-
-/// Calculate pitch (padded length of major dimension \p x) by rounding up to
-/// multiple of 32.
-/// \param x The dimension to be padded (in bytes)
-/// \returns size_t representing pitched length of dimension x (in bytes).
-static inline constexpr size_t get_pitch(size_t x) {
-  return ((x) + 31) & ~(0x1F);
-}
-
-/// \brief Malloc pitched 3D data
-/// \param [out] pitch returns the calculated pitch (in bytes)
-/// \param [in] x width of the allocation (in bytes)
-/// \param [in] y height of the allocation
-/// \param [in] z depth of the allocation
-/// \param [in] q The queue in which the operation is done.
-/// \returns A pointer to the allocated memory
-static inline void *malloc(size_t &pitch, size_t x, size_t y, size_t z,
-                           sycl::queue q) {
-  pitch = get_pitch(x);
-  return malloc(pitch * y * z, q);
-}
-
-/// \brief Set \p pattern to the first \p count elements of type \p T
-/// starting from \p dev_ptr.
-///
-/// \tparam T Datatype of the pattern to be set.
-/// \param q The queue in which the operation is done.
-/// \param dev_ptr Pointer to the device memory address.
-/// \param pattern Pattern of type T to be set.
-/// \param count Number of elements to be set to the patten.
-/// \returns An event representing the fill operation.
-template <class T>
-static inline sycl::event fill(sycl::queue q, void *dev_ptr, const T &pattern,
-                               size_t count) {
-#ifdef SYCLCOMPAT_USM_LEVEL_NONE
-  auto &mm = mem_mgr::instance();
-  assert(mm.is_device_ptr(dev_ptr));
-  auto alloc = mm.translate_ptr(dev_ptr);
-  size_t offset = (T *)dev_ptr - (T *)alloc.alloc_ptr;
-
-  return q.submit([&](sycl::handler &cgh) {
-    auto r = sycl::range<1>(count);
-    auto o = sycl::id<1>(offset);
-    auto new_buffer =
-        alloc.buffer.reinterpret<T>(sycl::range<1>(alloc.size / sizeof(T)));
-    sycl::accessor<T, 1, sycl::access_mode::write, sycl::access::target::device>
-        acc(new_buffer, cgh, r, o);
-    cgh.fill(acc, pattern);
-  });
-#else
-  return q.fill(dev_ptr, pattern, count);
-#endif
-}
-
-/// Set \p value to the first \p size bytes starting from \p dev_ptr in \p q.
-///
-/// \param q The queue in which the operation is done.
-/// \param dev_ptr Pointer to the device memory address.
-/// \param value Value to be set.
-/// \param size Number of bytes to be set to the value.
-/// \returns An event representing the memset operation.
-static inline sycl::event memset(sycl::queue q, void *dev_ptr, int value,
-                                 size_t size) {
-#ifdef SYCLCOMPAT_USM_LEVEL_NONE
-  auto &mm = mem_mgr::instance();
-  assert(mm.is_device_ptr(dev_ptr));
-  auto alloc = mm.translate_ptr(dev_ptr);
-  size_t offset = (byte_t *)dev_ptr - (byte_t *)alloc.alloc_ptr;
-
-  return q.submit([&](sycl::handler &cgh) {
-    auto r = sycl::range<1>(size);
-    auto o = sycl::id<1>(offset);
-    auto new_buffer = alloc.buffer.reinterpret<byte_t>(
-        sycl::range<1>(alloc.size / sizeof(byte_t)));
-    sycl::accessor<byte_t, 1, sycl::access_mode::write,
-                   sycl::access::target::device>
-        acc(new_buffer, cgh, r, o);
-    cgh.fill(acc, static_cast<unsigned char>(value));
-  });
-#else
-  return q.memset(dev_ptr, value, size);
-#endif // SYCLCOMPAT_USM_LEVEL_NONE
-}
-
-/// \brief Sets \p value to the 3D memory region pointed by \p data in \p q.
-/// \tparam T The type of the element to be set.
-/// \param [in] q The queue in which the operation is done.
-/// \param [in] data Pointer to the pitched device memory region.
-/// \param [in] value The value to be set.
-/// \param [in] size 3D memory region by number of elements.
-/// \return An event list representing the memset operations.
-template <typename T>
-static inline std::vector<sycl::event>
-memset(sycl::queue q, pitched_data data, const T &value, sycl::range<3> size) {
-  std::vector<sycl::event> event_list;
-  size_t slice = data.get_pitch() * data.get_y();
-  unsigned char *data_surface = (unsigned char *)data.get_data_ptr();
-  for (size_t z = 0; z < size.get(2); ++z) {
-    unsigned char *data_ptr = data_surface;
-    for (size_t y = 0; y < size.get(1); ++y) {
-      event_list.push_back(detail::fill<T>(q, data_ptr, value, size.get(0)));
-      data_ptr += data.get_pitch();
-    }
-    data_surface += slice;
-  }
-  return event_list;
-}
-
-/// \brief Sets \p val to the pitched 2D memory region pointed by \p ptr in \p
-/// q.
-/// \tparam T The type of the element to be set.
-/// \param [in] q The queue in which the operation is done.
-/// \param [in] ptr Pointer to the virtual device memory.
-/// \param [in] pitch The pitch size by number of elements, including padding.
-/// \param [in] value The value to be set.
-/// \param [in] x The width of memory region by number of elements.
-/// \param [in] y The height of memory region by number of elements.
-/// \return An event list representing the memset operations.
-template <typename T>
-static inline std::vector<sycl::event> memset(sycl::queue q, void *ptr,
-                                              size_t pitch, const T &value,
-                                              size_t x, size_t y) {
-  return memset(q, pitched_data(ptr, pitch, x, 1), value,
-                sycl::range<3>(x, y, 1));
-}
-
-enum class pointer_access_attribute {
-  host_only = 0,
-  device_only,
-  host_device,
-  end
-};
-
-static pointer_access_attribute get_pointer_attribute(sycl::queue q,
-                                                      const void *ptr) {
-#ifdef SYCLCOMPAT_USM_LEVEL_NONE
-  return mem_mgr::instance().is_device_ptr(ptr)
-             ? pointer_access_attribute::device_only
-             : pointer_access_attribute::host_only;
-#else
-  switch (sycl::get_pointer_type(ptr, q.get_context())) {
-  case sycl::usm::alloc::unknown:
-    return pointer_access_attribute::host_only;
-  case sycl::usm::alloc::device:
-    return pointer_access_attribute::device_only;
-  case sycl::usm::alloc::shared:
-  case sycl::usm::alloc::host:
-    return pointer_access_attribute::host_device;
-  }
-#endif // SYCLCOMPAT_USM_LEVEL_NONE
-}
-
-static memcpy_direction
-deduce_memcpy_direction(sycl::queue q, void *to_ptr, const void *from_ptr) {
-  // table[to_attribute][from_attribute]
-  static const memcpy_direction
-      direction_table[static_cast<unsigned>(pointer_access_attribute::end)]
-                     [static_cast<unsigned>(pointer_access_attribute::end)] = {
-                         {host_to_host, device_to_host, host_to_host},
-                         {host_to_device, device_to_device, device_to_device},
-                         {host_to_host, device_to_device, device_to_device}};
-  return direction_table[static_cast<unsigned>(get_pointer_attribute(
-      q, to_ptr))][static_cast<unsigned>(get_pointer_attribute(q, from_ptr))];
-}
-
-static sycl::event memcpy(sycl::queue q, void *to_ptr, const void *from_ptr,
-                          size_t size,
-                          const std::vector<sycl::event> &dep_events = {}) {
-  if (!size)
-    return sycl::event{};
-#ifdef SYCLCOMPAT_USM_LEVEL_NONE
-  auto &mm = mem_mgr::instance();
-  auto real_direction = deduce_memcpy_direction(q, to_ptr, from_ptr);
-
-  switch (real_direction) {
-  case host_to_host:
-    return q.submit([&](sycl::handler &cgh) {
-      cgh.depends_on(dep_events);
-      cgh.host_task([=] { std::memcpy(to_ptr, from_ptr, size); });
-    });
-  case host_to_device: {
-    auto alloc = mm.translate_ptr(to_ptr);
-    size_t offset = (byte_t *)to_ptr - alloc.alloc_ptr;
-    return q.submit([&](sycl::handler &cgh) {
-      cgh.depends_on(dep_events);
-      auto r = sycl::range<1>(size);
-      auto o = sycl::id<1>(offset);
-      sycl::accessor<byte_t, 1, sycl::access_mode::write,
-                     sycl::access::target::device>
-          acc(alloc.buffer, cgh, r, o);
-      cgh.copy(from_ptr, acc);
-    });
-  }
-  case device_to_host: {
-    auto alloc = mm.translate_ptr(from_ptr);
-    size_t offset = (byte_t *)from_ptr - alloc.alloc_ptr;
-    return q.submit([&](sycl::handler &cgh) {
-      cgh.depends_on(dep_events);
-      auto r = sycl::range<1>(size);
-      auto o = sycl::id<1>(offset);
-      sycl::accessor<byte_t, 1, sycl::access_mode::read,
-                     sycl::access::target::device>
-          acc(alloc.buffer, cgh, r, o);
-      cgh.copy(acc, to_ptr);
-    });
-  }
-  case device_to_device: {
-    auto to_alloc = mm.translate_ptr(to_ptr);
-    auto from_alloc = mm.translate_ptr(from_ptr);
-    size_t to_offset = (byte_t *)to_ptr - to_alloc.alloc_ptr;
-    size_t from_offset = (byte_t *)from_ptr - from_alloc.alloc_ptr;
-    return q.submit([&](sycl::handler &cgh) {
-      cgh.depends_on(dep_events);
-      auto r = sycl::range<1>(size);
-      auto to_o = sycl::id<1>(to_offset);
-      auto from_o = sycl::id<1>(from_offset);
-      sycl::accessor<byte_t, 1, sycl::access_mode::write,
-                     sycl::access::target::device>
-          to_acc(to_alloc.buffer, cgh, r, to_o);
-      sycl::accessor<byte_t, 1, sycl::access_mode::read,
-                     sycl::access::target::device>
-          from_acc(from_alloc.buffer, cgh, r, from_o);
-      cgh.copy(from_acc, to_acc);
-    });
-  }
-  default:
-    throw std::runtime_error("[SYCLcompat] memcpy: invalid direction value");
-  }
-#else
-  return q.memcpy(to_ptr, from_ptr, size, dep_events);
-#endif // SYCLCOMPAT_USM_LEVEL_NONE
-}
-
-// Get actual copy range and make sure it will not exceed range.
-static inline size_t get_copy_range(sycl::range<3> size, size_t slice,
-                                    size_t pitch) {
-  return slice * (size.get(2) - 1) + pitch * (size.get(1) - 1) + size.get(0);
-}
-
-static inline size_t get_offset(sycl::id<3> id, size_t slice, size_t pitch) {
-  return slice * id.get(2) + pitch * id.get(1) + id.get(0);
-}
-
-// RAII for host pointer
-class host_buffer {
-  void *_buf;
-  size_t _size;
-  sycl::queue _q;
-  const std::vector<sycl::event> &_deps; // free operation depends
-
-public:
-  host_buffer(size_t size, sycl::queue q, const std::vector<sycl::event> &deps)
-      : _buf(std::malloc(size)), _size(size), _q(q), _deps(deps) {}
-  void *get_ptr() const { return _buf; }
-  size_t get_size() const { return _size; }
-  ~host_buffer() {
-    if (_buf) {
-      _q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(_deps);
-        cgh.host_task([buf = _buf] { std::free(buf); });
-      });
-    }
-  }
-};
-
-/// copy 3D matrix specified by \p size from 3D matrix specified by \p from_ptr
-/// and \p from_range to another specified by \p to_ptr and \p to_range.
-template <typename T = void>
-static inline std::vector<sycl::event>
-memcpy(sycl::queue q, void *to_ptr, const void *from_ptr,
-       sycl::range<3> to_range, sycl::range<3> from_range, sycl::id<3> to_id,
-       sycl::id<3> from_id, sycl::range<3> size,
-       const std::vector<sycl::event> &dep_events = {}) {
-  static_assert(
-      std::is_same_v<T, void>,
-      "This syclcompat::detail::memcpy overload only accepts a dummy template "
-      "parameter, T = void, which prevents SYCL kernel generation by default.");
-  std::vector<sycl::event> event_list;
-
-  size_t to_slice = to_range.get(1) * to_range.get(0);
-  size_t from_slice = from_range.get(1) * from_range.get(0);
-  unsigned char *to_surface =
-      (unsigned char *)to_ptr + get_offset(to_id, to_slice, to_range.get(0));
-  const unsigned char *from_surface =
-      (const unsigned char *)from_ptr +
-      get_offset(from_id, from_slice, from_range.get(0));
-
-  if (to_slice == from_slice && to_slice == size.get(1) * size.get(0)) {
-    return {memcpy(q, to_surface, from_surface, to_slice * size.get(2),
-                   dep_events)};
-  }
-  using namespace experimental; // for memcpy_direction
-  memcpy_direction direction = deduce_memcpy_direction(q, to_ptr, from_ptr);
-  size_t size_slice = size.get(1) * size.get(0);
-  switch (direction) {
-  case host_to_host:
-    for (size_t z = 0; z < size.get(2); ++z) {
-      unsigned char *to_ptr = to_surface;
-      const unsigned char *from_ptr = from_surface;
-      if (to_range.get(0) == from_range.get(0) &&
-          to_range.get(0) == size.get(0)) {
-        event_list.push_back(
-            memcpy(q, to_ptr, from_ptr, size_slice, dep_events));
-      } else {
-        for (size_t y = 0; y < size.get(1); ++y) {
-          event_list.push_back(
-              memcpy(q, to_ptr, from_ptr, size.get(0), dep_events));
-          to_ptr += to_range.get(0);
-          from_ptr += from_range.get(0);
-        }
-      }
-      to_surface += to_slice;
-      from_surface += from_slice;
-    }
-    break;
-  case host_to_device: {
-    host_buffer buf(get_copy_range(size, to_slice, to_range.get(0)), q,
-                    event_list);
-    std::vector<sycl::event> host_events;
-    if (to_slice == size_slice) {
-      // Copy host data to a temp host buffer with the shape of target.
-      host_events =
-          memcpy(q, buf.get_ptr(), from_surface, to_range, from_range,
-                 sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size, dep_events);
-    } else {
-      // Copy host data to a temp host buffer with the shape of target.
-      host_events =
-          memcpy(q, buf.get_ptr(), from_surface, to_range, from_range,
-                 sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size,
-                 // If has padding data, not sure whether it is useless. So fill
-                 // temp buffer with it.
-                 std::vector<sycl::event>{memcpy(q, buf.get_ptr(), to_surface,
-                                                 buf.get_size(), dep_events)});
-    }
-    // Copy from temp host buffer to device with only one submit.
-    event_list.push_back(
-        memcpy(q, to_surface, buf.get_ptr(), buf.get_size(), host_events));
-    break;
-  }
-  case device_to_host: {
-    host_buffer buf(get_copy_range(size, from_slice, from_range.get(0)), q,
-                    event_list);
-    // Copy from host temp buffer to host target with reshaping.
-    event_list =
-        memcpy(q, to_surface, buf.get_ptr(), to_range, from_range,
-               sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size,
-               // Copy from device to temp host buffer with only one submit.
-               std::vector<sycl::event>{memcpy(q, buf.get_ptr(), from_surface,
-                                               buf.get_size(), dep_events)});
-    break;
-  }
-  case device_to_device:
-#ifdef SYCLCOMPAT_USM_LEVEL_NONE
-  {
-    auto &mm = mem_mgr::instance();
-    auto to_alloc = mm.translate_ptr(to_surface);
-    auto from_alloc = mm.translate_ptr(from_surface);
-    size_t to_offset = (byte_t *)to_surface - to_alloc.alloc_ptr;
-    size_t from_offset = (byte_t *)from_surface - from_alloc.alloc_ptr;
-    event_list.push_back(q.submit([&](sycl::handler &cgh) {
-      cgh.depends_on(dep_events);
-      auto to_o = sycl::id<1>(to_offset);
-      auto from_o = sycl::id<1>(from_offset);
-      sycl::accessor<byte_t, 1, sycl::access_mode::write,
-                     sycl::access::target::device>
-          to_acc(to_alloc.buffer, cgh,
-                 get_copy_range(size, to_slice, to_range.get(0)), to_o);
-      sycl::accessor<byte_t, 1, sycl::access_mode::read,
-                     sycl::access::target::device>
-          from_acc(from_alloc.buffer, cgh,
-                   get_copy_range(size, from_slice, from_range.get(0)), from_o);
-      cgh.parallel_for<class compat_memcpy_3d_detail_usmnone>(
-          size, [=](sycl::id<3> id) {
-            to_acc[get_offset(id, to_slice, to_range.get(0))] =
-                from_acc[get_offset(id, from_slice, from_range.get(0))];
-          });
-    }));
-  }
-#else
-    event_list.push_back(q.submit([&](sycl::handler &cgh) {
-      cgh.depends_on(dep_events);
-      cgh.parallel_for<class memcpy_3d_detail>(size, [=](sycl::id<3> id) {
-        to_surface[get_offset(id, to_slice, to_range.get(0))] =
-            from_surface[get_offset(id, from_slice, from_range.get(0))];
-      });
-    }));
-#endif // SYCLCOMPAT_USM_LEVEL_NONE
-    break;
-  default:
-    throw std::runtime_error("[SYCLcompat] memcpy: invalid direction value");
-  }
-  return event_list;
-}
-
-/// memcpy 2D/3D matrix specified by pitched_data.
-template <typename T = void>
-static inline std::vector<sycl::event>
-memcpy(sycl::queue q, pitched_data to, sycl::id<3> to_id, pitched_data from,
-       sycl::id<3> from_id, sycl::range<3> size) {
-  static_assert(
-      std::is_same_v<T, void>,
-      "This syclcompat::detail::memcpy overload only accepts a dummy template "
-      "parameter, T = void, which prevents SYCL kernel generation by default.");
-  return memcpy(q, to.get_data_ptr(), from.get_data_ptr(),
-                sycl::range<3>(to.get_pitch(), to.get_y(), 1),
-                sycl::range<3>(from.get_pitch(), from.get_y(), 1), to_id,
-                from_id, size);
-}
-
-/// memcpy 2D matrix with pitch.
-template <typename T = void>
-static inline std::vector<sycl::event>
-memcpy(sycl::queue q, void *to_ptr, const void *from_ptr, size_t to_pitch,
-       size_t from_pitch, size_t x, size_t y) {
-  static_assert(
-      std::is_same_v<T, void>,
-      "This syclcompat::detail::memcpy overload only accepts a dummy template "
-      "parameter, T = void, which prevents SYCL kernel generation by default.");
-  return memcpy(q, to_ptr, from_ptr, sycl::range<3>(to_pitch, y, 1),
-                sycl::range<3>(from_pitch, y, 1), sycl::id<3>(0, 0, 0),
-                sycl::id<3>(0, 0, 0), sycl::range<3>(x, y, 1));
-}
-
-// Takes a std::vector<sycl::event> & returns a single event
-// which simply depends on all of them
-static sycl::event combine_events(std::vector<sycl::event> &events,
-                                  sycl::queue q) {
-  return q.submit([&events](sycl::handler &cgh) {
-    cgh.depends_on(events);
-    cgh.host_task([]() {});
-  });
-}
-} // namespace detail
-
-#ifdef SYCLCOMPAT_USM_LEVEL_NONE
-/// Check if the pointer \p ptr represents device pointer or not.
-///
-/// \param ptr The pointer to be checked.
-/// \returns true if \p ptr is a device pointer.
-template <class T> static inline bool is_device_ptr(T ptr) {
-  if constexpr (std::is_pointer<T>::value) {
-    return detail::mem_mgr::instance().is_device_ptr(ptr);
-  }
-  return false;
-}
-#endif
-
-/// Get the buffer and the offset of a piece of memory pointed to by \p ptr.
-///
-/// \param ptr Pointer to a piece of memory.
-/// If NULL is passed as an argument, an exception will be thrown.
-/// \returns a pair containing both the buffer and the offset.
-static std::pair<buffer_t, size_t> get_buffer_and_offset(const void *ptr) {
-  if (ptr) {
-    auto alloc = detail::mem_mgr::instance().translate_ptr(ptr);
-    size_t offset = (byte_t *)ptr - alloc.alloc_ptr;
-    return std::make_pair(alloc.buffer, offset);
-  } else {
-    throw std::runtime_error(
-        "[SYCLcompat] NULL pointer argument in get_buffer_and_offset function is invalid");
-  }
-}
-
-/// Get the data pointed from \p ptr as a 1D buffer reinterpreted as type T.
-template <typename T> static sycl::buffer<T> get_buffer(const void *ptr) {
-  if (!ptr)
-    return sycl::buffer<T>(sycl::range<1>(0));
-  auto alloc = detail::mem_mgr::instance().translate_ptr(ptr);
-  return alloc.buffer.reinterpret<T>(sycl::range<1>(alloc.size / sizeof(T)));
-}
-
-/// Get the buffer of a piece of memory pointed to by \p ptr.
-///
-/// \param ptr Pointer to a piece of memory.
-/// \returns the buffer.
-static buffer_t get_buffer(const void *ptr) {
-  return detail::mem_mgr::instance().translate_ptr(ptr).buffer;
-}
-
-/// Get the host pointer from a buffer that is mapped to virtual pointer ptr.
-/// \param ptr Virtual Pointer mapped to device buffer
-/// \returns A host pointer
-template <typename T> static inline T *get_host_ptr(const void *ptr) {
-  auto BufferOffset = get_buffer_and_offset(ptr);
-  auto host_ptr = BufferOffset.first.get_host_access()
-                      .get_multi_ptr<sycl::access::decorated::no>();
-  return (T *)(host_ptr + BufferOffset.second);
-}
-
-/// A wrapper class contains an accessor and an offset.
-template <typename dataT,
-          sycl::access_mode accessMode = sycl::access_mode::read_write>
-class access_wrapper {
-  sycl::accessor<byte_t, 1, accessMode> accessor;
-  size_t offset;
-
-public:
-  /// Construct the accessor wrapper for memory pointed by \p ptr.
-  ///
-  /// \param ptr Pointer to memory.
-  /// \param cgh The command group handler.
-  access_wrapper(const void *ptr, sycl::handler &cgh)
-      : accessor(get_buffer(ptr).get_access<accessMode>(cgh)), offset(0) {
-    auto alloc = detail::mem_mgr::instance().translate_ptr(ptr);
-    offset = (byte_t *)ptr - alloc.alloc_ptr;
-  }
-
-  /// Get the device pointer.
-  ///
-  /// \returns a device pointer with offset.
-  dataT get_raw_pointer() const { return (dataT)(&accessor[0] + offset); }
-};
-
-/// Get the accessor for memory pointed by \p ptr.
-///
-/// \param ptr Pointer to memory.
-/// If NULL is passed as an argument, an exception will be thrown.
-/// \param cgh The command group handler.
-/// \returns an accessor.
-template <sycl::access_mode accessMode = sycl::access_mode::read_write>
-static sycl::accessor<byte_t, 1, accessMode> get_access(const void *ptr,
-                                                        sycl::handler &cgh) {
-  if (ptr) {
-    auto alloc = detail::mem_mgr::instance().translate_ptr(ptr);
-    return alloc.buffer.get_access<accessMode>(cgh);
-  } else {
-    throw std::runtime_error(
-        "[SYCLcompat] NULL pointer argument in get_access function is invalid");
-  }
-}
-
-namespace experimental {
-namespace detail {
-template <typename T = void>
-static inline std::vector<sycl::event>
-memcpy(sycl::queue q, const experimental::memcpy_parameter &param) {
-  static_assert(std::is_same_v<T, void>,
-                "This syclcompat::experimental::detail::memcpy overload only "
-                "accepts a dummy template parameter, T = void, which prevents "
-                "SYCL kernel generation by default.");
-  auto to = param.to.pitched;
-  auto from = param.from.pitched;
-#ifdef SYCL_EXT_ONEAPI_BINDLESS_IMAGES
-  if (param.to.image_bindless != nullptr &&
-      param.from.image_bindless != nullptr) {
-    throw std::runtime_error(
-        "[SYCLcompat] memcpy: Unsupported bindless_image API.");
-    // TODO: Need change logic when sycl support image_mem to image_mem copy.
-    std::vector<sycl::event> event_list;
-    syclcompat::detail::host_buffer buf(param.size.size(), q, event_list);
-    to.set_data_ptr(buf.get_ptr());
-    experimental::detail::memcpy(param.from.image_bindless, param.from.pos, to,
-                                 sycl::id<3>(0, 0, 0), param.size, q);
-    from.set_data_ptr(buf.get_ptr());
-    event_list.push_back(experimental::detail::memcpy(
-        from, sycl::id<3>(0, 0, 0), param.to.image_bindless, param.to.pos,
-        param.size, q));
-    return event_list;
-  } else if (param.to.image_bindless != nullptr) {
-    throw std::runtime_error(
-        "[SYCLcompat] memcpy: Unsupported bindless_image API.");
-    return {experimental::detail::memcpy(from, param.from.pos,
-                                         param.to.image_bindless, param.to.pos,
-                                         param.size, q)};
-  } else if (param.from.image_bindless != nullptr) {
-    throw std::runtime_error(
-        "[SYCLcompat] memcpy: Unsupported bindless_image API.");
-    return {experimental::detail::memcpy(param.from.image_bindless,
-                                         param.from.pos, to, param.to.pos,
-                                         param.size, q)};
-  }
-#endif
-  if (param.to.image != nullptr) {
-    throw std::runtime_error("[SYCLcompat] memcpy: Unsupported image API.");
-    to = experimental::detail::to_pitched_data(param.to.image);
-  }
-  if (param.from.image != nullptr) {
-    throw std::runtime_error("[SYCLcompat] memcpy: Unsupported image API.");
-    from = experimental::detail::to_pitched_data(param.from.image);
-  }
-  return syclcompat::detail::memcpy(q, to, param.to.pos, from, param.from.pos,
-                                    param.size);
-}
-} // namespace detail
-} // namespace experimental
-
-/// Allocate memory block on the device.
-/// \param num_bytes Number of bytes to allocate.
-/// \param q Queue to execute the allocate task.
-/// \returns A pointer to the newly allocated memory.
-static inline void *malloc(size_t num_bytes,
-                           sycl::queue q = get_default_queue()) {
-  return detail::malloc(num_bytes, q);
-}
-
-/// Allocate memory block on the device.
-/// \param T Datatype to allocate
-/// \param count Number of elements to allocate.
-/// \param q Queue to execute the allocate task.
-/// \returns A pointer to the newly allocated memory.
-template <typename T>
-static inline T *malloc(size_t count, sycl::queue q = get_default_queue()) {
-  return static_cast<T *>(detail::malloc(count * sizeof(T), q));
-}
-
-/// Allocate memory block on the host.
-/// \param num_bytes Number of bytes to allocate.
-/// \param q Queue to execute the allocate task.
-/// \returns A pointer to the newly allocated memory.
-static inline void *malloc_host(size_t num_bytes,
-                                sycl::queue q = get_default_queue()) {
-  return sycl::malloc_host(num_bytes, q);
-}
-
-/// Allocate memory block on the host.
-/// \param T Datatype to allocate
-/// \param num_bytes Number of bytes to allocate.
-/// \param q Queue to execute the allocate task.
-/// \returns A pointer to the newly allocated memory.
-template <typename T>
-static inline T *malloc_host(size_t count,
-                             sycl::queue q = get_default_queue()) {
-  return static_cast<T *>(sycl::malloc_host(count * sizeof(T), q));
-}
-
-/// Allocate memory block of usm_shared memory.
-/// \param num_bytes Number of bytes to allocate.
-/// \param q Queue to execute the allocate task.
-/// \returns A pointer to the newly allocated memory.
-static inline void *malloc_shared(size_t num_bytes,
-                                  sycl::queue q = get_default_queue()) {
-  return sycl::malloc_shared(num_bytes, q);
-}
-
-/// Allocate memory block of usm_shared memory.
-/// \param num_bytes Number of bytes to allocate.
-/// \param q Queue to execute the allocate task.
-/// \returns A pointer to the newly allocated memory.
-template <typename T>
-static inline T *malloc_shared(size_t count,
-                               sycl::queue q = get_default_queue()) {
-  return static_cast<T *>(sycl::malloc_shared(count * sizeof(T), q));
-}
-
-/// Allocate memory block for 3D array on the device.
-/// \param size Size of the memory block, in bytes.
-/// \param q Queue to execute the allocate task.
-/// \returns A pitched_data object which stores the memory info.
-static inline pitched_data malloc(sycl::range<3> size,
-                                  sycl::queue q = get_default_queue()) {
-  pitched_data pitch(nullptr, 0, size.get(0), size.get(1));
-  size_t pitch_size;
-  pitch.set_data_ptr(
-      detail::malloc(pitch_size, size.get(0), size.get(1), size.get(2), q));
-  pitch.set_pitch(pitch_size);
-  return pitch;
-}
-
-/// Allocate memory block for 2D array on the device.
-/// \param [out] pitch Aligned size of x in bytes.
-/// \param x Range in dim x.
-/// \param y Range in dim y.
-/// \param q Queue to execute the allocate task.
-/// \returns A pointer to the newly allocated memory.
-static inline void *malloc(size_t &pitch, size_t x, size_t y,
-                           sycl::queue q = get_default_queue()) {
-  return detail::malloc(pitch, x, y, 1, q);
-}
-
-namespace detail {
-
-inline void free(void *ptr, const sycl::queue &q) {
-  if (ptr) {
-#ifdef SYCLCOMPAT_USM_LEVEL_NONE
-    detail::mem_mgr::instance().mem_free(ptr);
-#else
-    sycl::free(ptr, q.get_context());
-#endif // SYCLCOMPAT_USM_LEVEL_NONE
-  }
-}
-} // namespace detail
-
-/// Wait on the queue \p q and free the memory \p ptr.
-/// \param ptr Point to free.
-/// \param q Queue to execute the free task.
-/// \returns no return value.
-static inline void wait_and_free(void *ptr,
-                                 sycl::queue q = get_default_queue()) {
-  get_current_device().queues_wait_and_throw();
-  q.wait();
-  if (ptr) {
-    detail::free(ptr, q);
-  }
-}
-
-// Anonymous namespace to disable ADL for functions which might clash (memcpy,
-// memset, free)
-namespace {
-/// Free the memory \p ptr on the default queue without synchronizing
-/// \param ptr Point to free.
-/// \returns no return value.
-static inline void free(void *ptr, sycl::queue q = get_default_queue()) {
-  detail::free(ptr, q);
-}
-} // namespace
-
-/// Enqueues the release of all pointers in /p pointers on the /p q.
-/// The command waits on all passed /p events and returns an event that
-/// track the commands execution on the queue.
-///
-/// \param pointers The pointers point to the device memory requested to be
-/// freed.
-/// \param events The events to be waited on.
-/// \param q The sycl::queue the memory relates to.
-// Can't be static due to the friend declaration in the memory header.
-inline sycl::event enqueue_free(const std::vector<void *> &pointers,
-                                const std::vector<sycl::event> &events,
-                                sycl::queue q = get_default_queue()) {
-  auto event = q.submit(
-      [&pointers, &events, &q](sycl::handler &cgh) {
-        cgh.depends_on(events);
-        cgh.host_task([=]() {
-          for (auto p : pointers)
-            detail::free(p, q);
-        });
-      });
-  get_current_device().add_event(event);
-  return event;
-}
-
-namespace {
-/// Synchronously copies \p size bytes from the address specified by \p from_ptr
-/// to the address specified by \p to_ptr. The function will
-/// return after the copy is completed.
-///
-/// \param to_ptr Pointer to destination memory address.
-/// \param from_ptr Pointer to source memory address.
-/// \param size Number of bytes to be copied.
-/// \param q Queue to execute the copy task.
-/// \returns no return value.
-static void memcpy(void *to_ptr, const void *from_ptr, size_t size,
-                   sycl::queue q = get_default_queue()) {
-  detail::memcpy(q, to_ptr, from_ptr, size).wait();
-}
-
-} // namespace
-
-/// Asynchronously copies \p size bytes from the address specified by \p
-/// from_ptr to the address specified by \p to_ptr. The return of the function
-/// does NOT guarantee the copy is completed.
-///
-/// \param to_ptr Pointer to destination memory address.
-/// \param from_ptr Pointer to source memory address.
-/// \param size Number of bytes to be copied.
-/// \param q Queue to execute the copy task.
-/// \returns no return value.
-static sycl::event memcpy_async(void *to_ptr, const void *from_ptr, size_t size,
-                                sycl::queue q = get_default_queue()) {
-  return detail::memcpy(q, to_ptr, from_ptr, size);
-}
-
-/// Asynchronously copies \p count T's from the address specified by \p
-/// from_ptr to the address specified by \p to_ptr. The return of the function
-/// does NOT guarantee the copy is completed.
-///
-/// \tparam T Datatype to be copied.
-/// \param to_ptr Pointer to destination memory address.
-/// \param from_ptr Pointer to source memory address.
-/// \param count Number of T to be copied.
-/// \param q Queue to execute the copy task.
-/// \returns no return value.
-template <typename T>
-static sycl::event
-memcpy_async(type_identity_t<T> *to_ptr, const type_identity_t<T> *from_ptr,
-             size_t count, sycl::queue q = get_default_queue()) {
-  return detail::memcpy(q, static_cast<void *>(to_ptr),
-                        static_cast<const void *>(from_ptr), count * sizeof(T));
-}
-
-namespace {
-/// Synchronously copies \p count T's from the address specified by \p from_ptr
-/// to the address specified by \p to_ptr. The function will
-/// return after the copy is completed.
-///
-/// \tparam T Datatype to be copied.
-/// \param to_ptr Pointer to destination memory address.
-/// \param from_ptr Pointer to source memory address.
-/// \param count Number of T to be copied.
-/// \param q Queue to execute the copy task.
-/// \returns no return value.
-template <typename T>
-static void memcpy(type_identity_t<T> *to_ptr,
-                   const type_identity_t<T> *from_ptr, size_t count,
-                   sycl::queue q = get_default_queue()) {
-  detail::memcpy(q, static_cast<void *>(to_ptr),
-                 static_cast<const void *>(from_ptr), count * sizeof(T))
-      .wait();
-}
-
-/// Synchronously copies 2D matrix specified by \p x and \p y from the address
-/// specified by \p from_ptr to the address specified by \p to_ptr, while \p
-/// from_pitch and \p to_pitch are the range of dim x in bytes of the matrix
-/// specified by \p from_ptr and \p to_ptr. The function will return after the
-/// copy is completed.
-///
-/// \tparam T Dummy template parameter to delay SYCL kernel instantiation
-/// \param to_ptr Pointer to destination memory address.
-/// \param to_pitch Range of dim x in bytes of destination matrix.
-/// \param from_ptr Pointer to source memory address.
-/// \param from_pitch Range of dim x in bytes of source matrix.
-/// \param x Range of dim x of matrix to be copied.
-/// \param y Range of dim y of matrix to be copied.
-/// \param q Queue to execute the copy task.
-/// \returns no return value.
-template <typename T = void>
-static inline void memcpy(void *to_ptr, size_t to_pitch, const void *from_ptr,
-                          size_t from_pitch, size_t x, size_t y,
-                          sycl::queue q = get_default_queue()) {
-  static_assert(
-      std::is_same_v<T, void>,
-      "This syclcompat::memcpy overload only accepts a dummy template "
-      "parameter, T = void, which prevents SYCL kernel generation by default.");
-  sycl::event::wait(
-      detail::memcpy(q, to_ptr, from_ptr, to_pitch, from_pitch, x, y));
-}
-
-} // namespace
-
-/// Asynchronously copies 2D matrix specified by \p x and \p y from the address
-/// specified by \p from_ptr to the address specified by \p to_ptr, while \p
-/// \p from_pitch and \p to_pitch are the range of dim x in bytes of the matrix
-/// specified by \p from_ptr and \p to_ptr. The return of the function does NOT
-/// guarantee the copy is completed.
-///
-/// \tparam T Dummy template parameter to delay SYCL kernel instantiation
-/// \param to_ptr Pointer to destination memory address.
-/// \param to_pitch Range of dim x in bytes of destination matrix.
-/// \param from_ptr Pointer to source memory address.
-/// \param from_pitch Range of dim x in bytes of source matrix.
-/// \param x Range of dim x of matrix to be copied.
-/// \param y Range of dim y of matrix to be copied.
-/// \param q Queue to execute the copy task.
-/// \returns An event representing the memcpy operation.
-template <typename T = void>
-static inline sycl::event memcpy_async(void *to_ptr, size_t to_pitch,
-                                       const void *from_ptr, size_t from_pitch,
-                                       size_t x, size_t y,
-                                       sycl::queue q = get_default_queue()) {
-  static_assert(
-      std::is_same_v<T, void>,
-      "This syclcompat::memcpy overload only accepts a dummy template "
-      "parameter, T = void, which prevents SYCL kernel generation by default.");
-  auto events = detail::memcpy(q, to_ptr, from_ptr, to_pitch, from_pitch, x, y);
-  return detail::combine_events(events, q);
-}
-
-namespace {
-/// Synchronously copies a subset of a 3D matrix specified by \p to to another
-/// 3D matrix specified by \p from. The from and to position info are specified
-/// by \p from_pos and \p to_pos The copied matrix size is specified by \p size.
-// The function will return after the copy is completed.
-///
-/// \tparam T Dummy template parameter to delay SYCL kernel instantiation
-/// \param to Destination matrix info.
-/// \param to_pos Position of destination.
-/// \param from Source matrix info.
-/// \param from_pos Position of destination.
-/// \param size Range of the submatrix to be copied.
-/// \param q Queue to execute the copy task.
-/// \returns no return value.
-template <typename T = void>
-static inline void memcpy(pitched_data to, sycl::id<3> to_pos,
-                          pitched_data from, sycl::id<3> from_pos,
-                          sycl::range<3> size,
-                          sycl::queue q = get_default_queue()) {
-  static_assert(
-      std::is_same_v<T, void>,
-      "This syclcompat::memcpy overload only accepts a dummy template "
-      "parameter, T = void, which prevents SYCL kernel generation by default.");
-  sycl::event::wait(detail::memcpy(q, to, to_pos, from, from_pos, size));
-}
-} // namespace
-
-/// Asynchronously copies a subset of a 3D matrix specified by \p to to another
-/// 3D matrix specified by \p from. The from and to position info are specified
-/// by \p from_pos and \p to_pos The copied matrix size is specified by \p size.
-/// The return of the function does NOT guarantee the copy is completed.
-///
-/// \param to Destination matrix info.
-/// \param to_pos Position of destination.
-/// \param from Source matrix info.
-/// \param from_pos Position of destination.
-/// \param size Range of the submatrix to be copied.
-/// \param q Queue to execute the copy task.
-/// \returns An event representing the memcpy operation.
-template <typename T = void>
-static inline sycl::event memcpy_async(pitched_data to, sycl::id<3> to_pos,
-                                       pitched_data from, sycl::id<3> from_pos,
-                                       sycl::range<3> size,
-                                       sycl::queue q = get_default_queue()) {
-  static_assert(
-      std::is_same_v<T, void>,
-      "This syclcompat::memcpy overload only accepts a dummy template "
-      "parameter, T = void, which prevents SYCL kernel generation by default.");
-  auto events = detail::memcpy(q, to, to_pos, from, from_pos, size);
-  return detail::combine_events(events, q);
-}
-
-namespace {
-/// Synchronously sets \p pattern to the first \p count elements starting from
-/// \p dev_ptr. The function will return after the fill operation is completed.
-///
-/// \tparam T Datatype of the value to be set.
-/// \param dev_ptr Pointer to the device memory address.
-/// \param pattern Pattern of type \p T to be set.
-/// \param count Number of elements to be set to the patten.
-/// \param q The queue in which the operation is done.
-/// \returns no return value.
-template <class T>
-static void inline fill(void *dev_ptr, const T &pattern, size_t count,
-                        sycl::queue q = get_default_queue()) {
-  detail::fill(q, dev_ptr, pattern, count).wait();
-}
-} // namespace
-
-/// Asynchronously sets \p pattern to the first \p count elements starting from
-/// \p dev_ptr.
-/// The return of the function does NOT guarantee the fill operation is
-/// completed.
-///
-/// \tparam T Datatype of the pattern to be set.
-/// \param dev_ptr Pointer to the device memory address.
-/// \param pattern Pattern of type \p T to be set.
-/// \param count Number of elements to be set to the patten.
-/// \param q The queue in which the operation is done.
-/// \returns An event representing the fill operation.
-template <class T>
-static sycl::event inline fill_async(void *dev_ptr, const T &pattern,
-                                     size_t count,
-                                     sycl::queue q = get_default_queue()) {
-  return detail::fill(q, dev_ptr, pattern, count);
-}
-
-namespace experimental {
-
-/// [UNSUPPORTED] Synchronously copies 2D/3D memory data specified by \p param .
-/// The function will return after the copy is completed.
-///
-/// \tparam T Dummy template parameter to delay SYCL kernel instantiation
-/// \param param Memory copy parameters.
-/// \param q Queue to execute the copy task.
-/// \returns no return value.
-template <typename T = void>
-static inline void memcpy(const memcpy_parameter &param,
-                          sycl::queue q = get_default_queue()) {
-  static_assert(
-      std::is_same_v<T, void>,
-      "This syclcompat::memcpy overload only accepts a dummy template "
-      "parameter, T = void, which prevents SYCL kernel generation by default.");
-  sycl::event::wait(syclcompat::experimental::detail::memcpy(q, param));
-}
-
-/// [UNSUPPORTED] Asynchronously copies 2D/3D memory data specified by \p param
-/// . The return of the function does NOT guarantee the copy is completed.
-///
-/// \param param Memory copy parameters.
-/// \param q Queue to execute the copy task.
-/// \returns no return value.
-template <typename T = void>
-static inline void memcpy_async(const memcpy_parameter &param,
-                                sycl::queue q = get_default_queue()) {
-  static_assert(
-      std::is_same_v<T, void>,
-      "This syclcompat::memcpy overload only accepts a dummy template "
-      "parameter, T = void, which prevents SYCL kernel generation by default.");
-  syclcompat::experimental::detail::memcpy(q, param);
-}
-} // namespace experimental
-
-namespace {
-/// Synchronously sets \p value to the first \p size bytes starting from \p
-/// dev_ptr. The function will return after the memset operation is completed.
-///
-/// \param dev_ptr Pointer to the device memory address.
-/// \param value Value to be set.
-/// \param size Number of bytes to be set to the value.
-/// \param q The queue in which the operation is done.
-/// \returns no return value.
-static void memset(void *dev_ptr, int value, size_t size,
-                   sycl::queue q = get_default_queue()) {
-  detail::memset(q, dev_ptr, value, size).wait();
-}
-} // namespace
-
-/// \brief Sets 2 bytes data \p value to the first \p size elements starting
-/// from \p dev_ptr in \p q synchronously.
-/// \tparam T Dummy template parameter to delay SYCL kernel instantiation
-/// \param [in] dev_ptr Pointer to the virtual device memory address.
-/// \param [in] value The value to be set.
-/// \param [in] size Number of elements to be set to the value.
-/// \param [in] q The queue in which the operation is done.
-template <typename T = void>
-static inline void memset_d16(void *dev_ptr, unsigned short value, size_t size,
-                              sycl::queue q = get_default_queue()) {
-  static_assert(
-      std::is_same_v<T, void>,
-      "syclcompat::memset_d16 only accepts a dummy template parameter, T = "
-      "void, which prevents SYCL kernel generation by default.");
-  detail::fill<unsigned short>(q, dev_ptr, value, size).wait();
-}
-
-/// \brief Sets 4 bytes data \p value to the first \p size elements starting
-/// from \p dev_ptr in \p q synchronously.
-/// \tparam T Dummy template parameter to delay SYCL kernel instantiation
-/// \param [in] dev_ptr Pointer to the virtual device memory address.
-/// \param [in] value The value to be set.
-/// \param [in] size Number of elements to be set to the value.
-/// \param [in] q The queue in which the operation is done.
-template <typename T = void>
-static inline void memset_d32(void *dev_ptr, unsigned int value, size_t size,
-                              sycl::queue q = get_default_queue()) {
-  static_assert(
-      std::is_same_v<T, void>,
-      "syclcompat::memset_d32 only accepts a dummy template parameter, T = "
-      "void, which prevents SYCL kernel generation by default.");
-  detail::fill<unsigned int>(q, dev_ptr, value, size).wait();
-}
-
-/// \brief Sets 1 byte data \p value to the first \p size elements starting
-/// from \p dev_ptr in \p q asynchronously.
-/// \param dev_ptr Pointer to the device memory address.
-/// \param value Value to be set.
-/// \param size Number of bytes to be set to the value.
-/// \returns An event representing the memset operation.
-static inline sycl::event memset_async(void *dev_ptr, int value, size_t size,
-                                       sycl::queue q = get_default_queue()) {
-  return detail::memset(q, dev_ptr, value, size);
-}
-
-/// \brief Sets 2 bytes data \p value to the first \p size elements starting
-/// from \p dev_ptr in \p q asynchronously.
-/// \tparam T Dummy template parameter to delay SYCL kernel instantiation
-/// \param [in] dev_ptr Pointer to the virtual device memory address.
-/// \param [in] value The value to be set.
-/// \param [in] size Number of elements to be set to the value.
-/// \param [in] q The queue in which the operation is done.
-/// \returns An event representing the memset operation.
-template <typename T = void>
-static inline sycl::event
-memset_d16_async(void *dev_ptr, unsigned short value, size_t size,
-                 sycl::queue q = get_default_queue()) {
-  static_assert(
-      std::is_same_v<T, void>,
-      "syclcompat::memset_d16_async only accepts a dummy template parameter, T "
-      "= void, which prevents SYCL kernel generation by default.");
-  return detail::fill<unsigned short>(q, dev_ptr, value, size);
-}
-
-/// \brief Sets 4 bytes data \p value to the first \p size elements starting
-/// from \p dev_ptr in \p q asynchronously.
-/// \tparam T Dummy template parameter to delay SYCL kernel instantiation
-/// \param [in] dev_ptr Pointer to the virtual device memory address.
-/// \param [in] value The value to be set.
-/// \param [in] size Number of elements to be set to the value.
-/// \param [in] q The queue in which the operation is done.
-/// \returns An event representing the memset operation.
-template <typename T = void>
-static inline sycl::event
-memset_d32_async(void *dev_ptr, unsigned int value, size_t size,
-                 sycl::queue q = get_default_queue()) {
-  static_assert(
-      std::is_same_v<T, void>,
-      "syclcompat::memset_d32_async only accepts a dummy template parameter, T "
-      "= void, which prevents SYCL kernel generation by default.");
-  return detail::fill<unsigned int>(q, dev_ptr, value, size);
-}
-
-namespace {
-/// \brief Sets 1 byte data \p val to the pitched 2D memory region pointed by \p
-/// ptr in \p q synchronously.
-/// \tparam T Dummy template parameter to delay SYCL kernel instantiation
-/// \param [in] ptr Pointer to the virtual device memory.
-/// \param [in] pitch The pitch size by number of elements, including padding.
-/// \param [in] val The value to be set.
-/// \param [in] x The width of memory region by number of elements.
-/// \param [in] y The height of memory region by number of elements.
-/// \param [in] q The queue in which the operation is done.
-template <typename T = void>
-static inline void memset(void *ptr, size_t pitch, int val, size_t x, size_t y,
-                          sycl::queue q = get_default_queue()) {
-  static_assert(
-      std::is_same_v<T, void>,
-      "This syclcompat::memset overload only accepts a dummy template "
-      "parameter, T = void, which prevents SYCL kernel generation by default.");
-  sycl::event::wait(detail::memset<unsigned char>(q, ptr, pitch, val, x, y));
-}
-} // namespace
-
-/// \brief Sets 2 bytes data \p val to the pitched 2D memory region pointed by
-/// ptr in \p q synchronously.
-/// \tparam T Dummy template parameter to delay SYCL kernel instantiation
-/// \param [in] ptr Pointer to the virtual device memory.
-/// \param [in] pitch The pitch size by number of elements, including padding.
-/// \param [in] val The value to be set.
-/// \param [in] x The width of memory region by number of elements.
-/// \param [in] y The height of memory region by number of elements.
-/// \param [in] q The queue in which the operation is done.
-template <typename T = void>
-static inline void memset_d16(void *ptr, size_t pitch, unsigned short val,
-                              size_t x, size_t y,
-                              sycl::queue q = get_default_queue()) {
-  static_assert(
-      std::is_same_v<T, void>,
-      "syclcompat::memset_d16 only accepts a dummy template parameter, T = "
-      "void, which prevents SYCL kernel generation by default.");
-  sycl::event::wait(detail::memset(q, ptr, pitch, val, x, y));
-}
-
-/// \brief Sets 4 bytes data \p val to the pitched 2D memory region pointed by
-/// ptr in \p q synchronously.
-/// \tparam T Dummy template parameter to delay SYCL kernel instantiation
-/// \param [in] ptr Pointer to the virtual device memory.
-/// \param [in] pitch The pitch size by number of elements, including padding.
-/// \param [in] val The value to be set.
-/// \param [in] x The width of memory region by number of elements.
-/// \param [in] y The height of memory region by number of elements.
-/// \param [in] q The queue in which the operation is done.
-template <typename T = void>
-static inline void memset_d32(void *ptr, size_t pitch, unsigned int val,
-                              size_t x, size_t y,
-                              sycl::queue q = get_default_queue()) {
-  static_assert(
-      std::is_same_v<T, void>,
-      "syclcompat::memset_d32 only accepts a dummy template parameter, T = "
-      "void, which prevents SYCL kernel generation by default.");
-  sycl::event::wait(detail::memset(q, ptr, pitch, val, x, y));
-}
-
-/// \brief Sets 1 byte data \p val to the pitched 2D memory region pointed by \p
-/// ptr in \p q asynchronously.
-/// \tparam T Dummy template parameter to delay SYCL kernel instantiation
-/// \param [in] ptr Pointer to the virtual device memory.
-/// \param [in] pitch The pitch size by number of elements, including padding.
-/// \param [in] val The value to be set.
-/// \param [in] x The width of memory region by number of elements.
-/// \param [in] y The height of memory region by number of elements.
-/// \param [in] q The queue in which the operation is done.
-/// \returns An event representing the memset operation.
-template <typename T = void>
-static inline sycl::event memset_async(void *ptr, size_t pitch, int val,
-                                       size_t x, size_t y,
-                                       sycl::queue q = get_default_queue()) {
-  static_assert(
-      std::is_same_v<T, void>,
-      "syclcompat::memset_async only accepts a dummy template parameter, T = "
-      "void, which prevents SYCL kernel generation by default.");
-  auto events = detail::memset<unsigned char>(q, ptr, pitch, val, x, y);
-  return detail::combine_events(events, q);
-}
-
-/// \brief Sets 2 bytes data \p val to the pitched 2D memory region pointed by
-/// \p ptr in \p q asynchronously.
-/// \tparam T Dummy template parameter to delay SYCL kernel instantiation
-/// \param [in] ptr Pointer to the virtual device memory.
-/// \param [in] pitch The pitch size by number of elements, including padding.
-/// \param [in] val The value to be set.
-/// \param [in] x The width of memory region by number of elements.
-/// \param [in] y The height of memory region by number of elements.
-/// \param [in] q The queue in which the operation is done.
-/// \returns An event representing the memset operation.
-template <typename T = void>
-static inline sycl::event
-memset_d16_async(void *ptr, size_t pitch, unsigned short val, size_t x,
-                 size_t y, sycl::queue q = get_default_queue()) {
-  static_assert(
-      std::is_same_v<T, void>,
-      "syclcompat::memset_d16_async only accepts a dummy template parameter, T "
-      "= void, which prevents SYCL kernel generation by default.");
-  auto events = detail::memset(q, ptr, pitch, val, x, y);
-  return detail::combine_events(events, q);
-}
-
-/// \brief Sets 4 bytes data \p val to the pitched 2D memory region pointed by
-/// \p ptr in \p q asynchronously.
-/// \tparam T Dummy template parameter to delay SYCL kernel instantiation
-/// \param [in] ptr Pointer to the virtual device memory.
-/// \param [in] pitch The pitch size by number of elements, including padding.
-/// \param [in] val The value to be set.
-/// \param [in] x The width of memory region by number of elements.
-/// \param [in] y The height of memory region by number of elements.
-/// \param [in] q The queue in which the operation is done.
-/// \returns An event representing the memset operation.
-template <typename T = void>
-static inline sycl::event
-memset_d32_async(void *ptr, size_t pitch, unsigned int val, size_t x, size_t y,
-                 sycl::queue q = get_default_queue()) {
-  static_assert(
-      std::is_same_v<T, void>,
-      "syclcompat::memset_d32_async only accepts a dummy template parameter, T "
-      "= void, which prevents SYCL kernel generation by default.");
-  auto events = detail::memset(q, ptr, pitch, val, x, y);
-  return detail::combine_events(events, q);
-}
-
-namespace {
-/// Sets \p value to the 3D memory region specified by \p pitch in \p q. \p size
-/// specify the setted 3D memory size. The function will return after the
-/// memset operation is completed.
-///
-/// \tparam T Dummy template parameter to delay SYCL kernel instantiation
-/// \param pitch Specify the 3D memory region.
-/// \param value Value to be set.
-/// \param size The setted 3D memory size.
-/// \param q The queue in which the operation is done.
-/// \returns no return value.
-template <typename T = void>
-static inline void memset(pitched_data pitch, int val, sycl::range<3> size,
-                          sycl::queue q = get_default_queue()) {
-  static_assert(std::is_same_v<T, void>,
-                "syclcompat::memset only accepts a dummy template parameter, T "
-                "= void, which prevents SYCL kernel generation by default.");
-  sycl::event::wait(detail::memset<unsigned char>(q, pitch, val, size));
-}
-} // namespace
-
-/// Sets \p value to the 3D memory region specified by \p pitch in \p q. \p size
-/// specify the setted 3D memory size. The return of the function does NOT
-/// guarantee the memset operation is completed.
-///
-/// \tparam T Dummy template parameter to delay SYCL kernel instantiation
-/// \param pitch Specify the 3D memory region.
-/// \param value Value to be set.
-/// \param size The setted 3D memory size.
-/// \param q The queue in which the operation is done.
-/// \returns An event representing the memset operation.
-template <typename T = void>
-static inline sycl::event memset_async(pitched_data pitch, int val,
-                                       sycl::range<3> size,
-                                       sycl::queue q = get_default_queue()) {
-  static_assert(
-      std::is_same_v<T, void>,
-      "syclcompat::memset_async only accepts a dummy template parameter, T = "
-      "void, which prevents SYCL kernel generation by default.");
-  auto events = detail::memset<unsigned char>(q, pitch, val, size);
-  return detail::combine_events(events, q);
-}
-
-/// accessor used as device function parameter.
-template <class T, memory_region Memory, size_t Dimension> class accessor;
-template <class T, memory_region Memory> class accessor<T, Memory, 3> {
-public:
-  using memory_t = detail::memory_traits<Memory, T>;
-  using element_t = typename memory_t::element_t;
-  using pointer_t = typename memory_t::pointer_t;
-  using accessor_t = typename memory_t::template accessor_t<3>;
-  accessor(pointer_t data, const sycl::range<3> &in_range)
-      : _data(data), _range(in_range) {}
-  template <memory_region M = Memory>
-  accessor(typename std::enable_if<M != memory_region::local,
-                                   const accessor_t>::type &acc)
-      : accessor(acc, acc.get_range()) {}
-  accessor(const accessor_t &acc, const sycl::range<3> &in_range)
-      : accessor(
-            acc.template get_multi_ptr<sycl::access::decorated::no>().get(),
-            in_range) {}
-  accessor<T, Memory, 2> operator[](size_t index) const {
-    sycl::range<2> sub(_range.get(1), _range.get(2));
-    return accessor<T, Memory, 2>(_data + index * sub.size(), sub);
-  }
-
-  pointer_t get_ptr() const { return _data; }
-
-private:
-  pointer_t _data;
-  sycl::range<3> _range;
-};
-template <class T, memory_region Memory> class accessor<T, Memory, 2> {
-public:
-  using memory_t = detail::memory_traits<Memory, T>;
-  using element_t = typename memory_t::element_t;
-  using pointer_t = typename memory_t::pointer_t;
-  using accessor_t = typename memory_t::template accessor_t<2>;
-  accessor(pointer_t data, const sycl::range<2> &in_range)
-      : _data(data), _range(in_range) {}
-  template <memory_region Mem = Memory>
-  accessor(typename std::enable_if<Mem != memory_region::local,
-                                   const accessor_t>::type &acc)
-      : accessor(acc, acc.get_range()) {}
-  accessor(const accessor_t &acc, const sycl::range<2> &in_range)
-      : accessor(
-            acc.template get_multi_ptr<sycl::access::decorated::no>().get(),
-            in_range) {}
-
-  pointer_t operator[](size_t index) const {
-    return _data + _range.get(1) * index;
-  }
-
-  pointer_t get_ptr() const { return _data; }
-
-private:
-  pointer_t _data;
-  sycl::range<2> _range;
-};
-
-/// Device variable with address space of shared or global.
-// TODO(syclcompat-lib-reviewers): This doesn't yet support multi-device (ptr
-// per device)
-template <class T, memory_region Memory, size_t Dimension> class device_memory {
-public:
-  using accessor_t =
-      typename detail::memory_traits<Memory, T>::template accessor_t<Dimension>;
-  using value_t = typename detail::memory_traits<Memory, T>::value_t;
-  using syclcompat_accessor_t = syclcompat::accessor<T, Memory, Dimension>;
-
-  device_memory(sycl::queue q = get_default_queue())
-      : device_memory(sycl::range<Dimension>(1), q) {}
-
-  /// Constructor of 1-D array with initializer list
-  device_memory(const sycl::range<Dimension> &in_range,
-                std::initializer_list<value_t> &&init_list,
-                sycl::queue q = get_default_queue())
-      : device_memory(in_range, q) {
-    assert(init_list.size() <= in_range.size());
-    _host_ptr = (value_t *)std::malloc(_size);
-    std::memset(_host_ptr, 0, _size);
-    std::memcpy(_host_ptr, init_list.begin(), init_list.size() * sizeof(T));
-  }
-
-  /// Constructor of 2-D array with initializer list
-  template <size_t Dim = Dimension>
-  device_memory(
-      const typename std::enable_if<Dim == 2, sycl::range<2>>::type &in_range,
-      std::initializer_list<std::initializer_list<value_t>> &&init_list,
-      sycl::queue q = get_default_queue())
-      : device_memory(in_range, q) {
-    assert(init_list.size() <= in_range[0]);
-    _host_ptr = (value_t *)std::malloc(_size);
-    std::memset(_host_ptr, 0, _size);
-    auto tmp_data = _host_ptr;
-    for (auto sub_list : init_list) {
-      assert(sub_list.size() <= in_range[1]);
-      std::memcpy(tmp_data, sub_list.begin(), sub_list.size() * sizeof(T));
-      tmp_data += in_range[1];
-    }
-  }
-
-  /// Constructor with range
-  device_memory(const sycl::range<Dimension> &range_in,
-                sycl::queue q = get_default_queue())
-      : _size(range_in.size() * sizeof(T)), _range(range_in), _reference(false),
-        _host_ptr(nullptr), _device_ptr(nullptr), _q(q) {
-    static_assert((Memory == memory_region::global) ||
-                      (Memory == memory_region::constant) ||
-                      (Memory == memory_region::usm_shared),
-                  "device memory region should be global, constant or shared");
-    // Make sure that singleton class dev_mgr will destruct later than this.
-    detail::dev_mgr::instance();
-#ifdef SYCLCOMPAT_USM_LEVEL_NONE
-    detail::mem_mgr::instance();
-#endif
-  }
-
-  /// Constructor with range
-  // enable_if_t SFINAE to avoid ambiguity with
-  // device_memory(Args... Arguments, sycl::queue q)
-  template <class... Args, size_t Dim = Dimension,
-            typename = std::enable_if_t<sizeof...(Args) == Dim>>
-  device_memory(Args... Arguments)
-      : device_memory(sycl::range<Dimension>(Arguments...),
-                      get_default_queue()) {}
-
-  /// Constructor with range and queue
-  template <class... Args>
-  device_memory(Args... Arguments, sycl::queue q)
-      : device_memory(sycl::range<Dimension>(Arguments...), q) {}
-
-  ~device_memory() {
-    if (_device_ptr && !_reference)
-      syclcompat::free(_device_ptr, _q);
-    if (_host_ptr)
-      std::free(_host_ptr);
-  }
-
-  /// Allocate memory with the queue specified in the constuctor, and init
-  /// memory if has initial value
-  void init() { init(_q); }
-  /// Allocate memory with specified queue, and init memory if has initial
-  /// value.
-  void init(sycl::queue q) {
-    if (_device_ptr)
-      return;
-    if (!_size)
-      return;
-    allocate_device(q);
-    if (_host_ptr)
-      detail::memcpy(q, _device_ptr, _host_ptr, _size);
-  }
-
-  /// The variable is assigned to a device pointer.
-  void assign(value_t *src, size_t size) {
-    this->~device_memory();
-    new (this) device_memory(src, size, _q);
-  }
-
-  // Get memory pointer of the memory object, a device USM pointer.
-  value_t *get_ptr() { return get_ptr(_q); }
-
-  // Get memory pointer of the memory object, a device USM pointer.
-  value_t *get_ptr(sycl::queue q) {
-    init(q);
-    return _device_ptr;
-  }
-
-  /// Get the device memory object size in bytes.
-  size_t get_size() { return _size; }
-
-  template <size_t Dim = Dimension>
-  typename std::enable_if<Dim == 1, T>::type &operator[](size_t index) {
-    init();
-#ifdef SYCLCOMPAT_USM_LEVEL_NONE
-    return syclcompat::get_buffer<typename std::enable_if<Dim == 1, T>::type>(
-               _device_ptr)
-        .template get_access<sycl::access_mode::read_write>()[index];
-#else
-    return _device_ptr[index];
-#endif // SYCLCOMPAT_USM_LEVEL_NONE
-  }
-
-#ifdef SYCLCOMPAT_USM_LEVEL_NONE
-  /// Get sycl::accessor for the device memory object when usm is not used.
-  accessor_t get_access(sycl::handler &cgh) {
-    return get_buffer(_device_ptr)
-        .template reinterpret<T, Dimension>(_range)
-        .template get_access<detail::memory_traits<Memory, T>::mode,
-                             detail::memory_traits<Memory, T>::target>(cgh);
-  }
-#else
-  /// Get compat_accessor with dimension info for the device memory object
-  /// when usm is used and dimension is greater than 1.
-  template <size_t Dim = Dimension>
-  typename std::enable_if<Dim != 1, syclcompat_accessor_t>::type
-  get_access(sycl::handler &cgh) {
-    return syclcompat_accessor_t((T *)_device_ptr, _range);
-  }
-#endif // SYCLCOMPAT_USM_LEVEL_NONE
-
-private:
-  device_memory(value_t *memory_ptr, size_t size,
-                sycl::queue q = get_default_queue())
-      : _size(size), _range(size / sizeof(T)), _reference(true),
-        _device_ptr(memory_ptr), _q(q) {}
-
-  void allocate_device(sycl::queue q) {
-#ifndef SYCLCOMPAT_USM_LEVEL_NONE
-    if (Memory == memory_region::usm_shared) {
-      _device_ptr = (value_t *)sycl::malloc_shared(_size, q.get_device(),
-                                                   q.get_context());
-      return;
-    }
-#ifdef SYCL_EXT_ONEAPI_USM_DEVICE_READ_ONLY
-    if (Memory == memory_region::constant) {
-      _device_ptr = (value_t *)sycl::malloc_device(
-          _size, q.get_device(), q.get_context(),
-          sycl::ext::oneapi::property::usm::device_read_only());
-      return;
-    }
-#endif
-#endif
-    _device_ptr = (value_t *)detail::malloc(_size, q);
-  }
-
-  size_t _size;
-  sycl::range<Dimension> _range;
-  bool _reference;
-  value_t *_host_ptr;
-  value_t *_device_ptr;
-  sycl::queue _q;
-};
-template <class T, memory_region Memory>
-class device_memory<T, Memory, 0> : public device_memory<T, Memory, 1> {
-public:
-  using base = device_memory<T, Memory, 1>;
-  using value_t = typename base::value_t;
-  using accessor_t =
-      typename detail::memory_traits<Memory, T>::template accessor_t<0>;
-
-  /// Constructor with initial value.
-  device_memory(const value_t &val, sycl::queue q = get_default_queue())
-      : base(sycl::range<1>(1), {val}, q) {}
-
-  /// Default constructor
-  device_memory(sycl::queue q = get_default_queue()) : base(1, q) {}
-#ifdef SYCLCOMPAT_USM_LEVEL_NONE
-  /// Get sycl::accessor for the device memory object when usm is not used.
-  accessor_t get_access(sycl::handler &cgh) {
-    auto buf = get_buffer(base::get_ptr())
-                   .template reinterpret<T, 1>(sycl::range<1>(1));
-    return accessor_t(buf, cgh);
-  }
-#endif // SYCLCOMPAT_USM_LEVEL_NONE
-};
-
-template <class T, size_t Dimension>
-using global_memory = device_memory<T, memory_region::global, Dimension>;
-template <class T, size_t Dimension>
-using constant_memory = device_memory<T, memory_region::constant, Dimension>;
-template <class T, size_t Dimension>
-using shared_memory = device_memory<T, memory_region::usm_shared, Dimension>;
-
-class pointer_attributes {
-public:
-  void init(const void *ptr, sycl::queue q = get_default_queue()) {
-#ifdef SYCLCOMPAT_USM_LEVEL_NONE
-    throw std::runtime_error(
-        "[SYCLcompat] pointer_attributes: only works for USM pointer.");
-#else
-    memory_type = sycl::get_pointer_type(ptr, q.get_context());
-    device_pointer = (memory_type != sycl::usm::alloc::unknown) ? ptr : nullptr;
-    host_pointer = (memory_type != sycl::usm::alloc::unknown) &&
-                           (memory_type != sycl::usm::alloc::device)
-                       ? ptr
-                       : nullptr;
-    sycl::device device_obj = sycl::get_pointer_device(ptr, q.get_context());
-    device_id = detail::dev_mgr::instance().get_device_id(device_obj);
-#endif // SYCLCOMPAT_USM_LEVEL_NONE
-  }
-
-  sycl::usm::alloc get_memory_type() { return memory_type; }
-
-  const void *get_device_pointer() { return device_pointer; }
-
-  const void *get_host_pointer() { return host_pointer; }
-
-  bool is_memory_shared() { return memory_type == sycl::usm::alloc::shared; }
-
-  unsigned int get_device_id() { return device_id; }
-
-private:
-  sycl::usm::alloc memory_type = sycl::usm::alloc::unknown;
-  const void *device_pointer = nullptr;
-  const void *host_pointer = nullptr;
-  unsigned int device_id = 0;
-};
-
-} // namespace syclcompat
diff --git a/sycl/include/syclcompat/syclcompat.hpp b/sycl/include/syclcompat/syclcompat.hpp
deleted file mode 100644
index 93a3eb81d0f15..0000000000000
--- a/sycl/include/syclcompat/syclcompat.hpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat
- *
- *  syclcompat.hpp
- *
- *  Description:
- *    Main include internal header for SYCLcompat
- **************************************************************************/
-
-#pragma once
-
-// MSVC ignores [[deprecated]] attribute on namespace unless compiled with
-// /W3 or above.
-#ifdef _MSC_VER
-#define __SYCLCOMPAT_STRINGIFY(x) #x
-#define __SYCLCOMPAT_TOSTRING(x) __SYCLCOMPAT_STRINGIFY(x)
-
-#define __SYCLCOMPAT_WARNING(msg)                                              \
-  __pragma(message(__FILE__                                                    \
-                   "(" __SYCLCOMPAT_TOSTRING(__LINE__) "): warning: " msg))
-
-__SYCLCOMPAT_WARNING("syclcompat is deprecated and the deprecation warnings "
-                     "are ignored unless compiled with /W3 or above.")
-
-#undef __SYCLCOMPAT_WARNING
-#undef __SYCLCOMPAT_TOSTRING
-#undef __SYCLCOMPAT_STRINGIFY
-#endif
-
-#include <syclcompat/atomic.hpp>
-#include <syclcompat/defs.hpp>
-#include <syclcompat/device.hpp>
-#include <syclcompat/dims.hpp>
-#include <syclcompat/group_utils.hpp>
-#include <syclcompat/id_query.hpp>
-#include <syclcompat/kernel.hpp>
-#include <syclcompat/launch.hpp>
-#include <syclcompat/math.hpp>
-#include <syclcompat/memory.hpp>
-#include <syclcompat/util.hpp>
diff --git a/sycl/include/syclcompat/traits.hpp b/sycl/include/syclcompat/traits.hpp
deleted file mode 100644
index 502fd979d0066..0000000000000
--- a/sycl/include/syclcompat/traits.hpp
+++ /dev/null
@@ -1,295 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCL compatibility extension
- *
- *  traits.hpp
- *
- *  Description:
- *    Type traits for the SYCL compatibility extension
- **************************************************************************/
-
-#pragma once
-
-#include <sycl/feature_test.hpp>
-#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
-#include <sycl/ext/oneapi/bfloat16.hpp>
-#endif
-#include <cstddef>
-#include <sycl/ext/oneapi/properties/properties.hpp>
-#include <sycl/ext/oneapi/properties/property_value.hpp>
-#include <sycl/nd_item.hpp>
-#include <sycl/nd_range.hpp>
-#include <sycl/range.hpp>
-#include <tuple>
-#include <type_traits>
-
-namespace [[deprecated("syclcompat is deprecated")]] syclcompat {
-
-// Equivalent to C++20's std::type_identity (used to create non-deduced
-// contexts)
-template <class T> struct type_identity {
-  using type = T;
-};
-template <class T> using type_identity_t = typename type_identity<T>::type;
-
-// Defines the operand type for arithemtic operations on T. This is identity
-// for all types except pointers, for which it is std::ptrdiff_t
-template <typename T> struct arith {
-  using type = std::conditional_t<std::is_pointer_v<T>, std::ptrdiff_t, T>;
-};
-template <typename T> using arith_t = typename arith<T>::type;
-
-// Traits to check device function signature matches args (with or without local
-// mem)
-template <auto F, typename... Args>
-struct device_fn_invocable : std::is_invocable<decltype(F), Args...> {};
-
-template <auto F, typename... Args>
-struct device_fn_lmem_invocable
-    : std::is_invocable<decltype(F), Args..., char *> {};
-
-template <typename LaunchPolicy, auto F, typename... Args>
-constexpr inline bool args_compatible =
-    std::conditional_t<LaunchPolicy::HasLocalMem,
-                       device_fn_lmem_invocable<F, Args...>,
-                       device_fn_invocable<F, Args...>>::value;
-
-namespace detail {
-
-// Trait for identifying sycl::range and sycl::nd_range.
-template <typename T> struct is_range : std::false_type {};
-template <int Dim> struct is_range<sycl::range<Dim>> : std::true_type {};
-
-template <typename T> constexpr bool is_range_v = is_range<T>::value;
-
-template <typename T> struct is_nd_range : std::false_type {};
-template <int Dim> struct is_nd_range<sycl::nd_range<Dim>> : std::true_type {};
-
-template <typename T> constexpr bool is_nd_range_v = is_nd_range<T>::value;
-
-template <typename T>
-constexpr bool is_range_or_nd_range_v =
-    std::disjunction_v<is_range<T>, is_nd_range<T>>;
-
-// Trait range_to_item_t to convert nd_range -> nd_item, range -> item
-template <typename T> struct range_to_item_map;
-template <int Dim> struct range_to_item_map<sycl::nd_range<Dim>> {
-  using ItemT = sycl::nd_item<Dim>;
-};
-template <int Dim> struct range_to_item_map<sycl::range<Dim>> {
-  using ItemT = sycl::item<Dim, false>;
-};
-
-template <typename T>
-using range_to_item_t = typename range_to_item_map<T>::ItemT;
-
-} // namespace detail
-
-// Forward decls
-namespace experimental {
-
-template <typename Properties> struct kernel_properties;
-template <typename Properties> struct launch_properties;
-struct local_mem_size;
-
-template <typename Range, typename KProps, typename LProps, bool LocalMem>
-class launch_policy;
-} // namespace experimental
-
-namespace experimental::detail {
-
-// Helper for tuple_template_index
-template <template <typename TT> typename PropertyContainer, typename Tuple>
-struct tuple_template_index_helper;
-
-template <template <typename TT> typename PropertyContainer>
-struct tuple_template_index_helper<PropertyContainer, std::tuple<>> {
-  static constexpr std::size_t value = 0;
-};
-
-template <template <typename TT> typename PropertyContainer, typename T,
-          typename... Rest>
-struct tuple_template_index_helper<PropertyContainer,
-                                   std::tuple<PropertyContainer<T>, Rest...>> {
-  static constexpr std::size_t value = 0;
-  using RestTuple = std::tuple<Rest...>;
-  static_assert(
-      tuple_template_index_helper<PropertyContainer, RestTuple>::value ==
-          std::tuple_size_v<RestTuple>,
-      "type appears more than once in tuple");
-};
-
-template <template <typename TT> typename PropertyContainer, typename First,
-          typename... Rest>
-struct tuple_template_index_helper<PropertyContainer,
-                                   std::tuple<First, Rest...>> {
-  using RestTuple = std::tuple<Rest...>;
-  static constexpr std::size_t value =
-      1 + tuple_template_index_helper<PropertyContainer, RestTuple>::value;
-};
-
-// tuple_template_index is a trait helper which finds the index of a class
-// template in a std::tuple<Ts...>. During template argument deduction for
-// launch, this enables us to search the tuple for e.g. `kernel_properties`
-// without knowing the concrete type (e.g. kernel_properties<KProps>) A compile
-// time error is raised if the class template is found more than once. If not
-// found, returns the tuple size (i.e. this is not an error).
-template <template <typename TT> typename PropertyContainer, typename Tuple>
-struct tuple_template_index {
-  static constexpr std::size_t value =
-      tuple_template_index_helper<PropertyContainer, Tuple>::value;
-};
-
-// tuple_contains_template piggy-backs on the functionality of
-// tuple_template_index to detect whether a class template exists in the tuple
-template <template <typename TT> typename PropertyContainer, typename Tuple>
-    struct tuple_contains_template
-    : std::conditional_t <
-      tuple_template_index<PropertyContainer, Tuple>::value<
-          std::tuple_size_v<Tuple>, std::true_type, std::false_type> {};
-
-template <bool TupleContains, typename PropertyContainerConcrete,
-          typename Tuple>
-struct property_getter_helper;
-
-template <typename PropertyContainerConcrete, typename Tuple>
-struct property_getter_helper<true, PropertyContainerConcrete, Tuple> {
-  PropertyContainerConcrete operator()(Tuple tuple) {
-    return std::get<PropertyContainerConcrete>(tuple);
-  }
-};
-
-template <typename PropertyContainerConcrete, typename Tuple>
-struct property_getter_helper<false, PropertyContainerConcrete, Tuple> {
-  PropertyContainerConcrete operator()(Tuple) {
-    return {};
-  }
-};
-
-// For local_mem_size
-template <typename T, typename Tuple> struct has_type;
-
-template <typename T, typename... Us>
-struct has_type<T, std::tuple<Us...>>
-    : std::disjunction<std::is_same<T, Us>...> {};
-
-template <template <typename TT> typename PropertyContainer,
-          typename PropertyContainerConcrete, typename Tuple>
-using property_getter = property_getter_helper<
-    detail::tuple_contains_template<PropertyContainer, Tuple>::value,
-    PropertyContainerConcrete, Tuple>;
-
-template <typename PropertyContainerConcrete, typename Tuple>
-using local_mem_getter =
-    property_getter_helper<has_type<PropertyContainerConcrete, Tuple>::value,
-                           PropertyContainerConcrete, Tuple>;
-
-// Helpers for properties_or_empty
-template <bool InTuple, template <typename TT> typename PropertyContainer,
-          typename... Ts>
-struct properties_or_empty_helper;
-
-template <template <typename TT> typename PropertyContainer, typename... Ts>
-struct properties_or_empty_helper<false, PropertyContainer, Ts...> {
-  using Props = sycl::ext::oneapi::experimental::empty_properties_t;
-};
-
-template <template <typename TT> typename PropertyContainer, typename... Ts>
-struct properties_or_empty_helper<true, PropertyContainer, Ts...> {
-  using Props = typename std::tuple_element_t<
-      tuple_template_index<PropertyContainer, std::tuple<Ts...>>::value,
-      std::tuple<Ts...>>::Props;
-};
-
-// Template type alias which searches variadic types for e.g.
-// syclcompat::experimental::kernel_properties, launch_properties and returns
-// the contained sycl_exp::properties. If not found, returns
-// sycl_exp::empty_properties_t
-template <template <typename TT> typename PropertyContainer, typename... Ts>
-using properties_or_empty = typename properties_or_empty_helper<
-    tuple_contains_template<PropertyContainer, std::tuple<Ts...>>::value,
-    PropertyContainer, Ts...>::Props;
-
-// Traits to detect objects related to compat_exp::launch
-// ========================================================
-
-// Trait to detect compat_exp::kernel_properties
-template <typename T> struct is_kernel_properties : std::false_type {};
-template <typename TT>
-struct is_kernel_properties<kernel_properties<TT>> : std::true_type {};
-
-// Trait to detect compat_exp::launch_properties
-template <typename T> struct is_launch_properties : std::false_type {};
-template <typename TT>
-struct is_launch_properties<launch_properties<TT>> : std::true_type {};
-
-// Trait to detect compat_exp::local_mem_size
-template <typename T> struct is_local_mem_size : std::false_type {};
-template <> struct is_local_mem_size<local_mem_size> : std::true_type {};
-
-// Traits to detect compat_exp::launch_policy
-template <typename T> struct is_launch_policy : std::false_type {};
-
-template <typename RangeT, typename KProps, typename LProps, bool LocalMem>
-struct is_launch_policy<launch_policy<RangeT, KProps, LProps, LocalMem>>
-    : std::true_type {};
-
-template <typename T>
-inline constexpr bool is_launch_policy_v = is_launch_policy<T>::value;
-
-// Trait to detect if all args are sycl_exp property types
-template <typename... Args>
-using are_all_props = std::conjunction<
-    sycl::ext::oneapi::experimental::is_property_value<Args>...>;
-
-} // namespace experimental::detail
-
-// Trait for extended floating point definition
-template <typename T>
-struct is_floating_point : std::is_floating_point<T>{};
-
-template <> struct is_floating_point<sycl::half> : std::true_type {};
-
-#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
-template <> struct is_floating_point<sycl::ext::oneapi::bfloat16> : std::true_type {};
-#endif
-
-template <typename T>
-inline constexpr bool is_floating_point_v = is_floating_point<T>::value;
-
-} // namespace syclcompat
-
-// Specialize std::common_type for bfloat16
-// Semantics here match bfloat16.hpp operator overloads (all mixed type math
-// ops return bfloat16)
-// TODO(syclcompat-lib-reviewers) Move this to bfloat extension
-namespace std {
-template <> struct common_type<sycl::ext::oneapi::bfloat16> {
-  using type = sycl::ext::oneapi::bfloat16;
-};
-
-template <>
-struct common_type<sycl::ext::oneapi::bfloat16, sycl::ext::oneapi::bfloat16> {
-  using type = sycl::ext::oneapi::bfloat16;
-};
-
-template <typename T> struct common_type<sycl::ext::oneapi::bfloat16, T> {
-  using type = sycl::ext::oneapi::bfloat16;
-};
-
-template <typename T> struct common_type<T, sycl::ext::oneapi::bfloat16> {
-  using type = sycl::ext::oneapi::bfloat16;
-};
-} // namespace std
diff --git a/sycl/include/syclcompat/util.hpp b/sycl/include/syclcompat/util.hpp
deleted file mode 100644
index 2e80be3480214..0000000000000
--- a/sycl/include/syclcompat/util.hpp
+++ /dev/null
@@ -1,1192 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCL compatibility extension
- *
- *  util.hpp
- *
- *  Description:
- *    util functionality for the SYCL compatibility extension
- **************************************************************************/
-
-// The original source was under the license below:
-//==---- util.hpp ---------------------------------*- C++ -*----------------==//
-//
-// Copyright (C) Intel Corporation
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// See https://llvm.org/LICENSE.txt for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#pragma once
-
-#include <cassert>
-#include <type_traits>
-
-#include <sycl/atomic_ref.hpp>
-#include <sycl/group_barrier.hpp>
-#include <sycl/kernel_bundle.hpp>
-
-#include <syclcompat/math.hpp>
-#include <syclcompat/memory.hpp>
-#include <syclcompat/dims.hpp>
-
-#if defined(__NVPTX__)
-#include <sycl/ext/oneapi/experimental/cuda/masked_shuffles.hpp>
-#endif
-
-// TODO: Remove these function definitions once they exist in the DPC++ compiler
-#if defined(__SYCL_DEVICE_ONLY__) && defined(__INTEL_LLVM_COMPILER)
-template <typename T>
-__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT
-    __attribute__((noduplicate)) T
-    __spirv_GroupNonUniformShuffle(__spv::Scope::Flag, T, unsigned) noexcept;
-
-template <typename T>
-__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT
-    __attribute__((noduplicate)) T
-    __spirv_GroupNonUniformShuffleDown(__spv::Scope::Flag, T,
-                                       unsigned) noexcept;
-
-template <typename T>
-__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT
-    __attribute__((noduplicate)) T
-    __spirv_GroupNonUniformShuffleUp(__spv::Scope::Flag, T, unsigned) noexcept;
-#endif
-
-namespace [[deprecated("syclcompat is deprecated")]] syclcompat {
-
-namespace detail {
-
-template <typename tag, typename T> class generic_error_type {
-public:
-  generic_error_type() = default;
-  generic_error_type(T value) : value{value} {}
-  operator T() const { return value; }
-
-private:
-  T value;
-};
-
-template <typename T> struct DataType {
-  using T2 = T;
-};
-template <typename T> struct DataType<sycl::vec<T, 2>> {
-  using T2 = detail::complex_type<T>;
-};
-
-template <typename T = void>
-inline void matrix_mem_copy(void *to_ptr, const void *from_ptr, int to_ld,
-                            int from_ld, int rows, int cols, int elem_size,
-                            sycl::queue queue = syclcompat::get_default_queue(),
-                            bool async = false) {
-  static_assert(
-      std::is_same_v<T, void>,
-      "syclcompat::matrix_mem_copy only accepts a dummy template parameter, T "
-      "= void, which prevents SYCL kernel generation by default.");
-  if (to_ptr == from_ptr && to_ld == from_ld) {
-    return;
-  }
-
-  if (to_ld == from_ld) {
-    size_t copy_size = elem_size * ((cols - 1) * (size_t)to_ld + rows);
-    if (async)
-      detail::memcpy(queue, (void *)to_ptr, (void *)from_ptr, copy_size);
-    else
-      detail::memcpy(queue, (void *)to_ptr, (void *)from_ptr, copy_size).wait();
-  } else {
-    if (async)
-      detail::memcpy(queue, to_ptr, from_ptr, elem_size * to_ld,
-                     elem_size * from_ld, elem_size * rows, cols);
-    else
-      sycl::event::wait(detail::memcpy(queue, to_ptr, from_ptr,
-                                       elem_size * to_ld, elem_size * from_ld,
-                                       elem_size * rows, cols));
-  }
-}
-
-/// Copy matrix data. The default leading dimension is column.
-/// \param [out] to_ptr A pointer points to the destination location.
-/// \param [in] from_ptr A pointer points to the source location.
-/// \param [in] to_ld The leading dimension the destination matrix.
-/// \param [in] from_ld The leading dimension the source matrix.
-/// \param [in] rows The number of rows of the source matrix.
-/// \param [in] cols The number of columns of the source matrix.
-/// \param [in] queue The queue where the routine should be executed.
-/// \param [in] async If this argument is true, the return of the function
-/// does NOT guarantee the copy is completed.
-template <typename T>
-inline void matrix_mem_copy(T *to_ptr, const T *from_ptr, int to_ld,
-                            int from_ld, int rows, int cols,
-                            sycl::queue queue = get_default_queue(),
-                            bool async = false) {
-  using Ty = typename DataType<T>::T2;
-  matrix_mem_copy((void *)to_ptr, (void *)from_ptr, to_ld, from_ld, rows, cols,
-                  sizeof(Ty), queue, async);
-}
-} // namespace detail
-
-using err0 = detail::generic_error_type<struct err0_tag, int>;
-using err1 = detail::generic_error_type<struct err1_tag, int>;
-
-/// Cast the high or low 32 bits of a double to an integer.
-/// \param [in] d The double value.
-/// \param [in] use_high32 Cast the high 32 bits of the double if true;
-/// otherwise cast the low 32 bits.
-inline int cast_double_to_int(double d, bool use_high32 = true) {
-  sycl::vec<double, 1> v0{d};
-  auto v1 = v0.as<sycl::int2>();
-  if (use_high32)
-    return v1[0];
-  return v1[1];
-}
-
-/// Combine two integers, the first as the high 32 bits and the second
-/// as the low 32 bits, into a double.
-/// \param [in] high32 The integer as the high 32 bits
-/// \param [in] low32 The integer as the low 32 bits
-inline double cast_ints_to_double(int high32, int low32) {
-  sycl::int2 v0{high32, low32};
-  auto v1 = v0.as<sycl::vec<double, 1>>();
-  return v1;
-}
-
-/// Reverse the bit order of an unsigned integer
-/// \param [in] a Input unsigned integer value
-/// \returns Value of a with the bit order reversed
-template <typename T> inline T reverse_bits(T a) {
-  static_assert(std::is_unsigned<T>::value && std::is_integral<T>::value,
-                "unsigned integer required");
-#if defined(__NVPTX__)
-  if constexpr (sizeof(T) == 4) {
-    unsigned result;
-    asm volatile("brev.b32 %0, %1;" : "=r"(result) : "r"(a));
-    return result;
-  }
-#endif // __NVPTX__
-  if (!a)
-    return 0;
-  T mask = 0;
-  size_t count = 4 * sizeof(T);
-  mask = ~mask >> count;
-  while (count) {
-    a = ((a & mask) << count) | ((a & ~mask) >> count);
-    count = count >> 1;
-    mask = mask ^ (mask << count);
-  }
-  return a;
-}
-
-/// \param [in] a The first value contains 4 bytes
-/// \param [in] b The second value contains 4 bytes
-/// \param [in] s The selector value, only lower 16bit used
-/// \returns the permutation result of 4 bytes selected in the way
-/// specified by \p s from \p a and \p b
-inline unsigned int byte_level_permute(unsigned int a, unsigned int b,
-                                       unsigned int s) {
-  unsigned int ret;
-  ret =
-      ((((std::uint64_t)b << 32 | a) >> (s & 0x7) * 8) & 0xff) |
-      (((((std::uint64_t)b << 32 | a) >> ((s >> 4) & 0x7) * 8) & 0xff) << 8) |
-      (((((std::uint64_t)b << 32 | a) >> ((s >> 8) & 0x7) * 8) & 0xff) << 16) |
-      (((((std::uint64_t)b << 32 | a) >> ((s >> 12) & 0x7) * 8) & 0xff) << 24);
-  return ret;
-}
-
-/// \brief The function performs bitwise logical operations on three input
-/// values of \p a, \p b and \p c based on the specified 8-bit truth table \p
-/// lut and return the result
-///
-/// \param [in] a Input value
-/// \param [in] b Input value
-/// \param [in] c Input value
-/// \param [in] lut truth table for looking up
-/// \returns The result
-inline uint32_t ternary_logic_op(uint32_t a, uint32_t b, uint32_t c,
-                                 uint8_t lut) {
-  uint32_t result = 0;
-#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
-  asm volatile("lop3.b32 %0, %1, %2, %3, %4;"
-               : "=r"(result)
-               : "r"(a), "r"(b), "r"(c), "n"(lut));
-#else
-  switch (lut) {
-  case 0x0:
-    result = 0;
-    break;
-  case 0x1:
-    result = ~a & ~b & ~c;
-    break;
-  case 0x2:
-    result = ~a & ~b & c;
-  case 0x4:
-    result = ~a & b & ~c;
-    break;
-  case 0x8:
-    result = ~a & b & c;
-    break;
-  case 0x10:
-    result = a & ~b & ~c;
-    break;
-  case 0x20:
-    result = a & ~b & c;
-    break;
-  case 0x40:
-    result = a & b & ~c;
-    break;
-  case 0x80:
-    result = a & b & c;
-    break;
-  case 0x1a:
-    result = (a & b | c) ^ a;
-    break;
-  case 0x1e:
-    result = a ^ (b | c);
-    break;
-  case 0x2d:
-    result = ~a ^ (~b & c);
-    break;
-  case 0x78:
-    result = a ^ (b & c);
-    break;
-  case 0x96:
-    result = a ^ b ^ c;
-    break;
-  case 0xb4:
-    result = a ^ (b & ~c);
-    break;
-  case 0xb8:
-    result = a ^ (b & (c ^ a));
-    break;
-  case 0xd2:
-    result = a ^ (~b & c);
-    break;
-  case 0xe8:
-    result = a & (b | c) | (b & c);
-    break;
-  case 0xea:
-    result = a & b | c;
-    break;
-  case 0xfe:
-    result = a | b | c;
-    break;
-  case 0xff:
-    result = -1;
-    break;
-  default: {
-    if (lut & 0x01)
-      result |= ~a & ~b & ~c;
-    if (lut & 0x02)
-      result |= ~a & ~b & c;
-    if (lut & 0x04)
-      result |= ~a & b & ~c;
-    if (lut & 0x08)
-      result |= ~a & b & c;
-    if (lut & 0x10)
-      result |= a & ~b & ~c;
-    if (lut & 0x20)
-      result |= a & ~b & c;
-    if (lut & 0x40)
-      result |= a & b & ~c;
-    if (lut & 0x80)
-      result |= a & b & c;
-    break;
-  }
-  }
-#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
-  return result;
-}
-
-/// Find position of first least significant set bit in an integer.
-/// ffs(0) returns 0.
-///
-/// \param [in] a Input integer value
-/// \returns The position
-template <typename T> inline int ffs(T a) {
-  static_assert(std::is_integral<T>::value, "integer required");
-  return (sycl::ctz(a) + 1) % (sizeof(T) * 8 + 1);
-}
-
-/// select_from_sub_group allows work-items to obtain a copy of a value held by
-/// any other work-item in the sub_group. The input sub_group will be divided
-/// into several logical sub_groups with id range [0, \p logical_sub_group_size
-/// - 1]. Each work-item in logical sub_group gets value from another work-item
-/// whose id is \p remote_local_id. If \p remote_local_id is outside the
-/// logical sub_group id range, \p remote_local_id will modulo with \p
-/// logical_sub_group_size. The \p logical_sub_group_size must be a power of 2
-/// and not exceed input sub_group size.
-/// \tparam T Input value type
-/// \param [in] g Input sub_group
-/// \param [in] x Input value
-/// \param [in] remote_local_id Input source work item id
-/// \param [in] logical_sub_group_size Input logical sub_group size
-/// \returns The result
-template <typename T>
-T select_from_sub_group(sycl::sub_group g, T x, int remote_local_id,
-                        int logical_sub_group_size = 32) {
-  unsigned int start_index =
-      g.get_local_linear_id() / logical_sub_group_size * logical_sub_group_size;
-  return sycl::select_from_group(
-      g, x, start_index + remote_local_id % logical_sub_group_size);
-}
-
-/// shift_sub_group_left move values held by the work-items in a sub_group
-/// directly to another work-item in the sub_group, by shifting values a fixed
-/// number of work-items to the left. The input sub_group will be divided into
-/// several logical sub_groups with id range [0, \p logical_sub_group_size - 1].
-/// Each work-item in logical sub_group gets value from another work-item whose
-/// id is caller's id adds \p delta. If calculated id is outside the logical
-/// sub_group id range, the work-item will get value from itself. The \p
-/// logical_sub_group_size must be a power of 2 and not exceed input sub_group
-/// size.
-/// \tparam T Input value type
-/// \param [in] g Input sub_group
-/// \param [in] x Input value
-/// \param [in] delta Input delta
-/// \param [in] logical_sub_group_size Input logical sub_group size
-/// \returns The result
-template <typename T>
-T shift_sub_group_left(sycl::sub_group g, T x, unsigned int delta,
-                       int logical_sub_group_size = 32) {
-  unsigned int id = g.get_local_linear_id();
-  unsigned int end_index =
-      (id / logical_sub_group_size + 1) * logical_sub_group_size;
-  T result = sycl::shift_group_left(g, x, delta);
-  if ((id + delta) >= end_index) {
-    result = x;
-  }
-  return result;
-}
-
-/// shift_sub_group_right move values held by the work-items in a sub_group
-/// directly to another work-item in the sub_group, by shifting values a fixed
-/// number of work-items to the right. The input sub_group will be divided into
-/// several logical_sub_groups with id range [0, \p logical_sub_group_size - 1].
-/// Each work-item in logical_sub_group gets value from another work-item whose
-/// id is caller's id subtracts \p delta. If calculated id is outside the
-/// logical sub_group id range, the work-item will get value from itself. The \p
-/// logical_sub_group_size must be a power of 2 and not exceed input sub_group
-/// size.
-/// \tparam T Input value type
-/// \param [in] g Input sub_group
-/// \param [in] x Input value
-/// \param [in] delta Input delta
-/// \param [in] logical_sub_group_size Input logical sub_group size
-/// \returns The result
-template <typename T>
-T shift_sub_group_right(sycl::sub_group g, T x, unsigned int delta,
-                        int logical_sub_group_size = 32) {
-  unsigned int id = g.get_local_linear_id();
-  unsigned int start_index =
-      id / logical_sub_group_size * logical_sub_group_size;
-  T result = sycl::shift_group_right(g, x, delta);
-  if ((id - start_index) < delta) {
-    result = x;
-  }
-  return result;
-}
-
-/// permute_sub_group_by_xor permutes values by exchanging values held by pairs
-/// of work-items identified by computing the bitwise exclusive OR of the
-/// work-item id and some fixed mask. The input sub_group will be divided into
-/// several logical sub_groups with id range [0, \p logical_sub_group_size - 1].
-/// Each work-item in logical sub_group gets value from another work-item whose
-/// id is bitwise exclusive OR of the caller's id and \p mask. If calculated id
-/// is outside the logical sub_group id range, the work-item will get value from
-/// itself. The \p logical_sub_group_size must be a power of 2 and not exceed
-/// input sub_group size.
-/// \tparam T Input value type
-/// \param [in] g Input sub_group
-/// \param [in] x Input value
-/// \param [in] mask Input mask
-/// \param [in] logical_sub_group_size Input logical sub_group size
-/// \returns The result
-template <typename T>
-T permute_sub_group_by_xor(sycl::sub_group g, T x, unsigned int mask,
-                           int logical_sub_group_size = 32) {
-  if (logical_sub_group_size == 32) {
-    return permute_group_by_xor(g, x, mask);
-  }
-  unsigned int id = g.get_local_linear_id();
-  unsigned int start_index =
-      id / logical_sub_group_size * logical_sub_group_size;
-  unsigned int target_offset = (id % logical_sub_group_size) ^ mask;
-  return sycl::select_from_group(g, x,
-                                 target_offset < logical_sub_group_size
-                                     ? start_index + target_offset
-                                     : id);
-}
-
-namespace experimental {
-/// Masked version of select_from_sub_group, which execute masked sub-group
-/// operation. The parameter member_mask indicating the work-items participating
-/// the call. Whether the n-th bit is set to 1 representing whether the
-/// work-item with id n is participating the call. All work-items named in
-/// member_mask must be executed with the same member_mask, or the result is
-/// undefined.
-/// \tparam T Input value type
-/// \param [in] member_mask Input mask
-/// \param [in] g Input sub_group
-/// \param [in] x Input value
-/// \param [in] remote_local_id Input source work item id
-/// \param [in] logical_sub_group_size Input logical sub_group size
-/// \returns The result
-template <typename T>
-T select_from_sub_group(unsigned int member_mask, sycl::sub_group g, T x,
-                        int remote_local_id, int logical_sub_group_size = 32) {
-  unsigned int start_index =
-      g.get_local_linear_id() / logical_sub_group_size * logical_sub_group_size;
-  unsigned logical_remote_id =
-      start_index + remote_local_id % logical_sub_group_size;
-#if defined(__SYCL_DEVICE_ONLY__) && defined(__INTEL_LLVM_COMPILER)
-#if defined(__SPIR__)
-  return __spirv_GroupNonUniformShuffle(__spv::Scope::Subgroup, x,
-                                        logical_remote_id);
-#elif defined(__NVPTX__)
-  int cVal = ((32 - logical_sub_group_size) << 8) | 31;
-  return cuda_shfl_sync_idx_i32(member_mask, x, remote_local_id, cVal);
-#else
-  throw sycl::exception(sycl::errc::runtime,
-                        "[SYCLcompat] Masked version of select_from_sub_group "
-                        "only supports SPIR-V or cuda backends.");
-#endif // __SPIR__
-#else
-  (void)g;
-  (void)x;
-  (void)remote_local_id;
-  (void)logical_sub_group_size;
-  (void)member_mask;
-  throw sycl::exception(
-      sycl::errc::runtime,
-      "[SYCLcompat] Masked version of select_from_sub_group not "
-      "supported on host device and non intel compiler.");
-#endif // __SYCL_DEVICE_ONLY__ && __INTEL_LLVM_COMPILER
-}
-
-/// Masked version of shift_sub_group_left, which execute masked sub-group
-/// operation. The parameter member_mask indicating the work-items participating
-/// the call. Whether the n-th bit is set to 1 representing whether the
-/// work-item with id n is participating the call. All work-items named in
-/// member_mask must be executed with the same member_mask, or the result is
-/// undefined.
-/// \tparam T Input value type
-/// \param [in] member_mask Input mask
-/// \param [in] g Input sub_group
-/// \param [in] x Input value
-/// \param [in] delta Input delta
-/// \param [in] logical_sub_group_size Input logical sub_group size
-/// \returns The result
-template <typename T>
-T shift_sub_group_left(unsigned int member_mask, sycl::sub_group g, T x,
-                       unsigned int delta, int logical_sub_group_size = 32) {
-  unsigned int id = g.get_local_linear_id();
-  unsigned int end_index =
-      (id / logical_sub_group_size + 1) * logical_sub_group_size;
-#if defined(__SYCL_DEVICE_ONLY__) && defined(__INTEL_LLVM_COMPILER)
-#if defined(__SPIR__)
-  T result =
-      __spirv_GroupNonUniformShuffleDown(__spv::Scope::Subgroup, x, delta);
-  if ((id + delta) >= end_index) {
-    result = x;
-  }
-  return result;
-#elif defined(__NVPTX__)
-  int cVal = ((32 - logical_sub_group_size) << 8) | 31;
-  return cuda_shfl_sync_down_i32(member_mask, x, delta, cVal);
-#else
-  throw sycl::exception(sycl::errc::runtime,
-                        "[SYCLcompat] Masked version of shift_sub_group_left "
-                        "only supports SPIR-V or cuda backends.");
-#endif // __SPIR__
-#else
-  (void)g;
-  (void)x;
-  (void)delta;
-  (void)logical_sub_group_size;
-  (void)member_mask;
-  throw sycl::exception(
-      sycl::errc::runtime,
-      "[SYCLcompat] Masked version of shift_sub_group_left not "
-      "supported on host device and non intel compiler.");
-#endif // __SYCL_DEVICE_ONLY__ && __INTEL_LLVM_COMPILER
-}
-
-/// Masked version of shift_sub_group_right, which execute masked sub-group
-/// operation. The parameter member_mask indicating the work-items participating
-/// the call. Whether the n-th bit is set to 1 representing whether the
-/// work-item with id n is participating the call. All work-items named in
-/// member_mask must be executed with the same member_mask, or the result is
-/// undefined.
-/// \tparam T Input value type
-/// \param [in] member_mask Input mask
-/// \param [in] g Input sub_group
-/// \param [in] x Input value
-/// \param [in] delta Input delta
-/// \param [in] logical_sub_group_size Input logical sub_group size
-/// \returns The result
-template <typename T>
-T shift_sub_group_right(unsigned int member_mask, sycl::sub_group g, T x,
-                        unsigned int delta, int logical_sub_group_size = 32) {
-  unsigned int id = g.get_local_linear_id();
-  unsigned int start_index =
-      id / logical_sub_group_size * logical_sub_group_size;
-#if defined(__SYCL_DEVICE_ONLY__) && defined(__INTEL_LLVM_COMPILER)
-#if defined(__SPIR__)
-  T result = __spirv_GroupNonUniformShuffleUp(__spv::Scope::Subgroup, x, delta);
-  if ((id - start_index) < delta) {
-    result = x;
-  }
-  return result;
-#elif defined(__NVPTX__)
-  int cVal = ((32 - logical_sub_group_size) << 8);
-  return cuda_shfl_sync_up_i32(member_mask, x, delta, cVal);
-#else
-  throw sycl::exception(sycl::errc::runtime,
-                        "Masked version of shift_sub_group_right "
-                        "only supports SPIR-V or cuda backends.");
-#endif // __SPIR__
-#else
-  (void)g;
-  (void)x;
-  (void)delta;
-  (void)logical_sub_group_size;
-  (void)member_mask;
-  throw sycl::exception(sycl::errc::runtime,
-                        "Masked version of shift_sub_group_right not "
-                        "supported on host device and non intel compiler.");
-#endif // __SYCL_DEVICE_ONLY && __INTEL_LLVM_COMPILER
-}
-
-/// Masked version of permute_sub_group_by_xor, which execute masked sub-group
-/// operation. The parameter member_mask indicating the work-items participating
-/// the call. Whether the n-th bit is set to 1 representing whether the
-/// work-item with id n is participating the call. All work-items named in
-/// member_mask must be executed with the same member_mask, or the result is
-/// undefined.
-/// \tparam T Input value type
-/// \param [in] member_mask Input mask
-/// \param [in] g Input sub_group
-/// \param [in] x Input value
-/// \param [in] mask Input mask
-/// \param [in] logical_sub_group_size Input logical sub_group size
-/// \returns The result
-template <typename T>
-T permute_sub_group_by_xor(unsigned int member_mask, sycl::sub_group g, T x,
-                           unsigned int mask, int logical_sub_group_size = 32) {
-  unsigned int id = g.get_local_linear_id();
-  unsigned int start_index =
-      id / logical_sub_group_size * logical_sub_group_size;
-  unsigned int target_offset = (id % logical_sub_group_size) ^ mask;
-  unsigned logical_remote_id = (target_offset < logical_sub_group_size)
-                                   ? start_index + target_offset
-                                   : id;
-#if defined(__SYCL_DEVICE_ONLY__) && defined(__INTEL_LLVM_COMPILER)
-#if defined(__SPIR__)
-  return __spirv_GroupNonUniformShuffle(__spv::Scope::Subgroup, x,
-                                        logical_remote_id);
-#elif defined(__NVPTX__)
-  int cVal = ((32 - logical_sub_group_size) << 8) | 31;
-  return cuda_shfl_sync_bfly_i32(member_mask, x, mask, cVal);
-#else
-  throw sycl::exception(
-      sycl::errc::runtime,
-      "[SYCLcompat] Masked version of permute_sub_group_by_xor "
-      "only supports SPIR-V or cuda backends.");
-#endif // __SPIR__
-#else
-  (void)g;
-  (void)x;
-  (void)mask;
-  (void)logical_sub_group_size;
-  (void)member_mask;
-  throw sycl::exception(
-      sycl::errc::runtime,
-      "[SYCLcompat]Masked version of permute_sub_group_by_xor not "
-      "supported on host device and non intel compiler.");
-#endif // __SYCL_DEVICE_ONLY__ && __INTEL_LLVM_COMPILER
-}
-} // namespace experimental
-
-/// Inherited from the original SYCLomatic compatibility headers.
-/// @return compiler's SYCL version if defined, 202000 otherwise.
-inline int get_sycl_language_version() {
-#ifdef SYCL_LANGUAGE_VERSION
-  return SYCL_LANGUAGE_VERSION;
-#else
-  return 202000;
-#endif
-}
-
-/// The function match_any_over_sub_group conducts a comparison of values
-/// across work-items within a sub-group. match_any_over_sub_group return a mask
-/// in which some bits are set to 1, indicating that the \p value provided by
-/// the work-item represented by these bits are equal. The n-th bit of mask
-/// representing the work-item with id n. The parameter \p member_mask
-/// indicating the work-items participating the call.
-/// \tparam T Input value type
-/// \param [in] g Input sub_group
-/// \param [in] member_mask Input mask
-/// \param [in] value Input value
-/// \returns The result
-template <typename T>
-unsigned int match_any_over_sub_group(sycl::sub_group g, unsigned member_mask,
-                                      T value) {
-  static_assert(std::is_arithmetic_v<T>, "Value type must be arithmetic type.");
-  if (!member_mask) {
-    return 0;
-  }
-  unsigned int id = g.get_local_linear_id();
-  unsigned int flag = 0, result = 0, reduce_result = 0;
-  unsigned int bit_index = 0x1 << id;
-  bool is_participate = member_mask & bit_index;
-  T broadcast_value = 0;
-  bool matched = false;
-  while (flag != member_mask) {
-    broadcast_value =
-        sycl::select_from_group(g, value, sycl::ctz((~flag & member_mask)));
-    reduce_result = sycl::reduce_over_group(
-        g, is_participate ? (broadcast_value == value ? bit_index : 0) : 0,
-        sycl::plus<>());
-    flag |= reduce_result;
-    matched = reduce_result & bit_index;
-    result = matched * reduce_result + (1 - matched) * result;
-  }
-  return result;
-}
-
-/// The function match_all_over_sub_group conducts a comparison of values
-/// across work-items within a sub-group. match_all_over_sub_group return \p
-/// member_mask and predicate \p pred will be set to 1 if all \p value that
-/// provided by each work-item in \p member_mask are equal, otherwise return 0
-/// and the predicate \p pred will be set to 0. The n-th bit of \p member_mask
-/// representing the work-item with id n. The parameter \p member_mask
-/// indicating the work-items participating the call.
-/// \tparam T Input value type
-/// \param [in] g Input sub_group
-/// \param [in] member_mask Input mask
-/// \param [in] value Input value
-/// \param [out] pred Output predicate
-/// \returns The result
-template <typename T>
-unsigned int match_all_over_sub_group(sycl::sub_group g, unsigned member_mask,
-                                      T value, int *pred) {
-  static_assert(std::is_arithmetic_v<T>, "Value type must be arithmetic type.");
-  if (!member_mask) {
-    return 0;
-  }
-  unsigned int id = g.get_local_linear_id();
-  unsigned int bit_index = 0x1 << id;
-  bool is_participate = member_mask & bit_index;
-  T broadcast_value = sycl::select_from_group(g, value, sycl::ctz(member_mask));
-  unsigned int reduce_result = sycl::reduce_over_group(
-      g,
-      (member_mask & bit_index) ? (broadcast_value == value ? bit_index : 0)
-                                : 0,
-      sycl::plus<>());
-  bool all_equal = (reduce_result == member_mask);
-  *pred = is_participate & all_equal;
-  return (is_participate & all_equal) * member_mask;
-}
-
-namespace experimental {
-
-// FIXME(@intel/syclcompat-lib-reviewers): unify once supported in the CUDA and
-// AMD backends.
-#if defined(__AMDGPU__) || defined(__NVPTX__)
-constexpr sycl::memory_order barrier_memory_order = sycl::memory_order::acq_rel;
-#else
-constexpr sycl::memory_order barrier_memory_order = sycl::memory_order::seq_cst;
-#endif
-
-/// Synchronize work items from all work groups within a SYCL kernel.
-/// \param [in] item:  Represents a work group.
-/// \param [in] counter: An atomic object defined on a device memory which can
-/// be accessed by work items in all work groups. The initial value of the
-/// counter should be zero.
-/// Note: Please make sure that all the work items of all work groups within
-/// a SYCL kernel can be scheduled actively at the same time on a device.
-template <int dimensions = 3>
-inline void nd_range_barrier(
-    const sycl::nd_item<dimensions> &item,
-    sycl::atomic_ref<unsigned int, barrier_memory_order,
-                     sycl::memory_scope::device,
-                     sycl::access::address_space::global_space> &counter) {
-
-  static_assert(dimensions == 3, "dimensions must be 3.");
-  constexpr unsigned int MSB32_MASK = 0x80000000;
-
-  unsigned int num_groups = item.get_group_range(2) * item.get_group_range(1) *
-                            item.get_group_range(0);
-
-  item.barrier();
-
-  if (item.get_local_linear_id() == 0) {
-    unsigned int inc = 1;
-    unsigned int old_arrive = 0;
-    bool is_group0 =
-        (item.get_group(2) + item.get_group(1) + item.get_group(0) == 0);
-    if (is_group0) {
-      inc = MSB32_MASK - (num_groups - 1);
-    }
-
-    old_arrive = counter.fetch_add(inc);
-    // Synchronize all the work groups
-    while (((old_arrive ^ counter.load()) & MSB32_MASK) == 0)
-      ;
-  }
-
-  item.barrier();
-}
-
-/// Synchronize work items from all work groups within a SYCL kernel.
-/// \param [in] item:  Represents a work group.
-/// \param [in] counter: An atomic object defined on a device memory which can
-/// be accessed by work items in all work groups. The initial value of the
-/// counter should be zero.
-/// Note: Please make sure that all the work items of all work groups within
-/// a SYCL kernel can be scheduled actively at the same time on a device.
-template <>
-inline void nd_range_barrier(
-    const sycl::nd_item<1> &item,
-    sycl::atomic_ref<unsigned int, barrier_memory_order,
-                     sycl::memory_scope::device,
-                     sycl::access::address_space::global_space> &counter) {
-  unsigned int num_groups = item.get_group_range(0);
-  constexpr unsigned int MSB32_MASK = 0x80000000;
-
-  item.barrier();
-
-  if (item.get_local_linear_id() == 0) {
-    unsigned int inc = 1;
-    unsigned int old_arrive = 0;
-    bool is_group0 = (item.get_group(0) == 0);
-    if (is_group0) {
-      inc = MSB32_MASK - (num_groups - 1);
-    }
-
-    old_arrive = counter.fetch_add(inc);
-    // Synchronize all the work groups
-    while (((old_arrive ^ counter.load()) & MSB32_MASK) == 0)
-      ;
-  }
-
-  item.barrier();
-}
-
-/// The logical-group is a logical collection of some work-items within a
-/// work-group.
-/// Note: Please make sure that the logical-group size is a power of 2 in the
-/// range [1, current_sub_group_size].
-template <int dimensions = 3> class logical_group {
-  sycl::nd_item<dimensions> _item;
-  sycl::group<dimensions> _g;
-  uint32_t _logical_group_size;
-  uint32_t _group_linear_range_in_parent;
-
-public:
-  /// Dividing \p parent_group into several logical-groups.
-  /// \param [in] item Current work-item.
-  /// \param [in] parent_group The group to be divided.
-  /// \param [in] size The logical-group size.
-  logical_group(sycl::nd_item<dimensions> item,
-                sycl::group<dimensions> parent_group, uint32_t size)
-      : _item(item), _g(parent_group), _logical_group_size(size) {
-    _group_linear_range_in_parent =
-        (_g.get_local_linear_range() - 1) / _logical_group_size + 1;
-  }
-  logical_group(sycl::nd_item<dimensions> item)
-      : _item(item), _g(item.get_group()) {}
-  /// Returns the index of the work-item within the logical-group.
-  uint32_t get_local_linear_id() const {
-    return _item.get_local_linear_id() % _logical_group_size;
-  }
-  /// Returns the index of the logical-group in the parent group.
-  uint32_t get_group_linear_id() const {
-    return _item.get_local_linear_id() / _logical_group_size;
-  }
-  /// Returns the number of work-items in the logical-group.
-  uint32_t get_local_linear_range() const {
-    if (_g.get_local_linear_range() % _logical_group_size == 0) {
-      return _logical_group_size;
-    }
-    uint32_t last_item_group_id =
-        _g.get_local_linear_range() / _logical_group_size;
-    uint32_t first_of_last_group = last_item_group_id * _logical_group_size;
-    if (_item.get_local_linear_id() >= first_of_last_group) {
-      return _g.get_local_linear_range() - first_of_last_group;
-    } else {
-      return _logical_group_size;
-    }
-  }
-  /// Returns the number of logical-group in the parent group.
-  uint32_t get_group_linear_range() const {
-    return _group_linear_range_in_parent;
-  }
-};
-
-// The original source of the functions calculate_max_active_wg_per_xecore and
-// calculate_max_potential_wg were under the license below:
-//
-// Copyright (C) Intel Corporation
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-// SOFTWARE.
-//
-/// This function is used for occupancy calculation, it computes the max active
-/// work-group number per Xe-Core. Ref to
-/// https://github.com/oneapi-src/oneAPI-samples/tree/master/Tools/GPU-Occupancy-Calculator
-/// \param [out] num_wg Active work-group number.
-/// \param [in] wg_size Work-group size.
-/// \param [in] slm_size Share local memory size.
-/// \param [in] sg_size Sub-group size.
-/// \param [in] used_barrier Whether barrier is used.
-/// \param [in] used_large_grf Whether large General Register File is used.
-/// \return If no error, returns 0.
-/// If \p wg_size exceeds the max work-group size, the max work-group size will
-/// be used instead of \p wg_size and returns -1.
-inline int calculate_max_active_wg_per_xecore(int *num_wg, int wg_size,
-                                              int slm_size = 0,
-                                              int sg_size = 32,
-                                              bool used_barrier = false,
-                                              bool used_large_grf = false) {
-  int ret = 0;
-  const int slm_size_per_xe_core = 64 * 1024;
-  const int max_barrier_registers = 32;
-  syclcompat::device_ext &dev = syclcompat::get_current_device();
-
-  size_t max_wg_size = dev.get_info<sycl::info::device::max_work_group_size>();
-  if (wg_size > max_wg_size) {
-    wg_size = max_wg_size;
-    ret = -1;
-  }
-
-  int num_threads_ss = 56;
-  int max_num_wg = 56;
-  if (dev.has(sycl::aspect::ext_intel_gpu_eu_count_per_subslice) &&
-      dev.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
-    auto eu_count =
-        dev.get_info<sycl::info::device::ext_intel_gpu_eu_count_per_subslice>();
-    auto threads_count =
-        dev.get_info<sycl::ext::intel::info::device::gpu_hw_threads_per_eu>();
-    num_threads_ss = eu_count * threads_count;
-    max_num_wg = eu_count * threads_count;
-  }
-
-  if (used_barrier) {
-    max_num_wg = max_barrier_registers;
-  }
-
-  // Calculate num_wg_slm
-  int num_wg_slm = 0;
-  if (slm_size == 0) {
-    num_wg_slm = max_num_wg;
-  } else {
-    num_wg_slm = std::floor((float)slm_size_per_xe_core / slm_size);
-  }
-
-  // Calculate num_wg_threads
-  if (used_large_grf)
-    num_threads_ss = num_threads_ss / 2;
-  int num_threads = std::ceil((float)wg_size / sg_size);
-  int num_wg_threads = std::floor((float)num_threads_ss / num_threads);
-
-  // Calculate num_wg
-  *num_wg = std::min(num_wg_slm, num_wg_threads);
-  *num_wg = std::min(*num_wg, max_num_wg);
-  return ret;
-}
-
-/// This function is used for occupancy calculation, it computes the work-group
-/// number and the work-group size which achieves the maximum occupancy of the
-/// device potentially. Ref to
-/// https://github.com/oneapi-src/oneAPI-samples/tree/master/Tools/GPU-Occupancy-Calculator
-/// \param [out] num_wg Work-group number.
-/// \param [out] wg_size Work-group size.
-/// \param [in] max_wg_size_for_device_code The maximum working work-group size
-/// for current device code logic. Zero means no limitation.
-/// \param [in] slm_size Share local memory size.
-/// \param [in] sg_size Sub-group size.
-/// \param [in] used_barrier Whether barrier is used.
-/// \param [in] used_large_grf Whether large General Register File is used.
-/// \return Returns 0.
-inline int calculate_max_potential_wg(int *num_wg, int *wg_size,
-                                      int max_wg_size_for_device_code,
-                                      int slm_size = 0, int sg_size = 32,
-                                      bool used_barrier = false,
-                                      bool used_large_grf = false) {
-  sycl::device &dev = syclcompat::get_current_device();
-  size_t max_wg_size = dev.get_info<sycl::info::device::max_work_group_size>();
-  if (max_wg_size_for_device_code == 0 ||
-      max_wg_size_for_device_code >= max_wg_size)
-    *wg_size = (int)max_wg_size;
-  else
-    *wg_size = max_wg_size_for_device_code;
-  calculate_max_active_wg_per_xecore(num_wg, *wg_size, slm_size, sg_size,
-                                     used_barrier, used_large_grf);
-  std::uint32_t num_ss = 1;
-  if (dev.has(sycl::aspect::ext_intel_gpu_slices) &&
-      dev.has(sycl::aspect::ext_intel_gpu_subslices_per_slice)) {
-    num_ss =
-        dev.get_info<sycl::ext::intel::info::device::gpu_slices>() *
-        dev.get_info<sycl::ext::intel::info::device::gpu_subslices_per_slice>();
-  }
-  num_wg[0] = num_ss * num_wg[0];
-  return 0;
-}
-
-/// Supported group types
-enum class group_type { work_group, sub_group, logical_group, root_group };
-
-/// The group_base will dispatch the function call to the specific interface
-/// based on the group type.
-template <int dimensions = 3> class group_base {
-public:
-  group_base(sycl::nd_item<dimensions> item)
-      : nd_item(item), _logical_group(item) {}
-  ~group_base() {}
-  /// Returns the number of work-items in the group.
-  size_t get_local_linear_range() {
-    switch (type) {
-    case group_type::work_group:
-      return nd_item.get_group().get_local_linear_range();
-    case group_type::sub_group:
-      return nd_item.get_sub_group().get_local_linear_range();
-    case group_type::logical_group:
-      return _logical_group.get_local_linear_range();
-    default:
-      return -1; // Unkonwn group type
-    }
-  }
-  /// Returns the index of the work-item within the group.
-  size_t get_local_linear_id() {
-    switch (type) {
-    case group_type::work_group:
-      return nd_item.get_group().get_local_linear_id();
-    case group_type::sub_group:
-      return nd_item.get_sub_group().get_local_linear_id();
-    case group_type::logical_group:
-      return _logical_group.get_local_linear_id();
-    default:
-      return -1; // Unkonwn group type
-    }
-  }
-  /// Wait for all the elements within the group to complete their execution
-  /// before proceeding.
-  void barrier() {
-    switch (type) {
-    case group_type::work_group:
-      sycl::group_barrier(nd_item.get_group());
-      break;
-    case group_type::sub_group:
-    case group_type::logical_group:
-      sycl::group_barrier(nd_item.get_sub_group());
-      break;
-    default:
-      break;
-    }
-  }
-
-protected:
-  logical_group<dimensions> _logical_group;
-  sycl::nd_item<dimensions> nd_item;
-  group_type type;
-};
-
-/// Container type that can store supported group_types.
-template <typename GroupT, int dimensions = 3>
-class group : public group_base<dimensions> {
-  using group_base<dimensions>::type;
-  using group_base<dimensions>::logical_group;
-
-public:
-  group(GroupT g, sycl::nd_item<dimensions> item)
-      : group_base<dimensions>(item) {
-    if constexpr (std::is_same_v<GroupT, sycl::sub_group>) {
-      type = group_type::sub_group;
-    } else if constexpr (std::is_same_v<GroupT, sycl::group<dimensions>>) {
-      type = group_type::work_group;
-    } else if constexpr (std::is_same_v<
-                             GroupT, experimental::logical_group<dimensions>>) {
-      logical_group = g;
-      type = group_type::logical_group;
-    }
-  }
-};
-} // namespace experimental
-
-// Calculate the number of work-groups per compute unit
-// \tparam [in] KernelName SYCL kernel name to calculate for
-// \param [in] q SYCL queue used to execute kernel
-// \param [in] wg_dim3 dim3 representing work-group shape
-// \param [in] local_mem_size Local memory usage per work-group in bytes
-// \return size_t representing maximum work-groups per compute unit
-template <class KernelName>
-size_t max_active_work_groups_per_cu(
-    syclcompat::dim3 wg_dim3, size_t local_mem_size,
-    sycl::queue queue = syclcompat::get_default_queue()) {
-  namespace syclex = sycl::ext::oneapi::experimental;
-  // max_num_work_groups only supports range<3>
-  auto ctx = queue.get_context();
-  auto bundle = sycl::get_kernel_bundle<sycl::bundle_state::executable>(ctx);
-  auto kernel = bundle.template get_kernel<KernelName>();
-  sycl::range<3> wg_range_3d(wg_dim3);
-  size_t max_wgs = kernel.template ext_oneapi_get_info<
-      syclex::info::kernel_queue_specific::max_num_work_groups>(queue, wg_range_3d,
-                                                                local_mem_size);
-  size_t max_compute_units =
-      queue.get_device().get_info<sycl::info::device::max_compute_units>();
-  // Spec dictates max_compute_units > 0, so no need to catch div 0
-  return max_wgs / max_compute_units;
-}
-
-// Calculate the number of work-groups per compute unit
-// \tparam [in] KernelName SYCL kernel name to calculate for
-// \tparam [in] RangeDim the dimension of the sycl::range
-// \param [in] q SYCL queue used to execute kernel
-// \param [in] wg_range SYCL work-group range
-// \param [in] local_mem_size Local memory usage per work-group in bytes
-// \return size_t representing maximum work-groups per compute unit
-template <class KernelName, int RangeDim>
-size_t max_active_work_groups_per_cu(
-    sycl::range<RangeDim> wg_range, size_t local_mem_size,
-    sycl::queue queue = syclcompat::get_default_queue()) {
-  return max_active_work_groups_per_cu<KernelName>(syclcompat::dim3(wg_range),
-                                                   local_mem_size, queue);
-}
-
-/// If x <= 2, then return a pointer to the default queue;
-/// otherwise, return x reinterpreted as a queue_ptr.
-inline queue_ptr int_as_queue_ptr(uintptr_t x) {
-  return x <= 2 ? detail::dev_mgr::instance().current_device().default_queue()
-                : reinterpret_cast<queue_ptr>(x);
-}
-
-template <int n_nondefault_params, int n_default_params, typename T>
-class args_selector;
-
-/// args_selector is a helper class for extracting arguments from an
-/// array of pointers to arguments or buffer of arguments to pass to a
-/// kernel function.
-///
-/// \param R(Ts...) The type of the kernel
-/// \param n_nondefault_params The number of nondefault parameters of the kernel
-/// (excluding parameters that like sycl::nd_item, etc.)
-/// \param n_default_params The number of default parameters of the kernel
-///
-/// Example usage:
-/// With the following kernel:
-///   void foo(sycl::float2 *x, int n, sycl::nd_item<3> item_ct1, float f=.1) {}
-/// and with the declaration:
-///   args_selector<2, 1, decltype(foo)> selector(kernelParams, extra);
-///   void* kernelParams[2 + 1] = { (void*)float2_var, int_var, float_var }
-/// we have:
-///   selector.get<0>() returns a reference to sycl::float*,
-///   selector.get<1>() returns a reference to int,
-///   selector.get<2>() returns a reference to float
-template <int n_nondefault_params, int n_default_params, typename R,
-          typename... Ts>
-class args_selector<n_nondefault_params, n_default_params, R(Ts...)> {
-private:
-  void **kernel_params;
-  char *args_buffer;
-
-  template <int i> static constexpr int account_for_default_params() {
-    constexpr int n_total_params = sizeof...(Ts);
-    if constexpr (i >= n_nondefault_params) {
-      return n_total_params - n_default_params + (i - n_nondefault_params);
-    } else {
-      return i;
-    }
-  }
-
-public:
-  /// Get the type of the ith argument of R(Ts...)
-  /// \param [in] i Index of parameter to get
-  /// \returns Type of ith parameter
-  template <int i>
-  using arg_type =
-      std::tuple_element_t<account_for_default_params<i>(), std::tuple<Ts...>>;
-
-private:
-  template <int i> static constexpr int get_offset() {
-    if constexpr (i == 0) {
-      // we can assume args_buffer is properly aligned to the
-      // first argument
-      return 0;
-    } else {
-      constexpr int prev_off = get_offset<i - 1>();
-      constexpr int prev_past_end = prev_off + sizeof(arg_type<i - 1>);
-      using T = arg_type<i>;
-      // is the past-the-end of the i-1st element properly aligned
-      // with the ith element's alignment?
-      if constexpr (prev_past_end % alignof(T) == 0) {
-        return prev_past_end;
-      }
-      // otherwise bump prev_past_end to match alignment
-      else {
-        return prev_past_end + (alignof(T) - (prev_past_end % alignof(T)));
-      }
-    }
-  }
-
-  static char *get_args_buffer(void **extra) {
-    if (!extra)
-      return nullptr;
-    for (; (std::size_t)*extra != 0; ++extra) {
-      if ((std::size_t)*extra == 1) {
-        return static_cast<char *>(*(extra + 1));
-      }
-    }
-    return nullptr;
-  }
-
-public:
-  /// If kernel_params is nonnull, then args_selector will
-  /// extract arguments from kernel_params. Otherwise, it
-  /// will extract them from extra.
-  /// \param [in] kernel_params Array of pointers to arguments
-  /// a or null pointer.
-  /// \param [in] extra Array containing pointer to argument buffer.
-  args_selector(void **kernel_params, void **extra)
-      : kernel_params(kernel_params), args_buffer(get_args_buffer(extra)) {}
-
-  /// Get a reference to the ith argument extracted from kernel_params
-  /// or extra.
-  /// \param [in] i Index of argument to get
-  /// \returns Reference to the ith argument
-  template <int i> arg_type<i> &get() {
-    if (kernel_params) {
-      return *static_cast<arg_type<i> *>(kernel_params[i]);
-    } else {
-      return *reinterpret_cast<arg_type<i> *>(args_buffer + get_offset<i>());
-    }
-  }
-};
-
-} // namespace syclcompat
diff --git a/sycl/test-e2e/syclcompat/atomic/atomic_arith.cpp b/sycl/test-e2e/syclcompat/atomic/atomic_arith.cpp
deleted file mode 100644
index 72d7be247b6d5..0000000000000
--- a/sycl/test-e2e/syclcompat/atomic/atomic_arith.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  atomic_arith.cpp
- *
- *  Description:
- *    atomic operations API tests
- **************************************************************************/
-
-// The original source was under the license below:
-// ====------ Atomic.cpp---------- -*- C++ -* ----===////
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: hip
-
-// XFAIL: target-native_cpu
-// XFAIL-TRACKER: https://github.com/intel/llvm/issues/20142
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include <cstddef>
-#include <type_traits>
-
-#include <sycl/detail/core.hpp>
-
-#include <syclcompat/atomic.hpp>
-#include <syclcompat/device.hpp>
-#include <syclcompat/dims.hpp>
-#include <syclcompat/id_query.hpp>
-#include <syclcompat/launch.hpp>
-#include <syclcompat/memory.hpp>
-
-#include "../common.hpp"
-#include "atomic_fixt.hpp"
-
-// Simple atomic kernels for testing
-// In every case we test two API overloads, one taking an explicit runtime
-// memory_order argument. We use `relaxed` in every case because these tests
-// are *not* checking the memory_order semantics, just the API.
-template <typename T1, typename T2>
-inline void atomic_fetch_add_kernel(T1 *data, T2 operand) {
-  syclcompat::atomic_fetch_add(data, operand);
-}
-template <typename T1, typename T2>
-inline void atomic_fetch_sub_kernel(T1 *data, T2 operand) {
-  syclcompat::atomic_fetch_sub(data, operand);
-}
-
-template <typename T> void test_atomic_arith() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  constexpr syclcompat::dim3 grid{4};
-  constexpr syclcompat::dim3 threads{32};
-  constexpr T sum = static_cast<T>(grid.x * threads.x);
-  constexpr T init = static_cast<T>(0);
-  constexpr T operand = static_cast<T>(1);
-
-  AtomicLauncher<atomic_fetch_add_kernel<T, T>, T>(grid, threads)
-      .launch_test(init, sum, operand);
-  AtomicLauncher<atomic_fetch_sub_kernel<T, T>, T>(grid, threads)
-      .launch_test(sum, init, operand);
-}
-
-template <typename T> void test_atomic_ptr_arith() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  constexpr syclcompat::dim3 grid{4};
-  constexpr syclcompat::dim3 threads{32};
-
-  using ValType = std::remove_pointer_t<T>;
-
-  T init = (T)syclcompat::malloc(sizeof(ValType));
-  T final = init + (grid.x * threads.x);
-  constexpr std::ptrdiff_t operand = static_cast<std::ptrdiff_t>(1);
-
-  AtomicLauncher<atomic_fetch_add_kernel<T, std::ptrdiff_t>, T>(grid, threads)
-      .launch_test(init, final, operand);
-
-  AtomicLauncher<atomic_fetch_sub_kernel<T, std::ptrdiff_t>, T>(grid, threads)
-      .launch_test(final, init, operand);
-
-  syclcompat::free(init);
-}
-
-void test_atomic_arith_t1_t2() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  using data_t = float;
-  using operand_t = int;
-
-  constexpr syclcompat::dim3 grid{4};
-  constexpr syclcompat::dim3 threads{32};
-  constexpr data_t sum = static_cast<data_t>(grid.x * threads.x);
-  constexpr data_t init = static_cast<data_t>(0);
-  constexpr operand_t operand = static_cast<operand_t>(1);
-
-  AtomicLauncher<atomic_fetch_add_kernel<data_t, operand_t>, data_t>(grid,
-                                                                     threads)
-      .launch_test(init, sum, operand);
-  AtomicLauncher<atomic_fetch_sub_kernel<data_t, operand_t>, data_t>(grid,
-                                                                     threads)
-      .launch_test(sum, init, operand);
-}
-
-int main() {
-  INSTANTIATE_ALL_TYPES(atomic_value_type_list, test_atomic_arith);
-  INSTANTIATE_ALL_TYPES(atomic_ptr_type_list, test_atomic_ptr_arith);
-  test_atomic_arith_t1_t2();
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/atomic/atomic_bitwise.cpp b/sycl/test-e2e/syclcompat/atomic/atomic_bitwise.cpp
deleted file mode 100644
index d8d1a604cd405..0000000000000
--- a/sycl/test-e2e/syclcompat/atomic/atomic_bitwise.cpp
+++ /dev/null
@@ -1,244 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  atomic_logic.cpp
- *
- *  Description:
- *    atomic operations API tests
- **************************************************************************/
-
-// The original source was under the license below:
-// ====------ Atomic.cpp---------- -*- C++ -* ----===////
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===----------------------------------------------------------------------===//
-
-// FIXME: This isn't entirely true, it's not supported in hardware without
-// seq_acq but the assertion is done at compile-time for AMDGPU, which causes CI
-// to fail. The same applies to each test within this directory
-// UNSUPPORTED: hip
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include <type_traits>
-
-#include <sycl/detail/core.hpp>
-
-#include <syclcompat/atomic.hpp>
-#include <syclcompat/device.hpp>
-#include <syclcompat/dims.hpp>
-#include <syclcompat/id_query.hpp>
-#include <syclcompat/launch.hpp>
-#include <syclcompat/memory.hpp>
-
-#include "../common.hpp"
-#include "atomic_fixt.hpp"
-
-// Simple atomic kernels for testing
-// In every case we test two API overloads, one taking an explicit runtime
-// memory_order argument. We use `relaxed` in every case because these tests
-// are *not* checking the memory_order semantics, just the API.
-template <typename T1, typename T2>
-void atomic_fetch_and_kernel(T1 *data, T2 operand, T2 operand0) {
-  syclcompat::atomic_fetch_and(
-      data, (syclcompat::global_id::x() == 0 ? operand0 : operand));
-}
-template <typename T1, typename T2>
-void atomic_fetch_or_kernel(T1 *data, T2 operand, T2 operand0) {
-  syclcompat::atomic_fetch_or(
-      data, (syclcompat::global_id::x() == 0 ? operand0 : operand));
-}
-template <typename T1, typename T2>
-void atomic_fetch_xor_kernel(T1 *data, T2 operand, T2 operand0) {
-  syclcompat::atomic_fetch_xor(
-      data, (syclcompat::global_id::x() == 0 ? operand0 : operand));
-}
-
-template <typename T> void test_atomic_and() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  constexpr syclcompat::dim3 grid{4};
-  constexpr syclcompat::dim3 threads{32};
-
-  // All 0 -> 0
-  AtomicLauncher<atomic_fetch_and_kernel<T, T>, T>(grid, threads)
-      .launch_test(static_cast<T>(0), static_cast<T>(0), static_cast<T>(0),
-                   static_cast<T>(0));
-
-  // All 1 -> 1
-  AtomicLauncher<atomic_fetch_and_kernel<T, T>, T>(grid, threads)
-      .launch_test(static_cast<T>(1), static_cast<T>(1), static_cast<T>(1),
-                   static_cast<T>(1));
-  // Most 1, one 0 -> 0
-  AtomicLauncher<atomic_fetch_and_kernel<T, T>, T>(grid, threads)
-      .launch_test(static_cast<T>(1), static_cast<T>(0), static_cast<T>(1),
-                   static_cast<T>(0));
-}
-
-template <typename T> void test_atomic_or() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  constexpr syclcompat::dim3 grid{4};
-  constexpr syclcompat::dim3 threads{32};
-
-  // All 0 -> 0
-  AtomicLauncher<atomic_fetch_or_kernel<T, T>, T>(grid, threads)
-      .launch_test(static_cast<T>(0), static_cast<T>(0), static_cast<T>(0),
-                   static_cast<T>(0));
-  // All 1 -> 1
-  AtomicLauncher<atomic_fetch_or_kernel<T, T>, T>(grid, threads)
-      .launch_test(static_cast<T>(1), static_cast<T>(1), static_cast<T>(1),
-                   static_cast<T>(1));
-  // Most 1, one 0 -> 1
-  AtomicLauncher<atomic_fetch_or_kernel<T, T>, T>(grid, threads)
-      .launch_test(static_cast<T>(1), static_cast<T>(1), static_cast<T>(1),
-                   static_cast<T>(0));
-  // Init 1, all 0 -> 1
-  AtomicLauncher<atomic_fetch_or_kernel<T, T>, T>(grid, threads)
-      .launch_test(static_cast<T>(1), static_cast<T>(1), static_cast<T>(0),
-                   static_cast<T>(0));
-}
-
-template <typename T> void test_atomic_xor() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  constexpr syclcompat::dim3 grid{1};
-  constexpr syclcompat::dim3 threads{2}; // 2 threads, 3 values inc. init
-
-  // 000 -> 0
-  AtomicLauncher<atomic_fetch_xor_kernel<T, T>, T>(grid, threads)
-      .launch_test(static_cast<T>(0), static_cast<T>(0), static_cast<T>(0),
-                   static_cast<T>(0));
-  // 111 -> 1
-  AtomicLauncher<atomic_fetch_xor_kernel<T, T>, T>(grid, threads)
-      .launch_test(static_cast<T>(1), static_cast<T>(1), static_cast<T>(1),
-                   static_cast<T>(1));
-  // 110 -> 0
-  AtomicLauncher<atomic_fetch_xor_kernel<T, T>, T>(grid, threads)
-      .launch_test(static_cast<T>(1), static_cast<T>(0), static_cast<T>(1),
-                   static_cast<T>(0));
-  // 010 -> 1
-  AtomicLauncher<atomic_fetch_xor_kernel<T, T>, T>(grid, threads)
-      .launch_test(static_cast<T>(0), static_cast<T>(1), static_cast<T>(1),
-                   static_cast<T>(0));
-}
-
-void test_atomic_and_t1_t2() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  constexpr syclcompat::dim3 grid{4};
-  constexpr syclcompat::dim3 threads{32};
-
-  using data_t = long;
-  using operand_t = unsigned int;
-
-  // All 0 -> 0
-  AtomicLauncher<atomic_fetch_and_kernel<data_t, operand_t>, data_t>(grid,
-                                                                     threads)
-      .launch_test(static_cast<data_t>(0), static_cast<data_t>(0),
-                   static_cast<operand_t>(0), static_cast<operand_t>(0));
-
-  // All 1 -> 1
-  AtomicLauncher<atomic_fetch_and_kernel<data_t, operand_t>, data_t>(grid,
-                                                                     threads)
-      .launch_test(static_cast<data_t>(1), static_cast<data_t>(1),
-                   static_cast<operand_t>(1), static_cast<operand_t>(1));
-  // Most 1, one 0 -> 0
-  AtomicLauncher<atomic_fetch_and_kernel<data_t, operand_t>, data_t>(grid,
-                                                                     threads)
-      .launch_test(static_cast<data_t>(1), static_cast<data_t>(0),
-                   static_cast<operand_t>(1), static_cast<operand_t>(0));
-}
-
-void test_atomic_or_t1_t2() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  constexpr syclcompat::dim3 grid{4};
-  constexpr syclcompat::dim3 threads{32};
-
-  using data_t = long;
-  using operand_t = unsigned int;
-
-  // All 0 -> 0
-  AtomicLauncher<atomic_fetch_or_kernel<data_t, operand_t>, data_t>(grid,
-                                                                    threads)
-      .launch_test(static_cast<data_t>(0), static_cast<data_t>(0),
-                   static_cast<operand_t>(0), static_cast<operand_t>(0));
-  // All 1 -> 1
-  AtomicLauncher<atomic_fetch_or_kernel<data_t, operand_t>, data_t>(grid,
-                                                                    threads)
-      .launch_test(static_cast<data_t>(1), static_cast<data_t>(1),
-                   static_cast<operand_t>(1), static_cast<operand_t>(1));
-  // Most 1, one 0 -> 1
-  AtomicLauncher<atomic_fetch_or_kernel<data_t, operand_t>, data_t>(grid,
-                                                                    threads)
-      .launch_test(static_cast<data_t>(1), static_cast<data_t>(1),
-                   static_cast<operand_t>(1), static_cast<operand_t>(0));
-  // Init 1, all 0 -> 1
-  AtomicLauncher<atomic_fetch_or_kernel<data_t, operand_t>, data_t>(grid,
-                                                                    threads)
-      .launch_test(static_cast<data_t>(1), static_cast<data_t>(1),
-                   static_cast<operand_t>(0), static_cast<operand_t>(0));
-}
-
-void test_atomic_xor_t1_t2() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  constexpr syclcompat::dim3 grid{1};
-  constexpr syclcompat::dim3 threads{2}; // 2 threads, 3 values inc. init
-
-  using data_t = long;
-  using operand_t = unsigned int;
-
-  // 000 -> 0
-  AtomicLauncher<atomic_fetch_xor_kernel<data_t, operand_t>, data_t>(grid,
-                                                                     threads)
-      .launch_test(static_cast<data_t>(0), static_cast<data_t>(0),
-                   static_cast<operand_t>(0), static_cast<operand_t>(0));
-  // 111 -> 1
-  AtomicLauncher<atomic_fetch_xor_kernel<data_t, operand_t>, data_t>(grid,
-                                                                     threads)
-      .launch_test(static_cast<data_t>(1), static_cast<data_t>(1),
-                   static_cast<operand_t>(1), static_cast<operand_t>(1));
-  // 110 -> 0
-  AtomicLauncher<atomic_fetch_xor_kernel<data_t, operand_t>, data_t>(grid,
-                                                                     threads)
-      .launch_test(static_cast<data_t>(1), static_cast<data_t>(0),
-                   static_cast<operand_t>(1), static_cast<operand_t>(0));
-  // 010 -> 1
-  AtomicLauncher<atomic_fetch_xor_kernel<data_t, operand_t>, data_t>(grid,
-                                                                     threads)
-      .launch_test(static_cast<data_t>(0), static_cast<data_t>(1),
-                   static_cast<operand_t>(1), static_cast<operand_t>(0));
-}
-
-int main() {
-  INSTANTIATE_ALL_TYPES(integral_type_list, test_atomic_and);
-  INSTANTIATE_ALL_TYPES(integral_type_list, test_atomic_or);
-  INSTANTIATE_ALL_TYPES(integral_type_list, test_atomic_xor);
-
-  // Avoid combinatorial explosion by only testing the interface
-  test_atomic_and_t1_t2();
-  test_atomic_or_t1_t2();
-  test_atomic_xor_t1_t2();
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/atomic/atomic_class.cpp b/sycl/test-e2e/syclcompat/atomic/atomic_class.cpp
deleted file mode 100644
index cf9f863d188a6..0000000000000
--- a/sycl/test-e2e/syclcompat/atomic/atomic_class.cpp
+++ /dev/null
@@ -1,135 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  atomic_class.cpp
- *
- *  Description:
- *    atomic operations API tests
- **************************************************************************/
-
-// The original source was under the license below:
-// ====------ libcu_atomic.cpp---------- -*- C++ -* ----===////
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===----------------------------------------------------------------------===//
-// UNSUPPORTED: target-nvidia,cuda
-// UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/20109
-
-// UNSUPPORTED: target-amd || (windows && level_zero)
-
-// RUN: %{build} %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 %} -o %t.out
-// RUN: %{run} %t.out
-
-#include <sycl/detail/core.hpp>
-#include <syclcompat/atomic.hpp>
-
-#include "../common.hpp"
-#include "atomic_fixt.hpp"
-
-#include <atomic>
-
-constexpr size_t numBlocks = 1;
-constexpr size_t numThreads = 1;
-constexpr size_t numData = 6;
-
-template <typename T, typename AtomicType>
-void atomic_ref_ptr(T *atom_arr_out, T *atom_arr_in) {
-  AtomicType a{nullptr};
-
-  a.store(atom_arr_in[0]);
-
-  atom_arr_out[0] = a.load();
-  atom_arr_out[1] = a.exchange(atom_arr_in[1]);
-  atom_arr_out[2] = a.load();
-  a.compare_exchange_weak(atom_arr_out[2], atom_arr_in[2]);
-  atom_arr_out[3] = a.load();
-  a.compare_exchange_strong(atom_arr_out[3], atom_arr_in[3]);
-  atom_arr_out[4] = a.fetch_add(static_cast<std::ptrdiff_t>(1));
-  atom_arr_out[5] = a.fetch_sub(static_cast<std::ptrdiff_t>(-1));
-}
-
-template <typename T> void atomic_ref_ptr_kernel(T *atom_arr, T *atom_arr_in) {
-  atomic_ref_ptr<T, syclcompat::atomic<T>>(atom_arr, atom_arr_in);
-}
-
-template <typename T> void atomic_ref_ptr_host(T *atom_arr, T *atom_arr_in) {
-  atomic_ref_ptr<T, std::atomic<T>>(atom_arr, atom_arr_in);
-}
-
-template <typename T> void test_atomic_class_ptr() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  AtomicClassPtrTypeLauncher<T>(numBlocks, numThreads, numData)
-      .template launch_test<atomic_ref_ptr_kernel<T>, atomic_ref_ptr_host<T>>();
-}
-
-template <typename T, typename AtomicType> void atomic_ref_value(T *atom_arr) {
-  AtomicType a{static_cast<T>(0)};
-  T temp1 = static_cast<T>(3);
-  T temp2 = static_cast<T>(4);
-
-  a.store(static_cast<T>(1));
-
-  atom_arr[0] = a.load();
-  atom_arr[1] = a.exchange(static_cast<T>(3));
-  atom_arr[2] = a.load();
-  a.compare_exchange_weak(temp1, static_cast<T>(4));
-  atom_arr[3] = a.load();
-  a.compare_exchange_strong(temp2, static_cast<T>(8));
-  atom_arr[4] = a.fetch_add(static_cast<T>(1));
-  atom_arr[5] = a.fetch_sub(static_cast<T>(-1));
-}
-
-template <typename T> void atomic_ref_value_kernel(T *atom_arr) {
-  atomic_ref_value<T, syclcompat::atomic<T>>(atom_arr);
-}
-
-template <typename T> void atomic_ref_value_host(T *atom_arr) {
-  // atomic RMW operations for floating point in std is C++20 and may
-  // not be implemented
-  if constexpr (std::is_integral_v<T>)
-    atomic_ref_value<T, std::atomic<T>>(atom_arr);
-  else
-    atomic_ref_value<T, syclcompat::atomic<T>>(atom_arr);
-}
-
-template <typename T> void test_atomic_class_value() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  AtomicClassLauncher<T>(numBlocks, numThreads, numData)
-      .template launch_test<atomic_ref_value_kernel<T>,
-                            atomic_ref_value_host<T>>();
-}
-
-void test_default_constructor() { syclcompat::atomic<int> default_constructor; }
-
-int main() {
-  std::vector<sycl::memory_order> supported_memory_orders =
-      syclcompat::get_default_queue()
-          .get_device()
-          .get_info<sycl::info::device::atomic_memory_order_capabilities>();
-
-  if (is_supported(supported_memory_orders, sycl::memory_order::seq_cst)) {
-    test_default_constructor();
-
-    INSTANTIATE_ALL_TYPES(atomic_value_type_list, test_atomic_class_value);
-    INSTANTIATE_ALL_TYPES(atomic_ptr_type_list, test_atomic_class_ptr);
-  }
-}
diff --git a/sycl/test-e2e/syclcompat/atomic/atomic_comp_exchange.cpp b/sycl/test-e2e/syclcompat/atomic/atomic_comp_exchange.cpp
deleted file mode 100644
index b801995fe0cdd..0000000000000
--- a/sycl/test-e2e/syclcompat/atomic/atomic_comp_exchange.cpp
+++ /dev/null
@@ -1,212 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  atomic_comp_exchange.cpp
- *
- *  Description:
- *    atomic operations API tests
- **************************************************************************/
-
-// The original source was under the license below:
-// ====------ Atomic.cpp---------- -*- C++ -* ----===////
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: hip
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include <type_traits>
-
-#include <sycl/detail/core.hpp>
-
-#include <syclcompat/atomic.hpp>
-#include <syclcompat/device.hpp>
-#include <syclcompat/dims.hpp>
-#include <syclcompat/id_query.hpp>
-#include <syclcompat/launch.hpp>
-#include <syclcompat/memory.hpp>
-
-#include "../common.hpp"
-#include "atomic_fixt.hpp"
-
-// Simple atomic kernels for testing
-// In every case we test two API overloads, one taking an explicit runtime
-// memory_order argument. We use `relaxed` in every case because these tests
-// are *not* checking the memory_order semantics, just the API.
-template <typename T>
-inline void atomic_fetch_compare_inc_kernel(T *data, T operand) {
-  syclcompat::atomic_fetch_compare_inc(data, operand);
-}
-template <typename T>
-inline void atomic_fetch_compare_dec_kernel(T *data, T operand) {
-  syclcompat::atomic_fetch_compare_dec(data, operand);
-}
-template <typename T1, typename T2>
-inline void atomic_exchange_kernel(T1 *data, T2 operand) {
-  syclcompat::atomic_exchange(data, operand);
-}
-template <typename T1, typename T2, typename T3>
-inline void atomic_compare_exchange_strong_kernel(T1 *data, T2 expected,
-                                                  T3 desired) {
-  syclcompat::atomic_compare_exchange_strong(data, expected, desired);
-}
-
-void test_atomic_comp_inc() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  constexpr syclcompat::dim3 grid{1};
-  constexpr syclcompat::dim3 threads{6};
-
-  AtomicLauncher<atomic_fetch_compare_inc_kernel<unsigned int>, unsigned int>(
-      grid, threads)
-      .launch_test(0, 6, 6);
-  AtomicLauncher<atomic_fetch_compare_inc_kernel<unsigned int>, unsigned int>(
-      grid, threads)
-      .launch_test(1, 0, 6);
-}
-
-void test_atomic_comp_dec() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  constexpr syclcompat::dim3 grid{1};
-  constexpr syclcompat::dim3 threads{6};
-
-  AtomicLauncher<atomic_fetch_compare_dec_kernel<unsigned int>, unsigned int>(
-      grid, threads)
-      .launch_test(6, 0, 0);
-  AtomicLauncher<atomic_fetch_compare_dec_kernel<unsigned int>, unsigned int>(
-      grid, threads)
-      .launch_test(0, 6, 11);
-}
-
-template <typename T> void test_atomic_exch() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  constexpr syclcompat::dim3 grid{4};
-  constexpr syclcompat::dim3 threads{32};
-
-  AtomicLauncher<atomic_exchange_kernel<T, T>, T>(grid, threads)
-      .launch_test(static_cast<T>(0), static_cast<T>(1), static_cast<T>(1));
-  AtomicLauncher<atomic_exchange_kernel<T, T>, T>(grid, threads)
-      .launch_test(static_cast<T>(0), static_cast<T>(0), static_cast<T>(0));
-}
-
-template <typename T> void test_atomic_ptr_exch() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  constexpr syclcompat::dim3 grid{4};
-  constexpr syclcompat::dim3 threads{32};
-
-  using ValType = std::remove_pointer_t<T>;
-  T ptr1 = (T)syclcompat::malloc(sizeof(ValType));
-  T ptr2 = (T)syclcompat::malloc(sizeof(ValType));
-
-  AtomicLauncher<atomic_exchange_kernel<T, T>, T>(grid, threads)
-      .launch_test(ptr1, ptr2, ptr2);
-  AtomicLauncher<atomic_exchange_kernel<T, T>, T>(grid, threads)
-      .launch_test(ptr1, ptr1, ptr1);
-  syclcompat::free(ptr1);
-  syclcompat::free(ptr2);
-}
-
-template <typename T> void test_atomic_exch_strong() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  constexpr syclcompat::dim3 grid{4};
-  constexpr syclcompat::dim3 threads{32};
-
-  AtomicLauncher<atomic_compare_exchange_strong_kernel<T, T, T>, T>(grid,
-                                                                    threads)
-      .launch_test(static_cast<T>(0), static_cast<T>(1), static_cast<T>(0),
-                   static_cast<T>(1));
-  AtomicLauncher<atomic_compare_exchange_strong_kernel<T, T, T>, T>(grid,
-                                                                    threads)
-      .launch_test(static_cast<T>(0), static_cast<T>(0), static_cast<T>(1),
-                   static_cast<T>(2));
-}
-
-template <typename T> void test_atomic_ptr_exch_strong() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  constexpr syclcompat::dim3 grid{4};
-  constexpr syclcompat::dim3 threads{32};
-
-  using ValType = std::remove_pointer_t<T>;
-  T ptr1 = (T)syclcompat::malloc(sizeof(ValType));
-  T ptr2 = (T)syclcompat::malloc(sizeof(ValType));
-  T ptr3 = (T)syclcompat::malloc(sizeof(ValType));
-
-  AtomicLauncher<atomic_compare_exchange_strong_kernel<T, T, T>, T>(grid,
-                                                                    threads)
-      .launch_test(ptr1, ptr2, ptr1, ptr2);
-  AtomicLauncher<atomic_compare_exchange_strong_kernel<T, T, T>, T>(grid,
-                                                                    threads)
-      .launch_test(ptr1, ptr1, ptr2, ptr3);
-  syclcompat::free(ptr1);
-  syclcompat::free(ptr2);
-  syclcompat::free(ptr3);
-}
-
-void test_atomic_exch_t1_t2() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  constexpr syclcompat::dim3 grid{4};
-  constexpr syclcompat::dim3 threads{32};
-
-  AtomicLauncher<atomic_exchange_kernel<float, int>, float>(grid, threads)
-      .launch_test(static_cast<float>(0), static_cast<float>(1),
-                   static_cast<int>(1));
-  AtomicLauncher<atomic_exchange_kernel<float, int>, float>(grid, threads)
-      .launch_test(static_cast<float>(0), static_cast<float>(0),
-                   static_cast<int>(0));
-}
-
-void test_atomic_exch_strong_t1_t2_t3() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  constexpr syclcompat::dim3 grid{4};
-  constexpr syclcompat::dim3 threads{32};
-
-  AtomicLauncher<atomic_compare_exchange_strong_kernel<float, int, unsigned>,
-                 float>(grid, threads)
-      .launch_test(static_cast<float>(0), static_cast<float>(1),
-                   static_cast<int>(0), static_cast<unsigned>(1));
-  AtomicLauncher<atomic_compare_exchange_strong_kernel<float, int, unsigned>,
-                 float>(grid, threads)
-      .launch_test(static_cast<float>(0), static_cast<float>(0),
-                   static_cast<int>(1), static_cast<unsigned>(2));
-}
-
-int main() {
-  INSTANTIATE_ALL_TYPES(atomic_value_type_list, test_atomic_exch);
-  INSTANTIATE_ALL_TYPES(atomic_value_type_list, test_atomic_exch_strong);
-
-  INSTANTIATE_ALL_TYPES(atomic_ptr_type_list, test_atomic_ptr_exch);
-  INSTANTIATE_ALL_TYPES(atomic_ptr_type_list, test_atomic_ptr_exch_strong);
-
-  test_atomic_comp_inc();
-  test_atomic_comp_dec();
-  test_atomic_exch_t1_t2();
-  test_atomic_exch_strong_t1_t2_t3();
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/atomic/atomic_fixt.hpp b/sycl/test-e2e/syclcompat/atomic/atomic_fixt.hpp
deleted file mode 100644
index 4d34db9cfa102..0000000000000
--- a/sycl/test-e2e/syclcompat/atomic/atomic_fixt.hpp
+++ /dev/null
@@ -1,178 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  atomic_fixt.hpp
- *
- *  Description:
- *    Memory Order helper for the Atomic functionality tests
- **************************************************************************/
-
-#pragma once
-
-#include <algorithm>
-#include <sycl/detail/core.hpp>
-
-#include <syclcompat.hpp>
-
-using atomic_value_type_list =
-    std::tuple<int, unsigned int, long, unsigned long, long long,
-               unsigned long long, float, double>;
-
-using atomic_ptr_type_list =
-    std::tuple<int *, long *, long long *, float *, double *>;
-
-using integral_type_list = std::tuple<int, unsigned int, long, unsigned long,
-                                      long long, unsigned long long>;
-
-using signed_type_list = std::tuple<int, long, long long, float, double>;
-
-bool is_supported(std::vector<sycl::memory_order> capabilities,
-                  sycl::memory_order mem_order) {
-  return std::find(capabilities.begin(), capabilities.end(), mem_order) !=
-         capabilities.end();
-}
-
-template <typename T> bool should_skip(const sycl::device &dev) {
-  if constexpr (sizeof(T) == 8) {
-    if (!dev.has(sycl::aspect::atomic64)) {
-      return true;
-    }
-  }
-  if constexpr (std::is_same_v<T, double> || std::is_same_v<T, double *>) {
-    if (!dev.has(sycl::aspect::fp64)) {
-      return true;
-    }
-  }
-  return false;
-}
-
-template <auto F, typename T> class AtomicLauncher {
-protected:
-  syclcompat::dim3 grid_;
-  syclcompat::dim3 threads_;
-  T *data_;
-  sycl::queue q_;
-  bool skip_;
-
-public:
-  AtomicLauncher(syclcompat::dim3 grid, syclcompat::dim3 threads,
-                 sycl::queue q = syclcompat::get_default_queue())
-      : grid_{grid}, threads_{threads}, q_{q},
-        skip_{should_skip<T>(q.get_device())} {
-    data_ = (T *)syclcompat::malloc(sizeof(T), q_);
-  };
-  ~AtomicLauncher() { syclcompat::free(data_); }
-  template <typename... Args>
-  void launch_test(T init_val, T expected_result, Args... args) {
-    if (skip_)
-      return;
-
-    syclcompat::memcpy(data_, &init_val, sizeof(T), q_);
-    syclcompat::launch<F>(grid_, threads_, q_, data_, args...);
-    T result_val;
-    syclcompat::memcpy(&result_val, data_, sizeof(T), q_);
-    syclcompat::wait();
-    assert(result_val == expected_result);
-  }
-};
-
-template <typename T> class AtomicClassLauncher {
-protected:
-  syclcompat::dim3 grid_;
-  syclcompat::dim3 threads_;
-  size_t data_len_;
-  T *atom_arr_device_;
-  T *atom_arr_host_;
-  bool skip_;
-
-  void verify() {
-    bool result = true;
-    for (int i = 0; i < data_len_; ++i) {
-      if (atom_arr_device_[i] != atom_arr_host_[i]) {
-        std::cout << "-- Failure at " << i << std::endl << std::flush;
-        result = false;
-      }
-    }
-    assert(result);
-  }
-
-public:
-  AtomicClassLauncher(const syclcompat::dim3 &grid,
-                      const syclcompat::dim3 &threads, const size_t data_len)
-      : grid_{grid}, threads_{threads}, data_len_{data_len},
-        skip_{should_skip<T>(syclcompat::get_current_device())} {
-    atom_arr_device_ = syclcompat::malloc_shared<T>(data_len_);
-    atom_arr_host_ = syclcompat::malloc_shared<T>(data_len_);
-
-    for (size_t i = 0; i < data_len_; i++) {
-      atom_arr_device_[i] = 0;
-      atom_arr_host_[i] = 0;
-    }
-  };
-  virtual ~AtomicClassLauncher() {
-    syclcompat::free(atom_arr_device_);
-    syclcompat::free(atom_arr_host_);
-  }
-
-  template <auto Kernel, auto HostFunc> void launch_test() {
-    if (skip_)
-      return; // skip
-    syclcompat::launch<Kernel>(grid_, threads_, atom_arr_device_);
-    syclcompat::wait();
-    HostFunc(atom_arr_host_);
-
-    verify();
-  }
-};
-
-template <typename T>
-class AtomicClassPtrTypeLauncher : public AtomicClassLauncher<T> {
-protected:
-  using ValType = std::remove_pointer_t<T>;
-
-  T *atom_arr_shared_in_;
-
-public:
-  AtomicClassPtrTypeLauncher(const syclcompat::dim3 &grid,
-                             const syclcompat::dim3 &threads,
-                             const size_t data_len)
-      : AtomicClassLauncher<T>(grid, threads, data_len) {
-
-    atom_arr_shared_in_ = syclcompat::malloc_shared<T>(this->data_len_);
-
-    for (size_t i = 0; i < this->data_len_; i++) {
-      atom_arr_shared_in_[i] = syclcompat::malloc_shared<ValType>(1);
-    }
-  };
-
-  virtual ~AtomicClassPtrTypeLauncher() {
-    for (size_t i = 0; i < this->data_len_; i++) {
-      syclcompat::free(atom_arr_shared_in_[i]);
-    }
-    syclcompat::free(atom_arr_shared_in_);
-  }
-
-  template <auto Kernel, auto HostFunc> void launch_test() {
-    if (this->skip_)
-      return;
-    syclcompat::launch<Kernel>(this->grid_, this->threads_,
-                               this->atom_arr_device_, atom_arr_shared_in_);
-    syclcompat::wait();
-    HostFunc(this->atom_arr_host_, atom_arr_shared_in_);
-
-    this->verify();
-  }
-};
diff --git a/sycl/test-e2e/syclcompat/atomic/atomic_memory_acq_rel.cpp b/sycl/test-e2e/syclcompat/atomic/atomic_memory_acq_rel.cpp
deleted file mode 100644
index 765440e4175ff..0000000000000
--- a/sycl/test-e2e/syclcompat/atomic/atomic_memory_acq_rel.cpp
+++ /dev/null
@@ -1,283 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  atomic_memory_acq_rel.cpp
- *
- *  Description:
- *    Tests fetch_add for acquire and release memory ordering
- **************************************************************************/
-
-// The original source was under the license below:
-// ====-------------------------------------------------- -*- C++ -* ----===////
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: target-amd
-
-// RUN: %{build} %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 %} -o %t.out
-// RUN: %{run} %t.out
-
-#include <iostream>
-#include <numeric>
-
-#include <sycl/detail/core.hpp>
-
-#include <syclcompat/atomic.hpp>
-
-#include "atomic_fixt.hpp"
-
-using namespace sycl;
-
-using address_space = sycl::access::address_space;
-
-template <memory_order order> void test_acquire_global() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  const size_t N_items = 256;
-  const size_t N_iters = 1000;
-
-  int error = 0;
-  int val[] = {0, 0};
-
-  queue q;
-  {
-    buffer<int> error_buf(&error, 1);
-    buffer<int> val_buf(val, 2);
-
-    q.submit([&](handler &cgh) {
-       auto error =
-           error_buf.template get_access<access::mode::read_write>(cgh);
-       auto val = val_buf.template get_access<access::mode::read_write>(cgh);
-       cgh.parallel_for(range<1>(N_items), [=](item<1> it) {
-         volatile int *val_p =
-             val.get_multi_ptr<sycl::access::decorated::no>().get();
-         auto atm0 =
-             atomic_ref<int, memory_order::relaxed, memory_scope::device,
-                        address_space::global_space>(val[0]);
-         auto atm1 =
-             atomic_ref<int, memory_order::relaxed, memory_scope::device,
-                        address_space::global_space>(val[1]);
-         for (int i = 0; i < N_iters; i++) {
-           if (it.get_id(0) == 0) {
-
-             syclcompat::atomic_fetch_add<address_space::global_space, order>(
-                 &val[0], 1);
-             val_p[1]++;
-           } else {
-             // syclcompat:: doesn't offer load/store so using sycl::atomic_ref
-             // here
-             int tmp1 = atm1.load(memory_order::acquire);
-             int tmp0 = atm0.load(memory_order::relaxed);
-             if (tmp0 < tmp1) {
-               error[0] = 1;
-             }
-           }
-         }
-       });
-     }).wait_and_throw();
-  }
-  assert(error == 0);
-}
-
-template <memory_order order> void test_acquire_local() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  const size_t local_size = 256;
-  const size_t N_wgs = 16;
-  const size_t global_size = local_size * N_wgs;
-  const size_t N_iters = 1000;
-
-  int error = 0;
-  int val[] = {0, 0};
-
-  queue q;
-  {
-    buffer<int> error_buf(&error, 1);
-    buffer<int> val_buf(val, 2);
-
-    q.submit([&](handler &cgh) {
-       auto error =
-           error_buf.template get_access<access::mode::read_write>(cgh);
-       local_accessor<int, 1> val(2, cgh);
-       cgh.parallel_for(nd_range<1>(global_size, local_size), [=](nd_item<1>
-                                                                      it) {
-         size_t lid = it.get_local_id(0);
-         val[0] = 0;
-         val[1] = 0;
-         it.barrier(access::fence_space::local_space);
-         volatile int *val_p =
-             val.get_multi_ptr<sycl::access::decorated::no>().get();
-         auto atm0 =
-             atomic_ref<int, memory_order::relaxed, memory_scope::device,
-                        address_space::local_space>(val[0]);
-         auto atm1 =
-             atomic_ref<int, memory_order::relaxed, memory_scope::device,
-                        address_space::local_space>(val[1]);
-         for (int i = 0; i < N_iters; i++) {
-           if (it.get_local_id(0) == 0) {
-             syclcompat::atomic_fetch_add<address_space::local_space, order>(
-                 &val[0], 1);
-             val_p[1]++;
-           } else {
-             // syclcompat:: doesn't offer load/store so using
-             // sycl::atomic_ref here
-             int tmp1 = atm1.load(memory_order::acquire);
-             int tmp0 = atm0.load(memory_order::relaxed);
-             if (tmp0 < tmp1) {
-               error[0] = 1;
-             }
-           }
-         }
-       });
-     }).wait_and_throw();
-  }
-  assert(error == 0);
-}
-
-template <memory_order order> void test_release_global() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  const size_t N_items = 256;
-  const size_t N_iters = 1000;
-
-  int error = 0;
-  int val[] = {0, 0};
-
-  queue q;
-  {
-    buffer<int> error_buf(&error, 1);
-    buffer<int> val_buf(val, 2);
-
-    q.submit([&](handler &cgh) {
-       auto error =
-           error_buf.template get_access<access::mode::read_write>(cgh);
-       auto val = val_buf.template get_access<access::mode::read_write>(cgh);
-       cgh.parallel_for(range<1>(N_items), [=](item<1> it) {
-         volatile int *val_p =
-             val.get_multi_ptr<sycl::access::decorated::no>().get();
-         auto atm0 =
-             atomic_ref<int, memory_order::relaxed, memory_scope::device,
-                        address_space::global_space>(val[0]);
-         auto atm1 =
-             atomic_ref<int, memory_order::relaxed, memory_scope::device,
-                        address_space::global_space>(val[1]);
-         for (int i = 0; i < N_iters; i++) {
-           if (it.get_id(0) == 0) {
-             val_p[0]++;
-             syclcompat::atomic_fetch_add<address_space::global_space, order>(
-                 &val[1], 1);
-           } else {
-             // syclcompat:: doesn't offer load/store so using sycl::atomic_ref
-             // here
-             int tmp1 = atm1.load(memory_order::acquire);
-             int tmp0 = atm0.load(memory_order::relaxed);
-             if (tmp0 < tmp1) {
-               error[0] = 1;
-             }
-           }
-         }
-       });
-     }).wait_and_throw();
-  }
-  assert(error == 0);
-}
-
-template <memory_order order> void test_release_local() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  const size_t local_size = 256;
-  const size_t N_wgs = 16;
-  const size_t global_size = local_size * N_wgs;
-  const size_t N_iters = 1000;
-
-  int error = 0;
-  int val[] = {0, 0};
-
-  queue q;
-  {
-    buffer<int> error_buf(&error, 1);
-    buffer<int> val_buf(val, 2);
-
-    q.submit([&](handler &cgh) {
-       auto error =
-           error_buf.template get_access<access::mode::read_write>(cgh);
-       local_accessor<int, 1> val(2, cgh);
-       cgh.parallel_for(nd_range<1>(global_size, local_size), [=](nd_item<1>
-                                                                      it) {
-         size_t lid = it.get_local_id(0);
-         val[0] = 0;
-         val[1] = 0;
-         it.barrier(access::fence_space::local_space);
-         volatile int *val_p =
-             val.get_multi_ptr<sycl::access::decorated::no>().get();
-         auto atm0 =
-             atomic_ref<int, memory_order::relaxed, memory_scope::device,
-                        address_space::local_space>(val[0]);
-         auto atm1 =
-             atomic_ref<int, memory_order::relaxed, memory_scope::device,
-                        address_space::local_space>(val[1]);
-         for (int i = 0; i < N_iters; i++) {
-           if (it.get_local_id(0) == 0) {
-             val_p[0]++;
-             syclcompat::atomic_fetch_add<address_space::local_space, order>(
-                 &val[1], 1);
-           } else {
-             // syclcompat:: doesn't offer load/store so using
-             // sycl::atomic_ref here
-             int tmp1 = atm1.load(memory_order::acquire);
-             int tmp0 = atm0.load(memory_order::relaxed);
-             if (tmp0 < tmp1) {
-               error[0] = 1;
-             }
-           }
-         }
-       });
-     }).wait_and_throw();
-  }
-  assert(error == 0);
-}
-
-int main() {
-  queue q;
-  std::vector<memory_order> supported_memory_orders =
-      q.get_device()
-          .get_info<sycl::info::device::atomic_memory_order_capabilities>();
-
-  if (is_supported(supported_memory_orders, memory_order::acq_rel)) {
-    // Acquire-release memory order must also support both acquire and release
-    // orderings.
-    assert(is_supported(supported_memory_orders, memory_order::acquire) &&
-           is_supported(supported_memory_orders, memory_order::release));
-    test_acquire_global<memory_order::acq_rel>();
-    test_acquire_local<memory_order::acq_rel>();
-    test_release_global<memory_order::acq_rel>();
-    test_release_local<memory_order::acq_rel>();
-  }
-
-  if (is_supported(supported_memory_orders, memory_order::seq_cst)) {
-    test_acquire_global<memory_order::seq_cst>();
-    test_acquire_local<memory_order::seq_cst>();
-    test_release_global<memory_order::seq_cst>();
-    test_release_local<memory_order::seq_cst>();
-  }
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/atomic/atomic_minmax.cpp b/sycl/test-e2e/syclcompat/atomic/atomic_minmax.cpp
deleted file mode 100644
index fcf207d47bb0c..0000000000000
--- a/sycl/test-e2e/syclcompat/atomic/atomic_minmax.cpp
+++ /dev/null
@@ -1,115 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  atomic_minmax.cpp
- *
- *  Description:
- *    atomic operations API tests
- **************************************************************************/
-
-// The original source was under the license below:
-// ====------ Atomic.cpp---------- -*- C++ -* ----===////
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: hip
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include <type_traits>
-
-#include <sycl/detail/core.hpp>
-
-#include <syclcompat/atomic.hpp>
-#include <syclcompat/device.hpp>
-#include <syclcompat/dims.hpp>
-#include <syclcompat/id_query.hpp>
-#include <syclcompat/launch.hpp>
-#include <syclcompat/memory.hpp>
-
-#include "../common.hpp"
-#include "atomic_fixt.hpp"
-
-// Simple atomic kernels for testing
-// In every case we test two API overloads, one taking an explicit runtime
-// memory_order argument. We use `relaxed` in every case because these tests
-// are *not* checking the memory_order semantics, just the API.
-template <typename T1, typename T2>
-inline void atomic_fetch_min_kernel(T1 *data, T2 operand, T2 operand0) {
-  syclcompat::atomic_fetch_min(
-      data, (syclcompat::global_id::x() == 0 ? operand0 : operand));
-}
-template <typename T1, typename T2>
-inline void atomic_fetch_max_kernel(T1 *data, T2 operand, T2 operand0) {
-  syclcompat::atomic_fetch_max(
-      data, (syclcompat::global_id::x() == 0 ? operand0 : operand));
-}
-
-template <typename T> void test_atomic_minmax() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  constexpr syclcompat::dim3 grid{4};
-  constexpr syclcompat::dim3 threads{32};
-
-  AtomicLauncher<atomic_fetch_min_kernel<T, T>, T>(grid, threads)
-      .launch_test(static_cast<T>(100), static_cast<T>(1), static_cast<T>(200),
-                   static_cast<T>(1));
-  AtomicLauncher<atomic_fetch_max_kernel<T, T>, T>(grid, threads)
-      .launch_test(static_cast<T>(100), static_cast<T>(200),
-                   static_cast<T>(200), static_cast<T>(1));
-}
-
-template <typename T> void test_signed_atomic_minmax() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  constexpr syclcompat::dim3 grid{4};
-  constexpr syclcompat::dim3 threads{32};
-
-  AtomicLauncher<atomic_fetch_min_kernel<T, T>, T>(grid, threads)
-      .launch_test(static_cast<T>(-1), static_cast<T>(-4), static_cast<T>(-4),
-                   static_cast<T>(100));
-  AtomicLauncher<atomic_fetch_max_kernel<T, T>, T>(grid, threads)
-      .launch_test(static_cast<T>(-40), static_cast<T>(-30),
-                   static_cast<T>(-30), static_cast<T>(-100));
-}
-
-void test_signed_atomic_minmax_t1_t2() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  constexpr syclcompat::dim3 grid{4};
-  constexpr syclcompat::dim3 threads{32};
-
-  AtomicLauncher<atomic_fetch_min_kernel<float, int>, float>(grid, threads)
-      .launch_test(static_cast<float>(-1), static_cast<float>(-4),
-                   static_cast<int>(-4), static_cast<int>(100));
-  AtomicLauncher<atomic_fetch_max_kernel<float, int>, float>(grid, threads)
-      .launch_test(static_cast<float>(-40), static_cast<float>(-30),
-                   static_cast<int>(-30), static_cast<int>(-100));
-}
-
-int main() {
-  INSTANTIATE_ALL_TYPES(atomic_value_type_list, test_atomic_minmax);
-  INSTANTIATE_ALL_TYPES(signed_type_list, test_signed_atomic_minmax);
-  test_signed_atomic_minmax_t1_t2();
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/atomic/atomics_verification_usmnone.cpp b/sycl/test-e2e/syclcompat/atomic/atomics_verification_usmnone.cpp
deleted file mode 100644
index 5fa6e48a49cab..0000000000000
--- a/sycl/test-e2e/syclcompat/atomic/atomics_verification_usmnone.cpp
+++ /dev/null
@@ -1,310 +0,0 @@
-// ====------ atomics_noneusm_verification.cpp---------- -*- C++ -* ----===////
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===----------------------------------------------------------------------===//
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-// XFAIL: target-native_cpu
-// XFAIL-TRACKER: https://github.com/intel/llvm/issues/20142
-
-#define SYCLCOMPAT_USM_LEVEL_NONE
-#include <sycl/detail/core.hpp>
-#include <cstdio>
-#include <ctime>
-#include <syclcompat/syclcompat.hpp>
-#include <math.h>
-#include <stdint.h>
-
-#define min(a, b) (a) < (b) ? (a) : (b)
-#define max(a, b) (a) > (b) ? (a) : (b)
-
-#define LOOP_NUM 5
-
-void atomicKernel(int *atom_arr, sycl::nd_item<3> item_ct1) {
-  unsigned int tid = item_ct1.get_local_range().get(2) * item_ct1.get_group(2) +
-                     item_ct1.get_local_id(2);
-
-  for (int i = 0; i < LOOP_NUM; i++) {
-    // Atomic addition
-    syclcompat::atomic_fetch_add(&atom_arr[0], 10);
-
-    // Atomic exchange
-    syclcompat::atomic_exchange(&atom_arr[1], (int)tid);
-
-    // Atomic maximum
-    syclcompat::atomic_fetch_max(&atom_arr[2], (int)tid);
-
-    // Atomic minimum
-    syclcompat::atomic_fetch_min(&atom_arr[3], (int)tid);
-
-    // Atomic increment (modulo 17+1)
-    syclcompat::atomic_fetch_compare_inc((unsigned int *)&atom_arr[4],
-                                   (unsigned int)17);
-
-    // Atomic compare-and-swap
-    syclcompat::atomic_compare_exchange_strong(&atom_arr[6], (int)(tid - 1),
-                                         (int)tid);
-
-    // Bitwise atomic instructions
-
-    // Atomic AND
-    syclcompat::atomic_fetch_and(&atom_arr[7], (int)(2 * tid + 7));
-
-    // Atomic OR
-    syclcompat::atomic_fetch_or(&atom_arr[8], 1 << tid);
-
-    // Atomic XOR
-    syclcompat::atomic_fetch_xor(&atom_arr[9], (int)tid);
-  }
-}
-
-void atomicKernel_CPU(int *atom_arr, int no_of_threads) {
-
-  for (int i = no_of_threads; i < 2 * no_of_threads; i++) {
-
-    for (int j = 0; j < LOOP_NUM; j++) {
-      // Atomic addition
-      __sync_fetch_and_add(&atom_arr[0], 10);
-
-      // Atomic exchange
-      __sync_lock_test_and_set(&atom_arr[1], i);
-
-      // Atomic maximum
-      int old, expected;
-      do {
-        expected = atom_arr[2];
-        old = __sync_val_compare_and_swap(&atom_arr[2], expected,
-                                          max(expected, i));
-      } while (old != expected);
-
-      // Atomic minimum
-      do {
-        expected = atom_arr[3];
-        old = __sync_val_compare_and_swap(&atom_arr[3], expected,
-                                          min(expected, i));
-      } while (old != expected);
-
-      // Atomic increment (modulo 17+1)
-      int limit = 17;
-      do {
-        expected = atom_arr[4];
-        old = __sync_val_compare_and_swap(
-            &atom_arr[4], expected, (expected >= limit) ? 0 : expected + 1);
-      } while (old != expected);
-
-      // Atomic decrement
-      limit = 137;
-      do {
-        expected = atom_arr[5];
-        old = __sync_val_compare_and_swap(
-            &atom_arr[5], expected,
-            ((expected == 0) || (expected > limit)) ? limit : expected - 1);
-      } while (old != expected);
-
-      // Atomic compare-and-swap
-      __sync_val_compare_and_swap(&atom_arr[6], i - 1, i);
-
-      // Bitwise atomic instructions
-
-      // Atomic AND
-      __sync_fetch_and_and(&atom_arr[7], 2 * i + 7);
-
-      // Atomic OR
-      __sync_fetch_and_or(&atom_arr[8], 1 << i);
-
-      // Atomic XOR
-      // 11th element should be 0xff
-      __sync_fetch_and_xor(&atom_arr[9], i);
-    }
-  }
-}
-
-int verify(int *testData, const int len) {
-  int val = 0;
-
-  for (int i = 0; i < len * LOOP_NUM; ++i) {
-    val += 10;
-  }
-
-  if (val != testData[0]) {
-    printf("atomicAdd failed val = %d testData = %d\n", val, testData[0]);
-    return false;
-  }
-
-  val = 0;
-
-  bool found = false;
-
-  for (int i = 0; i < len; ++i) {
-    // second element should be a member of [0, len)
-    if (i == testData[1]) {
-      found = true;
-      break;
-    }
-  }
-
-  if (!found) {
-    printf("atomicExch failed\n");
-    return false;
-  }
-
-  val = -(1 << 8);
-
-  for (int i = 0; i < len; ++i) {
-    // third element should be len-1
-    val = max(val, i);
-  }
-
-  if (val != testData[2]) {
-    printf("atomicMax failed\n");
-    return false;
-  }
-
-  val = 1 << 8;
-
-  for (int i = 0; i < len; ++i) {
-    val = min(val, i);
-  }
-
-  if (val != testData[3]) {
-    printf("atomicMin failed\n");
-    return false;
-  }
-
-  int limit = 17;
-  val = 0;
-
-  for (int i = 0; i < len * LOOP_NUM; ++i) {
-    val = (val >= limit) ? 0 : val + 1;
-  }
-
-  if (val != testData[4]) {
-    printf("atomicInc failed\n");
-    return false;
-  }
-
-  limit = 137;
-  val = 0;
-
-  for (int i = 0; i < len * LOOP_NUM; ++i) {
-    val = ((val == 0) || (val > limit)) ? limit : val - 1;
-  }
-
-  found = false;
-
-  for (int i = 0; i < len; ++i) {
-    // seventh element should be a member of [0, len)
-    if (i == testData[6]) {
-      found = true;
-      break;
-    }
-  }
-
-  if (!found) {
-    printf("atomicCAS failed\n");
-    return false;
-  }
-
-  val = 0xff;
-
-  for (int i = 0; i < len; ++i) {
-    // 8th element should be 1
-    val &= (2 * i + 7);
-  }
-
-  if (val != testData[7]) {
-    printf("atomicAnd failed\n");
-    return false;
-  }
-
-  val = 0;
-
-  for (int i = 0; i < len; ++i) {
-    // 9th element should be 0xff
-    val |= (1 << i);
-  }
-
-  if (val != testData[8]) {
-    printf("atomicOr failed\n");
-    return false;
-  }
-
-  val = 0xff;
-
-  for (int i = 0; i < len; ++i) {
-    // 11th element should be 0xff
-    val ^= i;
-  }
-
-  if (val != testData[9]) {
-    printf("atomicXor failed\n");
-    return false;
-  }
-
-  return true;
-}
-
-int main(int argc, char **argv) {
-
-  unsigned int numThreads = 256;
-  unsigned int numBlocks = 64;
-  unsigned int numData = 10;
-
-  int *atom_arr;
-
-  atom_arr = (int *)syclcompat::malloc(sizeof(int) * numData);
-
-  for (unsigned int i = 0; i < numData; i++) {
-    *syclcompat::get_host_ptr<unsigned int>(atom_arr + i) = 0;
-  }
-
-  // To make the AND and XOR tests generate something other than 0...
-  *syclcompat::get_host_ptr<unsigned int>(atom_arr + 7) =
-      *syclcompat::get_host_ptr<unsigned int>(atom_arr + 9) = 0xff;
-
-  std::cout << "Selected device: "
-            << syclcompat::get_default_queue()
-                   .get_device()
-                   .get_info<sycl::info::device::name>()
-            << "\n";
-
-  {
-    std::pair<syclcompat::buffer_t, size_t> atom_arr_buf_ct0 =
-        syclcompat::get_buffer_and_offset(atom_arr);
-    size_t atom_arr_offset_ct0 = atom_arr_buf_ct0.second;
-    syclcompat::get_default_queue().submit([&](sycl::handler &cgh) {
-      auto atom_arr_acc_ct0 =
-          atom_arr_buf_ct0.first.get_access<sycl::access::mode::read_write>(
-              cgh);
-
-      cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, numBlocks) *
-                                             sycl::range<3>(1, 1, numThreads),
-                                         sycl::range<3>(1, 1, numThreads)),
-                       [=](sycl::nd_item<3> item_ct1) {
-                         int *atom_arr_ct0 = (int *)(&atom_arr_acc_ct0[0] +
-                                                     atom_arr_offset_ct0);
-                         atomicKernel(atom_arr_ct0, item_ct1);
-                       });
-    });
-  }
-
-  atomicKernel_CPU(syclcompat::get_host_ptr<int>(atom_arr), numBlocks * numThreads);
-
-  syclcompat::get_current_device().queues_wait_and_throw();
-
-  // Compute & verify reference solution
-  int testResult =
-      verify(syclcompat::get_host_ptr<int>(atom_arr), 2 * numThreads * numBlocks);
-
-  syclcompat::free(atom_arr);
-
-  printf("Atomics test completed, returned %s \n",
-         testResult ? "OK" : "ERROR!");
-  exit(testResult ? 0 : -1);
-}
diff --git a/sycl/test-e2e/syclcompat/common.hpp b/sycl/test-e2e/syclcompat/common.hpp
deleted file mode 100644
index ff840c98209bd..0000000000000
--- a/sycl/test-e2e/syclcompat/common.hpp
+++ /dev/null
@@ -1,89 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat
- *
- *  common.hpp
- *
- *  Description:
- *     Common helpers to help with syclcompat functionality tests
- **************************************************************************/
-
-#pragma once
-
-#include <sycl/feature_test.hpp>
-#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
-#include <sycl/ext/oneapi/bfloat16.hpp>
-#endif
-#include <sycl/half_type.hpp>
-#include <tuple>
-
-constexpr double ERROR_TOLERANCE = 1e-5;
-
-template <typename Tuple, typename Func, std::size_t... Is>
-void for_each_type_call(Func &&f, std::index_sequence<Is...>) {
-  (f(std::integral_constant<std::size_t, Is>{}), ...);
-}
-
-template <typename Tuple, typename Func> void instantiate_all_types(Func &&f) {
-  for_each_type_call<Tuple>(
-      std::forward<Func>(f),
-      std::make_index_sequence<std::tuple_size_v<Tuple>>{});
-}
-
-#define INSTANTIATE_ALL_TYPES(tuple, f)                                        \
-  instantiate_all_types<tuple>([](auto index) {                                \
-    using T = std::tuple_element_t<decltype(index)::value, tuple>;             \
-    f<T>();                                                                    \
-  });
-
-#define INSTANTIATE_ALL_CONTAINER_TYPES(tuple, container, f)                   \
-  instantiate_all_types<tuple>([](auto index) {                                \
-    using T = std::tuple_element_t<decltype(index)::value, tuple>;             \
-    f<container, T>();                                                         \
-  });
-
-using value_type_list =
-    std::tuple<char, signed char, unsigned char, int, unsigned int, short,
-               unsigned short, long, unsigned long, long long,
-               unsigned long long, float, double, sycl::half
-#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
-               ,sycl::ext::oneapi::bfloat16
-#endif
->;
-
-using fp_type_list_no_bfloat16 = std::tuple<float, double, sycl::half>;
-
-using fp_type_list = std::tuple<float, double, sycl::half
-
-#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
-                ,sycl::ext::oneapi::bfloat16
-#endif
->;
-
-using marray_type_list =
-    std::tuple<char, signed char, short, int, long, long long, unsigned char,
-               unsigned short, unsigned int, unsigned long, unsigned long long,
-               float, double, sycl::half
-#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
-              , sycl::ext::oneapi::bfloat16
-#endif
->;
-using vec_type_list = std::tuple<int8_t, int16_t, int32_t, int64_t, uint8_t,
-                                 uint16_t, uint32_t, uint64_t, float, double,
-                                 sycl::half
-#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
-              , sycl::ext::oneapi::bfloat16
-#endif
->;
diff --git a/sycl/test-e2e/syclcompat/defs.cpp b/sycl/test-e2e/syclcompat/defs.cpp
deleted file mode 100644
index ea151c21283be..0000000000000
--- a/sycl/test-e2e/syclcompat/defs.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat
- *
- *  Defs.cpp
- *
- *  Description:
- *     Syclcompat macros tests
- **************************************************************************/
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include <cassert>
-#include <iostream>
-
-#include <sycl/detail/core.hpp>
-
-#include <syclcompat/defs.hpp>
-
-void test_align() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  constexpr std::size_t expected_size = 16;
-  struct __syclcompat_align__(expected_size) {
-    int a;
-    char c;
-  }
-  s;
-  assert(sizeof(s) == expected_size);
-}
-
-void test_check_error() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  auto sycl_error_throw = []() {
-    throw sycl::exception(sycl::make_error_code(sycl::errc::invalid),
-                          "Expected invalid exception in test_check_error");
-  };
-
-  auto runtime_error_throw = []() {
-    throw std::runtime_error("Expected invalid exception in test_check_error");
-  };
-
-  assert(syclcompat::error_code::success == SYCLCOMPAT_CHECK_ERROR());
-  assert(syclcompat::error_code::backend_error ==
-         SYCLCOMPAT_CHECK_ERROR(sycl_error_throw()));
-  assert(syclcompat::error_code::default_error ==
-         SYCLCOMPAT_CHECK_ERROR(runtime_error_throw()));
-}
-
-void test_version() {
-  // Check the composition of the version int
-  assert(SYCLCOMPAT_MAKE_VERSION(1, 1, 1) == 1001001);
-  assert(SYCLCOMPAT_MAKE_VERSION(9, 0, 0) == 9000000);
-
-  // Check some inequalities
-  assert(SYCLCOMPAT_MAKE_VERSION(0, 1, 1) > SYCLCOMPAT_MAKE_VERSION(0, 1, 0));
-  assert(SYCLCOMPAT_MAKE_VERSION(1, 0, 0) > SYCLCOMPAT_MAKE_VERSION(0, 9, 0));
-}
-
-int main() {
-  test_align();
-  test_check_error();
-  test_version();
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/device/device.cpp b/sycl/test-e2e/syclcompat/device/device.cpp
deleted file mode 100644
index d84e4d9c001bf..0000000000000
--- a/sycl/test-e2e/syclcompat/device/device.cpp
+++ /dev/null
@@ -1,428 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  device.cpp
- *
- *  Description:
- *    Device info and selection tests
- **************************************************************************/
-
-// The original source was under the license below:
-//===-- Device.cpp -  -*- C++ -* ------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// RUN: %{build} -Wno-error=user-defined-warnings -o %t.out
-// RUN: %{run} %t.out
-
-// XFAIL: target-native_cpu
-// XFAIL-TRACKER: https://github.com/intel/llvm/issues/20142
-
-#include <syclcompat/device.hpp>
-
-#include "device_fixt.hpp"
-
-void test_set_default_queue() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  DeviceExtFixt dev_ext;
-  auto &dev_ = dev_ext.get_dev_ext();
-  sycl::queue old_default_queue = syclcompat::get_default_queue();
-  dev_.set_default_queue(syclcompat::create_queue());
-  assert(*dev_.default_queue() == *dev_.get_saved_queue());
-  assert(*dev_.default_queue() != old_default_queue);
-}
-
-/*
-  Device Tests
-*/
-void test_at_least_one_device() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  DeviceTestsFixt dtf;
-  assert(dtf.get_n_devices() > 0);
-}
-
-// Check the device returned matches the device ID
-void test_matches_id() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  assert(syclcompat::get_device(syclcompat::get_current_device_id()) ==
-         syclcompat::get_current_device());
-}
-
-// Check error on insufficient devices
-void test_not_enough_devices() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  DeviceTestsFixt dtf;
-  try {
-    syclcompat::select_device(dtf.get_n_devices());
-  } catch (std::runtime_error const &e) {
-    std::cout << "Expected SYCL exception caught: " << e.what() << std::endl;
-  }
-}
-
-// Check the default context matches default queue's context
-void test_default_context() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  DeviceTestsFixt dtf;
-  assert(dtf.get_queue().get_context() == syclcompat::get_default_context());
-}
-
-/*
-  Queue Tests
-*/
-void test_make_in_order_queue() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  sycl::queue q = syclcompat::get_default_queue();
-  assert(q.is_in_order());
-}
-
-void test_check_default_device() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  sycl::queue q = syclcompat::get_default_queue();
-  assert(q.get_device() == sycl::device{sycl::default_selector_v});
-}
-
-// Check behaviour of in order & out of order queue construction
-void test_create_queue_arguments() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  sycl::queue q_create_def{syclcompat::create_queue()};
-  assert(q_create_def.is_in_order());
-  sycl::queue q_in_order{syclcompat::create_queue(false, true)};
-  assert(q_in_order.is_in_order());
-  sycl::queue q_out_order{syclcompat::create_queue(false, false)};
-  assert(!q_out_order.is_in_order());
-}
-
-void test_version_parsing_case(const std::string &ver_string,
-                               int expected_major, int expected_minor) {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  int major;
-  int minor;
-  syclcompat::detail::parse_version_string(ver_string, major, minor);
-  if (major != expected_major || minor != expected_minor) {
-    std::cout << "Failed comparing " << ver_string << " major " << major
-              << " expected_major " << expected_major << " minor " << minor
-              << " expected_minor " << expected_minor << std::endl;
-    assert(false);
-  }
-  assert(major == expected_major);
-  assert(minor == expected_minor);
-}
-
-void test_version_parsing() {
-  test_version_parsing_case("3.0", 3, 0);
-  test_version_parsing_case("3.0 NEO", 3, 0);
-  test_version_parsing_case("OpenCL 3.0 NEO", 3, 0);
-  test_version_parsing_case("OpenCL 3.0 (Build 0)", 3, 0);
-  test_version_parsing_case("8.6", 8, 6);
-  test_version_parsing_case("8.0", 8, 0);
-  test_version_parsing_case("7.5", 7, 5);
-  test_version_parsing_case("1.3", 1, 3);
-  test_version_parsing_case("11.4", 11, 4);
-  test_version_parsing_case("0.1", 0, 1);
-  test_version_parsing_case("gfx1030", 1030, 0);
-}
-
-// We have *some* constraints on the major version that we can check
-void test_major_version(sycl::device &dev, int major) {
-  auto backend = dev.get_backend();
-  if (backend == sycl::backend::opencl) {
-    assert(major == 1 || major == 3);
-  } else if (backend == sycl::backend::ext_oneapi_level_zero ||
-             backend == sycl::backend::ext_oneapi_cuda) {
-    assert(major < 99);
-  }
-}
-
-/*
-  Device Extension Tests
-*/
-void test_device_ext_api() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  DeviceExtFixt dev_ext;
-  auto &dev_ = dev_ext.get_dev_ext();
-  dev_.is_native_host_atomic_supported();
-  auto major = dev_.get_major_version();
-  test_major_version(dev_, major);
-  dev_.get_minor_version();
-  dev_.get_max_clock_frequency();
-  dev_.get_integrated();
-
-  int max_cu = dev_.get_max_compute_units();
-  int max_wg_size = dev_.get_max_work_group_size();
-  size_t global_mem_size = dev_.get_global_mem_size();
-
-  syclcompat::device_info Info;
-  dev_.get_device_info(Info);
-  assert(Info.get_max_compute_units() == max_cu);
-  assert(Info.get_max_work_group_size() == max_wg_size);
-  assert(Info.get_global_mem_size() == global_mem_size);
-
-  dev_.reset();
-  auto QueuePtr = dev_.default_queue();
-  dev_.queues_wait_and_throw();
-  QueuePtr = dev_.create_queue();
-  dev_.destroy_queue(QueuePtr);
-  QueuePtr = dev_.create_queue();
-  dev_.set_saved_queue(QueuePtr);
-  QueuePtr = dev_.get_saved_queue();
-  auto Context = dev_.get_context();
-}
-
-void test_device_api() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  DeviceExtFixt dev_ext;
-  auto &dev_ = dev_ext.get_dev_ext();
-  auto major = get_major_version(dev_);
-  test_major_version(dev_, major);
-  get_minor_version(dev_);
-}
-
-void test_default_saved_queue() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  DeviceExtFixt dev_ext;
-  auto &dev_ = dev_ext.get_dev_ext();
-  assert(*dev_.default_queue() == *dev_.get_saved_queue());
-}
-
-void test_saved_queue() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  DeviceExtFixt dev_ext;
-  auto &dev_ = dev_ext.get_dev_ext();
-  auto q = *dev_.create_queue();
-  dev_.set_saved_queue(&q);
-  assert(q == *dev_.get_saved_queue());
-}
-
-// Check reset() resets the queues etc
-void test_reset() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  DeviceExtFixt dev_ext;
-  auto &dev_ = dev_ext.get_dev_ext();
-  auto q = *dev_.create_queue();
-  dev_.set_saved_queue(&q);
-  dev_.reset();
-  assert(q != *dev_.get_saved_queue());
-}
-
-void test_reset_arguments() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  DeviceExtFixt dev_ext;
-  auto &dev_ = dev_ext.get_dev_ext();
-
-  dev_.reset(false, false);
-  assert(!dev_.get_saved_queue()->is_in_order());
-
-  dev_.reset(false, true);
-  assert(dev_.get_saved_queue()->is_in_order());
-}
-
-void test_device_info_api() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  syclcompat::device_info Info;
-  const char *Name = "DEVNAME";
-  std::array<unsigned char, 16> uuid;
-  uuid.fill('0');
-  sycl::range<3> max_work_item_sizes;
-
-  Info.set_name(Name);
-  Info.set_max_work_item_sizes(max_work_item_sizes);
-  Info.set_major_version(1);
-  Info.set_minor_version(1);
-  Info.set_integrated(1);
-  Info.set_max_clock_frequency(1000);
-  Info.set_max_compute_units(32);
-  Info.set_global_mem_size(1000);
-  Info.set_local_mem_size(1000);
-  Info.set_max_work_group_size(32);
-  Info.set_max_sub_group_size(16);
-  Info.set_max_work_items_per_compute_unit(16);
-
-  Info.set_host_unified_memory(true);
-  Info.set_memory_clock_rate(1000);
-  Info.set_max_register_size_per_work_group(1000);
-  Info.set_device_id(0);
-  Info.set_uuid(uuid);
-  Info.set_global_mem_cache_size(1000);
-
-  assert(!strcmp(Info.get_name(), Name));
-  assert(Info.get_max_work_item_sizes() == max_work_item_sizes);
-  assert(Info.get_minor_version() == 1);
-  assert(Info.get_integrated() == 1);
-  assert(Info.get_max_clock_frequency() == 1000);
-  assert(Info.get_max_compute_units() == 32);
-  assert(Info.get_max_work_group_size() == 32);
-  assert(Info.get_max_sub_group_size() == 16);
-  assert(Info.get_max_work_items_per_compute_unit() == 16);
-  assert(Info.get_global_mem_size() == 1000);
-  assert(Info.get_local_mem_size() == 1000);
-
-  uuid.fill('0'); // set_uuid uses std::move
-  assert(Info.get_host_unified_memory());
-  assert(Info.get_memory_clock_rate() == 1000);
-  assert(Info.get_max_register_size_per_work_group() == 1000);
-  assert(Info.get_device_id() == 0);
-  assert(Info.get_uuid() == uuid);
-  assert(Info.get_global_mem_cache_size() == 1000);
-}
-
-void test_image_max_attrs() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  syclcompat::device_info info;
-
-  int _image1d_max = 1;
-  int _image2d_max[2] = {2, 3};
-  int _image3d_max[3] = {4, 5, 6};
-
-  info.set_image1d_max(_image1d_max);
-  info.set_image2d_max(_image2d_max[0], _image2d_max[1]);
-  info.set_image3d_max(_image3d_max[0], _image3d_max[1], _image3d_max[2]);
-
-  assert(info.get_image1d_max() == _image1d_max);
-  assert(info.get_image2d_max()[0] == _image2d_max[0]);
-  assert(info.get_image2d_max()[1] == _image2d_max[1]);
-  assert(info.get_image3d_max()[0] == _image3d_max[0]);
-  assert(info.get_image3d_max()[1] == _image3d_max[1]);
-  assert(info.get_image3d_max()[2] == _image3d_max[2]);
-
-  DeviceExtFixt dev_ext;
-  auto &dev_ = dev_ext.get_dev_ext();
-
-  info.set_image1d_max(0);
-  info.set_image2d_max(0, 0);
-  info.set_image3d_max(0, 0, 0);
-
-  // SYCL guarantees at least a certain minimum value if the device has
-  // aspect::image
-  if (!dev_.has(sycl::aspect::image)) {
-    std::cout << "  Partial skip: device does not have sycl::aspect::image."
-              << std::endl;
-    return;
-  }
-  dev_.get_device_info(info);
-  // We only need to ensure the value is modified.
-  assert(info.get_image1d_max() > 0);
-  assert(info.get_image2d_max()[0] > 0);
-  assert(info.get_image2d_max()[1] > 0);
-  assert(info.get_image3d_max()[0] > 0);
-  assert(info.get_image3d_max()[1] > 0);
-  assert(info.get_image3d_max()[2] > 0);
-}
-
-void test_max_nd_range() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  syclcompat::device_info info;
-
-  int size_array[3] = {1, 2, 3};
-  info.set_max_nd_range_size(size_array);
-
-  assert(info.get_max_nd_range_size()[0] == size_array[0]);
-  assert(info.get_max_nd_range_size()[1] == size_array[1]);
-  assert(info.get_max_nd_range_size()[2] == size_array[2]);
-
-  DeviceExtFixt dev_ext;
-  auto &dev = dev_ext.get_dev_ext();
-  dev.get_device_info(info);
-
-  int size_array_zeros[3] = {0, 0, 0};
-  info.set_max_nd_range_size(size_array_zeros);
-
-#ifdef SYCL_EXT_ONEAPI_MAX_WORK_GROUP_QUERY
-  // According to the extension values are > 1 unless info::device_type is
-  // info::device_type::custom.
-  if (dev.get_info<sycl::info::device::device_type>() ==
-      sycl::info::device_type::custom) {
-    std::cout << "  Skipping due to custom sycl::info::device_type::custom."
-              << std::endl;
-    return;
-  }
-
-  info.set_max_nd_range_size(
-      dev.get_info<
-          sycl::ext::oneapi::experimental::info::device::max_work_groups<3>>());
-  assert(info.get_max_nd_range_size()[0] > 0);
-  assert(info.get_max_nd_range_size()[1] > 0);
-  assert(info.get_max_nd_range_size()[2] > 0);
-#else
-  int expected = 0x7FFFFFFF;
-  assert(info.get_max_nd_range_size()[0] == expected);
-  assert(info.get_max_nd_range_size()[1] == expected);
-  assert(info.get_max_nd_range_size()[2] == expected);
-#endif
-}
-
-void test_list_devices() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  DeviceTestsFixt dtf;
-
-  // Redirect std::cout to count new lines
-  CountingStream countingBuf(std::cout.rdbuf());
-  std::streambuf *orig_buf = std::cout.rdbuf();
-  std::cout.rdbuf(&countingBuf);
-
-  syclcompat::list_devices();
-
-  // Restore back std::cout
-  std::cout.rdbuf(orig_buf);
-
-  // Expected one line per device
-  assert(countingBuf.get_line_count() == dtf.get_n_devices());
-}
-
-void test_device_count() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  unsigned int count = syclcompat::device_count();
-  assert(count > 0);
-}
-
-void test_get_device_id() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  sycl::device dev = syclcompat::get_device(0);
-  unsigned int id = syclcompat::get_device_id(dev);
-  assert(id == 0);
-}
-
-int main() {
-  test_at_least_one_device();
-  test_matches_id();
-  test_not_enough_devices();
-  test_set_default_queue();
-  test_default_context();
-  test_make_in_order_queue();
-  test_check_default_device();
-  test_create_queue_arguments();
-  test_device_ext_api();
-  test_device_api();
-  test_default_saved_queue();
-  test_saved_queue();
-  test_reset();
-  test_device_info_api();
-  test_version_parsing();
-  test_image_max_attrs();
-  test_max_nd_range();
-  test_list_devices();
-  test_device_count();
-  test_get_device_id();
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/device/device_filter.cpp b/sycl/test-e2e/syclcompat/device/device_filter.cpp
deleted file mode 100644
index d3e7967bc4243..0000000000000
--- a/sycl/test-e2e/syclcompat/device/device_filter.cpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  device_filter.cpp
- *
- *  Description:
- *    Device filtering tests
- **************************************************************************/
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include <syclcompat/device.hpp>
-
-void test_filtering_existing_device() {
-  auto &dev = syclcompat::get_current_device();
-  std::string dev_name = dev.get_info<sycl::info::device::name>();
-
-  syclcompat::filter_device({dev_name});
-  try {
-    syclcompat::get_device_id(dev);
-  } catch (std::runtime_error const &e) {
-    std::cout << "  Unexpected SYCL exception caught: " << e.what()
-              << std::endl;
-    assert(0);
-  }
-
-  // Checks for a substring of the device as well
-  std::string dev_substr = dev_name.substr(1, dev_name.find(" ") + 2);
-  syclcompat::filter_device({dev_substr});
-  try {
-    syclcompat::get_device_id(dev);
-  } catch (std::runtime_error const &e) {
-    std::cout << "  Unexpected SYCL exception caught: " << e.what()
-              << std::endl;
-    assert(0);
-  }
-}
-
-void test_filter_devices() {
-  auto &dev = syclcompat::get_current_device();
-
-  assert(syclcompat::detail::dev_mgr::instance().device_count() > 0);
-
-  syclcompat::filter_device({"NON-EXISTENT DEVICE"});
-  assert(syclcompat::detail::dev_mgr::instance().device_count() == 0);
-
-  try {
-    syclcompat::get_device_id(dev);
-    assert(0);
-  } catch (std::runtime_error const &e) {
-    std::cout << "  Expected SYCL exception caught: " << e.what() << std::endl;
-  }
-}
-
-int main() {
-  // syclcompat::dev_mgr is a singleton, so any changes to the device list is
-  // permanent between tests. Test isolated instead of relying on it being the
-  // last test in a different test suite.
-  test_filtering_existing_device();
-
-  test_filter_devices();
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/device/device_fixt.hpp b/sycl/test-e2e/syclcompat/device/device_fixt.hpp
deleted file mode 100644
index 7588eb71dd5b9..0000000000000
--- a/sycl/test-e2e/syclcompat/device/device_fixt.hpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat
- *
- *  device_fixt.h
- *
- *  Description:
- *     Fixture helpers for to tests the extended device functionality
- **************************************************************************/
-
-#pragma once
-
-#include <sycl/detail/core.hpp>
-#include <syclcompat/device.hpp>
-
-class DeviceTestsFixt {
-protected:
-  unsigned int n_devices{};
-  sycl::queue def_q_;
-
-public:
-  DeviceTestsFixt()
-      : n_devices{syclcompat::device_count()},
-        def_q_{syclcompat::get_default_queue()} {}
-
-  unsigned int get_n_devices() { return n_devices; }
-  sycl::queue get_queue() { return def_q_; }
-};
-
-class DeviceExtFixt {
-protected:
-  syclcompat::device_ext &dev_;
-
-public:
-  DeviceExtFixt() : dev_{syclcompat::get_current_device()} { SetUp(); }
-
-  void SetUp() { dev_.reset(); }
-
-  syclcompat::device_ext &get_dev_ext() { return dev_; }
-};
-
-// Helper for counting the output lines of syclcompat::list_devices
-// Used to override std::cout
-class CountingStream : public std::streambuf {
-public:
-  CountingStream(std::streambuf *buf) : buf(buf), line_count(0) {}
-
-  int overflow(int c) override {
-    if (c == '\n') {
-      ++line_count;
-    }
-    return buf->sputc(c);
-  }
-
-  std::streamsize xsputn(const char_type *s, std::streamsize count) override {
-    for (std::streamsize i = 0; i < count; ++i) {
-      if (s[i] == '\n') {
-        ++line_count;
-      }
-    }
-    return buf->sputn(s, count);
-  }
-
-  int get_line_count() const { return line_count; }
-
-private:
-  std::streambuf *buf;
-  int line_count;
-};
diff --git a/sycl/test-e2e/syclcompat/device/device_profiling.cpp b/sycl/test-e2e/syclcompat/device/device_profiling.cpp
deleted file mode 100644
index b6f7aefc33f7e..0000000000000
--- a/sycl/test-e2e/syclcompat/device/device_profiling.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  device_profiling.cpp
- *
- *  Description:
- *    Tests for the enable_profiling property paths
- **************************************************************************/
-
-// RUN: %{build} -DSYCLCOMPAT_PROFILING_ENABLED=1 -o %t-profiling.out
-// RUN: %{run} %t-profiling.out
-// RUN: %{build} -o %t-no-profiling.out
-// RUN: %{run} %t-no-profiling.out
-
-#include <syclcompat/device.hpp>
-
-#ifdef SYCLCOMPAT_PROFILING_ENABLED
-void test_event_profiling() {
-  sycl::queue q = syclcompat::get_default_queue();
-
-  if (!q.get_device().has(sycl::aspect::queue_profiling)) {
-    std::cout << "Device does not have aspect::queue_profiling, skipping."
-              << std::endl;
-    return;
-  }
-
-  assert(q.has_property<sycl::property::queue::enable_profiling>());
-
-  q = sycl::queue{q.get_device(), sycl::property::queue::enable_profiling()};
-  auto event = q.submit([&](sycl::handler &cgh) { cgh.single_task([=]() {}); });
-  event.get_profiling_info<sycl::info::event_profiling::command_end>();
-}
-#else
-void test_no_event_profiling() {
-  sycl::queue q = syclcompat::get_default_queue();
-
-  if (!q.get_device().has(sycl::aspect::queue_profiling)) {
-    std::cout << "Device does not have aspect::queue_profiling, skipping."
-              << std::endl;
-    return;
-  }
-
-  assert(!q.has_property<sycl::property::queue::enable_profiling>());
-}
-#endif
-
-int main() {
-#ifdef SYCLCOMPAT_PROFILING_ENABLED
-  test_event_profiling();
-#else
-  test_no_event_profiling();
-#endif
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/device/device_threaded.cpp b/sycl/test-e2e/syclcompat/device/device_threaded.cpp
deleted file mode 100644
index 37735548168c7..0000000000000
--- a/sycl/test-e2e/syclcompat/device/device_threaded.cpp
+++ /dev/null
@@ -1,74 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  device_threaded.cpp
- *
- *  Description:
- *    Device info and selection tests
- **************************************************************************/
-
-// The original source was under the license below:
-//===-- Device.cpp -  -*- C++ -* ------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// RUN: %{build} %threads_lib -o %t.out
-// RUN: %{run} %t.out
-
-#include <syclcompat/device.hpp>
-
-#include "device_fixt.hpp"
-
-// Check a thread is able to select a non-default device
-void test_device_select() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  DeviceTestsFixt dtf;
-  if (dtf.get_n_devices() > 1) {
-    constexpr unsigned int TARGET_DEV = 1;
-    unsigned int thread_dev_id{};
-    std::thread other_thread{[&]() {
-      syclcompat::select_device(TARGET_DEV);
-      thread_dev_id = syclcompat::get_current_device_id();
-    }};
-    other_thread.join();
-    assert(thread_dev_id == TARGET_DEV);
-  } else {
-    std::cout << "  Skipping, only doable with multiple devices" << std::endl;
-  }
-}
-
-// Check multiple threads get same device by default
-void test_threads() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  unsigned int thread_dev_id{};
-  std::thread other_thread{
-      [&]() { thread_dev_id = syclcompat::get_current_device_id(); }};
-  other_thread.join();
-  assert(thread_dev_id == syclcompat::get_current_device_id());
-}
-
-int main() {
-  test_device_select();
-  test_threads();
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/dim.cpp b/sycl/test-e2e/syclcompat/dim.cpp
deleted file mode 100644
index 32bb4534f0da7..0000000000000
--- a/sycl/test-e2e/syclcompat/dim.cpp
+++ /dev/null
@@ -1,158 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat
- *
- *  Dim.cpp
- *
- *  Description:
- *     dim3 tests
- **************************************************************************/
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include <cassert>
-#include <sycl/detail/core.hpp>
-#include <syclcompat/dims.hpp>
-
-int main() {
-  std::cout << "Testing Construct" << std::endl;
-  {
-    syclcompat::dim3 d3(512);
-    assert(d3.x == 512);
-    assert(d3.y == 1);
-    assert(d3.z == 1);
-  }
-  std::cout << "Testing Empty Construct" << std::endl;
-  {
-    syclcompat::dim3 d3;
-    assert(d3.x == 1);
-    assert(d3.y == 1);
-    assert(d3.z == 1);
-  }
-  std::cout << "Testing Empty Construct & Update" << std::endl;
-  {
-    syclcompat::dim3 d3;
-    d3.x = 1;
-    d3.y = 2;
-    d3.z = 3;
-
-    assert(d3.x == 1);
-    assert(d3.y == 2);
-    assert(d3.z == 3);
-  }
-  std::cout << "Testing Empty Construct & Update 2" << std::endl;
-  {
-    syclcompat::dim3 d3;
-    d3.x = 32;
-
-    assert(d3.x == 32);
-    assert(d3.y == 1);
-    assert(d3.z == 1);
-  }
-  std::cout << "Testing Convert" << std::endl;
-  {
-    syclcompat::dim3 d3(512);
-    sycl::range<3> r3 = d3;
-    assert(d3.x == r3[2]);
-    assert(d3.y == r3[1]);
-    assert(d3.z == r3[0]);
-
-    sycl::range<2> r2{1, 2};
-    syclcompat::dim3 d3_from_range2(r2);
-    assert(d3_from_range2.x == 2);
-    assert(d3_from_range2.y == 1);
-    assert(d3_from_range2.z == 1);
-
-    sycl::range<1> r1{2};
-    syclcompat::dim3 d3_from_range1(r1);
-    assert(d3_from_range2.x == 2);
-    assert(d3_from_range2.y == 1);
-    assert(d3_from_range2.z == 1);
-  }
-  std::cout << "Testing ConvertBack" << std::endl;
-  // Dimension-dependent conversions and
-  // check that exceptions are thrown when trying to convert
-  // higher dimensional dim3 to sycl::range
-  {
-    syclcompat::dim3 dim_3D(512, 4, 2);
-
-    sycl::range<3> range_3D{dim_3D};
-    sycl::range<3> exp_3D{2, 4, 512};
-    assert(range_3D == exp_3D);
-
-    try {
-      sycl::range<2> range_2D{dim_3D};
-    } catch (std::invalid_argument const &e) {
-      std::cout << "Expected SYCL exception caught: " << e.what();
-    }
-
-    try {
-      sycl::range<1> range_1D{dim_3D};
-    } catch (std::invalid_argument const &e) {
-      std::cout << "Expected SYCL exception caught: " << e.what();
-    }
-  }
-  {
-    syclcompat::dim3 dim_2D(512, 2);
-
-    sycl::range<3> range_3D{dim_2D};
-    sycl::range<3> exp_3D{1, 2, 512};
-    assert(range_3D == exp_3D);
-
-    sycl::range<2> range_2D{dim_2D};
-    sycl::range<2> exp_2D{2, 512};
-    assert(range_2D == exp_2D);
-
-    try {
-      sycl::range<1> range_1D{dim_2D};
-    } catch (std::invalid_argument const &e) {
-      std::cout << "Expected SYCL exception caught: " << e.what();
-    }
-  }
-  {
-    syclcompat::dim3 dim_1D{512};
-    sycl::range<3> range_3D{dim_1D};
-    sycl::range<3> exp_3D{1, 1, 512};
-    assert(range_3D == exp_3D);
-
-    sycl::range<2> range_2D{dim_1D};
-    sycl::range<2> exp_2D{1, 512};
-    assert(range_2D == exp_2D);
-
-    sycl::range<1> range_1D{dim_1D};
-    sycl::range<1> exp_1D{512};
-    assert(range_1D == exp_1D);
-  }
-
-  // Check that an nd_range is correctly constructed
-  // from pair of dim3
-  std::cout << "Testing ConvertMulti" << std::endl;
-  {
-    syclcompat::dim3 threads(32, 4, 2);
-    syclcompat::dim3 grid(4, 1, 1);
-
-    sycl::nd_range<3> range{grid * threads, threads};
-
-    assert(range.get_global_range()[0] == 2);
-    assert(range.get_global_range()[1] == 4);
-    assert(range.get_global_range()[2] == 128);
-    assert(range.get_local_range()[0] == 2);
-    assert(range.get_local_range()[1] == 4);
-    assert(range.get_local_range()[2] == 32);
-  }
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/group_utils/exchange.cpp b/sycl/test-e2e/syclcompat/group_utils/exchange.cpp
deleted file mode 100644
index 430dc4e3a2c8f..0000000000000
--- a/sycl/test-e2e/syclcompat/group_utils/exchange.cpp
+++ /dev/null
@@ -1,305 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  exchange.cpp
- *
- *  Description:
- *    Group exchange API tests
- **************************************************************************/
-
-// ===------- exchange.cpp---------------------- *- C++ -* ------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===----------------------------------------------------------------------===//
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include <iostream>
-
-#include <sycl/detail/core.hpp>
-
-#include <syclcompat/device.hpp>
-#include <syclcompat/group_utils.hpp>
-
-void StripedToBlockedKernel(int *d_data, const sycl::nd_item<3> &item_ct1,
-                            uint8_t *load_temp_storage,
-                            uint8_t *store_temp_storage,
-                            uint8_t *temp_storage) {
-  using BlockLoadT = syclcompat::group::group_load<
-      int, 4, syclcompat::group::group_load_algorithm::striped>;
-  using BlockStoreT = syclcompat::group::group_store<
-      int, 4, syclcompat::group::group_store_algorithm::striped>;
-  typedef syclcompat::group::exchange<int, 4> BlockExchange;
-
-  int thread_data[4];
-  BlockLoadT(load_temp_storage).load(item_ct1, d_data, thread_data);
-  BlockExchange(temp_storage)
-      .striped_to_blocked(item_ct1, thread_data, thread_data);
-  BlockStoreT(store_temp_storage).store(item_ct1, d_data, thread_data);
-}
-
-void BlockedToStripedKernel(int *d_data, const sycl::nd_item<3> &item_ct1,
-                            uint8_t *temp_storage) {
-
-  typedef syclcompat::group::exchange<int, 4> BlockExchange;
-
-  int thread_data[4];
-  syclcompat::group::load_direct_striped(item_ct1, d_data, thread_data);
-  BlockExchange(temp_storage)
-      .blocked_to_striped(item_ct1, thread_data, thread_data);
-  syclcompat::group::store_direct_striped(item_ct1, d_data, thread_data);
-}
-
-void ScatterToBlockedKernel(int *d_data, int *d_rank,
-                            const sycl::nd_item<3> &item_ct1,
-                            uint8_t *temp_storage) {
-
-  using BlockExchange = syclcompat::group::exchange<int, 4>;
-
-  int thread_data[4], thread_rank[4];
-  syclcompat::group::load_direct_striped(item_ct1, d_data, thread_data);
-  syclcompat::group::load_direct_striped(item_ct1, d_rank, thread_rank);
-  BlockExchange(temp_storage)
-      .scatter_to_blocked(item_ct1, thread_data, thread_rank);
-  syclcompat::group::store_direct_striped(item_ct1, d_data, thread_data);
-}
-
-void ScatterToStripedKernel(int *d_data, int *d_rank,
-                            const sycl::nd_item<3> &item_ct1,
-                            uint8_t *temp_storage) {
-
-  using BlockExchange = syclcompat::group::exchange<int, 4>;
-
-  int thread_data[4], thread_rank[4];
-  syclcompat::group::load_direct_striped(item_ct1, d_data, thread_data);
-  syclcompat::group::load_direct_striped(item_ct1, d_rank, thread_rank);
-  BlockExchange(temp_storage)
-      .scatter_to_striped(item_ct1, thread_data, thread_rank);
-  syclcompat::group::store_direct_striped(item_ct1, d_data, thread_data);
-}
-
-bool test_striped_to_blocked() {
-  syclcompat::device_ext &dev_ct1 = syclcompat::get_current_device();
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  int *d_data;
-  d_data = sycl::malloc_shared<int>(512, q_ct1);
-  for (int i = 0; i < 128; i++) {
-    d_data[4 * i + 0] = i;
-    d_data[4 * i + 1] = i + 1 * 128;
-    d_data[4 * i + 2] = i + 2 * 128;
-    d_data[4 * i + 3] = i + 3 * 128;
-  }
-
-  q_ct1.submit([&](sycl::handler &cgh) {
-    sycl::local_accessor<uint8_t, 1> load_temp_storage_acc(
-        syclcompat::group::group_load<int, 4>::get_local_memory_size(
-            sycl::range<3>(1, 1, 128).size()),
-        cgh);
-    sycl::local_accessor<uint8_t, 1> store_temp_storage_acc(
-        syclcompat::group::group_store<int, 4>::get_local_memory_size(
-            sycl::range<3>(1, 1, 128).size()),
-        cgh);
-    sycl::local_accessor<uint8_t, 1> temp_storage_acc(
-        syclcompat::group::exchange<int, 4>::get_local_memory_size(
-            sycl::range<3>(1, 1, 128).size()),
-        cgh);
-
-    cgh.parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)),
-        [=](sycl::nd_item<3> item_ct1) {
-          StripedToBlockedKernel(d_data, item_ct1, &load_temp_storage_acc[0],
-                                 &store_temp_storage_acc[0],
-                                 &temp_storage_acc[0]);
-        });
-  });
-  dev_ct1.queues_wait_and_throw();
-
-  for (int i = 0; i < 512; ++i) {
-    if (d_data[i] != i) {
-      std::cout << "test_striped_to_blocked failed\n";
-      std::ostream_iterator<int> Iter(std::cout, ", ");
-      std::copy(d_data, d_data + 512, Iter);
-      std::cout << std::endl;
-      sycl::free(d_data, q_ct1);
-      return false;
-    }
-  }
-  sycl::free(d_data, q_ct1);
-  std::cout << "test_striped_to_blocked pass\n";
-  return true;
-}
-
-bool test_blocked_to_striped() {
-  syclcompat::device_ext &dev_ct1 = syclcompat::get_current_device();
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  int *d_data, expected[512];
-  d_data = sycl::malloc_shared<int>(512, q_ct1);
-  for (int i = 0; i < 512; ++i)
-    d_data[i] = i;
-
-  q_ct1.submit([&](sycl::handler &cgh) {
-    sycl::local_accessor<uint8_t, 1> temp_storage_acc(
-        syclcompat::group::exchange<int, 4>::get_local_memory_size(
-            sycl::range<3>(1, 1, 128).size()),
-        cgh);
-
-    cgh.parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)),
-        [=](sycl::nd_item<3> item_ct1) {
-          BlockedToStripedKernel(d_data, item_ct1, &temp_storage_acc[0]);
-        });
-  });
-  dev_ct1.queues_wait_and_throw();
-
-  for (int i = 0; i < 128; i++) {
-    expected[4 * i + 0] = i;
-    expected[4 * i + 1] = i + 1 * 128;
-    expected[4 * i + 2] = i + 2 * 128;
-    expected[4 * i + 3] = i + 3 * 128;
-  }
-
-  for (int i = 0; i < 512; ++i) {
-    if (expected[i] != d_data[i]) {
-      std::cout << "test_blocked_to_striped failed\n";
-      std::ostream_iterator<int> Iter(std::cout, ", ");
-      std::copy(d_data, d_data + 512, Iter);
-      std::cout << std::endl;
-      sycl::free(d_data, q_ct1);
-      return false;
-    }
-  }
-  sycl::free(d_data, q_ct1);
-  std::cout << "test_blocked_to_striped pass\n";
-  return true;
-}
-
-bool test_scatter_to_blocked() {
-  syclcompat::device_ext &dev_ct1 = syclcompat::get_current_device();
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  int *d_data, *d_rank;
-  d_data = sycl::malloc_shared<int>(512, q_ct1);
-  d_rank = sycl::malloc_shared<int>(512, q_ct1);
-  for (int i = 0; i < 128; i++) {
-    d_data[4 * i + 0] = i;
-    d_data[4 * i + 1] = i + 1 * 128;
-    d_data[4 * i + 2] = i + 2 * 128;
-    d_data[4 * i + 3] = i + 3 * 128;
-    d_rank[4 * i + 0] = i * 4 + 0;
-    d_rank[4 * i + 1] = i * 4 + 1;
-    d_rank[4 * i + 2] = i * 4 + 2;
-    d_rank[4 * i + 3] = i * 4 + 3;
-  }
-
-  q_ct1.submit([&](sycl::handler &cgh) {
-    sycl::local_accessor<uint8_t, 1> temp_storage_acc(
-        syclcompat::group::exchange<int, 4>::get_local_memory_size(
-            sycl::range<3>(1, 1, 128).size()),
-        cgh);
-
-    cgh.parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)),
-        [=](sycl::nd_item<3> item_ct1) {
-          ScatterToBlockedKernel(d_data, d_rank, item_ct1,
-                                 &temp_storage_acc[0]);
-        });
-  });
-  dev_ct1.queues_wait_and_throw();
-
-  for (int i = 0; i < 512; ++i) {
-    if (d_data[i] != i) {
-      std::cout << "test_scatter_to_blocked failed\n";
-      std::ostream_iterator<int> Iter(std::cout, ", ");
-      std::copy(d_data, d_data + 512, Iter);
-      std::cout << std::endl;
-      sycl::free(d_data, q_ct1);
-      sycl::free(d_rank, q_ct1);
-      return false;
-    }
-  }
-  sycl::free(d_data, q_ct1);
-  sycl::free(d_rank, q_ct1);
-  std::cout << "test_scatter_to_blocked pass\n";
-  return true;
-}
-
-bool test_scatter_to_striped() {
-  syclcompat::device_ext &dev_ct1 = syclcompat::get_current_device();
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  int *d_data, *d_rank, expected[512];
-  d_data = sycl::malloc_shared<int>(512, q_ct1);
-  d_rank = sycl::malloc_shared<int>(512, q_ct1);
-  for (int i = 0; i < 512; ++i)
-    d_data[i] = i;
-
-  d_rank[0] = 0;
-  d_rank[128] = 1;
-  d_rank[256] = 2;
-  d_rank[384] = 3;
-  for (int i = 1; i < 128; i++) {
-    d_rank[0 * 128 + i] = d_rank[0 * 128 + i - 1] + 4;
-    d_rank[1 * 128 + i] = d_rank[1 * 128 + i - 1] + 4;
-    d_rank[2 * 128 + i] = d_rank[2 * 128 + i - 1] + 4;
-    d_rank[3 * 128 + i] = d_rank[3 * 128 + i - 1] + 4;
-  }
-
-  q_ct1.submit([&](sycl::handler &cgh) {
-    sycl::local_accessor<uint8_t, 1> temp_storage_acc(
-        syclcompat::group::exchange<int, 4>::get_local_memory_size(
-            sycl::range<3>(1, 1, 128).size()),
-        cgh);
-
-    cgh.parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)),
-        [=](sycl::nd_item<3> item_ct1) {
-          ScatterToStripedKernel(d_data, d_rank, item_ct1,
-                                 &temp_storage_acc[0]);
-        });
-  });
-  dev_ct1.queues_wait_and_throw();
-
-  for (int i = 0; i < 128; i++) {
-    expected[4 * i + 0] = i + 0 * 128;
-    expected[4 * i + 1] = i + 1 * 128;
-    expected[4 * i + 2] = i + 2 * 128;
-    expected[4 * i + 3] = i + 3 * 128;
-  }
-
-  for (int i = 0; i < 512; ++i) {
-    if (expected[i] != d_data[i]) {
-      std::cout << "test_blocked_to_striped failed\n";
-      std::ostream_iterator<int> Iter(std::cout, ", ");
-      std::copy(d_data, d_data + 512, Iter);
-      std::cout << std::endl;
-      sycl::free(d_data, q_ct1);
-      sycl::free(d_rank, q_ct1);
-      return false;
-    }
-  }
-  sycl::free(d_data, q_ct1);
-  sycl::free(d_rank, q_ct1);
-  std::cout << "test_blocked_to_striped pass\n";
-  return true;
-}
-
-int main() {
-  return !(test_blocked_to_striped() && test_striped_to_blocked() &&
-           test_scatter_to_blocked() && test_scatter_to_striped());
-}
diff --git a/sycl/test-e2e/syclcompat/group_utils/radix_sort.cpp b/sycl/test-e2e/syclcompat/group_utils/radix_sort.cpp
deleted file mode 100644
index 949fc45042f6d..0000000000000
--- a/sycl/test-e2e/syclcompat/group_utils/radix_sort.cpp
+++ /dev/null
@@ -1,640 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  radix_sort.cpp
- *
- *  Description:
- *    Group radix sort API tests
- **************************************************************************/
-
-// ===------- radix_sort.cpp-------------------- *- C++ -* ------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===----------------------------------------------------------------------===//
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-// UNSUPPORTED: spirv-backend
-// UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/17400
-
-#include <iostream>
-
-#include <sycl/detail/core.hpp>
-
-#include <syclcompat/device.hpp>
-#include <syclcompat/group_utils.hpp>
-#include <syclcompat/memory.hpp>
-
-void Sort(int *data, const sycl::nd_item<3> &item_ct1, uint8_t *temp_storage) {
-
-  using BlockRadixSort = syclcompat::group::group_radix_sort<int, 4>;
-
-  int thread_keys[4];
-  syclcompat::group::load_direct_blocked(item_ct1, data, thread_keys);
-  BlockRadixSort(temp_storage).sort(item_ct1, thread_keys);
-  syclcompat::group::store_direct_blocked(item_ct1, data, thread_keys);
-}
-
-void SortDescending(int *data, const sycl::nd_item<3> &item_ct1,
-                    uint8_t *temp_storage) {
-
-  using BlockRadixSort = syclcompat::group::group_radix_sort<int, 4>;
-
-  int thread_keys[4];
-  syclcompat::group::load_direct_blocked(item_ct1, data, thread_keys);
-  BlockRadixSort(temp_storage).sort_descending(item_ct1, thread_keys);
-  syclcompat::group::store_direct_blocked(item_ct1, data, thread_keys);
-}
-
-void SortBlockedToStriped(int *data, const sycl::nd_item<3> &item_ct1,
-                          uint8_t *load_temp_storage,
-                          uint8_t *store_temp_storage, uint8_t *temp_storage) {
-  using BlockLoadT = syclcompat::group::group_load<int, 4>;
-  using BlockStoreT = syclcompat::group::group_store<int, 4>;
-  using BlockRadixSort = syclcompat::group::group_radix_sort<int, 4>;
-
-  int thread_keys[4];
-  BlockLoadT(load_temp_storage).load(item_ct1, data, thread_keys);
-  BlockRadixSort(temp_storage).sort_blocked_to_striped(item_ct1, thread_keys);
-  BlockStoreT(store_temp_storage).store(item_ct1, data, thread_keys);
-}
-
-void SortDescendingBlockedToStriped(int *data, const sycl::nd_item<3> &item_ct1,
-                                    uint8_t *load_temp_storage,
-                                    uint8_t *store_temp_storage,
-                                    uint8_t *temp_storage) {
-  using BlockLoadT = syclcompat::group::group_load<
-      int, 4, syclcompat::group::group_load_algorithm::blocked>;
-  using BlockStoreT = syclcompat::group::group_store<
-      int, 4, syclcompat::group::group_store_algorithm::blocked>;
-  using BlockRadixSort = syclcompat::group::group_radix_sort<int, 4>;
-
-  int thread_keys[4];
-  BlockLoadT(load_temp_storage).load(item_ct1, data, thread_keys);
-  BlockRadixSort(temp_storage)
-      .sort_descending_blocked_to_striped(item_ct1, thread_keys);
-  BlockStoreT(store_temp_storage).store(item_ct1, data, thread_keys);
-}
-
-void SortBit(int *data, const sycl::nd_item<3> &item_ct1,
-             uint8_t *temp_storage) {
-
-  using BlockRadixSort = syclcompat::group::group_radix_sort<int, 4>;
-
-  int thread_keys[4];
-  syclcompat::group::load_direct_blocked(item_ct1, data, thread_keys);
-  BlockRadixSort(temp_storage).sort(item_ct1, thread_keys, 4, 16);
-  syclcompat::group::store_direct_blocked(item_ct1, data, thread_keys);
-}
-
-void SortDescendingBit(int *data, const sycl::nd_item<3> &item_ct1,
-                       uint8_t *temp_storage) {
-
-  using BlockRadixSort = syclcompat::group::group_radix_sort<int, 4>;
-
-  int thread_keys[4];
-  syclcompat::group::load_direct_blocked(item_ct1, data, thread_keys);
-  BlockRadixSort(temp_storage).sort_descending(item_ct1, thread_keys, 4, 16);
-  syclcompat::group::store_direct_blocked(item_ct1, data, thread_keys);
-}
-
-void SortBlockedToStripedBit(int *data, const sycl::nd_item<3> &item_ct1,
-                             uint8_t *temp_storage) {
-
-  using BlockRadixSort = syclcompat::group::group_radix_sort<int, 4>;
-
-  int thread_keys[4];
-  syclcompat::group::load_direct_blocked(item_ct1, data, thread_keys);
-
-  BlockRadixSort(temp_storage)
-      .sort_blocked_to_striped(item_ct1, thread_keys, 4, 16);
-  syclcompat::group::store_direct_blocked(item_ct1, data, thread_keys);
-}
-
-void SortDescendingBlockedToStripedBit(int *data,
-                                       const sycl::nd_item<3> &item_ct1,
-                                       uint8_t *temp_storage) {
-
-  using BlockRadixSort = syclcompat::group::group_radix_sort<int, 4>;
-
-  int thread_keys[4];
-  syclcompat::group::load_direct_blocked(item_ct1, data, thread_keys);
-
-  BlockRadixSort(temp_storage)
-      .sort_descending_blocked_to_striped(item_ct1, thread_keys, 4, 16);
-  syclcompat::group::store_direct_blocked(item_ct1, data, thread_keys);
-}
-
-template <typename T, int N> void print_array(T (&arr)[N]) {
-  for (int i = 0; i < N; ++i)
-    printf("%d%c", arr[i], (i == N - 1 ? '\n' : ','));
-}
-
-bool test_sort() {
-  syclcompat::device_ext &dev_ct1 = syclcompat::get_current_device();
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  int data[512] = {0}, *d_data = nullptr;
-  d_data = sycl::malloc_device<int>(512, q_ct1);
-  for (int i = 0, x = 0, y = 511; i < 128; ++i) {
-    data[i * 4 + 0] = x++;
-    data[i * 4 + 1] = y--;
-    data[i * 4 + 2] = x++;
-    data[i * 4 + 3] = y--;
-  }
-  q_ct1.memcpy(d_data, data, sizeof(data)).wait();
-
-  q_ct1.submit([&](sycl::handler &cgh) {
-    sycl::local_accessor<uint8_t, 1> temp_storage_acc(
-        syclcompat::group::group_radix_sort<int, 4>::get_local_memory_size(
-            sycl::range<3>(1, 1, 128).size()),
-        cgh);
-
-    cgh.parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)),
-        [=](sycl::nd_item<3> item_ct1) {
-          Sort(d_data, item_ct1, &temp_storage_acc[0]);
-        });
-  });
-  dev_ct1.queues_wait_and_throw();
-  q_ct1.memcpy(data, d_data, sizeof(data)).wait();
-  syclcompat::wait_and_free(d_data, q_ct1);
-  for (int i = 0; i < 512; ++i)
-    if (data[i] != i) {
-      printf("test_sort failed\n");
-      print_array(data);
-      return false;
-    }
-  printf("test_sort pass\n");
-  return true;
-}
-
-bool test_sort_descending() {
-  syclcompat::device_ext &dev_ct1 = syclcompat::get_current_device();
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  int data[512] = {0}, *d_data = nullptr;
-  d_data = sycl::malloc_device<int>(512, q_ct1);
-  for (int i = 0, x = 0, y = 511; i < 128; ++i) {
-    data[i * 4 + 0] = x++;
-    data[i * 4 + 1] = y--;
-    data[i * 4 + 2] = x++;
-    data[i * 4 + 3] = y--;
-  }
-  q_ct1.memcpy(d_data, data, sizeof(data)).wait();
-
-  q_ct1.submit([&](sycl::handler &cgh) {
-    sycl::local_accessor<uint8_t, 1> temp_storage_acc(
-        syclcompat::group::group_radix_sort<int, 4>::get_local_memory_size(
-            sycl::range<3>(1, 1, 128).size()),
-        cgh);
-
-    cgh.parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)),
-        [=](sycl::nd_item<3> item_ct1) {
-          SortDescending(d_data, item_ct1, &temp_storage_acc[0]);
-        });
-  });
-  dev_ct1.queues_wait_and_throw();
-  q_ct1.memcpy(data, d_data, sizeof(data)).wait();
-  syclcompat::wait_and_free(d_data, q_ct1);
-  for (int i = 0; i < 512; ++i)
-    if (data[i] != 511 - i) {
-      printf("test_sort_descending failed\n");
-      print_array(data);
-      return false;
-    }
-  printf("test_sort_descending pass\n");
-  return true;
-}
-
-bool test_sort_blocked_to_striped() {
-  syclcompat::device_ext &dev_ct1 = syclcompat::get_current_device();
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  int data[512] = {0}, *d_data = nullptr;
-  d_data = sycl::malloc_device<int>(512, q_ct1);
-  for (int i = 0, x = 0, y = 511; i < 128; ++i) {
-    data[i * 4 + 0] = x++;
-    data[i * 4 + 1] = y--;
-    data[i * 4 + 2] = x++;
-    data[i * 4 + 3] = y--;
-  }
-  q_ct1.memcpy(d_data, data, sizeof(data)).wait();
-
-  q_ct1.submit([&](sycl::handler &cgh) {
-    sycl::local_accessor<uint8_t, 1> load_temp_storage_acc(
-        syclcompat::group::group_load<int, 4>::get_local_memory_size(
-            sycl::range<3>(1, 1, 128).size()),
-        cgh);
-    sycl::local_accessor<uint8_t, 1> store_temp_storage_acc(
-        syclcompat::group::group_store<int, 4>::get_local_memory_size(
-            sycl::range<3>(1, 1, 128).size()),
-        cgh);
-    sycl::local_accessor<uint8_t, 1> temp_storage_acc(
-        syclcompat::group::group_radix_sort<int, 4>::get_local_memory_size(
-            sycl::range<3>(1, 1, 128).size()),
-        cgh);
-
-    cgh.parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)),
-        [=](sycl::nd_item<3> item_ct1) {
-          SortBlockedToStriped(d_data, item_ct1, &load_temp_storage_acc[0],
-                               &store_temp_storage_acc[0],
-                               &temp_storage_acc[0]);
-        });
-  });
-  dev_ct1.queues_wait_and_throw();
-  q_ct1.memcpy(data, d_data, sizeof(data)).wait();
-  syclcompat::wait_and_free(d_data, q_ct1);
-  int expected[512];
-  for (int i = 0; i < 128; ++i) {
-    expected[4 * i + 0] = i;
-    expected[4 * i + 1] = i + 1 * 128;
-    expected[4 * i + 2] = i + 2 * 128;
-    expected[4 * i + 3] = i + 3 * 128;
-  }
-  for (int i = 0; i < 512; ++i)
-    if (data[i] != expected[i]) {
-      printf("test_sort_blocked_to_striped failed\n");
-      print_array(data);
-      return false;
-    }
-  printf("test_sort_blocked_to_striped pass\n");
-  return true;
-}
-
-bool test_sort_descending_blocked_to_striped() {
-  syclcompat::device_ext &dev_ct1 = syclcompat::get_current_device();
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  int data[512] = {0}, *d_data = nullptr;
-  d_data = sycl::malloc_device<int>(512, q_ct1);
-  for (int i = 0, x = 0, y = 511; i < 128; ++i) {
-    data[i * 4 + 0] = x++;
-    data[i * 4 + 1] = y--;
-    data[i * 4 + 2] = x++;
-    data[i * 4 + 3] = y--;
-  }
-  q_ct1.memcpy(d_data, data, sizeof(data)).wait();
-
-  q_ct1.submit([&](sycl::handler &cgh) {
-    sycl::local_accessor<uint8_t, 1> load_temp_storage_acc(
-        syclcompat::group::group_load<int, 4>::get_local_memory_size(
-            sycl::range<3>(1, 1, 128).size()),
-        cgh);
-    sycl::local_accessor<uint8_t, 1> store_temp_storage_acc(
-        syclcompat::group::group_store<int, 4>::get_local_memory_size(
-            sycl::range<3>(1, 1, 128).size()),
-        cgh);
-    sycl::local_accessor<uint8_t, 1> temp_storage_acc(
-        syclcompat::group::group_radix_sort<int, 4>::get_local_memory_size(
-            sycl::range<3>(1, 1, 128).size()),
-        cgh);
-
-    cgh.parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)),
-        [=](sycl::nd_item<3> item_ct1) {
-          SortDescendingBlockedToStriped(
-              d_data, item_ct1, &load_temp_storage_acc[0],
-              &store_temp_storage_acc[0], &temp_storage_acc[0]);
-        });
-  });
-  dev_ct1.queues_wait_and_throw();
-  q_ct1.memcpy(data, d_data, sizeof(data)).wait();
-  syclcompat::wait_and_free(d_data, q_ct1);
-  int expected[512];
-  for (int i = 0; i < 128; ++i) {
-    expected[4 * i + 0] = 511 - i;
-    expected[4 * i + 1] = 511 - i - 1 * 128;
-    expected[4 * i + 2] = 511 - i - 2 * 128;
-    expected[4 * i + 3] = 511 - i - 3 * 128;
-  }
-  for (int i = 0; i < 512; ++i)
-    if (data[i] != expected[i]) {
-      printf("test_sort_descending_blocked_to_striped failed\n");
-      print_array(data);
-      return false;
-    }
-  printf("test_sort_descending_blocked_to_striped pass\n");
-  return true;
-}
-
-bool test_sort_bit() {
-  syclcompat::device_ext &dev_ct1 = syclcompat::get_current_device();
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  int data[512] = {0}, *d_data = nullptr;
-  int expected[512] = {
-      0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
-      15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
-      30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
-      45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
-      60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
-      75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
-      90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
-      105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
-      120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
-      135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
-      150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
-      165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
-      180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
-      195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
-      210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
-      225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
-      240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
-      255, 271, 270, 269, 268, 267, 266, 265, 264, 263, 262, 261, 260, 259, 258,
-      257, 256, 287, 286, 285, 284, 283, 282, 281, 280, 279, 278, 277, 276, 275,
-      274, 273, 272, 303, 302, 301, 300, 299, 298, 297, 296, 295, 294, 293, 292,
-      291, 290, 289, 288, 319, 318, 317, 316, 315, 314, 313, 312, 311, 310, 309,
-      308, 307, 306, 305, 304, 335, 334, 333, 332, 331, 330, 329, 328, 327, 326,
-      325, 324, 323, 322, 321, 320, 351, 350, 349, 348, 347, 346, 345, 344, 343,
-      342, 341, 340, 339, 338, 337, 336, 367, 366, 365, 364, 363, 362, 361, 360,
-      359, 358, 357, 356, 355, 354, 353, 352, 383, 382, 381, 380, 379, 378, 377,
-      376, 375, 374, 373, 372, 371, 370, 369, 368, 399, 398, 397, 396, 395, 394,
-      393, 392, 391, 390, 389, 388, 387, 386, 385, 384, 415, 414, 413, 412, 411,
-      410, 409, 408, 407, 406, 405, 404, 403, 402, 401, 400, 431, 430, 429, 428,
-      427, 426, 425, 424, 423, 422, 421, 420, 419, 418, 417, 416, 447, 446, 445,
-      444, 443, 442, 441, 440, 439, 438, 437, 436, 435, 434, 433, 432, 463, 462,
-      461, 460, 459, 458, 457, 456, 455, 454, 453, 452, 451, 450, 449, 448, 479,
-      478, 477, 476, 475, 474, 473, 472, 471, 470, 469, 468, 467, 466, 465, 464,
-      495, 494, 493, 492, 491, 490, 489, 488, 487, 486, 485, 484, 483, 482, 481,
-      480, 511, 510, 509, 508, 507, 506, 505, 504, 503, 502, 501, 500, 499, 498,
-      497, 496};
-  d_data = sycl::malloc_device<int>(512, q_ct1);
-  for (int i = 0, x = 0, y = 511; i < 128; ++i) {
-    data[i * 4 + 0] = x++;
-    data[i * 4 + 1] = y--;
-    data[i * 4 + 2] = x++;
-    data[i * 4 + 3] = y--;
-  }
-  q_ct1.memcpy(d_data, data, sizeof(data)).wait();
-
-  q_ct1.submit([&](sycl::handler &cgh) {
-    sycl::local_accessor<uint8_t, 1> temp_storage_acc(
-        syclcompat::group::group_radix_sort<int, 4>::get_local_memory_size(
-            sycl::range<3>(1, 1, 128).size()),
-        cgh);
-
-    cgh.parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)),
-        [=](sycl::nd_item<3> item_ct1) {
-          SortBit(d_data, item_ct1, &temp_storage_acc[0]);
-        });
-  });
-  dev_ct1.queues_wait_and_throw();
-  q_ct1.memcpy(data, d_data, sizeof(data)).wait();
-  syclcompat::wait_and_free(d_data, q_ct1);
-  for (int i = 0; i < 512; ++i)
-    if (data[i] != expected[i]) {
-      printf("test_sort_bit failed\n");
-      print_array(data);
-      return false;
-    }
-  printf("test_sort_bit pass\n");
-  return true;
-}
-
-bool test_sort_descending_bit() {
-  syclcompat::device_ext &dev_ct1 = syclcompat::get_current_device();
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  int data[512] = {0}, *d_data = nullptr;
-  int expected[512] = {
-      511, 510, 509, 508, 507, 506, 505, 504, 503, 502, 501, 500, 499, 498, 497,
-      496, 495, 494, 493, 492, 491, 490, 489, 488, 487, 486, 485, 484, 483, 482,
-      481, 480, 479, 478, 477, 476, 475, 474, 473, 472, 471, 470, 469, 468, 467,
-      466, 465, 464, 463, 462, 461, 460, 459, 458, 457, 456, 455, 454, 453, 452,
-      451, 450, 449, 448, 447, 446, 445, 444, 443, 442, 441, 440, 439, 438, 437,
-      436, 435, 434, 433, 432, 431, 430, 429, 428, 427, 426, 425, 424, 423, 422,
-      421, 420, 419, 418, 417, 416, 415, 414, 413, 412, 411, 410, 409, 408, 407,
-      406, 405, 404, 403, 402, 401, 400, 399, 398, 397, 396, 395, 394, 393, 392,
-      391, 390, 389, 388, 387, 386, 385, 384, 383, 382, 381, 380, 379, 378, 377,
-      376, 375, 374, 373, 372, 371, 370, 369, 368, 367, 366, 365, 364, 363, 362,
-      361, 360, 359, 358, 357, 356, 355, 354, 353, 352, 351, 350, 349, 348, 347,
-      346, 345, 344, 343, 342, 341, 340, 339, 338, 337, 336, 335, 334, 333, 332,
-      331, 330, 329, 328, 327, 326, 325, 324, 323, 322, 321, 320, 319, 318, 317,
-      316, 315, 314, 313, 312, 311, 310, 309, 308, 307, 306, 305, 304, 303, 302,
-      301, 300, 299, 298, 297, 296, 295, 294, 293, 292, 291, 290, 289, 288, 287,
-      286, 285, 284, 283, 282, 281, 280, 279, 278, 277, 276, 275, 274, 273, 272,
-      271, 270, 269, 268, 267, 266, 265, 264, 263, 262, 261, 260, 259, 258, 257,
-      256, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,
-      254, 255, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236,
-      237, 238, 239, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219,
-      220, 221, 222, 223, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202,
-      203, 204, 205, 206, 207, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185,
-      186, 187, 188, 189, 190, 191, 160, 161, 162, 163, 164, 165, 166, 167, 168,
-      169, 170, 171, 172, 173, 174, 175, 144, 145, 146, 147, 148, 149, 150, 151,
-      152, 153, 154, 155, 156, 157, 158, 159, 128, 129, 130, 131, 132, 133, 134,
-      135, 136, 137, 138, 139, 140, 141, 142, 143, 112, 113, 114, 115, 116, 117,
-      118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 96,  97,  98,  99,  100,
-      101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 80,  81,  82,  83,
-      84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  64,  65,  66,
-      67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  48,  49,
-      50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  32,
-      33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,
-      16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,
-      31,  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,
-      14,  15};
-  d_data = sycl::malloc_device<int>(512, q_ct1);
-  for (int i = 0, x = 0, y = 511; i < 128; ++i) {
-    data[i * 4 + 0] = x++;
-    data[i * 4 + 1] = y--;
-    data[i * 4 + 2] = x++;
-    data[i * 4 + 3] = y--;
-  }
-  q_ct1.memcpy(d_data, data, sizeof(data)).wait();
-
-  q_ct1.submit([&](sycl::handler &cgh) {
-    sycl::local_accessor<uint8_t, 1> temp_storage_acc(
-        syclcompat::group::group_radix_sort<int, 4>::get_local_memory_size(
-            sycl::range<3>(1, 1, 128).size()),
-        cgh);
-
-    cgh.parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)),
-        [=](sycl::nd_item<3> item_ct1) {
-          SortDescendingBit(d_data, item_ct1, &temp_storage_acc[0]);
-        });
-  });
-  dev_ct1.queues_wait_and_throw();
-  q_ct1.memcpy(data, d_data, sizeof(data)).wait();
-  syclcompat::wait_and_free(d_data, q_ct1);
-  for (int i = 0; i < 512; ++i)
-    if (data[i] != expected[i]) {
-      printf("test_sort_descending_bit failed\n");
-      print_array(data);
-      return false;
-    }
-  printf("test_sort_descending_bit pass\n");
-  return true;
-}
-
-bool test_sort_blocked_to_striped_bit() {
-  syclcompat::device_ext &dev_ct1 = syclcompat::get_current_device();
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  int data[512] = {0}, *d_data = nullptr;
-  d_data = sycl::malloc_device<int>(512, q_ct1);
-  for (int i = 0, x = 0, y = 511; i < 128; ++i) {
-    data[i * 4 + 0] = x++;
-    data[i * 4 + 1] = y--;
-    data[i * 4 + 2] = x++;
-    data[i * 4 + 3] = y--;
-  }
-  q_ct1.memcpy(d_data, data, sizeof(data)).wait();
-
-  q_ct1.submit([&](sycl::handler &cgh) {
-    sycl::local_accessor<uint8_t, 1> temp_storage_acc(
-        syclcompat::group::group_radix_sort<int, 4>::get_local_memory_size(
-            sycl::range<3>(1, 1, 128).size()),
-        cgh);
-
-    cgh.parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)),
-        [=](sycl::nd_item<3> item_ct1) {
-          SortBlockedToStripedBit(d_data, item_ct1, &temp_storage_acc[0]);
-        });
-  });
-  dev_ct1.queues_wait_and_throw();
-  q_ct1.memcpy(data, d_data, sizeof(data)).wait();
-  syclcompat::wait_and_free(d_data, q_ct1);
-  int expected[512] = {
-      0,   128, 271, 399, 1,   129, 270, 398, 2,   130, 269, 397, 3,   131, 268,
-      396, 4,   132, 267, 395, 5,   133, 266, 394, 6,   134, 265, 393, 7,   135,
-      264, 392, 8,   136, 263, 391, 9,   137, 262, 390, 10,  138, 261, 389, 11,
-      139, 260, 388, 12,  140, 259, 387, 13,  141, 258, 386, 14,  142, 257, 385,
-      15,  143, 256, 384, 16,  144, 287, 415, 17,  145, 286, 414, 18,  146, 285,
-      413, 19,  147, 284, 412, 20,  148, 283, 411, 21,  149, 282, 410, 22,  150,
-      281, 409, 23,  151, 280, 408, 24,  152, 279, 407, 25,  153, 278, 406, 26,
-      154, 277, 405, 27,  155, 276, 404, 28,  156, 275, 403, 29,  157, 274, 402,
-      30,  158, 273, 401, 31,  159, 272, 400, 32,  160, 303, 431, 33,  161, 302,
-      430, 34,  162, 301, 429, 35,  163, 300, 428, 36,  164, 299, 427, 37,  165,
-      298, 426, 38,  166, 297, 425, 39,  167, 296, 424, 40,  168, 295, 423, 41,
-      169, 294, 422, 42,  170, 293, 421, 43,  171, 292, 420, 44,  172, 291, 419,
-      45,  173, 290, 418, 46,  174, 289, 417, 47,  175, 288, 416, 48,  176, 319,
-      447, 49,  177, 318, 446, 50,  178, 317, 445, 51,  179, 316, 444, 52,  180,
-      315, 443, 53,  181, 314, 442, 54,  182, 313, 441, 55,  183, 312, 440, 56,
-      184, 311, 439, 57,  185, 310, 438, 58,  186, 309, 437, 59,  187, 308, 436,
-      60,  188, 307, 435, 61,  189, 306, 434, 62,  190, 305, 433, 63,  191, 304,
-      432, 64,  192, 335, 463, 65,  193, 334, 462, 66,  194, 333, 461, 67,  195,
-      332, 460, 68,  196, 331, 459, 69,  197, 330, 458, 70,  198, 329, 457, 71,
-      199, 328, 456, 72,  200, 327, 455, 73,  201, 326, 454, 74,  202, 325, 453,
-      75,  203, 324, 452, 76,  204, 323, 451, 77,  205, 322, 450, 78,  206, 321,
-      449, 79,  207, 320, 448, 80,  208, 351, 479, 81,  209, 350, 478, 82,  210,
-      349, 477, 83,  211, 348, 476, 84,  212, 347, 475, 85,  213, 346, 474, 86,
-      214, 345, 473, 87,  215, 344, 472, 88,  216, 343, 471, 89,  217, 342, 470,
-      90,  218, 341, 469, 91,  219, 340, 468, 92,  220, 339, 467, 93,  221, 338,
-      466, 94,  222, 337, 465, 95,  223, 336, 464, 96,  224, 367, 495, 97,  225,
-      366, 494, 98,  226, 365, 493, 99,  227, 364, 492, 100, 228, 363, 491, 101,
-      229, 362, 490, 102, 230, 361, 489, 103, 231, 360, 488, 104, 232, 359, 487,
-      105, 233, 358, 486, 106, 234, 357, 485, 107, 235, 356, 484, 108, 236, 355,
-      483, 109, 237, 354, 482, 110, 238, 353, 481, 111, 239, 352, 480, 112, 240,
-      383, 511, 113, 241, 382, 510, 114, 242, 381, 509, 115, 243, 380, 508, 116,
-      244, 379, 507, 117, 245, 378, 506, 118, 246, 377, 505, 119, 247, 376, 504,
-      120, 248, 375, 503, 121, 249, 374, 502, 122, 250, 373, 501, 123, 251, 372,
-      500, 124, 252, 371, 499, 125, 253, 370, 498, 126, 254, 369, 497, 127, 255,
-      368, 496};
-  for (int i = 0; i < 512; ++i)
-    if (data[i] != expected[i]) {
-      printf("test_sort_blocked_to_striped_bit failed\n");
-      print_array(data);
-      return false;
-    }
-  printf("test_sort_blocked_to_striped_bit pass\n");
-  return true;
-}
-
-bool test_sort_descending_blocked_to_striped_bit() {
-  syclcompat::device_ext &dev_ct1 = syclcompat::get_current_device();
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  int data[512] = {0}, *d_data = nullptr;
-  d_data = sycl::malloc_device<int>(512, q_ct1);
-  for (int i = 0, x = 0, y = 511; i < 128; ++i) {
-    data[i * 4 + 0] = x++;
-    data[i * 4 + 1] = y--;
-    data[i * 4 + 2] = x++;
-    data[i * 4 + 3] = y--;
-  }
-  q_ct1.memcpy(d_data, data, sizeof(data)).wait();
-
-  q_ct1.submit([&](sycl::handler &cgh) {
-    sycl::local_accessor<uint8_t, 1> temp_storage_acc(
-        syclcompat::group::group_radix_sort<int, 4>::get_local_memory_size(
-            sycl::range<3>(1, 1, 128).size()),
-        cgh);
-
-    cgh.parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)),
-        [=](sycl::nd_item<3> item_ct1) {
-          SortDescendingBlockedToStripedBit(d_data, item_ct1,
-                                            &temp_storage_acc[0]);
-        });
-  });
-  dev_ct1.queues_wait_and_throw();
-  q_ct1.memcpy(data, d_data, sizeof(data)).wait();
-  syclcompat::wait_and_free(d_data, q_ct1);
-  int expected[512] = {
-      511, 383, 240, 112, 510, 382, 241, 113, 509, 381, 242, 114, 508, 380, 243,
-      115, 507, 379, 244, 116, 506, 378, 245, 117, 505, 377, 246, 118, 504, 376,
-      247, 119, 503, 375, 248, 120, 502, 374, 249, 121, 501, 373, 250, 122, 500,
-      372, 251, 123, 499, 371, 252, 124, 498, 370, 253, 125, 497, 369, 254, 126,
-      496, 368, 255, 127, 495, 367, 224, 96,  494, 366, 225, 97,  493, 365, 226,
-      98,  492, 364, 227, 99,  491, 363, 228, 100, 490, 362, 229, 101, 489, 361,
-      230, 102, 488, 360, 231, 103, 487, 359, 232, 104, 486, 358, 233, 105, 485,
-      357, 234, 106, 484, 356, 235, 107, 483, 355, 236, 108, 482, 354, 237, 109,
-      481, 353, 238, 110, 480, 352, 239, 111, 479, 351, 208, 80,  478, 350, 209,
-      81,  477, 349, 210, 82,  476, 348, 211, 83,  475, 347, 212, 84,  474, 346,
-      213, 85,  473, 345, 214, 86,  472, 344, 215, 87,  471, 343, 216, 88,  470,
-      342, 217, 89,  469, 341, 218, 90,  468, 340, 219, 91,  467, 339, 220, 92,
-      466, 338, 221, 93,  465, 337, 222, 94,  464, 336, 223, 95,  463, 335, 192,
-      64,  462, 334, 193, 65,  461, 333, 194, 66,  460, 332, 195, 67,  459, 331,
-      196, 68,  458, 330, 197, 69,  457, 329, 198, 70,  456, 328, 199, 71,  455,
-      327, 200, 72,  454, 326, 201, 73,  453, 325, 202, 74,  452, 324, 203, 75,
-      451, 323, 204, 76,  450, 322, 205, 77,  449, 321, 206, 78,  448, 320, 207,
-      79,  447, 319, 176, 48,  446, 318, 177, 49,  445, 317, 178, 50,  444, 316,
-      179, 51,  443, 315, 180, 52,  442, 314, 181, 53,  441, 313, 182, 54,  440,
-      312, 183, 55,  439, 311, 184, 56,  438, 310, 185, 57,  437, 309, 186, 58,
-      436, 308, 187, 59,  435, 307, 188, 60,  434, 306, 189, 61,  433, 305, 190,
-      62,  432, 304, 191, 63,  431, 303, 160, 32,  430, 302, 161, 33,  429, 301,
-      162, 34,  428, 300, 163, 35,  427, 299, 164, 36,  426, 298, 165, 37,  425,
-      297, 166, 38,  424, 296, 167, 39,  423, 295, 168, 40,  422, 294, 169, 41,
-      421, 293, 170, 42,  420, 292, 171, 43,  419, 291, 172, 44,  418, 290, 173,
-      45,  417, 289, 174, 46,  416, 288, 175, 47,  415, 287, 144, 16,  414, 286,
-      145, 17,  413, 285, 146, 18,  412, 284, 147, 19,  411, 283, 148, 20,  410,
-      282, 149, 21,  409, 281, 150, 22,  408, 280, 151, 23,  407, 279, 152, 24,
-      406, 278, 153, 25,  405, 277, 154, 26,  404, 276, 155, 27,  403, 275, 156,
-      28,  402, 274, 157, 29,  401, 273, 158, 30,  400, 272, 159, 31,  399, 271,
-      128, 0,   398, 270, 129, 1,   397, 269, 130, 2,   396, 268, 131, 3,   395,
-      267, 132, 4,   394, 266, 133, 5,   393, 265, 134, 6,   392, 264, 135, 7,
-      391, 263, 136, 8,   390, 262, 137, 9,   389, 261, 138, 10,  388, 260, 139,
-      11,  387, 259, 140, 12,  386, 258, 141, 13,  385, 257, 142, 14,  384, 256,
-      143, 15};
-  for (int i = 0; i < 512; ++i)
-    if (data[i] != expected[i]) {
-      printf("test_sort_descending_blocked_to_striped_bit failed\n");
-      print_array(data);
-      return false;
-    }
-  printf("test_sort_descending_blocked_to_striped_bit pass\n");
-  return true;
-}
-
-int main() {
-  return !(test_sort() && test_sort_descending() &&
-           test_sort_blocked_to_striped() &&
-           test_sort_descending_blocked_to_striped() && test_sort_bit() &&
-           test_sort_descending_bit() && test_sort_blocked_to_striped_bit() &&
-           test_sort_descending_blocked_to_striped_bit());
-}
diff --git a/sycl/test-e2e/syclcompat/group_utils/shuffle.cpp b/sycl/test-e2e/syclcompat/group_utils/shuffle.cpp
deleted file mode 100644
index 2fc3121d039cb..0000000000000
--- a/sycl/test-e2e/syclcompat/group_utils/shuffle.cpp
+++ /dev/null
@@ -1,320 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  shuffle.cpp
- *
- *  Description:
- *    Group shuffle API tests
- **************************************************************************/
-
-// ===------- shuffle.cpp -------------------- *- C++ -* ------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===----------------------------------------------------------------------===//
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include <iostream>
-
-#include <sycl/detail/core.hpp>
-
-#include <syclcompat/device.hpp>
-#include <syclcompat/group_utils.hpp>
-
-int expect1[128] = {
-    2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,  15,  16,
-    17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,
-    32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,
-    47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,
-    62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,
-    77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
-    92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106,
-    107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
-    122, 123, 124, 125, 126, 127, 0,   0};
-
-int expect2[128] = {
-    2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,  15,  16,
-    17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,
-    32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,
-    47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,
-    62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,
-    77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
-    92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104, 105, 106,
-    107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
-    122, 123, 124, 125, 126, 127, 0,   1};
-
-int expect3[513] = {
-    0,   0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,
-    14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,
-    29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,
-    44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,
-    59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,
-    74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,
-    89,  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103,
-    104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118,
-    119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133,
-    134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
-    149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163,
-    164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178,
-    179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193,
-    194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208,
-    209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
-    224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238,
-    239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,
-    254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268,
-    269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283,
-    284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298,
-    299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313,
-    314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328,
-    329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343,
-    344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358,
-    359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373,
-    374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388,
-    389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403,
-    404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418,
-    419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433,
-    434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448,
-    449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463,
-    464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478,
-    479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493,
-    494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508,
-    509, 510, 511};
-
-int expect4[513] = {
-    1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,  15,
-    16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,
-    31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,
-    46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,
-    61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,
-    76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
-    91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104, 105,
-    106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
-    121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135,
-    136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150,
-    151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165,
-    166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180,
-    181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195,
-    196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210,
-    211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225,
-    226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240,
-    241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255,
-    256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270,
-    271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285,
-    286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300,
-    301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315,
-    316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330,
-    331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345,
-    346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360,
-    361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375,
-    376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390,
-    391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405,
-    406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420,
-    421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435,
-    436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450,
-    451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465,
-    466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480,
-    481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495,
-    496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510,
-    511, 510, 0};
-
-void BlockShuffleKernel1(int *input, int *output,
-                         const sycl::nd_item<3> &item_ct1,
-                         uint8_t *temp_storage) {
-
-  typedef syclcompat::group::group_shuffle<int, 128> BS;
-
-  BS(temp_storage)
-      .select(item_ct1, input[item_ct1.get_local_id(2)],
-              output[item_ct1.get_local_id(2)], 2);
-}
-
-void BlockShuffleKernel2(int *input, int *output,
-                         const sycl::nd_item<3> &item_ct1,
-                         uint8_t *temp_storage) {
-  typedef syclcompat::group::group_shuffle<int, 128> BS;
-
-  BS(temp_storage)
-      .select2(item_ct1, input[item_ct1.get_local_id(2)],
-               output[item_ct1.get_local_id(2)], 2);
-}
-
-void BlockShuffleKernel3(int *input, int *output, int *extra,
-                         const sycl::nd_item<3> &item_ct1,
-                         uint8_t *temp_storage) {
-  typedef syclcompat::group::group_shuffle<int, 128> BS;
-
-  BS(temp_storage)
-      .shuffle_right(
-          item_ct1,
-          *reinterpret_cast<int(*)[4]>(input + item_ct1.get_local_id(2) * 4),
-          *reinterpret_cast<int(*)[4]>(output + item_ct1.get_local_id(2) * 4),
-          *extra);
-}
-
-void BlockShuffleKernel4(int *input, int *output, int *extra,
-                         const sycl::nd_item<3> &item_ct1,
-                         uint8_t *temp_storage) {
-  typedef syclcompat::group::group_shuffle<int, 128> BS;
-
-  BS(temp_storage)
-      .shuffle_left(
-          item_ct1,
-          *reinterpret_cast<int(*)[4]>(input + item_ct1.get_local_id(2) * 4),
-          *reinterpret_cast<int(*)[4]>(output + item_ct1.get_local_id(2) * 4),
-          *extra);
-}
-
-int main() {
-  syclcompat::device_ext &dev_ct1 = syclcompat::get_current_device();
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  int *input1, *output1;
-  int *input4, *output4;
-  int *extra;
-  int host_input1[128];
-  int host_output1[128];
-  int host_input4[128 * 4];
-  int host_output4[128 * 4];
-  int host_extra = 0;
-  input1 = sycl::malloc_device<int>(128, q_ct1);
-  output1 = sycl::malloc_device<int>(128, q_ct1);
-  input4 = (int *)sycl::malloc_device(sizeof(int) * 128 * 4, q_ct1);
-  output4 = (int *)sycl::malloc_device(sizeof(int) * 128 * 4, q_ct1);
-  extra = sycl::malloc_device<int>(1, q_ct1);
-
-  for (int i = 0; i < 128; i++) {
-    host_input1[i] = i;
-    host_output1[i] = 0;
-  }
-
-  for (int i = 0; i < 128 * 4; i++) {
-    host_input4[i] = i;
-    host_output4[i] = 0;
-  }
-
-  q_ct1.memcpy(input1, host_input1, sizeof(int) * 128);
-  q_ct1.memcpy(input4, host_input4, sizeof(int) * 128 * 4);
-  q_ct1.memcpy(output1, host_output1, sizeof(int) * 128);
-  q_ct1.memcpy(output4, host_output4, sizeof(int) * 128 * 4);
-  q_ct1.memcpy(extra, &host_extra, sizeof(int));
-
-  q_ct1.submit([&](sycl::handler &cgh) {
-    sycl::local_accessor<uint8_t, 1> temp_storage_acc(
-        syclcompat::group::group_shuffle<int, 128>::get_local_memory_size(
-            sycl::range<3>(1, 1, 128).size()),
-        cgh);
-
-    cgh.parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)),
-        [=](sycl::nd_item<3> item_ct1) {
-          BlockShuffleKernel1(input1, output1, item_ct1, &temp_storage_acc[0]);
-        });
-  });
-
-  q_ct1.memcpy(host_output1, output1, sizeof(int) * 128).wait();
-  dev_ct1.queues_wait_and_throw();
-  for (int i = 0; i < 128; i++) {
-    if (host_output1[i] != expect1[i]) {
-      std::cout << "test 1 failed" << std::endl;
-      exit(-1);
-    }
-  }
-
-  q_ct1.submit([&](sycl::handler &cgh) {
-    sycl::local_accessor<uint8_t, 1> temp_storage_acc(
-        syclcompat::group::group_shuffle<int, 128>::get_local_memory_size(
-            sycl::range<3>(1, 1, 128).size()),
-        cgh);
-
-    cgh.parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)),
-        [=](sycl::nd_item<3> item_ct1) {
-          BlockShuffleKernel2(input1, output1, item_ct1, &temp_storage_acc[0]);
-        });
-  });
-
-  q_ct1.memcpy(host_output1, output1, sizeof(int) * 128).wait();
-  dev_ct1.queues_wait_and_throw();
-  for (int i = 0; i < 128; i++) {
-    if (host_output1[i] != expect2[i]) {
-      std::cout << "test 2 failed" << std::endl;
-      exit(-1);
-    }
-  }
-
-  q_ct1.submit([&](sycl::handler &cgh) {
-    sycl::local_accessor<uint8_t, 1> temp_storage_acc(
-        syclcompat::group::group_shuffle<int, 128>::get_local_memory_size(
-            sycl::range<3>(1, 1, 128).size()),
-        cgh);
-
-    cgh.parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)),
-        [=](sycl::nd_item<3> item_ct1) {
-          BlockShuffleKernel3(input4, output4, extra, item_ct1,
-                              &temp_storage_acc[0]);
-        });
-  });
-
-  q_ct1.memcpy(host_output4, output4, sizeof(int) * 128 * 4);
-  q_ct1.memcpy(&host_extra, extra, sizeof(int)).wait();
-  dev_ct1.queues_wait_and_throw();
-
-  for (int i = 0; i < 128 * 4; i++) {
-    if (host_output4[i] != expect3[i]) {
-      std::cout << "test 3 failed" << std::endl;
-      exit(-1);
-    }
-  }
-  if (host_extra != expect3[512]) {
-    std::cout << "test 3 failed" << std::endl;
-    exit(-1);
-  }
-
-  q_ct1.submit([&](sycl::handler &cgh) {
-    sycl::local_accessor<uint8_t, 1> temp_storage_acc(
-        syclcompat::group::group_shuffle<int, 128>::get_local_memory_size(
-            sycl::range<3>(1, 1, 128).size()),
-        cgh);
-
-    cgh.parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)),
-        [=](sycl::nd_item<3> item_ct1) {
-          BlockShuffleKernel4(input4, output4, extra, item_ct1,
-                              &temp_storage_acc[0]);
-        });
-  });
-
-  q_ct1.memcpy(host_output4, output4, sizeof(int) * 128 * 4);
-  q_ct1.memcpy(&host_extra, extra, sizeof(int)).wait();
-  dev_ct1.queues_wait_and_throw();
-  for (int i = 0; i < 128 * 4; i++) {
-    if (host_output4[i] != expect4[i]) {
-      std::cout << "test 4 failed" << std::endl;
-      exit(-1);
-    }
-  }
-  if (host_extra != expect4[512]) {
-    std::cout << "test 4 failed" << std::endl;
-    exit(-1);
-  }
-  std::cout << "test pass" << std::endl;
-  return 0;
-};
diff --git a/sycl/test-e2e/syclcompat/helloworld.cpp b/sycl/test-e2e/syclcompat/helloworld.cpp
deleted file mode 100644
index 3e32f8a965eb6..0000000000000
--- a/sycl/test-e2e/syclcompat/helloworld.cpp
+++ /dev/null
@@ -1,142 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  helloworld.cpp
- *
- *  Description:
- *    Checks that the SYCLcompat example program compiles and runs
- **************************************************************************/
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include <sycl/detail/core.hpp>
-
-// The example uses specific headers but the user can
-// simple include <syclcompat/syclcompat.hpp> to get all the
-// functionality with a single header
-
-#include <syclcompat/device.hpp>
-#include <syclcompat/id_query.hpp>
-#include <syclcompat/launch.hpp>
-#include <syclcompat/memory.hpp>
-
-#include <cstdlib>
-#include <iostream>
-
-#define CHECK_MEMORY(ptr)                                                      \
-  if ((ptr) == nullptr) {                                                      \
-    std::cerr << "Failed to allocate memory: " << (#ptr) << "\n";              \
-    exit(EXIT_FAILURE);                                                        \
-  }
-
-/**
- * Slope intercept form of a straight line equation: Y = m * X + b
- */
-template <int BLOCK_SIZE>
-void slope_intercept(float *Y, float *X, float m, float b, size_t n) {
-
-  // Block index
-  size_t bx = syclcompat::work_group_id::x();
-  // Thread index
-  size_t tx = syclcompat::local_id::x();
-
-  size_t i = bx * BLOCK_SIZE + tx;
-  // or  i = syclcompat::global_id::x();
-  if (i < n)
-    Y[i] = m * X[i] + b;
-}
-
-/**
- * Program main
- */
-int main(int argc, char **argv) {
-  std::cout << "Simple Kernel example" << "\n";
-
-  constexpr size_t n_points = 32;
-  constexpr float m = 1.5f;
-  constexpr float b = 0.5f;
-
-  int block_size = 32;
-  if (block_size > syclcompat::get_current_device()
-                       .get_info<sycl::info::device::max_work_group_size>()) {
-    block_size = 16;
-  }
-
-  std::cout << "block_size = " << block_size << ", n_points = " << n_points
-            << "\n";
-
-  // Allocate host memory for vectors X and Y
-  size_t mem_size = n_points * sizeof(float);
-  float *h_X = (float *)syclcompat::malloc_host(mem_size);
-  float *h_Y = (float *)syclcompat::malloc_host(mem_size);
-  CHECK_MEMORY(h_X);
-  CHECK_MEMORY(h_Y);
-
-  // Alternative templated allocation for the expected output
-  float *h_expected = syclcompat::malloc_host<float>(n_points);
-  CHECK_MEMORY(h_expected);
-
-  // Initialize host memory & expected output
-  for (size_t i = 0; i < n_points; i++) {
-    h_X[i] = i + 1;
-    h_expected[i] = m * h_X[i] + b;
-  }
-
-  // Allocate device memory
-  float *d_X = (float *)syclcompat::malloc(mem_size);
-  float *d_Y = (float *)syclcompat::malloc(mem_size);
-  CHECK_MEMORY(d_X);
-  CHECK_MEMORY(d_Y);
-
-  // copy host memory to device
-  syclcompat::memcpy(d_X, h_X, mem_size);
-
-  size_t threads = block_size;
-  size_t grid = n_points / block_size;
-
-  std::cout << "Computing result using SYCL Kernel... ";
-  if (block_size == 16) {
-    syclcompat::launch<slope_intercept<16>>(grid, threads, d_Y, d_X, m, b,
-                                            n_points);
-  } else {
-    syclcompat::launch<slope_intercept<32>>(grid, threads, d_Y, d_X, m, b,
-                                            n_points);
-  }
-  syclcompat::wait();
-  std::cout << "DONE" << "\n";
-
-  // Async copy result from device to host
-  syclcompat::memcpy_async(h_Y, d_Y, mem_size).wait();
-
-  // Check output
-  for (size_t i = 0; i < n_points; i++) {
-    if (std::abs(h_Y[i] - h_expected[i]) >= 1e-6) {
-      std::cerr << "Mismatch at index " << i << ": expected " << h_expected[i]
-                << ", but got " << h_Y[i] << "\n";
-      exit(EXIT_FAILURE);
-    }
-  }
-
-  // Clean up memory
-  syclcompat::free(h_X);
-  syclcompat::free(h_Y);
-  syclcompat::free(h_expected);
-  syclcompat::free(d_X);
-  syclcompat::free(d_Y);
-
-  return EXIT_SUCCESS;
-}
diff --git a/sycl/test-e2e/syclcompat/id_query/id_query.cpp b/sycl/test-e2e/syclcompat/id_query/id_query.cpp
deleted file mode 100644
index d5d74e3e48ce8..0000000000000
--- a/sycl/test-e2e/syclcompat/id_query/id_query.cpp
+++ /dev/null
@@ -1,129 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  id_query.cpp
- *
- *  Description:
- *    global_id query tests
- **************************************************************************/
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include <algorithm>
-#include <numeric>
-
-#include <syclcompat/device.hpp>
-#include <syclcompat/launch.hpp>
-
-#include "id_query_fixt.hpp"
-
-void global_id_x_query(int *data) {
-  data[syclcompat::global_id::x()] = syclcompat::global_id::x();
-}
-void global_id_y_query(int *data) {
-  data[syclcompat::global_id::y()] = syclcompat::global_id::y();
-}
-void global_id_z_query(int *data) {
-  data[syclcompat::global_id::z()] = syclcompat::global_id::z();
-}
-
-void test_global_id_query() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  constexpr syclcompat::dim3 grid{4};
-  constexpr syclcompat::dim3 threads{32};
-
-  auto checker = [](std::vector<int> input) {
-    std::vector<int> expected(input.size());
-    std::iota(expected.begin(), expected.end(), 0);
-    assert(std::equal(expected.begin(), expected.end(), input.begin()));
-  };
-  // Check we can query x, y, z components of global_id
-  QueryLauncher<global_id_x_query>({4, 1, 1}, {32, 1, 1}).launch_dim3(checker);
-  QueryLauncher<global_id_y_query>({1, 4, 1}, {1, 32, 1}).launch_dim3(checker);
-  QueryLauncher<global_id_z_query>({1, 1, 4}, {1, 1, 32}).launch_dim3(checker);
-
-  // Check that we can query x component, irrespective of the kernel dimension
-  QueryLauncher<global_id_x_query>({4, 1, 1}, {32, 1, 1})
-      .launch_ndrange<3>(checker);
-  QueryLauncher<global_id_x_query>({4, 1, 1}, {32, 1, 1})
-      .launch_ndrange<2>(checker);
-  QueryLauncher<global_id_x_query>({4, 1, 1}, {32, 1, 1})
-      .launch_ndrange<1>(checker);
-
-  // Check we can query y component for 2D kernel
-  QueryLauncher<global_id_y_query>({1, 4, 1}, {1, 32, 1})
-      .launch_ndrange<2>(checker);
-}
-
-void range_x_query(int *data) {
-  data[syclcompat::global_id::x()] = syclcompat::global_range::x() *
-                                     syclcompat::work_group_range::x() *
-                                     syclcompat::local_range::x();
-}
-
-void test_ranges_query() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  constexpr syclcompat::dim3 grid{4};
-  constexpr syclcompat::dim3 threads{32};
-
-  // global_range::x() * work_group_range::x() * local_range::x();
-  int target = grid.x * threads.x * grid.x * threads.x;
-
-  auto checker = [&](std::vector<int> input) {
-    assert(std::all_of(input.begin(), input.end(),
-                       [=](int a) { return a == target; }));
-  };
-  QueryLauncher<range_x_query>(grid, threads).launch_dim3(checker);
-}
-
-void wgroup_id_x_query(int *data) {
-  data[syclcompat::global_id::x()] = syclcompat::work_group_id::x();
-}
-void local_id_x_query(int *data) {
-  data[syclcompat::global_id::x()] = syclcompat::local_id::x();
-}
-
-void test_ids_query() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  constexpr syclcompat::dim3 grid{4};
-  constexpr syclcompat::dim3 threads{32};
-
-  auto wgroup_checker = [&](std::vector<int> input) {
-    for (int i = 0; i < input.size(); ++i) {
-      assert(input[i] == i / threads.x);
-    }
-  };
-  QueryLauncher<wgroup_id_x_query>(grid, threads).launch_dim3(wgroup_checker);
-
-  auto local_checker = [&](std::vector<int> input) {
-    for (int i = 0; i < input.size(); ++i) {
-      assert(input[i] == i % threads.x);
-    }
-  };
-  QueryLauncher<local_id_x_query>(grid, threads).launch_dim3(local_checker);
-}
-
-int main() {
-  test_global_id_query();
-  test_ranges_query();
-  test_ids_query();
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/id_query/id_query_fixt.hpp b/sycl/test-e2e/syclcompat/id_query/id_query_fixt.hpp
deleted file mode 100644
index 18e51d4913cb7..0000000000000
--- a/sycl/test-e2e/syclcompat/id_query/id_query_fixt.hpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat
- *
- *  id_query_fixt.hpp
- *
- *  Description:
- *     Fixtures and helpers for to tests the id_query functionality
- **************************************************************************/
-
-#pragma once
-
-#include <sycl/detail/core.hpp>
-
-#include <syclcompat/id_query.hpp>
-#include <syclcompat/launch.hpp>
-#include <syclcompat/memory.hpp>
-
-// Class to launch a kernel and run a lambda on output data
-template <auto F> class QueryLauncher {
-protected:
-  syclcompat::dim3 grid_;
-  syclcompat::dim3 threads_;
-  size_t size_;
-  int *data_;
-  std::vector<int> host_data_;
-  using CheckLambda = std::function<void(std::vector<int>)>;
-
-public:
-  QueryLauncher(syclcompat::dim3 grid, syclcompat::dim3 threads)
-      : grid_{grid}, threads_{threads}, size_{grid_.size() * threads_.size()},
-        host_data_(size_) {
-    data_ = (int *)syclcompat::malloc(size_ * sizeof(int));
-    syclcompat::memset(data_, 0, size_ * sizeof(int));
-  };
-  ~QueryLauncher() { syclcompat::free(data_); }
-  template <typename... Args>
-  void launch_dim3(CheckLambda checker, Args... args) {
-    syclcompat::launch<F>(grid_, threads_, data_, args...);
-    syclcompat::memcpy(host_data_.data(), data_, size_ * sizeof(int));
-    syclcompat::wait();
-    checker(host_data_);
-  }
-  template <int Dim, typename... Args>
-  void launch_ndrange(CheckLambda checker, Args... args) {
-    sycl::nd_range<Dim> range = {grid_ * threads_, grid_};
-    syclcompat::launch<F>(range, data_, args...);
-    syclcompat::memcpy(host_data_.data(), data_, size_ * sizeof(int));
-    syclcompat::wait();
-    checker(host_data_);
-  }
-};
diff --git a/sycl/test-e2e/syclcompat/kernel/Inputs/kernel_function.cpp b/sycl/test-e2e/syclcompat/kernel/Inputs/kernel_function.cpp
deleted file mode 100644
index 5674ccf651a16..0000000000000
--- a/sycl/test-e2e/syclcompat/kernel/Inputs/kernel_function.cpp
+++ /dev/null
@@ -1,163 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  kernel_function.cpp
- *
- *  Description:
- *    kernel_function header API tests
- **************************************************************************/
-
-// The original source was under the license below:
-// ====------ kernel_function_lin.cpp---------- -*- C++ -* ----------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===---------------------------------------------------------------------===//
-#ifdef _WIN32
-#define NOMINMAX
-#include <windows.h>
-#undef NOMINMAX
-#else
-#include <dlfcn.h>
-#endif
-
-#include <iostream>
-#include <string>
-
-#include <sycl/detail/core.hpp>
-
-#include <syclcompat/defs.hpp>
-#include <syclcompat/device.hpp>
-#include <syclcompat/kernel.hpp>
-#include <syclcompat/memory.hpp>
-
-template <class T> void testTemplateKernel(T *data) {}
-
-void testKernel(void *data) {}
-
-template <class T> int getTemplateFuncAttrs() {
-  syclcompat::kernel_function_info attrs;
-  syclcompat::get_kernel_function_info(&attrs,
-                                       (const void *)testTemplateKernel<T>);
-  int threadPerBlock = attrs.max_work_group_size;
-  return threadPerBlock;
-}
-
-int getFuncAttrs() {
-  syclcompat::kernel_function_info attrs;
-  syclcompat::get_kernel_function_info(&attrs, (const void *)testKernel);
-  int threadPerBlock = attrs.max_work_group_size;
-  return threadPerBlock;
-}
-
-void test_get_func_attrs() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  syclcompat::device_ext &dev_ct1 = syclcompat::get_current_device();
-
-  int size = dev_ct1.get_info<sycl::info::device::max_work_group_size>();
-  assert(getTemplateFuncAttrs<int>() == size);
-  assert(getFuncAttrs() == size);
-}
-
-void call_library_func(syclcompat::kernel_library kernel_lib) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-
-  std::string FunctionName = "foo";
-  syclcompat::kernel_function func;
-  SYCLCOMPAT_CHECK_ERROR(
-      func = syclcompat::get_kernel_function(kernel_lib, FunctionName.c_str()));
-
-  if (func == nullptr) {
-    std::cout << "Could not load function pointer" << std::endl << std::flush;
-    syclcompat::unload_kernel_library(kernel_lib);
-    assert(false); // FAIL
-  }
-
-  int sharedSize = 10;
-  void **param = nullptr, **extra = nullptr;
-
-  constexpr size_t NUM_ELEMENTS = 16;
-  int *dev = syclcompat::malloc<int>(NUM_ELEMENTS);
-  syclcompat::fill<int>(dev, 0, NUM_ELEMENTS);
-
-  param = (void **)(&dev);
-  SYCLCOMPAT_CHECK_ERROR(syclcompat::invoke_kernel_function(
-      func, q_ct1, sycl::range<3>(1, 1, 2), sycl::range<3>(1, 1, 8), sharedSize,
-      param, extra));
-  syclcompat::wait_and_throw();
-
-  int *host_mem = syclcompat::malloc_host<int>(NUM_ELEMENTS);
-  syclcompat::memcpy<int>(host_mem, dev, NUM_ELEMENTS);
-  for (int i = 0; i < NUM_ELEMENTS; i++) {
-    assert(host_mem[i] == i);
-  }
-
-  SYCLCOMPAT_CHECK_ERROR(syclcompat::unload_kernel_library(kernel_lib));
-
-  syclcompat::free(dev);
-  syclcompat::free(host_mem);
-}
-
-void test_kernel_functor_ptr() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  syclcompat::kernel_library kernel_lib;
-  SYCLCOMPAT_CHECK_ERROR(kernel_lib =
-                             syclcompat::load_kernel_library(TEST_SHARED_LIB));
-
-  if (kernel_lib == nullptr) {
-    std::cout << "Could not load the library" << std::endl;
-    std::cout << "  " << TEST_SHARED_LIB << std::endl << std::flush;
-    assert(false); // FAIL
-  }
-
-  call_library_func(kernel_lib);
-}
-
-void test_kernel_functor_ptr_memory() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-
-  std::ifstream ifs;
-  ifs.open(TEST_SHARED_LIB, std::ios::in | std::ios::binary);
-
-  std::stringstream buffer;
-  buffer << ifs.rdbuf();
-
-  syclcompat::kernel_library kernel_lib;
-  SYCLCOMPAT_CHECK_ERROR(
-      kernel_lib = syclcompat::load_kernel_library_mem(buffer.str().c_str()));
-
-  if (kernel_lib == nullptr) {
-    std::cout << "Could not load the library" << std::endl;
-    std::cout << "  " << TEST_SHARED_LIB << std::endl << std::flush;
-    assert(false);
-  }
-
-  call_library_func(kernel_lib);
-}
-
-int main() {
-  test_get_func_attrs();
-  test_kernel_functor_ptr();
-  test_kernel_functor_ptr_memory();
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/kernel/Inputs/kernel_module.cpp b/sycl/test-e2e/syclcompat/kernel/Inputs/kernel_module.cpp
deleted file mode 100644
index 10ecafe62b745..0000000000000
--- a/sycl/test-e2e/syclcompat/kernel/Inputs/kernel_module.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  kernel_module.cpp
- *
- *  Description:
- *    function implementation used in kernel_function header API tests
- **************************************************************************/
-
-// The original source was under the license below:
-// ====------ kernel_module_lin.cpp------------------------ -*- C++ -* ----===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===---------------------------------------------------------------------===//
-
-#include <sycl/detail/core.hpp>
-
-#include <syclcompat/defs.hpp>
-
-void foo(int *k, sycl::nd_item<3> item_ct1, uint8_t *local_mem) {
-  k[item_ct1.get_global_linear_id()] = item_ct1.get_global_linear_id();
-}
-
-extern "C" {
-SYCLCOMPAT_EXPORT void foo_wrapper(sycl::queue &queue,
-                                   const sycl::nd_range<3> &nr,
-                                   unsigned int local_mem_size,
-                                   void **kernel_params, void **extra) {
-  int *k;
-  k = (int *)kernel_params[0];
-  queue.submit([&](sycl::handler &cgh) {
-    sycl::local_accessor<uint8_t, 1> local_acc_ct1(
-        sycl::range<1>(local_mem_size), cgh);
-    cgh.parallel_for(nr, [=](sycl::nd_item<3> item_ct1) {
-      foo(k, item_ct1,
-          local_acc_ct1.get_multi_ptr<sycl::access::decorated::no>().get());
-    });
-  });
-}
-}
diff --git a/sycl/test-e2e/syclcompat/kernel/kernel_lin.cpp b/sycl/test-e2e/syclcompat/kernel/kernel_lin.cpp
deleted file mode 100644
index d93a7880d404e..0000000000000
--- a/sycl/test-e2e/syclcompat/kernel/kernel_lin.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-// REQUIRES: linux
-// TODO: Supported for ROCM 5. Further development required to support AMDGPU.
-// UNSUPPORTED: hip
-
-// RUN: %clangxx -fPIC -shared -fsycl %{sycl_target_opts} %S/Inputs/kernel_module.cpp -o %t.so
-// RUN: %clangxx -DTEST_SHARED_LIB='"%t.so"' -ldl -fsycl %{sycl_target_opts} %S/Inputs/kernel_function.cpp -o %t.out
-// RUN: %{run} %t.out
diff --git a/sycl/test-e2e/syclcompat/kernel/kernel_win.cpp b/sycl/test-e2e/syclcompat/kernel/kernel_win.cpp
deleted file mode 100644
index 51843096a721a..0000000000000
--- a/sycl/test-e2e/syclcompat/kernel/kernel_win.cpp
+++ /dev/null
@@ -1,16 +0,0 @@
-// REQUIRES: windows
-
-// Currently disabled due to flaky failures caused by Windows runtime not
-// unregistering the binaries when runtime-loaded .dll files with SYCL binaries
-// are unloaded.
-// UNSUPPORTED: windows
-// UNSUPPORTED-TRACKER: CMPLRLLVM-68687
-
-// DEFINE: %{sharedflag} = %if cl_options %{/clang:-shared%} %else %{-shared%}
-
-// This test is sensitive to the absolute path of the dll file produced, so we
-// run the test completely on the run system to avoid issues.
-
-// RUN: %{run-aux} %clangxx %{sharedflag} -fsycl %{sycl_target_opts} %S\Inputs\kernel_module.cpp -o %t.dll
-// RUN: %{run-aux} %clangxx -DTEST_SHARED_LIB='"%/t.dll"' -fsycl %{sycl_target_opts} %S\Inputs\kernel_function.cpp -o %t.out
-// RUN: %{run} %t.out
diff --git a/sycl/test-e2e/syclcompat/launch/launch.cpp b/sycl/test-e2e/syclcompat/launch/launch.cpp
deleted file mode 100644
index b2f0a2def40ec..0000000000000
--- a/sycl/test-e2e/syclcompat/launch/launch.cpp
+++ /dev/null
@@ -1,142 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  launch.cpp
- *
- *  Description:
- *     launch<F> and launch<F> with dinamyc local memory tests
- **************************************************************************/
-// UNSUPPORTED: gpu-intel-dg2
-// UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/14387
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include <sycl/detail/core.hpp>
-#include <sycl/group_barrier.hpp>
-
-#include <syclcompat/device.hpp>
-#include <syclcompat/id_query.hpp>
-#include <syclcompat/launch.hpp>
-#include <syclcompat/memory.hpp>
-
-#include "launch_fixt.hpp"
-
-// Dummy kernel functions for testing
-inline void empty_kernel(){};
-inline void int_kernel(int a){};
-inline void int_ptr_kernel(int *a){};
-
-template <int Dim>
-void compute_nd_range_3d(RangeParams<Dim> range_param, std::string test_name) {
-  std::cout << __PRETTY_FUNCTION__ << " " << test_name << std::endl;
-
-  try {
-    auto g_out = syclcompat::compute_nd_range(range_param.global_range_in_,
-                                              range_param.local_range_in_);
-    sycl::nd_range<Dim> x_out = {range_param.expect_global_range_out_,
-                                 range_param.local_range_in_};
-    if (range_param.shouldPass_) {
-      assert(g_out == x_out);
-    } else {
-      assert(false); // Trigger failure, expected std::invalid_argument
-    }
-  } catch (std::invalid_argument const &err) {
-    if (range_param.shouldPass_) {
-      assert(false); // Trigger failure, unexpected std::invalid_argument
-    }
-  }
-}
-
-void test_launch_compute_nd_range_3d() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  compute_nd_range_3d(RangeParams<3>{{11, 1, 1}, {2, 1, 1}, {12, 1, 1}, true},
-                      "Round up");
-  compute_nd_range_3d(
-      RangeParams<3>{{320, 1, 1}, {32, 1, 1}, {320, 1, 1}, true}, "Even size");
-  compute_nd_range_3d(
-      RangeParams<3>{{32, 193, 1}, {16, 32, 1}, {32, 224, 1}, true},
-      "Round up 2");
-  compute_nd_range_3d(RangeParams<3>{{10, 0, 0}, {1, 0, 0}, {10, 0, 0}, false},
-                      "zero size");
-  compute_nd_range_3d(
-      RangeParams<3>{{0, 10, 10}, {0, 10, 10}, {0, 10, 10}, false},
-      "zero size 2");
-  compute_nd_range_3d(RangeParams<3>{{2, 1, 1}, {32, 1, 1}, {32, 1, 1}, false},
-                      "local > global");
-  compute_nd_range_3d(RangeParams<3>{{1, 2, 1}, {1, 32, 1}, {1, 32, 1}, false},
-                      "local > global 2");
-  compute_nd_range_3d(RangeParams<3>{{1, 1, 2}, {1, 1, 32}, {1, 1, 32}, false},
-                      "local > global 3");
-}
-
-void test_no_arg_launch() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  LaunchTest lt;
-
-  syclcompat::launch<empty_kernel>(lt.range_1_);
-  syclcompat::launch<empty_kernel>(lt.range_2_);
-  syclcompat::launch<empty_kernel>(lt.range_3_);
-  syclcompat::launch<empty_kernel>(lt.grid_, lt.thread_);
-
-  syclcompat::launch<empty_kernel>(lt.range_1_, lt.q_);
-  syclcompat::launch<empty_kernel>(lt.range_2_, lt.q_);
-  syclcompat::launch<empty_kernel>(lt.range_3_, lt.q_);
-  syclcompat::launch<empty_kernel>(lt.grid_, lt.thread_, lt.q_);
-}
-
-void test_one_arg_launch() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  LaunchTest lt;
-
-  int my_int;
-
-  syclcompat::launch<int_kernel>(lt.range_1_, my_int);
-  syclcompat::launch<int_kernel>(lt.range_2_, my_int);
-  syclcompat::launch<int_kernel>(lt.range_3_, my_int);
-  syclcompat::launch<int_kernel>(lt.grid_, lt.thread_, my_int);
-
-  syclcompat::launch<int_kernel>(lt.range_1_, lt.q_, my_int);
-  syclcompat::launch<int_kernel>(lt.range_2_, lt.q_, my_int);
-  syclcompat::launch<int_kernel>(lt.range_3_, lt.q_, my_int);
-  syclcompat::launch<int_kernel>(lt.grid_, lt.thread_, lt.q_, my_int);
-}
-
-void test_ptr_arg_launch() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  LaunchTest lt;
-
-  int *int_ptr = nullptr;
-
-  syclcompat::launch<int_ptr_kernel>(lt.range_1_, int_ptr);
-  syclcompat::launch<int_ptr_kernel>(lt.range_2_, int_ptr);
-  syclcompat::launch<int_ptr_kernel>(lt.range_3_, int_ptr);
-  syclcompat::launch<int_ptr_kernel>(lt.grid_, lt.thread_, int_ptr);
-
-  syclcompat::launch<int_ptr_kernel>(lt.range_1_, lt.q_, int_ptr);
-  syclcompat::launch<int_ptr_kernel>(lt.range_2_, lt.q_, int_ptr);
-  syclcompat::launch<int_ptr_kernel>(lt.range_3_, lt.q_, int_ptr);
-  syclcompat::launch<int_ptr_kernel>(lt.grid_, lt.thread_, lt.q_, int_ptr);
-}
-
-int main() {
-  test_launch_compute_nd_range_3d();
-  test_no_arg_launch();
-  test_one_arg_launch();
-  test_ptr_arg_launch();
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/launch/launch_fixt.hpp b/sycl/test-e2e/syclcompat/launch/launch_fixt.hpp
deleted file mode 100644
index a98f86971b31c..0000000000000
--- a/sycl/test-e2e/syclcompat/launch/launch_fixt.hpp
+++ /dev/null
@@ -1,109 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat
- *
- *  launch_fixt.hpp
- *
- *  Description:
- *     Fixtures and helpers for to tests the launch functionality
- **************************************************************************/
-
-#pragma once
-
-#include <sycl/detail/core.hpp>
-
-#include <syclcompat/device.hpp>
-#include <syclcompat/dims.hpp>
-
-// Struct containing test case data (local & global ranges)
-template <int Dim> struct RangeParams {
-  RangeParams(sycl::range<Dim> global_range_in, sycl::range<Dim> local_range,
-              sycl::range<Dim> expect_global_range_out, bool pass)
-      : global_range_in_{global_range_in}, local_range_in_{local_range},
-        expect_global_range_out_{expect_global_range_out}, shouldPass_{pass} {}
-
-  sycl::range<Dim> local_range_in_;
-  sycl::range<Dim> global_range_in_;
-  sycl::range<Dim> expect_global_range_out_;
-  bool shouldPass_;
-
-  // Pretty printing of RangeParams
-  friend std::ostream &operator<<(std::ostream &os, const RangeParams &range) {
-    auto print_range = [](std::ostream &os, const sycl::range<Dim> range) {
-      os << " {";
-      for (int i = 0; i < Dim; ++i) {
-        os << range[i];
-        os << ((Dim - i == 1) ? "} " : ", ");
-      }
-    };
-    os << "Local:";
-    print_range(os, range.local_range_in_);
-    os << "Global (in): ";
-    print_range(os, range.global_range_in_);
-    os << "Global (out): ";
-    print_range(os, range.expect_global_range_out_);
-    os << (range.shouldPass_ ? "Should Work" : "Should Throw");
-    return os;
-  }
-};
-
-// Fixture for launch tests - initializes a few different
-// range-like members & a queue.
-struct LaunchTest {
-  LaunchTest()
-      : q_{syclcompat::get_default_queue()}, grid_{4, 2, 2}, thread_{32, 2, 2},
-        range_1_{128, 32}, range_2_{{4, 128}, {2, 32}},
-        range_3_{{2, 4, 64}, {2, 2, 32}} {}
-  sycl::queue const q_;
-  syclcompat::dim3 const grid_;
-  syclcompat::dim3 const thread_;
-  sycl::nd_range<1> const range_1_;
-  sycl::nd_range<2> const range_2_;
-  sycl::nd_range<3> const range_3_;
-};
-
-// Typed tests
-template <typename T> struct LaunchTestWithArgs : public LaunchTest {
-  LaunchTestWithArgs()
-      : LaunchTest(), memsize_{LOCAL_MEM_SIZE},
-        in_order_q_{{sycl::property::queue::in_order()}}, skip_{false} {
-    should_skip();
-  }
-
-  void should_skip() {
-    if (!syclcompat::get_current_device().has(sycl::aspect::fp64) &&
-        std::is_same_v<T, double>) {
-      std::cout << "  sycl::aspect::fp64 not supported by the SYCL device."
-                << std::endl;
-      skip_ = true;
-    }
-    if (!syclcompat::get_current_device().has(sycl::aspect::fp16) &&
-        std::is_same_v<T, sycl::half>) {
-
-      std::cout << "  sycl::aspect::fp16 not supported by the SYCL device."
-                << std::endl;
-      skip_ = true;
-    }
-  }
-
-  constexpr static size_t LOCAL_MEM_SIZE = 64;
-
-  size_t const memsize_;
-  sycl::queue const in_order_q_;
-  bool skip_;
-};
-
-using memsize_type_list =
-    std::tuple<int, unsigned int, short, unsigned short, long, unsigned long>;
diff --git a/sycl/test-e2e/syclcompat/launch/launch_policy.cpp b/sycl/test-e2e/syclcompat/launch/launch_policy.cpp
deleted file mode 100644
index 5879b6c9466dc..0000000000000
--- a/sycl/test-e2e/syclcompat/launch/launch_policy.cpp
+++ /dev/null
@@ -1,363 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  launch_config.cpp
- *
- *  Description:
- *     launch<F> with config tests
- **************************************************************************/
-// REQUIRES: sg-32
-
-// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
-// RUN: %{run} %t.out
-
-// XFAIL: target-native_cpu
-// XFAIL-TRACKER: https://github.com/intel/llvm/issues/20142
-
-#include <sycl/ext/intel/experimental/kernel_execution_properties.hpp>
-#include <sycl/ext/oneapi/kernel_properties/properties.hpp>
-#include <syclcompat/device.hpp>
-#include <sycl/detail/core.hpp>
-#include <sycl/ext/oneapi/properties/properties.hpp>
-#include <sycl/group_barrier.hpp>
-
-#include <syclcompat/launch.hpp>
-#include <syclcompat/memory.hpp>
-
-#include "../common.hpp"
-#include "launch_fixt.hpp"
-
-namespace compat_exp = syclcompat::experimental;
-namespace sycl_exp = sycl::ext::oneapi::experimental;
-namespace sycl_intel_exp = sycl::ext::intel::experimental;
-
-// Dummy kernel functions for testing
-// =======================================================================
-
-static constexpr int LOCAL_MEM_SIZE = 1024;
-
-using sycl::ext::oneapi::experimental::empty_properties_t;
-
-inline void empty_kernel(){};
-inline void int_kernel(int a){};
-inline void int_ptr_kernel(int *a){};
-
-inline void dynamic_local_mem_empty_kernel(char *a){};
-
-template <typename T>
-inline void dynamic_local_mem_basicdt_kernel(T value, char *local_mem){};
-
-template <typename T> void write_mem_kernel(T *data, int num_elements) {
-  const int id =
-      sycl::ext::oneapi::this_work_item::get_nd_item<1>().get_global_id(0);
-  if (id < num_elements) {
-    data[id] = static_cast<T>(id);
-  }
-};
-
-template <typename T>
-void dynamic_local_mem_typed_kernel(T *data, char *local_mem) {
-  constexpr size_t num_elements = LOCAL_MEM_SIZE / sizeof(T);
-  T *typed_local_mem = reinterpret_cast<T *>(local_mem);
-
-  const int id =
-      sycl::ext::oneapi::this_work_item::get_nd_item<1>().get_global_id(0);
-  if (id < num_elements) {
-    typed_local_mem[id] = static_cast<T>(id);
-  }
-  sycl::group_barrier(sycl::ext::oneapi::this_work_item::get_work_group<1>());
-  if (id < num_elements) {
-    data[id] = typed_local_mem[num_elements - id - 1];
-  }
-};
-// =======================================================================
-
-int test_variadic_config_ctor() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  // nd_range and kernel_properties
-  {
-    compat_exp::launch_policy my_config(
-        sycl::nd_range<1>{{32}, {32}},
-        compat_exp::kernel_properties{sycl_exp::sub_group_size<32>});
-    static_assert(
-        std::is_same_v<decltype(my_config),
-                       compat_exp::launch_policy<
-                           sycl::nd_range<1>,
-                           decltype(sycl::ext::oneapi::experimental::properties{
-                               sycl_exp::sub_group_size<32>}),
-                           empty_properties_t, false>>);
-  }
-
-  // range and kernel_properties
-  {
-    compat_exp::launch_policy my_config(
-        sycl::range<3>{1, 1, 32},
-        compat_exp::kernel_properties{sycl_exp::sub_group_size<32>});
-    static_assert(
-        std::is_same_v<decltype(my_config),
-                       compat_exp::launch_policy<
-                           sycl::range<3>,
-                           decltype(sycl::ext::oneapi::experimental::properties{
-                               sycl_exp::sub_group_size<32>}),
-                           empty_properties_t, false>>);
-  }
-
-  // nd_range and kernel_properties properties ctor
-  {
-    sycl_exp::properties my_props{sycl_exp::sub_group_size<32>};
-    compat_exp::launch_policy my_config(
-        sycl::nd_range<1>{{32}, {32}},
-        compat_exp::kernel_properties(my_props));
-    static_assert(
-        std::is_same_v<decltype(my_config),
-                       compat_exp::launch_policy<
-                           sycl::nd_range<1>,
-                           decltype(sycl::ext::oneapi::experimental::properties{
-                               sycl_exp::sub_group_size<32>}),
-                           empty_properties_t, false>>);
-  }
-  // Empty kernel properties
-  {
-    compat_exp::launch_policy my_config(sycl::nd_range<1>{{32}, {32}},
-                                        compat_exp::kernel_properties{});
-    static_assert(
-        std::is_same_v<
-            decltype(my_config),
-            compat_exp::launch_policy<sycl::nd_range<1>, empty_properties_t,
-                                      empty_properties_t, false>>);
-  }
-
-  // Empty launch properties
-  {
-    compat_exp::launch_policy my_config(sycl::nd_range<1>{{32}, {32}},
-                                        compat_exp::launch_properties{});
-    static_assert(
-        std::is_same_v<
-            decltype(my_config),
-            compat_exp::launch_policy<sycl::nd_range<1>, empty_properties_t,
-                                      empty_properties_t, false>>);
-  }
-
-  // nd_range and launch_properties properties ctor
-  {
-
-    sycl_exp::cuda::cluster_size<1> ClusterDims(sycl::range<1>{32});
-    sycl_exp::properties my_props{ClusterDims};
-
-    compat_exp::launch_policy my_config(
-        sycl::nd_range<1>{{32}, {32}},
-        compat_exp::launch_properties(my_props));
-    static_assert(
-        std::is_same_v<decltype(my_config),
-                       compat_exp::launch_policy<
-                           sycl::nd_range<1>, empty_properties_t,
-                           decltype(sycl::ext::oneapi::experimental::properties{
-                               sycl_exp::cuda::cluster_size<1>{32}}),
-                           false>>);
-  }
-
-  // Just local mem
-  {
-    compat_exp::launch_policy my_config(sycl::nd_range<1>{{32}, {32}},
-                                        compat_exp::local_mem_size{1024});
-    static_assert(
-        std::is_same_v<
-            decltype(my_config),
-            compat_exp::launch_policy<sycl::nd_range<1>, empty_properties_t,
-                                      empty_properties_t, true>>);
-  }
-
-  // Just 0 local mem
-  {
-    compat_exp::launch_policy my_config(sycl::nd_range<1>{{32}, {32}},
-                                        compat_exp::local_mem_size{0});
-    static_assert(
-        std::is_same_v<
-            decltype(my_config),
-            compat_exp::launch_policy<sycl::nd_range<1>, empty_properties_t,
-                                      empty_properties_t, true>>);
-  }
-
-  return 0;
-}
-
-int test_basic_launch() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  sycl_intel_exp::cache_config my_cache_config{
-      sycl_intel_exp::large_slm}; // constructed at runtime
-
-  compat_exp::kernel_properties my_k_props{
-      sycl_exp::sub_group_size<32>, sycl_exp::use_root_sync, my_cache_config};
-
-  compat_exp::launch_properties my_l_props{};
-
-  compat_exp::launch_policy my_config(sycl::nd_range<1>{{32}, {32}}, my_k_props,
-                                      my_l_props);
-
-  sycl::queue q = syclcompat::get_default_queue();
-
-  int dummy_int{1};
-
-  compat_exp::launch<empty_kernel>(my_config);
-  compat_exp::launch<int_kernel>(my_config, dummy_int);
-
-  compat_exp::launch<empty_kernel>(my_config, q);
-  compat_exp::launch<int_kernel>(my_config, q, dummy_int);
-
-  return 0;
-}
-
-int test_range_launch() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  compat_exp::launch_policy my_config(sycl::range<1>{32});
-
-  sycl::queue q = syclcompat::get_default_queue();
-
-  int dummy_int{1};
-
-  compat_exp::launch<empty_kernel>(my_config);
-  compat_exp::launch<int_kernel>(my_config, dummy_int);
-
-  compat_exp::launch<empty_kernel>(my_config, q);
-  compat_exp::launch<int_kernel>(my_config, q, dummy_int);
-
-  return 0;
-}
-
-int test_lmem_launch() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  using T = int;
-  // A property constructed at runtime:
-  sycl_intel_exp::cache_config my_cache_config{sycl_intel_exp::large_slm};
-
-  int local_mem_size = LOCAL_MEM_SIZE; // rt value
-
-  size_t num_elements = local_mem_size / sizeof(T);
-  T *h_a = (T *)syclcompat::malloc_host(local_mem_size);
-  T *d_a = (T *)syclcompat::malloc(local_mem_size);
-
-  compat_exp::launch_policy my_config(
-      sycl::nd_range<1>{{256}, {256}},
-      compat_exp::kernel_properties{sycl_exp::sub_group_size<32>,
-                                    sycl_exp::use_root_sync, my_cache_config},
-      compat_exp::launch_properties{},
-      compat_exp::local_mem_size(local_mem_size));
-
-  compat_exp::launch<dynamic_local_mem_empty_kernel>(my_config).wait();
-  std::cout << "Launched 1 succesfully" << std::endl;
-
-  compat_exp::launch<dynamic_local_mem_typed_kernel<int>>(my_config, d_a)
-      .wait();
-  std::cout << "Launched 2 succesfully" << std::endl;
-
-  syclcompat::memcpy(h_a, d_a, local_mem_size);
-  syclcompat::free(d_a);
-
-  for (int i = 0; i < num_elements; i++) {
-    assert(h_a[i] == static_cast<T>(num_elements - i - 1));
-  }
-
-  syclcompat::free(h_a);
-  return 0;
-}
-
-int test_dim3_launch_policy() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  compat_exp::launch_policy my_dim3_config(syclcompat::dim3{32});
-
-  static_assert(
-      std::is_same_v<decltype(my_dim3_config)::RangeT, sycl::range<3>>);
-
-  compat_exp::launch_policy my_dim3_dim3_config(syclcompat::dim3{32},
-                                                syclcompat::dim3{32});
-
-  static_assert(
-      std::is_same_v<decltype(my_dim3_dim3_config)::RangeT, sycl::nd_range<3>>);
-
-  compat_exp::launch_policy my_nd_range_config(syclcompat::dim3{32},
-                                               syclcompat::dim3{32});
-
-  compat_exp::launch<empty_kernel>(my_dim3_config).wait();
-  std::cout << "Launched 1 succesfully" << std::endl;
-  compat_exp::launch<empty_kernel>(my_dim3_dim3_config).wait();
-  std::cout << "Launched 2 succesfully" << std::endl;
-
-  return 0;
-}
-
-int test_dim3_lmem_launch() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  compat_exp::launch_policy my_dim3_dim3_config(syclcompat::dim3{32},
-                                                syclcompat::dim3{32},
-                                                compat_exp::local_mem_size{0});
-
-  static_assert(
-      std::is_same_v<decltype(my_dim3_dim3_config)::RangeT, sycl::nd_range<3>>);
-
-  compat_exp::launch<dynamic_local_mem_empty_kernel>(my_dim3_dim3_config)
-      .wait();
-  std::cout << "Launched 1 succesfully" << std::endl;
-
-  return 0;
-}
-
-int test_dim3_props_launch() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  compat_exp::launch_policy my_dim3_config(syclcompat::dim3{32},
-                                           compat_exp::kernel_properties{});
-
-  static_assert(
-      std::is_same_v<decltype(my_dim3_config)::RangeT, sycl::range<3>>);
-
-  compat_exp::launch<int_kernel>(my_dim3_config, 9001);
-  return 0;
-}
-
-template <typename T> int test_write_mem() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  LaunchTestWithArgs<T> ltt;
-  if (ltt.skip_) // Unsupported aspect
-    return 0;
-
-  compat_exp::launch_policy my_dim3_config(syclcompat::dim3{32});
-
-  const int memsize = 1024;
-  T *d_a = (T *)syclcompat::malloc(memsize);
-  compat_exp::launch<write_mem_kernel<T>>(my_dim3_config, d_a,
-                                              memsize / sizeof(T))
-      .wait();
-
-  syclcompat::free(d_a);
-  return 0;
-}
-
-int main() {
-  test_variadic_config_ctor();
-  test_basic_launch();
-  test_range_launch();
-  test_lmem_launch();
-  test_dim3_launch_policy();
-  test_dim3_lmem_launch();
-  test_dim3_props_launch();
-  INSTANTIATE_ALL_TYPES(value_type_list, test_write_mem);
-}
diff --git a/sycl/test-e2e/syclcompat/launch/launch_policy_lmem.cpp b/sycl/test-e2e/syclcompat/launch/launch_policy_lmem.cpp
deleted file mode 100644
index 78501a8d1a348..0000000000000
--- a/sycl/test-e2e/syclcompat/launch/launch_policy_lmem.cpp
+++ /dev/null
@@ -1,290 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  launch_policy_lmem.cpp
- *
- *  Description:
- *     launch<F> with policy & use local memory tests
- **************************************************************************/
-
-// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
-// RUN: %{run} %t.out
-
-// UNSUPPORTED: linux && opencl && (gpu-intel-gen12 || gpu-intel-dg2 || arch-intel_gpu_pvc)
-// UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/15275
-
-// Flaky pass/fail behaviour.
-// UNSUPPORTED: spirv-backend
-// UNSUPPORTED-TRACKER: CMPLRLLVM-64705
-
-// XFAIL: target-native_cpu
-// XFAIL-TRACKER: https://github.com/intel/llvm/issues/20142
-
-#include <sycl/detail/core.hpp>
-#include <sycl/ext/oneapi/properties/properties.hpp>
-#include <sycl/group_barrier.hpp>
-
-#include <syclcompat/launch.hpp>
-#include <syclcompat/memory.hpp>
-#include <syclcompat/id_query.hpp>
-
-#include "../common.hpp"
-#include "launch_fixt.hpp"
-
-namespace compat_exp = syclcompat::experimental;
-
-using compat_exp::launch_policy;
-using compat_exp::local_mem_size;
-
-// Kernel functions for testing
-// =======================================================================
-inline void dynamic_local_mem_empty_kernel(char *a){};
-
-template <typename T>
-inline void dynamic_local_mem_basicdt_kernel(T value, char *local_mem){};
-
-template <typename T>
-void dynamic_local_mem_typed_kernel(T *data, char *local_mem) {
-  constexpr size_t memsize = LaunchTestWithArgs<T>::LOCAL_MEM_SIZE;
-  constexpr size_t num_elements = memsize / sizeof(T);
-  T *typed_local_mem = reinterpret_cast<T *>(local_mem);
-
-  const int local_id =
-      sycl::ext::oneapi::this_work_item::get_nd_item<3>().get_local_linear_id();
-  const int group_id =
-      sycl::ext::oneapi::this_work_item::get_nd_item<3>().get_group_linear_id();
-  // Only operate in first work-group
-  if (group_id == 0) {
-    if (local_id < num_elements) {
-      typed_local_mem[local_id] = static_cast<T>(local_id);
-    }
-    syclcompat::wg_barrier();
-    if (local_id < num_elements) {
-      data[local_id] = typed_local_mem[num_elements - local_id - 1];
-    }
-  }
-};
-
-// =======================================================================
-
-void test_dynamic_mem_no_arg_launch() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  LaunchTest lt;
-
-  compat_exp::launch<dynamic_local_mem_empty_kernel>(
-      launch_policy{lt.range_1_, local_mem_size{1}});
-  compat_exp::launch<dynamic_local_mem_empty_kernel>(
-      launch_policy{lt.range_2_, local_mem_size{1}});
-  compat_exp::launch<dynamic_local_mem_empty_kernel>(
-      launch_policy{lt.range_3_, local_mem_size{1}});
-  compat_exp::launch<dynamic_local_mem_empty_kernel>(
-      launch_policy{lt.grid_, lt.thread_, local_mem_size{1}});
-}
-
-void test_dynamic_mem_no_arg_launch_q() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  LaunchTest lt;
-
-  compat_exp::launch<dynamic_local_mem_empty_kernel>(
-      launch_policy{lt.range_1_, local_mem_size{1}}, lt.q_);
-  compat_exp::launch<dynamic_local_mem_empty_kernel>(
-      launch_policy{lt.range_2_, local_mem_size{1}}, lt.q_);
-  compat_exp::launch<dynamic_local_mem_empty_kernel>(
-      launch_policy{lt.range_3_, local_mem_size{1}}, lt.q_);
-  compat_exp::launch<dynamic_local_mem_empty_kernel>(
-      launch_policy{lt.grid_, lt.thread_, local_mem_size{1}}, lt.q_);
-}
-
-template <typename T> void test_basic_dt_launch() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  T d_a = T(1);
-  LaunchTestWithArgs<T> ltt;
-
-  if (ltt.skip_) // Unsupported aspect
-    return;
-
-  compat_exp::launch<dynamic_local_mem_basicdt_kernel<T>>(
-      launch_policy{ltt.range_1_, local_mem_size{ltt.memsize_}}, d_a);
-  compat_exp::launch<dynamic_local_mem_basicdt_kernel<T>>(
-      launch_policy{ltt.range_2_, local_mem_size{ltt.memsize_}}, d_a);
-  compat_exp::launch<dynamic_local_mem_basicdt_kernel<T>>(
-      launch_policy{ltt.range_3_, local_mem_size{ltt.memsize_}}, d_a);
-  compat_exp::launch<dynamic_local_mem_basicdt_kernel<T>>(
-      launch_policy{ltt.grid_, ltt.thread_, local_mem_size{ltt.memsize_}}, d_a);
-}
-
-template <typename T> void test_basic_dt_launch_q() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  T d_a = T(1);
-  LaunchTestWithArgs<T> ltt;
-
-  if (ltt.skip_) // Unsupported aspect
-    return;
-
-  compat_exp::launch<dynamic_local_mem_basicdt_kernel<T>>(
-      launch_policy{ltt.range_1_, local_mem_size{ltt.memsize_}},
-      ltt.in_order_q_, d_a);
-  compat_exp::launch<dynamic_local_mem_basicdt_kernel<T>>(
-      launch_policy{ltt.range_2_, local_mem_size{ltt.memsize_}},
-      ltt.in_order_q_, d_a);
-  compat_exp::launch<dynamic_local_mem_basicdt_kernel<T>>(
-      launch_policy{ltt.range_3_, local_mem_size{ltt.memsize_}},
-      ltt.in_order_q_, d_a);
-  compat_exp::launch<dynamic_local_mem_basicdt_kernel<T>>(
-      launch_policy{ltt.grid_, ltt.thread_, local_mem_size{ltt.memsize_}},
-      ltt.in_order_q_, d_a);
-}
-
-template <typename T> void test_arg_launch() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  LaunchTestWithArgs<T> ltt;
-  if (ltt.skip_) // Unsupported aspect
-    return;
-
-  T *d_a = (T *)syclcompat::malloc(ltt.memsize_);
-
-  compat_exp::launch<dynamic_local_mem_typed_kernel<T>>(
-      launch_policy{ltt.range_1_, local_mem_size{ltt.memsize_}}, d_a);
-  compat_exp::launch<dynamic_local_mem_typed_kernel<T>>(
-      launch_policy{ltt.range_2_, local_mem_size{ltt.memsize_}}, d_a);
-  compat_exp::launch<dynamic_local_mem_typed_kernel<T>>(
-      launch_policy{ltt.range_3_, local_mem_size{ltt.memsize_}}, d_a);
-  compat_exp::launch<dynamic_local_mem_typed_kernel<T>>(
-      launch_policy{ltt.grid_, ltt.thread_, local_mem_size{ltt.memsize_}}, d_a);
-
-  syclcompat::wait();
-  syclcompat::free(d_a);
-}
-
-template <typename T> void test_arg_launch_q() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  LaunchTestWithArgs<T> ltt;
-  if (ltt.skip_) // Unsupported aspect
-    return;
-
-  T *d_a = (T *)syclcompat::malloc(ltt.memsize_, ltt.in_order_q_);
-
-  compat_exp::launch<dynamic_local_mem_typed_kernel<T>>(
-      launch_policy{ltt.range_1_, local_mem_size{ltt.memsize_}},
-      ltt.in_order_q_, d_a);
-  compat_exp::launch<dynamic_local_mem_typed_kernel<T>>(
-      launch_policy{ltt.range_2_, local_mem_size{ltt.memsize_}},
-      ltt.in_order_q_, d_a);
-  compat_exp::launch<dynamic_local_mem_typed_kernel<T>>(
-      launch_policy{ltt.range_3_, local_mem_size{ltt.memsize_}},
-      ltt.in_order_q_, d_a);
-  compat_exp::launch<dynamic_local_mem_typed_kernel<T>>(
-      launch_policy{ltt.grid_, ltt.thread_, local_mem_size{ltt.memsize_}},
-      ltt.in_order_q_, d_a);
-
-  syclcompat::wait(ltt.in_order_q_);
-  syclcompat::free(d_a, ltt.in_order_q_);
-}
-
-template <typename T> void test_local_mem_usage() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  LaunchTestWithArgs<T> ltt;
-  if (ltt.skip_) // Unsupported aspect
-    return;
-
-  size_t num_elements = ltt.memsize_ / sizeof(T);
-
-  T *h_a = (T *)syclcompat::malloc_host(ltt.memsize_);
-  T *d_a = (T *)syclcompat::malloc(ltt.memsize_);
-
-  // d_a is the kernel output, no memcpy needed
-  compat_exp::launch<dynamic_local_mem_typed_kernel<T>>(
-      launch_policy{ltt.grid_, ltt.thread_, local_mem_size{ltt.memsize_}}, d_a);
-
-  syclcompat::memcpy(h_a, d_a, ltt.memsize_);
-  syclcompat::free(d_a);
-
-  for (int i = 0; i < num_elements; i++) {
-    assert(h_a[i] == static_cast<T>(num_elements - i - 1));
-  }
-  syclcompat::free(h_a);
-}
-
-template <typename T> void test_local_mem_usage_q() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  LaunchTestWithArgs<T> ltt;
-  if (ltt.skip_) // Unsupported aspect
-    return;
-
-  size_t num_elements = ltt.memsize_ / sizeof(T);
-  auto &q = ltt.in_order_q_;
-
-  T *h_a = (T *)syclcompat::malloc_host(ltt.memsize_);
-  T *d_a = (T *)syclcompat::malloc(ltt.memsize_, q);
-
-  // d_a is the kernel output, no memcpy needed
-  compat_exp::launch<dynamic_local_mem_typed_kernel<T>>(
-      launch_policy{ltt.grid_, ltt.thread_, local_mem_size{ltt.memsize_}}, q,
-      d_a);
-
-  syclcompat::memcpy(h_a, d_a, ltt.memsize_, q);
-  syclcompat::free(d_a, q);
-
-  for (size_t i = 0; i < num_elements; i++) {
-    assert(h_a[i] == static_cast<T>(num_elements - i - 1));
-  }
-
-  syclcompat::free(h_a);
-}
-
-template <typename T> void test_memsize_no_arg_launch() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  LaunchTest lt;
-  T memsize = static_cast<T>(8);
-
-  compat_exp::launch<dynamic_local_mem_empty_kernel>(launch_policy{lt.grid_, lt.thread_,
-                                                     local_mem_size(memsize)});
-}
-
-template <typename T> void test_memsize_no_arg_launch_q() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  LaunchTest lt;
-  T memsize = static_cast<T>(8);
-
-  compat_exp::launch<dynamic_local_mem_empty_kernel>(launch_policy{lt.grid_, lt.thread_,
-                                                     local_mem_size(memsize)}, lt.q_);
-}
-
-int main() {
-
-  test_dynamic_mem_no_arg_launch();
-  test_dynamic_mem_no_arg_launch_q();
-
-  INSTANTIATE_ALL_TYPES(value_type_list, test_basic_dt_launch);
-  INSTANTIATE_ALL_TYPES(value_type_list, test_basic_dt_launch_q);
-  INSTANTIATE_ALL_TYPES(value_type_list, test_arg_launch);
-  INSTANTIATE_ALL_TYPES(value_type_list, test_arg_launch_q);
-  INSTANTIATE_ALL_TYPES(value_type_list, test_local_mem_usage);
-  INSTANTIATE_ALL_TYPES(value_type_list, test_local_mem_usage_q);
-
-  INSTANTIATE_ALL_TYPES(memsize_type_list, test_memsize_no_arg_launch);
-  INSTANTIATE_ALL_TYPES(memsize_type_list, test_memsize_no_arg_launch_q);
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/launch/launch_properties.cpp b/sycl/test-e2e/syclcompat/launch/launch_properties.cpp
deleted file mode 100644
index 19176388b33e8..0000000000000
--- a/sycl/test-e2e/syclcompat/launch/launch_properties.cpp
+++ /dev/null
@@ -1,108 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  launch_properties.cpp
- *
- *  Description:
- *     launch<F> with launch properties tests - test cluster_dims passed
- *     correctly. Adapted from
- *     sycl/test-e2e/ClusterLaunch/cluster_launch_parallel_for.cpp
- **************************************************************************/
-
-// REQUIRES: target-nvidia, aspect-ext_oneapi_cuda_cluster_group
-// XFAIL: *
-// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16794
-// RUN: %{build} -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_90 -o %t.out
-// RUN: %{run} %t.out
-
-#include <sycl/detail/core.hpp>
-#include <sycl/ext/oneapi/experimental/cluster_group_prop.hpp>
-#include <sycl/ext/oneapi/properties/properties.hpp>
-
-#include <syclcompat/launch.hpp>
-#include <syclcompat/memory.hpp>
-
-namespace compat_exp = syclcompat::experimental;
-namespace sycl_exp = sycl::ext::oneapi::experimental;
-
-template <int Dim>
-void cluster_launch_kernel(sycl::range<Dim> cluster_range,
-                           int *correct_result_flag) {
-  uint32_t cluster_dim_x, cluster_dim_y, cluster_dim_z;
-// Temporary solution till cluster group class is implemented
-#if defined(__SYCL_DEVICE_ONLY__) && defined(__SYCL_CUDA_ARCH__) &&            \
-    (__SYCL_CUDA_ARCH__ >= 900)
-  asm volatile("\n\t"
-               "mov.u32 %0, %%cluster_nctaid.x; \n\t"
-               "mov.u32 %1, %%cluster_nctaid.y; \n\t"
-               "mov.u32 %2, %%cluster_nctaid.z; \n\t"
-               : "=r"(cluster_dim_z), "=r"(cluster_dim_y), "=r"(cluster_dim_x));
-#endif
-  if constexpr (Dim == 1) {
-    if (cluster_dim_z == cluster_range[0] && cluster_dim_y == 1 &&
-        cluster_dim_x == 1) {
-      *correct_result_flag = 1;
-    }
-  } else if constexpr (Dim == 2) {
-    if (cluster_dim_z == cluster_range[1] && cluster_dim_y == cluster_range[0] &&
-        cluster_dim_x == 1) {
-      *correct_result_flag = 1;
-    }
-  } else {
-    if (cluster_dim_z == cluster_range[2] && cluster_dim_y == cluster_range[1] &&
-        cluster_dim_x == cluster_range[0]) {
-      *correct_result_flag = 1;
-    }
-  }
-};
-
-template <int Dim>
-int test_cluster_launch_parallel_for(sycl::range<Dim> global_range,
-                                     sycl::range<Dim> local_range,
-                                     sycl::range<Dim> cluster_range) {
-
-  sycl_exp::cuda::cluster_size cluster_dims(cluster_range);
-
-  int *correct_result_flag = syclcompat::malloc<int>(1);
-  syclcompat::memset(correct_result_flag, 0, sizeof(int));
-
-  compat_exp::launch_policy policy{global_range, local_range,
-                                   compat_exp::launch_properties{cluster_dims}};
-  compat_exp::launch<cluster_launch_kernel<Dim>>(policy, cluster_range,
-                                                 correct_result_flag);
-
-  int correct_result_flag_host = 0;
-  syclcompat::memcpy<int>(&correct_result_flag_host, correct_result_flag, 1);
-  return correct_result_flag_host;
-}
-
-int main() {
-
-  sycl::queue Queue;
-
-  int host_correct_flag =
-      test_cluster_launch_parallel_for(sycl::range{128, 128, 128},
-                                       sycl::range{16, 16, 2},
-                                       sycl::range{2, 4, 1}) &&
-      test_cluster_launch_parallel_for(
-          sycl::range{512, 1024}, sycl::range{32, 32}, sycl::range{4, 2}) &&
-      test_cluster_launch_parallel_for(sycl::range{128}, sycl::range{32},
-                                       sycl::range{2}) &&
-      test_cluster_launch_parallel_for(sycl::range{16384}, sycl::range{32},
-                                       sycl::range{16});
-
-  return !host_correct_flag;
-}
diff --git a/sycl/test-e2e/syclcompat/lit.local.cfg b/sycl/test-e2e/syclcompat/lit.local.cfg
deleted file mode 100644
index c881febb2feb4..0000000000000
--- a/sycl/test-e2e/syclcompat/lit.local.cfg
+++ /dev/null
@@ -1,7 +0,0 @@
-# TODO: Remove this once the warnings are resolved
-original_clangxx=""
-for substitution in config.substitutions:
-  if substitution[0] == "%clangxx":
-    original_clangxx=substitution[1]
-config.substitutions.insert(0,
-  ("%clangxx", original_clangxx + ' -Wno-error=#warnings -Wno-error=deprecated-declarations'))
diff --git a/sycl/test-e2e/syclcompat/math/math_bfe.cpp b/sycl/test-e2e/syclcompat/math/math_bfe.cpp
deleted file mode 100644
index 5d280cf870d2e..0000000000000
--- a/sycl/test-e2e/syclcompat/math/math_bfe.cpp
+++ /dev/null
@@ -1,189 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  math_bfe.cpp
- *
- *  Description:
- *    math bitfield extract tests
- **************************************************************************/
-
-// ===----------- math_bfe.cpp ------------------ -*- C++ -* --------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===---------------------------------------------------------------------===//
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include <bitset>
-#include <chrono>
-#include <iostream>
-#include <limits.h>
-#include <random>
-#include <stdint.h>
-#include <sycl/detail/core.hpp>
-#include <syclcompat/math.hpp>
-#include <type_traits>
-#include <vector>
-
-template <typename T>
-inline std::enable_if_t<std::is_integral_v<T>, T>
-bfe_slow(const T source, const uint32_t bit_start, const uint32_t num_bits) {
-  const uint32_t msb =
-      std::numeric_limits<unsigned char>::digits * sizeof(T) - 1;
-  const uint32_t pos = bit_start;
-  const uint32_t len = num_bits;
-
-  // If the requested bit field length is zero, the result is zero.
-  if (num_bits == 0)
-    return 0ULL;
-
-  T sbit;
-  std::bitset<std::numeric_limits<unsigned char>::digits * sizeof(T)>
-      source_bitset(source);
-  if (std::is_unsigned_v<T> || len == 0)
-    sbit = 0;
-  else
-    sbit = source_bitset[std::min(pos + len - 1, msb)];
-
-  // If the start position is beyond the msb of the input, the destination d is
-  // filled with the replicated sign bit of the extracted field.
-  // -1 is 1111...
-  if (bit_start > msb)
-    return -sbit;
-
-  std::bitset<std::numeric_limits<unsigned char>::digits * sizeof(T)>
-      result_bitset;
-  for (uint8_t i = 0; i <= msb; ++i)
-    result_bitset[i] =
-        (i < len && pos + i <= msb) ? source_bitset[pos + i] : sbit;
-  return result_bitset.to_ullong();
-}
-
-template <typename T> bool test(const char *Msg, int N) {
-  uint32_t bit_width = std::numeric_limits<unsigned char>::digits * sizeof(T);
-  T min_value = std::numeric_limits<T>::lowest();
-  T max_value = std::numeric_limits<T>::max();
-  std::random_device rd;
-  std::mt19937::result_type seed =
-      rd() ^
-      ((std::mt19937::result_type)
-           std::chrono::duration_cast<std::chrono::seconds>(
-               std::chrono::system_clock::now().time_since_epoch())
-               .count() +
-       (std::mt19937::result_type)
-           std::chrono::duration_cast<std::chrono::microseconds>(
-               std::chrono::high_resolution_clock::now().time_since_epoch())
-               .count());
-
-  std::mt19937 gen(seed);
-  // Support for char type with uniform_int_distribution isn't universal
-  using RandomDataT = std::conditional_t<sizeof(T) == 1, int, T>;
-  std::uniform_int_distribution<RandomDataT> rd_source(min_value, max_value);
-
-  // Define a small overshoot so that we adequately test out-of-range cases
-  // without sacrificing depth of testing of valid start+length combinations
-  constexpr uint32_t overshoot = 2;
-  std::uniform_int_distribution<uint32_t> rd_start(0, bit_width + overshoot);
-  std::uniform_int_distribution<uint32_t> rd_length(0, bit_width + overshoot);
-
-  std::vector<T> sources(N, 0);
-  std::vector<T> compat_results(N, 0);
-  std::vector<T> slow_results(N, 0);
-  std::vector<uint32_t> starts(N, 0);
-  std::vector<uint32_t> lengths(N, 0);
-  for (int i = 0; i < N; ++i) {
-    sources[i] = static_cast<T>(rd_source(gen));
-    starts[i] = rd_start(gen);
-    lengths[i] = rd_length(gen);
-  }
-
-  sycl::buffer<T, 1> source_buffer(sources.data(), N);
-  sycl::buffer<T, 1> compat_results_buffer(compat_results.data(), N);
-  sycl::buffer<T, 1> slow_results_buffer(slow_results.data(), N);
-  sycl::buffer<uint32_t, 1> starts_buffer(starts.data(), N);
-  sycl::buffer<uint32_t, 1> lengths_buffer(lengths.data(), N);
-
-  sycl::queue que;
-  que.submit([&](sycl::handler &handler) {
-    sycl::accessor source_accessor(source_buffer, handler, sycl::read_only);
-    sycl::accessor start_accessor(starts_buffer, handler, sycl::read_only);
-    sycl::accessor length_accessor(lengths_buffer, handler, sycl::read_only);
-    sycl::accessor compat_result_accessor(compat_results_buffer, handler,
-                                          sycl::write_only);
-    handler.parallel_for(N, [=](sycl::id<1> i) {
-      compat_result_accessor[i] = syclcompat::bfe_safe<T>(
-          source_accessor[i], start_accessor[i], length_accessor[i]);
-    });
-  });
-
-  que.submit([&](sycl::handler &handler) {
-    sycl::accessor source_accessor(source_buffer, handler, sycl::read_only);
-    sycl::accessor start_accessor(starts_buffer, handler, sycl::read_only);
-    sycl::accessor length_accessor(lengths_buffer, handler, sycl::read_only);
-    sycl::accessor slow_result_accessor(slow_results_buffer, handler,
-                                        sycl::write_only);
-    handler.parallel_for(N, [=](sycl::id<1> i) {
-      slow_result_accessor[i] = bfe_slow<T>(
-          source_accessor[i], start_accessor[i], length_accessor[i]);
-    });
-  });
-
-  que.wait_and_throw();
-  sycl::host_accessor source_accessor(source_buffer, sycl::read_only);
-  sycl::host_accessor start_accessor(starts_buffer, sycl::read_only);
-  sycl::host_accessor length_accessor(lengths_buffer, sycl::read_only);
-  sycl::host_accessor compat_result_accessor(compat_results_buffer,
-                                             sycl::read_only);
-  sycl::host_accessor slow_result_accessor(slow_results_buffer,
-                                           sycl::read_only);
-
-  int failed = 0;
-  for (int i = 0; i < N; ++i) {
-    if (compat_result_accessor[i] != slow_result_accessor[i]) {
-      failed++;
-      std::cout << "[source = " << source_accessor[i]
-                << ", bit_start = " << start_accessor[i]
-                << ", num_bits = " << length_accessor[i] << "] failed, expect "
-                << slow_result_accessor[i] << " but got "
-                << compat_result_accessor[i] << std::endl;
-    }
-  }
-  std::cout << "===============" << std::endl;
-  std::cout << "Test: " << Msg << std::endl;
-  std::cout << "Total: " << N << std::endl;
-  std::cout << "Success: " << N - failed << std::endl;
-  std::cout << "Failed: " << failed << std::endl;
-  std::cout << "===============" << std::endl;
-  return !failed;
-}
-
-int main() {
-  const int N = 1000;
-  assert(test<int8_t>("int8", N));
-  assert(test<uint8_t>("uint8", N));
-  assert(test<int16_t>("int16", N));
-  assert(test<uint16_t>("uint16", N));
-  assert(test<int32_t>("int32", N));
-  assert(test<uint32_t>("uint32", N));
-  assert(test<int64_t>("int64", N));
-  assert(test<uint64_t>("uint64", N));
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/math/math_bfi.cpp b/sycl/test-e2e/syclcompat/math/math_bfi.cpp
deleted file mode 100644
index 717b63a09b4cf..0000000000000
--- a/sycl/test-e2e/syclcompat/math/math_bfi.cpp
+++ /dev/null
@@ -1,167 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  math_bfi.cpp
- *
- *  Description:
- *    math bitfield insert tests
- **************************************************************************/
-
-// ===----------- math_bfi.cpp ------------------ -*- C++ -* --------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===---------------------------------------------------------------------===//
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include <bitset>
-#include <chrono>
-#include <iostream>
-#include <limits.h>
-#include <random>
-#include <stdint.h>
-#include <sycl/detail/core.hpp>
-#include <syclcompat/math.hpp>
-#include <type_traits>
-#include <vector>
-
-template <typename T>
-inline std::enable_if_t<std::is_unsigned_v<T>, T>
-bfi_slow(const T x, const T y, const uint32_t bit_start,
-         const uint32_t num_bits) {
-  const uint32_t msb = CHAR_BIT * sizeof(T) - 1;
-  const uint32_t pos = bit_start & 0xff;
-  const uint32_t len = num_bits & 0xff;
-  std::bitset<CHAR_BIT * sizeof(T)> source_bitset(x), result_bitset(y);
-  for (int i = 0; i < len && pos + i <= msb; i++) {
-    result_bitset[pos + i] = source_bitset[i];
-  }
-  return result_bitset.to_ullong();
-}
-
-template <typename T> bool test(const char *Msg, int N) {
-  uint32_t bit_width = CHAR_BIT * sizeof(T);
-  T min_value = std::numeric_limits<T>::min();
-  T max_value = std::numeric_limits<T>::max();
-  std::random_device rd;
-  std::mt19937::result_type seed =
-      rd() ^
-      ((std::mt19937::result_type)
-           std::chrono::duration_cast<std::chrono::seconds>(
-               std::chrono::system_clock::now().time_since_epoch())
-               .count() +
-       (std::mt19937::result_type)
-           std::chrono::duration_cast<std::chrono::microseconds>(
-               std::chrono::high_resolution_clock::now().time_since_epoch())
-               .count());
-
-  std::mt19937 gen(seed);
-  std::uniform_int_distribution<T> rd_source(min_value, max_value);
-  // Define a small overshoot so that we adequately test out-of-range cases
-  // without sacrificing depth of testing of valid start+length combinations
-  constexpr uint32_t overshoot = 2;
-  std::uniform_int_distribution<uint32_t> rd_start(0, bit_width + overshoot);
-  std::uniform_int_distribution<uint32_t> rd_length(0, bit_width + overshoot);
-
-  std::vector<T> x(N, 0);
-  std::vector<T> y(N, 0);
-  std::vector<T> compat_results(N, 0);
-  std::vector<T> slow_results(N, 0);
-  std::vector<uint32_t> starts(N, 0);
-  std::vector<uint32_t> lengths(N, 0);
-  for (int i = 0; i < N; ++i) {
-    x[i] = rd_source(gen);
-    y[i] = rd_source(gen);
-    starts[i] = rd_start(gen);
-    lengths[i] = rd_length(gen);
-  }
-
-  sycl::buffer<T, 1> x_buffer(x.data(), N);
-  sycl::buffer<T, 1> y_buffer(y.data(), N);
-  sycl::buffer<T, 1> compat_results_buffer(compat_results.data(), N);
-  sycl::buffer<T, 1> slow_results_buffer(slow_results.data(), N);
-  sycl::buffer<uint32_t, 1> starts_buffer(starts.data(), N);
-  sycl::buffer<uint32_t, 1> lengths_buffer(lengths.data(), N);
-
-  sycl::queue que;
-  que.submit([&](sycl::handler &handler) {
-    sycl::accessor x_accessor(x_buffer, handler, sycl::read_only);
-    sycl::accessor y_accessor(y_buffer, handler, sycl::read_only);
-    sycl::accessor start_accessor(starts_buffer, handler, sycl::read_only);
-    sycl::accessor length_accessor(lengths_buffer, handler, sycl::read_only);
-    sycl::accessor compat_result_accessor(compat_results_buffer, handler,
-                                          sycl::write_only);
-    handler.parallel_for(N, [=](sycl::id<1> i) {
-      compat_result_accessor[i] = syclcompat::bfi_safe<T>(
-          x_accessor[i], y_accessor[i], start_accessor[i], length_accessor[i]);
-    });
-  });
-
-  que.submit([&](sycl::handler &handler) {
-    sycl::accessor x_accessor(x_buffer, handler, sycl::read_only);
-    sycl::accessor y_accessor(y_buffer, handler, sycl::read_only);
-    sycl::accessor start_accessor(starts_buffer, handler, sycl::read_only);
-    sycl::accessor length_accessor(lengths_buffer, handler, sycl::read_only);
-    sycl::accessor slow_result_accessor(slow_results_buffer, handler,
-                                        sycl::write_only);
-    handler.parallel_for(N, [=](sycl::id<1> i) {
-      slow_result_accessor[i] = bfi_slow<T>(
-          x_accessor[i], y_accessor[i], start_accessor[i], length_accessor[i]);
-    });
-  });
-
-  que.wait_and_throw();
-  sycl::host_accessor x_accessor(x_buffer, sycl::read_only);
-  sycl::host_accessor y_accessor(y_buffer, sycl::read_only);
-  sycl::host_accessor start_accessor(starts_buffer, sycl::read_only);
-  sycl::host_accessor length_accessor(lengths_buffer, sycl::read_only);
-  sycl::host_accessor compat_result_accessor(compat_results_buffer,
-                                             sycl::read_only);
-  sycl::host_accessor slow_result_accessor(slow_results_buffer,
-                                           sycl::read_only);
-
-  int failed = 0;
-  for (int i = 0; i < N; ++i) {
-    if (compat_result_accessor[i] != slow_result_accessor[i]) {
-      failed++;
-      std::cout << "[x = " << x_accessor[i] << ", y = " << y_accessor[i]
-                << ", bit_start = " << start_accessor[i]
-                << ", num_bits = " << length_accessor[i] << "] failed, expect "
-                << slow_result_accessor[i] << " but got "
-                << compat_result_accessor[i] << std::endl;
-    }
-  }
-  std::cout << "===============" << std::endl;
-  std::cout << "Test: " << Msg << std::endl;
-  std::cout << "Total: " << N << std::endl;
-  std::cout << "Success: " << N - failed << std::endl;
-  std::cout << "Failed: " << failed << std::endl;
-  std::cout << "===============" << std::endl;
-  return !failed;
-}
-
-int main() {
-  const int N = 1000;
-  assert(test<uint16_t>("uint16", N));
-  assert(test<uint32_t>("uint32", N));
-  assert(test<uint64_t>("uint64", N));
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/math/math_byte_dot_product.cpp b/sycl/test-e2e/syclcompat/math/math_byte_dot_product.cpp
deleted file mode 100644
index a53408157ecf6..0000000000000
--- a/sycl/test-e2e/syclcompat/math/math_byte_dot_product.cpp
+++ /dev/null
@@ -1,304 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  math_byte_dot_product.cpp
- *
- *  Description:
- *    Dp4a and Dp2a tests
- **************************************************************************/
-
-// ===----------- math_dp2a_dp4a.cpp ------------ -*- C++ -* --------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===---------------------------------------------------------------------===//
-
-// RUN: %{build} %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_61 %} -o %t.out
-// RUN: %{run} %t.out
-
-#include <sycl/detail/core.hpp>
-#include <syclcompat.hpp>
-
-template <typename T, size_t N> constexpr size_t array_size(T (&)[N]) {
-  return N;
-}
-
-// TODO(syclcompat-lib-reviewers): Improve the tests to ensure that the
-// intrinsics are actually used and the implementation is not defaulting to the
-// library implementation in CUDA devices.
-
-template <typename T1, typename T2> struct TestCaseStorage {
-  T1 a;
-  T2 b;
-  syclcompat::dot_product_acc_t<T1, T2> c;
-  syclcompat::dot_product_acc_t<T1, T2> d;
-};
-
-enum TestType { dp2a_lo, dp2a_hi, dp4a };
-
-template <TestType, typename T1, typename T2> struct TestCase;
-
-template <> struct TestCase<dp2a_lo, int32_t, int32_t> {
-  static constexpr TestCaseStorage<int32_t, int32_t> data[] = {
-      {930681129, 370772529, 2010968336, 2009507875},
-      {-182801821, 2018321974, -1344607006, -1345896544},
-      {-1405866995, -56456331, 2028627921, 2032457214},
-      {-2067420235, 667032387, -1549633870, -1551931432},
-      {150264517, 1499579728, 1168148523, 1167250815},
-      {-1488693248, 590983308, -1132841811, -1133637779},
-      {1952352829, 1541328881, -867130079, -868137584},
-      {1402917188, -396551268, 682657336, 684431698},
-      {1060076168, 2095822351, 266994190, 266267760},
-      {-597525506, 329411575, -760256038, -761517342},
-  };
-};
-
-template <> struct TestCase<dp2a_lo, int32_t, uint32_t> {
-  static constexpr TestCaseStorage<int32_t, uint32_t> data[] = {
-      {-1784870143, 3550903701, 929114859, 926130217},
-      {-906522442, 2115573780, -1285980330, -1286882122},
-      {1391650851, 4107608479, 273580150, 273309541},
-      {-1501013502, 3932674350, -905231285, -909141521},
-      {-304683280, 2104603303, -790552087, -792451259},
-      {-1341822015, 615507964, -1323598253, -1321376558},
-      {351927836, 264881689, -495668280, -494617318},
-      {-151229742, 3617293176, 628248961, 631228133},
-      {302881625, 4164956791, -1904446304, -1907281527},
-      {2037447091, 4048192261, -200189002, -196124539},
-  };
-};
-
-template <> struct TestCase<dp2a_lo, uint32_t, int32_t> {
-  static constexpr TestCaseStorage<uint32_t, int32_t> data[] = {
-      {3526794897, 1440743042, 370074542, 364852196},
-      {262513653, 298144108, 1265851732, 1270709221},
-      {1130955292, -963349034, -2078791855, -2076795466},
-      {2514054142, -1350622828, 257209474, 255489619},
-      {2734618833, -2039216829, 1170234974, 1174711303},
-      {2679502652, -552107997, 1516795981, 1513777921},
-      {2178722429, 1706794257, -1207356382, -1209905573},
-      {2938336684, 1853682464, 1478700448, 1479081561},
-      {4131007422, 88852262, 949301283, 946133869},
-      {1426380125, 1310424908, 2110346787, 2117262011},
-  };
-};
-
-template <> struct TestCase<dp2a_lo, uint32_t, uint32_t> {
-  static constexpr TestCaseStorage<uint32_t, uint32_t> data[] = {
-      {261879580, 462533001, 1244651601, 1254025336},
-      {3613440709, 39532914, 3612331201, 3620924635},
-      {2613678921, 3074075559, 2197617435, 2210733821},
-      {3858700825, 2932114399, 651043516, 660246528},
-      {3641490311, 1203902590, 1264123439, 1271505857},
-      {620567, 198432492, 1750593890, 1757851164},
-      {1924357490, 2672674441, 363874491, 372965679},
-      {575741870, 365675828, 4077327301, 4079479666},
-      {779333090, 1461441270, 3936527378, 3949974932},
-      {3047663397, 3117692984, 3095767416, 3100767768},
-  };
-};
-
-template <> struct TestCase<dp2a_hi, int32_t, int32_t> {
-  static constexpr TestCaseStorage<int32_t, int32_t> data[] = {
-      {2033148131, 1987852344, 1836738289, 1843474575},
-      {1854766635, -847369228, 570647947, 573274270},
-      {1221789280, -1504599082, 2039564501, 2038018823},
-      {1815893957, 522593320, -1194398972, -1192686202},
-      {-942058619, -1694947839, -1791401709, -1790085126},
-      {1261876252, -722935661, -401441440, -401822344},
-      {-1276948036, -2045446196, 883626458, 886422108},
-      {-1043904041, 1660095151, 924853314, 923046533},
-      {1873342481, -183952166, 1422494064, 1422142929},
-      {1548579097, 388816020, 1306723060, 1308540459},
-  };
-};
-template <> struct TestCase<dp2a_hi, int32_t, uint32_t> {
-  static constexpr TestCaseStorage<int32_t, uint32_t> data[] = {
-      {925779231, 2297216285, -2134129287, -2128032131},
-      {1226362493, 592978070, 1394319934, 1393859454},
-      {-820606485, 3315032306, -1946036979, -1953068392},
-      {865550467, 2594266420, 684086152, 688778945},
-      {2042373655, 2279820469, 330650825, 337071442},
-      {-803475029, 3557524416, 570180628, 567540937},
-      {-1920282536, 4207418946, -179074286, -188839786},
-      {-1611807508, 2012850000, -45410323, -52103004},
-      {-209217908, 3249694139, -1047805020, -1053226557},
-      {-938134420, 4023147013, -1637223186, -1637906791},
-  };
-};
-
-template <> struct TestCase<dp2a_hi, uint32_t, int32_t> {
-  static constexpr TestCaseStorage<uint32_t, int32_t> data[] = {
-      {1465064346, -987065627, 511196861, 510174688},
-      {423752047, -2037616892, 1367127780, 1359169438},
-      {1732089906, 1660637927, 835046327, 837441559},
-      {3240032526, -687279473, 314878829, 313361935},
-      {2028889232, 453690876, -1579929106, -1578835800},
-      {636106821, 1932111966, -1143803023, -1142096228},
-      {1744753942, 2120462197, 543738507, 552493329},
-      {1952094085, 75134480, -1870017090, -1865165688},
-      {1238028676, -368589994, 400410492, 400370364},
-      {1678354325, 1520837888, 900538674, 898982394},
-  };
-};
-
-template <> struct TestCase<dp2a_hi, uint32_t, uint32_t> {
-  static constexpr TestCaseStorage<uint32_t, uint32_t> data[] = {
-      {3407045239, 1034879260, 1566081712, 1573664144},
-      {1019854071, 319089899, 2048645673, 2049134832},
-      {3484748932, 23066577, 2279969923, 2280327476},
-      {772761490, 593919853, 110217101, 113334214},
-      {3040024654, 3302072533, 3503588845, 3513981095},
-      {247428909, 1708258743, 3414468907, 3421226563},
-      {3214691207, 2264421274, 2096321799, 2107689847},
-      {1978412244, 3523914401, 3482699206, 3489153446},
-      {845968593, 3600665955, 3398632658, 3406090055},
-      {2655885278, 642147090, 953440990, 957702400},
-  };
-};
-
-template <> struct TestCase<dp4a, int32_t, int32_t> {
-  static constexpr TestCaseStorage<int32_t, int32_t> data[] = {
-      {-1190208646, 231822748, 1361188354, 1361171428},
-      {-1897923580, -1660380472, -882257438, -882246232},
-      {-579619596, 1428550082, -686850248, -686847084},
-      {1276672648, 1193117464, 963222686, 963211136},
-      {-1511270552, 346453515, 539470060, 539466436},
-      {-1731107400, 30416897, 1116161329, 1116166641},
-      {314175584, 917356905, 1924209306, 1924227259},
-      {601261287, 461003584, -332185426, -332202489},
-      {451422378, 1069445579, 2077503598, 2077515898},
-      {1601425114, -1009494442, -12279717, -12298140},
-  };
-};
-
-template <> struct TestCase<dp4a, int32_t, uint32_t> {
-  static constexpr TestCaseStorage<int32_t, uint32_t> data[] = {
-      {851192907, 4159889898, -1560201465, -1560178121},
-      {-383662874, 94554831, -1699007777, -1699020048},
-      {319925525, 3224159406, -1636209897, -1636218115},
-      {390273202, 3538403320, 1599902512, 1599908059},
-      {-2133436013, 2204709798, -745513793, -745548526},
-      {-1365042624, 302260610, 1683641121, 1683648451},
-      {839091651, 3945553885, 18130274, 18116990},
-      {-92392216, 2135215000, -886668361, -886653647},
-      {-968453153, 2050948958, 1992996892, 1992963259},
-      {-234768205, 3930595068, -2067724845, -2067749613},
-  };
-};
-
-template <> struct TestCase<dp4a, uint32_t, int32_t> {
-  static constexpr TestCaseStorage<uint32_t, int32_t> data[] = {
-      {908604347, 1279608234, -1450969803, -1450975502},
-      {1784598592, 892171050, -824564831, -824528375},
-      {3414325281, 110856089, 1344013863, 1343984032},
-      {3589641407, 1110466407, 269001016, 269060567},
-      {3064317481, -1629226109, -733249792, -733278528},
-      {3599941523, 2112627078, 1626729914, 1626742113},
-      {1503610658, 885664480, 1900050896, 1900048832},
-      {2314829379, -2127096242, 1568300547, 1568304841},
-      {2817858008, -384307221, 307309401, 307306234},
-      {1408389703, 1080046077, -535563057, -535530708},
-  };
-};
-
-template <> struct TestCase<dp4a, uint32_t, uint32_t> {
-  static constexpr TestCaseStorage<uint32_t, uint32_t> data[] = {
-      {3065883002, 1618319527, 3160878852, 3160964499},
-      {750408200, 2617984089, 2072985277, 2073000475},
-      {1703570544, 1174656448, 1981665359, 1981717351},
-      {2526801072, 968400189, 821887370, 821972228},
-      {4033238565, 2506370972, 1177018849, 1177074623},
-      {2340922922, 2952738658, 316397016, 316469012},
-      {2559339202, 800262553, 1317311402, 1317374242},
-      {991496487, 2323953615, 2007618737, 2007639899},
-      {3918465905, 1041229499, 2826819834, 2826860086},
-      {4028147698, 2068172524, 482675182, 482797872}};
-};
-
-template <TestType Type, typename T1, typename T2> bool test() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  using Case = TestCase<Type, T1, T2>;
-  using CaseElement =
-      std::remove_cv_t<std::remove_extent_t<decltype(Case::data)>>;
-  using ResultT = syclcompat::dot_product_acc_t<T1, T2>;
-  constexpr size_t N = array_size(Case::data);
-  std::vector<ResultT> result(N);
-  std::vector<CaseElement> cases(std::begin(Case::data), std::end(Case::data));
-  sycl::buffer<CaseElement, 1> buffer(cases.data(), N);
-  sycl::buffer<ResultT, 1> result_buffer(result.data(), N);
-  sycl::queue q;
-  q.submit([&](sycl::handler &handler) {
-    sycl::accessor src(buffer, handler, sycl::read_only);
-    sycl::accessor res(result_buffer, handler, sycl::write_only);
-    handler.parallel_for(N, [=](sycl::id<1> i) {
-      if constexpr (Type == dp2a_lo)
-        res[i] = syclcompat::dp2a_lo<T1, T2>(src[i].a, src[i].b, src[i].c);
-      else if constexpr (Type == dp2a_hi)
-        res[i] = syclcompat::dp2a_hi<T1, T2>(src[i].a, src[i].b, src[i].c);
-      else
-        res[i] = syclcompat::dp4a<T1, T2>(src[i].a, src[i].b, src[i].c);
-    });
-  });
-
-  q.wait_and_throw();
-
-  int failed = 0;
-  sycl::host_accessor src(buffer, sycl::read_only);
-  sycl::host_accessor res(result_buffer, sycl::read_only);
-
-  for (int i = 0; i < N; ++i) {
-    if (src[i].d != res[i]) {
-      failed++;
-      std::cout << "  [a = " << src[i].a << ", b = " << src[i].b
-                << ", c = " << src[i].c << "] failed, expect " << src[i].d
-                << " but got " << res[i] << std::endl;
-    }
-  }
-
-  if (failed) {
-    std::cout << "  Total: " << N << std::endl;
-    std::cout << "  Success: " << N - failed << std::endl;
-    std::cout << "  Failed: " << failed << std::endl;
-  }
-
-  return !failed;
-}
-
-int main() {
-  bool passed = true;
-  passed = test<dp2a_lo, int32_t, int32_t>() && passed;
-  passed = test<dp2a_lo, int32_t, uint32_t>() && passed;
-  passed = test<dp2a_lo, uint32_t, int32_t>() && passed;
-  passed = test<dp2a_lo, uint32_t, uint32_t>() && passed;
-
-  passed = test<dp2a_hi, int32_t, int32_t>() && passed;
-  passed = test<dp2a_hi, int32_t, uint32_t>() && passed;
-  passed = test<dp2a_hi, uint32_t, int32_t>() && passed;
-  passed = test<dp2a_hi, uint32_t, uint32_t>() && passed;
-
-  passed = test<dp4a, int32_t, int32_t>() && passed;
-  passed = test<dp4a, int32_t, uint32_t>() && passed;
-  passed = test<dp4a, uint32_t, int32_t>() && passed;
-  passed = test<dp4a, uint32_t, uint32_t>() && passed;
-
-  assert(passed);
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/math/math_compare.cpp b/sycl/test-e2e/syclcompat/math/math_compare.cpp
deleted file mode 100644
index d3a93279fe308..0000000000000
--- a/sycl/test-e2e/syclcompat/math/math_compare.cpp
+++ /dev/null
@@ -1,386 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  math_compare.cpp
- *
- *  Description:
- *    math helpers tests
- **************************************************************************/
-
-// The original source was under the license below:
-// ===------------------- math.cpp ---------- -*- C++ -* ------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===---------------------------------------------------------------------===//
-
-// REQUIRES: aspect-fp16
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-// XFAIL: target-native_cpu
-// XFAIL-TRACKER: https://github.com/intel/llvm/issues/20142
-
-#include <sycl/half_type.hpp>
-#include <syclcompat/math.hpp>
-
-#include "../common.hpp"
-#include "math_fixt.hpp"
-
-template <typename ValueT>
-void compare_equal_kernel(ValueT *a, ValueT *b, bool *r) {
-  *r = syclcompat::compare(*a, *b, std::equal_to<>());
-}
-
-template <typename ValueT>
-void compare_not_equal_kernel(ValueT *a, ValueT *b, bool *r) {
-  *r = syclcompat::compare(*a, *b, std::not_equal_to<>());
-}
-
-template <typename ValueT> void test_compare() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  constexpr syclcompat::dim3 grid{1};
-  constexpr syclcompat::dim3 threads{1};
-  const ValueT op1 = static_cast<ValueT>(1.0);
-  ValueT op2 = sycl::nan(static_cast<unsigned int>(0));
-
-  //  1.0 == 1.0 -> True
-  BinaryOpTestLauncher<ValueT, ValueT, bool>(grid, threads)
-      .template launch_test<compare_equal_kernel<ValueT>>(op1, op1, true);
-  //  NaN == 1.0 -> False
-  BinaryOpTestLauncher<ValueT, ValueT, bool>(grid, threads)
-      .template launch_test<compare_equal_kernel<ValueT>>(op2, op1, false);
-  //  1.0 == NaN -> False
-  BinaryOpTestLauncher<ValueT, ValueT, bool>(grid, threads)
-      .template launch_test<compare_equal_kernel<ValueT>>(op1, op2, false);
-  //  NaN == NaN -> False
-  BinaryOpTestLauncher<ValueT, ValueT, bool>(grid, threads)
-      .template launch_test<compare_equal_kernel<ValueT>>(op2, op2, false);
-
-  //  1.0 != 1.0 -> False
-  BinaryOpTestLauncher<ValueT, ValueT, bool>(grid, threads)
-      .template launch_test<compare_not_equal_kernel<ValueT>>(op1, op1, false);
-  //  NaN != 1.0 -> False
-  BinaryOpTestLauncher<ValueT, ValueT, bool>(grid, threads)
-      .template launch_test<compare_not_equal_kernel<ValueT>>(op2, op1, false);
-  //  1.0 != NaN -> False
-  BinaryOpTestLauncher<ValueT, ValueT, bool>(grid, threads)
-      .template launch_test<compare_not_equal_kernel<ValueT>>(op1, op2, false);
-  //  NaN != NaN -> False
-  BinaryOpTestLauncher<ValueT, ValueT, bool>(grid, threads)
-      .template launch_test<compare_not_equal_kernel<ValueT>>(op2, op2, false);
-}
-
-template <typename Container>
-void compare_equal_vec_kernel(Container *a, Container *b, Container *r) {
-  *r = syclcompat::compare(*a, *b, std::equal_to<>());
-}
-
-template <typename Container>
-void compare_not_equal_vec_kernel(Container *a, Container *b, Container *r) {
-  *r = syclcompat::compare(*a, *b, std::not_equal_to<>());
-}
-
-template <template <typename T, int Dim> typename ContainerT,
-typename ValueT> void test_compare_vec() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  using Container = ContainerT<ValueT, 2>;
-
-  constexpr syclcompat::dim3 grid{1};
-  constexpr syclcompat::dim3 threads{1};
-  const Container op1 = {static_cast<ValueT>(1.0),
-                             static_cast<ValueT>(2.0)};
-  Container op2 = {static_cast<ValueT>(1.0),
-                   sycl::nan(static_cast<unsigned int>(0))};
-
-  // bool2 does not exist, 1.0 and 0.0 floats are used for true
-  // and false instead.
-  //  1.0 == 1.0, 2.0 == NaN -> {true, false}
-  const Container res1 = {1.0, 0.0};
-  BinaryOpTestLauncher<Container, Container>(grid, threads)
-      .template launch_test<compare_equal_vec_kernel<Container>>(op1, op2,
-                                                                 res1);
-  //  1.0 != 1.0, 2.0 != NaN -> {false, false}
-  const Container res2 = {0.0, 0.0};
-  BinaryOpTestLauncher<Container, Container>(grid, threads)
-      .template launch_test<compare_not_equal_vec_kernel<Container>>(op1, op2,
-                                                                     res2);
-}
-
-template <typename ValueT>
-void unordered_compare_equal_kernel(ValueT *a, ValueT *b, bool *r) {
-  *r = syclcompat::unordered_compare(*a, *b, std::equal_to<>());
-}
-
-template <typename ValueT>
-void unordered_compare_not_equal_kernel(ValueT *a, ValueT *b, bool *r) {
-  *r = syclcompat::unordered_compare(*a, *b, std::not_equal_to<>());
-}
-
-template <typename ValueT, typename ValueU = ValueT>
-void test_unordered_compare() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  constexpr syclcompat::dim3 grid{1};
-  constexpr syclcompat::dim3 threads{1};
-  const ValueT op1 = static_cast<ValueT>(1.0);
-  ValueT op2 = sycl::nan(static_cast<unsigned int>(0));
-
-  // Unordered comparison checks if either operand is NaN, or the binaryop holds
-  // true
-  //  1.0 == 1.0 -> True
-  BinaryOpTestLauncher<ValueT, ValueT, bool>(grid, threads)
-      .template launch_test<unordered_compare_equal_kernel<ValueT>>(op1, op1,
-                                                                    true);
-  //  NaN == 1.0 -> True
-  BinaryOpTestLauncher<ValueT, ValueT, bool>(grid, threads)
-      .template launch_test<unordered_compare_equal_kernel<ValueT>>(op2, op1,
-                                                                    true);
-  //  1.0 == NaN -> True
-  BinaryOpTestLauncher<ValueT, ValueT, bool>(grid, threads)
-      .template launch_test<unordered_compare_equal_kernel<ValueT>>(op1, op2,
-                                                                    true);
-  //  NaN == NaN -> True
-  BinaryOpTestLauncher<ValueT, ValueT, bool>(grid, threads)
-      .template launch_test<unordered_compare_equal_kernel<ValueT>>(op2, op2,
-                                                                    true);
-  //  1.0 != 1.0 -> False
-  BinaryOpTestLauncher<ValueT, ValueT, bool>(grid, threads)
-      .template launch_test<unordered_compare_not_equal_kernel<ValueT>>(
-          op1, op1, false);
-  // No need to check again if either operand is NaN
-}
-
-template <typename Container>
-void unordered_compare_equal_vec_kernel(Container *a, Container *b,
-                                        Container *r) {
-  *r = syclcompat::unordered_compare(*a, *b, std::equal_to<>());
-}
-
-template <typename Container>
-void unordered_compare_not_equal_vec_kernel(Container *a, Container *b,
-                                            Container *r) {
-  *r = syclcompat::unordered_compare(*a, *b, std::not_equal_to<>());
-}
-
-template <template <typename T, int Dim> typename ContainerT,
-typename ValueT> void test_unordered_compare_vec() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  using Container = ContainerT<ValueT, 2>;
-
-  constexpr syclcompat::dim3 grid{1};
-  constexpr syclcompat::dim3 threads{1};
-  const Container op1 = {static_cast<ValueT>(1.0),
-                             static_cast<ValueT>(2.0)};
-  Container op2 = {static_cast<ValueT>(1.0),
-                   sycl::nan(static_cast<unsigned int>(0))};
-
-  // bool2 does not exist, 1.0 and 0.0 floats are used for true
-  // and false instead.
-  //  1.0 == 1.0, 2.0 == NaN -> {true, true}
-  const Container res1 = {1.0, 1.0};
-  BinaryOpTestLauncher<Container, Container>(grid, threads)
-      .template launch_test<unordered_compare_equal_vec_kernel<Container>>(
-          op1, op2, res1);
-  //  1.0 != 1.0, 2.0 != NaN -> {false, true}
-  const Container res2 = {0.0, 1.0};
-  BinaryOpTestLauncher<Container, Container>(grid, threads)
-      .template launch_test<unordered_compare_not_equal_vec_kernel<Container>>(
-          op1, op2, res2);
-}
-
-template <typename Container>
-void compare_both_kernel(Container *a, Container *b, bool *r) {
-  *r = syclcompat::compare_both(*a, *b, std::equal_to<>());
-}
-
-template <template <typename T, int Dim> typename ContainerT,
-typename ValueT> void test_compare_both() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  using Container = ContainerT<ValueT, 2>;
-
-  constexpr syclcompat::dim3 grid{1};
-  constexpr syclcompat::dim3 threads{1};
-  const Container op1 = {static_cast<ValueT>(1.0),
-                             static_cast<ValueT>(2.0)};
-  Container op2 = {static_cast<ValueT>(1.0),
-                   sycl::nan(static_cast<unsigned int>(0))};
-
-  //  1.0 == 1.0, 2.0 == NaN -> {true, false} -> false
-  BinaryOpTestLauncher<Container, Container, bool>(grid, threads)
-      .template launch_test<compare_both_kernel<Container>>(op1, op2, false);
-
-  //  1.0 == 1.0, 2.0 == 2.0 -> {true, true} -> true
-  BinaryOpTestLauncher<Container, Container, bool>(grid, threads)
-      .template launch_test<compare_both_kernel<Container>>(op1, op1, true);
-
-  //  1.0 == 1.0, NaN == NaN -> {true, false} -> false
-  BinaryOpTestLauncher<Container, Container, bool>(grid, threads)
-      .template launch_test<compare_both_kernel<Container>>(op2, op2, false);
-}
-
-template <typename Container>
-void unordered_compare_both_kernel(Container *a, Container *b, bool *r) {
-  *r = syclcompat::unordered_compare_both(*a, *b, std::equal_to<>());
-}
-
-template <template <typename T, int Dim> typename ContainerT,
-typename ValueT> void test_unordered_compare_both() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  using Container = ContainerT<ValueT, 2>;
-
-  constexpr syclcompat::dim3 grid{1};
-  constexpr syclcompat::dim3 threads{1};
-  const Container op1 = {static_cast<ValueT>(1.0),
-                             static_cast<ValueT>(2.0)};
-  Container op2 = {static_cast<ValueT>(1.0),
-                   sycl::nan(static_cast<unsigned int>(0))};
-
-  //  1.0 == 1.0, 2.0 == NaN -> {true, true} -> true
-  BinaryOpTestLauncher<Container, Container, bool>(grid, threads)
-      .template launch_test<unordered_compare_both_kernel<Container>>(op1, op2,
-                                                                      true);
-  //  1.0 == 1.0, 2.0 == 2.0 -> {true, true} -> true
-  BinaryOpTestLauncher<Container, Container, bool>(grid, threads)
-      .template launch_test<unordered_compare_both_kernel<Container>>(op1, op1,
-                                                                      true);
-  //  1.0 == 1.0, NaN == NaN -> {true, true} -> true
-  BinaryOpTestLauncher<Container, Container, bool>(grid, threads)
-      .template launch_test<unordered_compare_both_kernel<Container>>(op2, op2,
-                                                                      true);
-}
-
-template <typename Container>
-void compare_mask_kernel(Container *a, Container *b, unsigned *r) {
-  *r = syclcompat::compare_mask(*a, *b, std::equal_to<>());
-}
-
-template <template <typename T, int Dim> typename ContainerT,
-typename ValueT> void test_compare_mask() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  using Container = ContainerT<ValueT, 2>;
-
-  constexpr syclcompat::dim3 grid{1};
-  constexpr syclcompat::dim3 threads{1};
-  const Container op1 = {static_cast<ValueT>(1.0),
-                             static_cast<ValueT>(2.0)};
-  const Container op2 = {static_cast<ValueT>(2.0),
-                             static_cast<ValueT>(1.0)};
-  const Container op3 = {static_cast<ValueT>(1.0),
-                             static_cast<ValueT>(3.0)};
-  const Container op4 = {static_cast<ValueT>(3.0),
-                             static_cast<ValueT>(2.0)};
-  Container op5 = {sycl::nan(static_cast<unsigned int>(0)),
-                   sycl::nan(static_cast<unsigned int>(0))};
-
-  //  1.0 == 1.0, 2.0 == 2.0 -> 0xffffffff
-  BinaryOpTestLauncher<Container, Container, unsigned>(grid, threads)
-      .template launch_test<compare_mask_kernel<Container>>(op1, op1,
-                                                            0xffffffff);
-
-  //  1.0 == 2.0, 2.0 == 1.0 -> 0x00000000
-  BinaryOpTestLauncher<Container, Container, unsigned>(grid, threads)
-      .template launch_test<compare_mask_kernel<Container>>(op1, op2,
-                                                            0x00000000);
-
-  //  1.0 == 1.0, 2.0 == 3.0 -> 0xffff0000
-  BinaryOpTestLauncher<Container, Container, unsigned>(grid, threads)
-      .template launch_test<compare_mask_kernel<Container>>(op1, op3,
-                                                            0x0000ffff);
-
-  //  1.0 == 3.0, 2.0 == 2.0 -> 0x0000ffff
-  BinaryOpTestLauncher<Container, Container, unsigned>(grid, threads)
-      .template launch_test<compare_mask_kernel<Container>>(op1, op4,
-                                                            0xffff0000);
-
-  //  1.0 == NaN, 2.0 == NaN -> 0x00000000
-  BinaryOpTestLauncher<Container, Container, unsigned>(grid, threads)
-      .template launch_test<compare_mask_kernel<Container>>(op1, op5,
-                                                            0x00000000);
-}
-
-template <typename Container>
-void unordered_compare_mask_kernel(Container *a, Container *b, unsigned *r) {
-  *r = syclcompat::unordered_compare_mask(*a, *b, std::equal_to<>());
-}
-
-template <template <typename T, int Dim> typename ContainerT,
-typename ValueT> void test_unordered_compare_mask() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  using Container = ContainerT<ValueT, 2>;
-
-  constexpr syclcompat::dim3 grid{1};
-  constexpr syclcompat::dim3 threads{1};
-  const Container op1 = {static_cast<ValueT>(1.0),
-                             static_cast<ValueT>(2.0)};
-  const Container op2 = {static_cast<ValueT>(2.0),
-                             static_cast<ValueT>(1.0)};
-  const Container op3 = {static_cast<ValueT>(1.0),
-                             static_cast<ValueT>(3.0)};
-  const Container op4 = {static_cast<ValueT>(3.0),
-                             static_cast<ValueT>(2.0)};
-  Container op5 = {sycl::nan(static_cast<unsigned int>(0)),
-                   sycl::nan(static_cast<unsigned int>(0))};
-
-  //  1.0 == 1.0, 2.0 == 2.0 -> 0xffffffff
-  BinaryOpTestLauncher<Container, Container, unsigned>(grid, threads)
-      .template launch_test<unordered_compare_mask_kernel<Container>>(
-          op1, op1, 0xffffffff);
-
-  //  1.0 == 2.0, 2.0 == 1.0 -> 0x00000000
-  BinaryOpTestLauncher<Container, Container, unsigned>(grid, threads)
-      .template launch_test<unordered_compare_mask_kernel<Container>>(
-          op1, op2, 0x00000000);
-
-  //  1.0 == 1.0, 2.0 == 3.0 -> 0xffff0000
-  BinaryOpTestLauncher<Container, Container, unsigned>(grid, threads)
-      .template launch_test<unordered_compare_mask_kernel<Container>>(
-          op1, op3, 0x0000ffff);
-
-  //  1.0 == 3.0, 2.0 == 2.0 -> 0x0000ffff
-  BinaryOpTestLauncher<Container, Container, unsigned>(grid, threads)
-      .template launch_test<unordered_compare_mask_kernel<Container>>(
-          op1, op4, 0xffff0000);
-
-  //  1.0 == NaN, 2.0 == NaN -> 0xffffffff
-  BinaryOpTestLauncher<Container, Container, unsigned>(grid, threads)
-      .template launch_test<unordered_compare_mask_kernel<Container>>(
-          op1, op5, 0xffffffff);
-}
-
-int main() {
-  INSTANTIATE_ALL_TYPES(fp_type_list, test_compare);
-  INSTANTIATE_ALL_TYPES(fp_type_list, test_unordered_compare);
-  INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::vec, test_compare_vec);
-  //INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::marray, test_compare_vec);
-  INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::vec, test_unordered_compare_vec);
-  //INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::marray, test_unordered_compare_vec);
-  INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::vec, test_compare_both);
-  //INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::marray, test_compare_both);
-  INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::vec, test_unordered_compare_both);
-  //INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::marray, test_unordered_compare_both);
-  INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::vec, test_compare_mask);
-  //INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::marray, test_compare_mask);
-  INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::vec, test_unordered_compare_mask);
-  //INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::marray, test_unordered_compare_mask);
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/math/math_complex.cpp b/sycl/test-e2e/syclcompat/math/math_complex.cpp
deleted file mode 100644
index 27e2bf8af8f71..0000000000000
--- a/sycl/test-e2e/syclcompat/math/math_complex.cpp
+++ /dev/null
@@ -1,257 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  math_complex.cpp
- *
- *  Description:
- *    Complex operations tests
- **************************************************************************/
-
-// The original source was under the license below:
-//===-------------- UtilComplex.cpp --------------------*- C++ -*----===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===---------------------------------------------------------------===//
-
-// REQUIRES: aspect-fp64
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include <complex>
-#include <iostream>
-
-#include <sycl/detail/core.hpp>
-#include <syclcompat.hpp>
-
-#include "../common.hpp"
-
-template <typename T> bool check(T x, float *e) {
-  float precision = ERROR_TOLERANCE;
-  if ((x.x() - e[0] < precision) && (x.x() - e[0] > -precision) &&
-      (x.y() - e[1] < precision) && (x.y() - e[1] > -precision)) {
-    return true;
-  }
-  return false;
-}
-
-template <typename T> bool check(sycl::marray<T, 2> x, float *e) {
-  float precision = ERROR_TOLERANCE;
-  if ((x[0] - e[0] < precision) && (x[0] - e[0] > -precision) &&
-      (x[1] - e[1] < precision) && (x[1] - e[1] > -precision)) {
-    return true;
-  }
-  return false;
-}
-
-template <> bool check<float>(float x, float *e) {
-  float precision = ERROR_TOLERANCE;
-  if ((x - e[0] < precision) && (x - e[0] > -precision)) {
-    return true;
-  }
-  return false;
-}
-
-template <> bool check<double>(double x, float *e) {
-  float precision = ERROR_TOLERANCE;
-  if ((x - e[0] < precision) && (x - e[0] > -precision)) {
-    return true;
-  }
-  return false;
-}
-
-// Class to launch a kernel and run a lambda on output data
-template <auto F> class ComplexLauncher {
-protected:
-  int *result_;
-  int cpu_result_{0};
-  int h_result_;
-
-public:
-  ComplexLauncher() {
-    result_ = (int *)syclcompat::malloc(sizeof(int));
-    syclcompat::memset(result_, 0, sizeof(int));
-  };
-  ~ComplexLauncher() { syclcompat::free(result_); }
-  void launch() {
-    F(&cpu_result_);                      // Run on host
-    syclcompat::launch<F>(1, 1, result_); // Run on device
-    syclcompat::wait();
-    syclcompat::memcpy<int>(&h_result_, result_, 1);
-    assert(h_result_ == 1);
-    assert(cpu_result_ == 1);
-  }
-};
-
-void kernel_abs(int *result) {
-
-  sycl::float2 f1, f2;
-  sycl::double2 d1, d2;
-
-  f1 = sycl::float2(1.8, -2.7);
-  d1 = sycl::double2(5.4, -6.3);
-
-  bool r = true;
-  float expect[2] = {8.297590, 3.244996};
-
-  auto a1 = syclcompat::cabs(d1);
-  r = r && check(a1, expect);
-
-  auto a2 = syclcompat::cabs(f1);
-  r = r && check(a2, expect + 1);
-
-  *result = r;
-}
-
-void kernel_conj(int *result) {
-
-  sycl::float2 f1, f2;
-  sycl::double2 d1, d2;
-
-  f1 = sycl::float2(1.8, -2.7);
-  f2 = sycl::float2(-3.6, 4.5);
-  d1 = sycl::double2(5.4, -6.3);
-  d2 = sycl::double2(-7.2, 8.1);
-
-  bool r = true;
-  float expect[4] = {5.400000, 6.300000, 1.800000, 2.700000};
-
-  auto a1 = syclcompat::conj(d1);
-  r = r && check(a1, expect);
-
-  auto a2 = syclcompat::conj(f1);
-  r = r && check(a2, expect + 2);
-
-  *result = r;
-}
-
-void kernel_div(int *result) {
-
-  sycl::float2 f1, f2;
-  sycl::double2 d1, d2;
-
-  f1 = sycl::float2(1.8, -2.7);
-  f2 = sycl::float2(-3.6, 4.5);
-  d1 = sycl::double2(5.4, -6.3);
-  d2 = sycl::double2(-7.2, 8.1);
-
-  bool r = true;
-  float expect[4] = {-0.765517, 0.013793, -0.560976, 0.048780};
-
-  auto a1 = syclcompat::cdiv(d1, d2);
-  r = r && check(a1, expect);
-
-  auto a2 = syclcompat::cdiv(f1, f2);
-  r = r && check(a2, expect + 2);
-
-  *result = r;
-}
-
-void kernel_mul(int *result) {
-
-  sycl::float2 f1, f2;
-  sycl::double2 d1, d2;
-
-  f1 = sycl::float2(1.8, -2.7);
-  f2 = sycl::float2(-3.6, 4.5);
-  d1 = sycl::double2(5.4, -6.3);
-  d2 = sycl::double2(-7.2, 8.1);
-
-  bool r = true;
-  float expect[4] = {12.150000, 89.100000, 5.670001, 17.820000};
-
-  auto a1 = syclcompat::cmul(d1, d2);
-  r = r && check(a1, expect);
-
-  auto a2 = syclcompat::cmul(f1, f2);
-  r = r && check(a2, expect + 2);
-
-  *result = r;
-}
-
-void kernel_mul_add(int *result) {
-  sycl::double2 d1, d2, d3;
-  sycl::float2 f1, f2, f3;
-  sycl::marray<double, 2> m_d1, m_d2, m_d3;
-  sycl::marray<float, 2> m_f1, m_f2, m_f3;
-
-  d1 = sycl::double2(5.4, -6.3);
-  d2 = sycl::double2(-7.2, 8.1);
-  d3 = sycl::double2(1.0, -1.0);
-
-  f1 = sycl::float2(1.8, -2.7);
-  f2 = sycl::float2(-3.6, 4.5);
-  f3 = sycl::float2(1.0, -1.0);
-
-  bool r = true;
-  float expect[4] = {13.150000, 88.100000, 6.670001, 16.820000};
-
-  auto a1 = syclcompat::cmul_add(d1, d2, d3);
-  r = r && check(a1, expect);
-
-  auto a2 = syclcompat::cmul_add(f1, f2, f3);
-  r = r && check(a2, expect + 2);
-
-  m_d1 = sycl::marray<double, 2>(5.4, -6.3);
-  m_d2 = sycl::marray<double, 2>(-7.2, 8.1);
-  m_d3 = sycl::marray<double, 2>(1.0, -1.0);
-
-  m_f1 = sycl::marray<float, 2>(1.8, -2.7);
-  m_f2 = sycl::marray<float, 2>(-3.6, 4.5);
-  m_f3 = sycl::marray<float, 2>(1.0, -1.0);
-
-  auto a3 = syclcompat::cmul_add(m_d1, m_d2, m_d3);
-  r = r && check(a3, expect);
-
-  auto a4 = syclcompat::cmul_add(m_f1, m_f2, m_f3);
-  r = r && check(a4, expect + 2);
-
-  *result = r;
-}
-
-void test_abs() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  ComplexLauncher<kernel_abs>().launch();
-}
-void test_mul() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  ComplexLauncher<kernel_mul>().launch();
-}
-void test_div() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  ComplexLauncher<kernel_div>().launch();
-}
-void test_conj() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  ComplexLauncher<kernel_conj>().launch();
-}
-
-void test_mul_add() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  ComplexLauncher<kernel_mul_add>().launch();
-}
-
-int main() {
-  test_abs();
-  test_mul();
-  test_div();
-  test_conj();
-  test_mul_add();
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/math/math_complex_datatype.cpp b/sycl/test-e2e/syclcompat/math/math_complex_datatype.cpp
deleted file mode 100644
index 9efacbcd9591f..0000000000000
--- a/sycl/test-e2e/syclcompat/math/math_complex_datatype.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  math_complex_datatype.cpp
- *
- *  Description:
- *    Complex operations tests
- **************************************************************************/
-
-// The original source was under the license below:
-//===-------------- UtilComplex.cpp --------------------*- C++ -*----===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===---------------------------------------------------------------===//
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include <iostream>
-#include <syclcompat/math.hpp>
-#include <syclcompat/util.hpp>
-
-void test_datatype() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  if constexpr (!std::is_same<syclcompat::detail::DataType<float>::T2,
-                              float>::value)
-    assert(false); // FAIL
-#ifdef SYCL_EXT_ONEAPI_COMPLEX
-  if constexpr (!std::is_same<
-                    syclcompat::detail::DataType<sycl::float2>::T2,
-                    sycl::ext::oneapi::experimental::complex<float>>::value)
-    assert(false); // FAIL
-#endif
-}
-
-int main() {
-  test_datatype();
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/math/math_emu_simd_from_syclomatic.cpp b/sycl/test-e2e/syclcompat/math/math_emu_simd_from_syclomatic.cpp
deleted file mode 100644
index 6a850a887eb18..0000000000000
--- a/sycl/test-e2e/syclcompat/math/math_emu_simd_from_syclomatic.cpp
+++ /dev/null
@@ -1,3380 +0,0 @@
-//===---- math_emu_simd_from_syclomatic.cpp ---------- *- C++ -* ----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// This file is modified from the code migrated by SYCLomatic.
-
-// REQUIRES: aspect-fp16
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include <sycl/detail/core.hpp>
-
-#include <syclcompat/device.hpp>
-#include <syclcompat/math.hpp>
-#include <syclcompat/memory.hpp>
-
-using namespace std;
-
-typedef pair<unsigned int, unsigned int> Uint_pair;
-
-void checkResult(const string &FuncName, const vector<unsigned int> &Inputs,
-                 const unsigned int &Expect, const unsigned int &DeviceResult) {
-  cout << FuncName << "(" << Inputs[0];
-  for (size_t i = 1; i < Inputs.size(); ++i) {
-    cout << ", " << Inputs[i];
-  }
-  cout << ") = " << DeviceResult << " (expect " << Expect << ")" << endl;
-  assert(DeviceResult == Expect);
-}
-
-void vabs2(unsigned int *const DeviceResult, unsigned int Input1) {
-  *DeviceResult =
-      syclcompat::vectorized_unary<sycl::short2>(Input1, syclcompat::abs());
-}
-
-void testVabs2Cases(const vector<pair<unsigned int, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_ct1 = TestCase.first;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vabs2(DeviceResult, TestCase_first_ct1);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vabs2", {TestCase.first}, TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vabs4(unsigned int *const DeviceResult, unsigned int Input1) {
-  *DeviceResult =
-      syclcompat::vectorized_unary<sycl::char4>(Input1, syclcompat::abs());
-}
-
-void testVabs4Cases(const vector<pair<unsigned int, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_ct1 = TestCase.first;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vabs4(DeviceResult, TestCase_first_ct1);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vabs4", {TestCase.first}, TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vabsdiffs2(unsigned int *const DeviceResult, unsigned int Input1,
-                unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::short2>(
-      Input1, Input2, syclcompat::abs_diff());
-}
-
-void testVabsdiffs2Cases(
-    const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vabsdiffs2(DeviceResult, TestCase_first_first_ct1,
-                         TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vabsdiffs2", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vabsdiffs4(unsigned int *const DeviceResult, unsigned int Input1,
-                unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::char4>(
-      Input1, Input2, syclcompat::abs_diff());
-}
-
-void testVabsdiffs4Cases(
-    const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vabsdiffs4(DeviceResult, TestCase_first_first_ct1,
-                         TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vabsdiffs4", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vabsdiffu2(unsigned int *const DeviceResult, unsigned int Input1,
-                unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::ushort2>(
-      Input1, Input2, syclcompat::abs_diff());
-}
-
-void testVabsdiffu2Cases(
-    const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vabsdiffu2(DeviceResult, TestCase_first_first_ct1,
-                         TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vabsdiffu2", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vabsdiffu4(unsigned int *const DeviceResult, unsigned int Input1,
-                unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::uchar4>(
-      Input1, Input2, syclcompat::abs_diff());
-}
-
-void testVabsdiffu4Cases(
-    const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vabsdiffu4(DeviceResult, TestCase_first_first_ct1,
-                         TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vabsdiffu4", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vabsss2(unsigned int *const DeviceResult, unsigned int Input1) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::short2>(
-      Input1, 0, syclcompat::abs_diff());
-}
-
-void testVabsss2Cases(
-    const vector<pair<unsigned int, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_ct1 = TestCase.first;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vabsss2(DeviceResult, TestCase_first_ct1);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vabsss2", {TestCase.first}, TestCase.second,
-                  *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vabsss4(unsigned int *const DeviceResult, unsigned int Input1) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::char4>(
-      Input1, 0, syclcompat::abs_diff());
-}
-
-void testVabsss4Cases(
-    const vector<pair<unsigned int, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_ct1 = TestCase.first;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vabsss4(DeviceResult, TestCase_first_ct1);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vabsss4", {TestCase.first}, TestCase.second,
-                  *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vadd2(unsigned int *const DeviceResult, unsigned int Input1,
-           unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::ushort2>(Input1, Input2,
-                                                               std::plus<>());
-}
-
-void testVadd2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vadd2(DeviceResult, TestCase_first_first_ct1,
-                    TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vadd2", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vadd4(unsigned int *const DeviceResult, unsigned int Input1,
-           unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::uchar4>(Input1, Input2,
-                                                              std::plus<>());
-}
-
-void testVadd4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vadd4(DeviceResult, TestCase_first_first_ct1,
-                    TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vadd4", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vaddss2(unsigned int *const DeviceResult, unsigned int Input1,
-             unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::short2>(
-      Input1, Input2, syclcompat::add_sat());
-}
-
-void testVaddss2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vaddss2(DeviceResult, TestCase_first_first_ct1,
-                      TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vaddss2", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vaddss4(unsigned int *const DeviceResult, unsigned int Input1,
-             unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::char4>(
-      Input1, Input2, syclcompat::add_sat());
-}
-
-void testVaddss4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vaddss4(DeviceResult, TestCase_first_first_ct1,
-                      TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vaddss4", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vaddus2(unsigned int *const DeviceResult, unsigned int Input1,
-             unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::ushort2>(
-      Input1, Input2, syclcompat::add_sat());
-}
-
-void testVaddus2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vaddus2(DeviceResult, TestCase_first_first_ct1,
-                      TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vaddus2", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vaddus4(unsigned int *const DeviceResult, unsigned int Input1,
-             unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::uchar4>(
-      Input1, Input2, syclcompat::add_sat());
-}
-
-void testVaddus4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vaddus4(DeviceResult, TestCase_first_first_ct1,
-                      TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vaddus4", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vavgs2(unsigned int *const DeviceResult, unsigned int Input1,
-            unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::short2>(
-      Input1, Input2, syclcompat::rhadd());
-}
-
-void testVavgs2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vavgs2(DeviceResult, TestCase_first_first_ct1,
-                     TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vavgs2", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vavgs4(unsigned int *const DeviceResult, unsigned int Input1,
-            unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::char4>(
-      Input1, Input2, syclcompat::rhadd());
-}
-
-void testVavgs4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vavgs4(DeviceResult, TestCase_first_first_ct1,
-                     TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vavgs4", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vavgu2(unsigned int *const DeviceResult, unsigned int Input1,
-            unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::ushort2>(
-      Input1, Input2, syclcompat::rhadd());
-}
-
-void testVavgu2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vavgu2(DeviceResult, TestCase_first_first_ct1,
-                     TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vavgu2", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vavgu4(unsigned int *const DeviceResult, unsigned int Input1,
-            unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::uchar4>(
-      Input1, Input2, syclcompat::rhadd());
-}
-
-void testVavgu4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vavgu4(DeviceResult, TestCase_first_first_ct1,
-                     TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vavgu4", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vcmpeq2(unsigned int *const DeviceResult, unsigned int Input1,
-             unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::ushort2>(
-      Input1, Input2, std::equal_to<>());
-}
-
-void testVcmpeq2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vcmpeq2(DeviceResult, TestCase_first_first_ct1,
-                      TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vcmpeq2", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vcmpeq4(unsigned int *const DeviceResult, unsigned int Input1,
-             unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::uchar4>(
-      Input1, Input2, std::equal_to<>());
-}
-
-void testVcmpeq4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vcmpeq4(DeviceResult, TestCase_first_first_ct1,
-                      TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vcmpeq4", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vcmpges2(unsigned int *const DeviceResult, unsigned int Input1,
-              unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::short2>(
-      Input1, Input2, std::greater_equal<>());
-}
-
-void testVcmpges2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vcmpges2(DeviceResult, TestCase_first_first_ct1,
-                       TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vcmpges2", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vcmpges4(unsigned int *const DeviceResult, unsigned int Input1,
-              unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::char4>(
-      Input1, Input2, std::greater_equal<>());
-}
-
-void testVcmpges4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vcmpges4(DeviceResult, TestCase_first_first_ct1,
-                       TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vcmpges4", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vcmpgeu2(unsigned int *const DeviceResult, unsigned int Input1,
-              unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::ushort2>(
-      Input1, Input2, std::greater_equal<>());
-}
-
-void testVcmpgeu2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vcmpgeu2(DeviceResult, TestCase_first_first_ct1,
-                       TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vcmpgeu2", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vcmpgeu4(unsigned int *const DeviceResult, unsigned int Input1,
-              unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::uchar4>(
-      Input1, Input2, std::greater_equal<>());
-}
-
-void testVcmpgeu4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vcmpgeu4(DeviceResult, TestCase_first_first_ct1,
-                       TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vcmpgeu4", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vcmpgts2(unsigned int *const DeviceResult, unsigned int Input1,
-              unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::short2>(Input1, Input2,
-                                                              std::greater<>());
-}
-
-void testVcmpgts2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vcmpgts2(DeviceResult, TestCase_first_first_ct1,
-                       TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vcmpgts2", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vcmpgts4(unsigned int *const DeviceResult, unsigned int Input1,
-              unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::char4>(Input1, Input2,
-                                                             std::greater<>());
-}
-
-void testVcmpgts4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vcmpgts4(DeviceResult, TestCase_first_first_ct1,
-                       TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vcmpgts4", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vcmpgtu2(unsigned int *const DeviceResult, unsigned int Input1,
-              unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::ushort2>(
-      Input1, Input2, std::greater<>());
-}
-
-void testVcmpgtu2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vcmpgtu2(DeviceResult, TestCase_first_first_ct1,
-                       TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vcmpgtu2", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vcmpgtu4(unsigned int *const DeviceResult, unsigned int Input1,
-              unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::uchar4>(Input1, Input2,
-                                                              std::greater<>());
-}
-
-void testVcmpgtu4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vcmpgtu4(DeviceResult, TestCase_first_first_ct1,
-                       TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vcmpgtu4", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vcmples2(unsigned int *const DeviceResult, unsigned int Input1,
-              unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::short2>(
-      Input1, Input2, std::less_equal<>());
-}
-
-void testVcmples2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vcmples2(DeviceResult, TestCase_first_first_ct1,
-                       TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vcmples2", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vcmples4(unsigned int *const DeviceResult, unsigned int Input1,
-              unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::char4>(
-      Input1, Input2, std::less_equal<>());
-}
-
-void testVcmples4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vcmples4(DeviceResult, TestCase_first_first_ct1,
-                       TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vcmples4", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vcmpleu2(unsigned int *const DeviceResult, unsigned int Input1,
-              unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::ushort2>(
-      Input1, Input2, std::less_equal<>());
-}
-
-void testVcmpleu2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vcmpleu2(DeviceResult, TestCase_first_first_ct1,
-                       TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vcmpleu2", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vcmpleu4(unsigned int *const DeviceResult, unsigned int Input1,
-              unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::uchar4>(
-      Input1, Input2, std::less_equal<>());
-}
-
-void testVcmpleu4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vcmpleu4(DeviceResult, TestCase_first_first_ct1,
-                       TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vcmpleu4", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vcmplts2(unsigned int *const DeviceResult, unsigned int Input1,
-              unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::short2>(Input1, Input2,
-                                                              std::less<>());
-}
-
-void testVcmplts2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vcmplts2(DeviceResult, TestCase_first_first_ct1,
-                       TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vcmplts2", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vcmplts4(unsigned int *const DeviceResult, unsigned int Input1,
-              unsigned int Input2) {
-  *DeviceResult =
-      syclcompat::vectorized_binary<sycl::char4>(Input1, Input2, std::less<>());
-}
-
-void testVcmplts4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vcmplts4(DeviceResult, TestCase_first_first_ct1,
-                       TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vcmplts4", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vcmpltu2(unsigned int *const DeviceResult, unsigned int Input1,
-              unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::ushort2>(Input1, Input2,
-                                                               std::less<>());
-}
-
-void testVcmpltu2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vcmpltu2(DeviceResult, TestCase_first_first_ct1,
-                       TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vcmpltu2", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vcmpltu4(unsigned int *const DeviceResult, unsigned int Input1,
-              unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::uchar4>(Input1, Input2,
-                                                              std::less<>());
-}
-
-void testVcmpltu4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vcmpltu4(DeviceResult, TestCase_first_first_ct1,
-                       TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vcmpltu4", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vcmpne2(unsigned int *const DeviceResult, unsigned int Input1,
-             unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::ushort2>(
-      Input1, Input2, std::not_equal_to<>());
-}
-
-void testVcmpne2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vcmpne2(DeviceResult, TestCase_first_first_ct1,
-                      TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vcmpne2", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vcmpne4(unsigned int *const DeviceResult, unsigned int Input1,
-             unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::uchar4>(
-      Input1, Input2, std::not_equal_to<>());
-}
-
-void testVcmpne4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vcmpne4(DeviceResult, TestCase_first_first_ct1,
-                      TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vcmpne4", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vhaddu2(unsigned int *const DeviceResult, unsigned int Input1,
-             unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::ushort2>(
-      Input1, Input2, syclcompat::hadd());
-}
-
-void testVhaddu2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vhaddu2(DeviceResult, TestCase_first_first_ct1,
-                      TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vhaddu2", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vhaddu4(unsigned int *const DeviceResult, unsigned int Input1,
-             unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::uchar4>(
-      Input1, Input2, syclcompat::hadd());
-}
-
-void testVhaddu4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vhaddu4(DeviceResult, TestCase_first_first_ct1,
-                      TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vhaddu4", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vmaxs2(unsigned int *const DeviceResult, unsigned int Input1,
-            unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::short2>(
-      Input1, Input2, syclcompat::maximum());
-}
-
-void testVmaxs2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vmaxs2(DeviceResult, TestCase_first_first_ct1,
-                     TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vmaxs2", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vmaxs4(unsigned int *const DeviceResult, unsigned int Input1,
-            unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::char4>(
-      Input1, Input2, syclcompat::maximum());
-}
-
-void testVmaxs4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vmaxs4(DeviceResult, TestCase_first_first_ct1,
-                     TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vmaxs4", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vmaxu2(unsigned int *const DeviceResult, unsigned int Input1,
-            unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::ushort2>(
-      Input1, Input2, syclcompat::maximum());
-}
-
-void testVmaxu2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vmaxu2(DeviceResult, TestCase_first_first_ct1,
-                     TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vmaxu2", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vmaxu4(unsigned int *const DeviceResult, unsigned int Input1,
-            unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::uchar4>(
-      Input1, Input2, syclcompat::maximum());
-}
-
-void testVmaxu4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vmaxu4(DeviceResult, TestCase_first_first_ct1,
-                     TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vmaxu4", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vmins2(unsigned int *const DeviceResult, unsigned int Input1,
-            unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::short2>(
-      Input1, Input2, syclcompat::minimum());
-}
-
-void testVmins2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vmins2(DeviceResult, TestCase_first_first_ct1,
-                     TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vmins2", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vmins4(unsigned int *const DeviceResult, unsigned int Input1,
-            unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::char4>(
-      Input1, Input2, syclcompat::minimum());
-}
-
-void testVmins4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vmins4(DeviceResult, TestCase_first_first_ct1,
-                     TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vmins4", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vminu2(unsigned int *const DeviceResult, unsigned int Input1,
-            unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::ushort2>(
-      Input1, Input2, syclcompat::minimum());
-}
-
-void testVminu2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vminu2(DeviceResult, TestCase_first_first_ct1,
-                     TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vminu2", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vminu4(unsigned int *const DeviceResult, unsigned int Input1,
-            unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::uchar4>(
-      Input1, Input2, syclcompat::minimum());
-}
-
-void testVminu4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vminu4(DeviceResult, TestCase_first_first_ct1,
-                     TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vminu4", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vneg2(unsigned int *const DeviceResult, unsigned int Input1) {
-  *DeviceResult =
-      syclcompat::vectorized_unary<sycl::short2>(Input1, std::negate<>());
-}
-
-void testVneg2Cases(const vector<pair<unsigned int, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_ct1 = TestCase.first;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vneg2(DeviceResult, TestCase_first_ct1);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vneg2", {TestCase.first}, TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vneg4(unsigned int *const DeviceResult, unsigned int Input1) {
-  *DeviceResult =
-      syclcompat::vectorized_unary<sycl::char4>(Input1, std::negate<>());
-}
-
-void testVneg4Cases(const vector<pair<unsigned int, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_ct1 = TestCase.first;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vneg4(DeviceResult, TestCase_first_ct1);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vneg4", {TestCase.first}, TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vnegss2(unsigned int *const DeviceResult, unsigned int Input1) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::short2>(
-      0, Input1, syclcompat::sub_sat());
-}
-
-void testVnegss2Cases(
-    const vector<pair<unsigned int, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_ct1 = TestCase.first;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vnegss2(DeviceResult, TestCase_first_ct1);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vnegss2", {TestCase.first}, TestCase.second,
-                  *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vnegss4(unsigned int *const DeviceResult, unsigned int Input1) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::char4>(
-      0, Input1, syclcompat::sub_sat());
-}
-
-void testVnegss4Cases(
-    const vector<pair<unsigned int, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_ct1 = TestCase.first;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vnegss4(DeviceResult, TestCase_first_ct1);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vnegss4", {TestCase.first}, TestCase.second,
-                  *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vsads2(unsigned int *const DeviceResult, unsigned int Input1,
-            unsigned int Input2) {
-  *DeviceResult =
-      syclcompat::vectorized_sum_abs_diff<sycl::short2>(Input1, Input2);
-}
-
-void testVsads2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vsads2(DeviceResult, TestCase_first_first_ct1,
-                     TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vsads2", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vsads4(unsigned int *const DeviceResult, unsigned int Input1,
-            unsigned int Input2) {
-  *DeviceResult =
-      syclcompat::vectorized_sum_abs_diff<sycl::char4>(Input1, Input2);
-}
-
-void testVsads4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vsads4(DeviceResult, TestCase_first_first_ct1,
-                     TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vsads4", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vsadu2(unsigned int *const DeviceResult, unsigned int Input1,
-            unsigned int Input2) {
-  *DeviceResult =
-      syclcompat::vectorized_sum_abs_diff<sycl::ushort2>(Input1, Input2);
-}
-
-void testVsadu2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vsadu2(DeviceResult, TestCase_first_first_ct1,
-                     TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vsadu2", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vsadu4(unsigned int *const DeviceResult, unsigned int Input1,
-            unsigned int Input2) {
-  *DeviceResult =
-      syclcompat::vectorized_sum_abs_diff<sycl::uchar4>(Input1, Input2);
-}
-
-void testVsadu4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vsadu4(DeviceResult, TestCase_first_first_ct1,
-                     TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vsadu4", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vseteq2(unsigned int *const DeviceResult, unsigned int Input1,
-             unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::ushort2>(
-      Input1, Input2, std::equal_to<unsigned short>());
-}
-
-void testVseteq2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vseteq2(DeviceResult, TestCase_first_first_ct1,
-                      TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vseteq2", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vseteq4(unsigned int *const DeviceResult, unsigned int Input1,
-             unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::uchar4>(
-      Input1, Input2, std::equal_to<unsigned char>());
-}
-
-void testVseteq4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vseteq4(DeviceResult, TestCase_first_first_ct1,
-                      TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vseteq4", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vsetges2(unsigned int *const DeviceResult, unsigned int Input1,
-              unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::short2>(
-      Input1, Input2, std::greater_equal<short>());
-}
-
-void testVsetges2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vsetges2(DeviceResult, TestCase_first_first_ct1,
-                       TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vsetges2", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vsetges4(unsigned int *const DeviceResult, unsigned int Input1,
-              unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::char4>(
-      Input1, Input2, std::greater_equal<char>());
-}
-
-void testVsetges4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vsetges4(DeviceResult, TestCase_first_first_ct1,
-                       TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vsetges4", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vsetgeu2(unsigned int *const DeviceResult, unsigned int Input1,
-              unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::ushort2>(
-      Input1, Input2, std::greater_equal<unsigned short>());
-}
-
-void testVsetgeu2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vsetgeu2(DeviceResult, TestCase_first_first_ct1,
-                       TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vsetgeu2", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vsetgeu4(unsigned int *const DeviceResult, unsigned int Input1,
-              unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::uchar4>(
-      Input1, Input2, std::greater_equal<unsigned char>());
-}
-
-void testVsetgeu4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vsetgeu4(DeviceResult, TestCase_first_first_ct1,
-                       TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vsetgeu4", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vsetgts2(unsigned int *const DeviceResult, unsigned int Input1,
-              unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::short2>(
-      Input1, Input2, std::greater<short>());
-}
-
-void testVsetgts2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vsetgts2(DeviceResult, TestCase_first_first_ct1,
-                       TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vsetgts2", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vsetgts4(unsigned int *const DeviceResult, unsigned int Input1,
-              unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::char4>(
-      Input1, Input2, std::greater<char>());
-}
-
-void testVsetgts4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vsetgts4(DeviceResult, TestCase_first_first_ct1,
-                       TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vsetgts4", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vsetgtu2(unsigned int *const DeviceResult, unsigned int Input1,
-              unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::ushort2>(
-      Input1, Input2, std::greater<unsigned short>());
-}
-
-void testVsetgtu2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vsetgtu2(DeviceResult, TestCase_first_first_ct1,
-                       TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vsetgtu2", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vsetgtu4(unsigned int *const DeviceResult, unsigned int Input1,
-              unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::uchar4>(
-      Input1, Input2, std::greater<unsigned char>());
-}
-
-void testVsetgtu4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vsetgtu4(DeviceResult, TestCase_first_first_ct1,
-                       TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vsetgtu4", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vsetles2(unsigned int *const DeviceResult, unsigned int Input1,
-              unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::short2>(
-      Input1, Input2, std::less_equal<short>());
-}
-
-void testVsetles2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vsetles2(DeviceResult, TestCase_first_first_ct1,
-                       TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vsetles2", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vsetles4(unsigned int *const DeviceResult, unsigned int Input1,
-              unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::char4>(
-      Input1, Input2, std::less_equal<char>());
-}
-
-void testVsetles4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vsetles4(DeviceResult, TestCase_first_first_ct1,
-                       TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vsetles4", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vsetleu2(unsigned int *const DeviceResult, unsigned int Input1,
-              unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::ushort2>(
-      Input1, Input2, std::less_equal<unsigned short>());
-}
-
-void testVsetleu2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vsetleu2(DeviceResult, TestCase_first_first_ct1,
-                       TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vsetleu2", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vsetleu4(unsigned int *const DeviceResult, unsigned int Input1,
-              unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::uchar4>(
-      Input1, Input2, std::less_equal<unsigned char>());
-}
-
-void testVsetleu4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vsetleu4(DeviceResult, TestCase_first_first_ct1,
-                       TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vsetleu4", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vsetlts2(unsigned int *const DeviceResult, unsigned int Input1,
-              unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::short2>(
-      Input1, Input2, std::less<short>());
-}
-
-void testVsetlts2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vsetlts2(DeviceResult, TestCase_first_first_ct1,
-                       TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vsetlts2", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vsetlts4(unsigned int *const DeviceResult, unsigned int Input1,
-              unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::char4>(Input1, Input2,
-                                                             std::less<char>());
-}
-
-void testVsetlts4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vsetlts4(DeviceResult, TestCase_first_first_ct1,
-                       TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vsetlts4", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vsetltu2(unsigned int *const DeviceResult, unsigned int Input1,
-              unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::ushort2>(
-      Input1, Input2, std::less<unsigned short>());
-}
-
-void testVsetltu2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vsetltu2(DeviceResult, TestCase_first_first_ct1,
-                       TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vsetltu2", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vsetltu4(unsigned int *const DeviceResult, unsigned int Input1,
-              unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::uchar4>(
-      Input1, Input2, std::less<unsigned char>());
-}
-
-void testVsetltu4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vsetltu4(DeviceResult, TestCase_first_first_ct1,
-                       TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vsetltu4", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vsetne2(unsigned int *const DeviceResult, unsigned int Input1,
-             unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::ushort2>(
-      Input1, Input2, std::not_equal_to<unsigned short>());
-}
-
-void testVsetne2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vsetne2(DeviceResult, TestCase_first_first_ct1,
-                      TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vsetne2", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vsetne4(unsigned int *const DeviceResult, unsigned int Input1,
-             unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::uchar4>(
-      Input1, Input2, std::not_equal_to<unsigned char>());
-}
-
-void testVsetne4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vsetne4(DeviceResult, TestCase_first_first_ct1,
-                      TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vsetne4", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vsub2(unsigned int *const DeviceResult, unsigned int Input1,
-           unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::ushort2>(Input1, Input2,
-                                                               std::minus<>());
-}
-
-void testVsub2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vsub2(DeviceResult, TestCase_first_first_ct1,
-                    TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vsub2", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vsub4(unsigned int *const DeviceResult, unsigned int Input1,
-           unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::uchar4>(Input1, Input2,
-                                                              std::minus<>());
-}
-
-void testVsub4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vsub4(DeviceResult, TestCase_first_first_ct1,
-                    TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vsub4", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vsubss2(unsigned int *const DeviceResult, unsigned int Input1,
-             unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::short2>(
-      Input1, Input2, syclcompat::sub_sat());
-}
-
-void testVsubss2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vsubss2(DeviceResult, TestCase_first_first_ct1,
-                      TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vsubss2", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vsubss4(unsigned int *const DeviceResult, unsigned int Input1,
-             unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::char4>(
-      Input1, Input2, syclcompat::sub_sat());
-}
-
-void testVsubss4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vsubss4(DeviceResult, TestCase_first_first_ct1,
-                      TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vsubss4", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vsubus2(unsigned int *const DeviceResult, unsigned int Input1,
-             unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::ushort2>(
-      Input1, Input2, syclcompat::sub_sat());
-}
-
-void testVsubus2Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vsubus2(DeviceResult, TestCase_first_first_ct1,
-                      TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vsubus2", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-void vsubus4(unsigned int *const DeviceResult, unsigned int Input1,
-             unsigned int Input2) {
-  *DeviceResult = syclcompat::vectorized_binary<sycl::uchar4>(
-      Input1, Input2, syclcompat::sub_sat());
-}
-
-void testVsubus4Cases(const vector<pair<Uint_pair, unsigned int>> &TestCases) {
-  sycl::queue q_ct1 = syclcompat::get_default_queue();
-  unsigned int *DeviceResult;
-  DeviceResult =
-      (unsigned int *)sycl::malloc_shared(sizeof(*DeviceResult), q_ct1);
-  try {
-    for (const auto &TestCase : TestCases) {
-      q_ct1.submit([&](sycl::handler &cgh) {
-        auto TestCase_first_first_ct1 = TestCase.first.first;
-        auto TestCase_first_second_ct2 = TestCase.first.second;
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-            [=](sycl::nd_item<3> item_ct1) {
-              vsubus4(DeviceResult, TestCase_first_first_ct1,
-                      TestCase_first_second_ct2);
-            });
-      });
-      q_ct1.wait_and_throw();
-      checkResult("__vsubus4", {TestCase.first.first, TestCase.first.second},
-                  TestCase.second, *DeviceResult);
-    }
-  } catch (...) {
-    // Intentionally left empty to make sure allocated memory can be freed.
-  }
-  sycl::free(DeviceResult, q_ct1);
-}
-
-int main() {
-  testVabs2Cases({
-      {214321, 214321},
-      {3, 3},
-      {2147483647, 2147418113}, // 7FFF,FFFF-->7FFF,0001
-      {0, 0},
-      {4294967295, 65537}, // FFFF,FFFF-->0001,0001
-  });
-  testVabs4Cases({
-      {214321, 214321},
-      {3, 3},
-      {2147483647, 2130772225}, // 7F,FF,FF,FF-->7F,01,01,01
-      {0, 0},
-      {4294967295, 16843009}, // FF,FF,FF,FF-->01,01,01,01
-  });
-  testVabsdiffs2Cases({
-      {{4, 3}, 1},
-      {{214321, 2147483647}, 2147239218},
-      {{4294967295, 2147483647}, 2147483648},
-      {{4294967295, 4294967295}, 0},
-      {{3, 4}, 1},
-  });
-  testVabsdiffs4Cases({
-      {{4, 3}, 1},
-      {{214321, 2147483647}, 2130986546},
-      {{4294967295, 2147483647}, 2147483648},
-      {{4294967295, 4294967295}, 0},
-      {{3, 4}, 1},
-  });
-  testVabsdiffu2Cases({
-      {{4, 3}, 1},
-      {{214321, 2147483647}, 2147269326},
-      {{4294967295, 2147483647}, 2147483648},
-      {{4294967295, 4294967295}, 0},
-      {{3, 4}, 1},
-  });
-  testVabsdiffu4Cases({
-      {{4, 3}, 1},
-      {{214321, 2147483647}, 2147269326},
-      {{4294967295, 2147483647}, 2147483648},
-      {{4294967295, 4294967295}, 0},
-      {{3, 4}, 1},
-  });
-  testVabsss2Cases({
-      {214321, 214321},
-      {3, 3},
-      {2147483647, 2147418113},
-      {0, 0},
-      {4294967295, 65537},
-  });
-  testVabsss4Cases({
-      {214321, 214321},
-      {3, 3},
-      {2147483647, 2130772225},
-      {0, 0},
-      {4294967295, 16843009},
-  });
-  testVadd2Cases({
-      {{4, 3}, 7},
-      {{214321, 2147483647}, 2147632432},
-      {{4294967295, 2147483647}, 2147418110},
-      {{4294967295, 4294967295}, 4294901758},
-      {{3, 4}, 7},
-  });
-  testVadd4Cases({
-      {{4, 3}, 7},
-      {{214321, 2147483647}, 2130854960},
-      {{4294967295, 2147483647}, 2130640638},
-      {{4294967295, 4294967295}, 4278124286},
-      {{3, 4}, 7},
-  });
-  testVaddss2Cases({
-      {{4, 3}, 7},
-      {{214321, 2147483647}, 2147435824}, // 3,4531+7FFF,FFFF-->7FFF,4530
-      {{4294967295, 2147483647}, 2147418110},
-      {{4294967295, 4294967295}, 4294901758},
-      {{3, 4}, 7},
-  });
-  testVaddss4Cases({
-      {{4, 3}, 7},
-      {{214321, 2147483647}, 2130854960},
-      {{4294967295, 2147483647}, 2130640638},
-      {{4294967295, 4294967295}, 4278124286},
-      {{3, 4}, 7},
-  });
-  testVaddus2Cases({
-      {{4, 3}, 7},
-      {{214321, 2147483647}, 2147680255},
-      {{4294967295, 2147483647}, 4294967295},
-      {{4294967295, 4294967295}, 4294967295},
-      {{3, 4}, 7},
-  });
-  testVaddus4Cases({
-      {{4, 3}, 7},
-      {{214321, 2147483647}, 2147483647},
-      {{4294967295, 2147483647}, 4294967295},
-      {{4294967295, 4294967295}, 4294967295},
-      {{3, 4}, 7},
-  });
-  testVavgs2Cases({
-      {{4, 3}, 4},
-      {{214321, 2147483647}, 1073816216},
-      {{4294967295, 2147483647}, 1073741823},
-      {{4294967295, 4294967295}, 4294967295},
-      {{3, 4}, 4},
-  });
-  testVavgs4Cases({
-      {{4, 3}, 4},
-      {{214321, 2147483647}, 1073816088},
-      {{4294967295, 2147483647}, 1073741823},
-      {{4294967295, 4294967295}, 4294967295},
-      {{3, 4}, 4},
-  });
-  testVavgu2Cases({
-      {{4, 3}, 4},
-      {{214321, 2147483647}, 1073848984},
-      {{4294967295, 2147483647}, 3221225471},
-      {{4294967295, 4294967295}, 4294967295},
-      {{3, 4}, 4},
-  });
-  testVavgu4Cases({
-      {{4, 3}, 4},
-      {{214321, 2147483647}, 1082237592},
-      {{4294967295, 2147483647}, 3221225471},
-      {{4294967295, 4294967295}, 4294967295},
-      {{3, 4}, 4},
-  });
-  testVcmpeq2Cases({
-      {{4, 3}, 4294901760},
-      {{214321, 2147483647}, 0},
-      {{4294967295, 2147483647}, 65535},
-      {{4294967295, 4294967295}, 4294967295},
-      {{3, 4}, 4294901760},
-  });
-  testVcmpeq4Cases({
-      {{4, 3}, 4294967040},
-      {{214321, 2147483647}, 0},
-      {{4294967295, 2147483647}, 16777215},
-      {{4294967295, 4294967295}, 4294967295},
-      {{3, 4}, 4294967040},
-  });
-  testVcmpges2Cases({
-      {{4, 3}, 4294967295},
-      {{214321, 2147483647}, 65535},
-      {{4294967295, 2147483647}, 65535},
-      {{4294967295, 4294967295}, 4294967295},
-      {{3, 4}, 4294901760},
-  });
-  testVcmpges4Cases({
-      {{4, 3}, 4294967295},
-      {{214321, 2147483647}, 16777215},
-      {{4294967295, 2147483647}, 16777215},
-      {{4294967295, 4294967295}, 4294967295},
-      {{3, 4}, 4294967040},
-  });
-  testVcmpgeu2Cases({
-      {{4, 3}, 4294967295},
-      {{214321, 2147483647}, 0},
-      {{4294967295, 2147483647}, 4294967295},
-      {{4294967295, 4294967295}, 4294967295},
-      {{3, 4}, 4294901760},
-  });
-  testVcmpgeu4Cases({
-      {{4, 3}, 4294967295},
-      {{214321, 2147483647}, 0},
-      {{4294967295, 2147483647}, 4294967295},
-      {{4294967295, 4294967295}, 4294967295},
-      {{3, 4}, 4294967040},
-  });
-  testVcmpgts2Cases({
-      {{4, 3}, 65535},
-      {{214321, 2147483647}, 65535},
-      {{4294967295, 2147483647}, 0},
-      {{4294967295, 4294967295}, 0},
-      {{3, 4}, 0},
-  });
-  testVcmpgts4Cases({
-      {{4, 3}, 255},
-      {{214321, 2147483647}, 16777215},
-      {{4294967295, 2147483647}, 0},
-      {{4294967295, 4294967295}, 0},
-      {{3, 4}, 0},
-  });
-  testVcmpgtu2Cases({
-      {{4, 3}, 65535},
-      {{214321, 2147483647}, 0},
-      {{4294967295, 2147483647}, 4294901760},
-      {{4294967295, 4294967295}, 0},
-      {{3, 4}, 0},
-  });
-  testVcmpgtu4Cases({
-      {{4, 3}, 255},
-      {{214321, 2147483647}, 0},
-      {{4294967295, 2147483647}, 4278190080},
-      {{4294967295, 4294967295}, 0},
-      {{3, 4}, 0},
-  });
-  testVcmples2Cases({
-      {{4, 3}, 4294901760},
-      {{214321, 2147483647}, 4294901760},
-      {{4294967295, 2147483647}, 4294967295},
-      {{4294967295, 4294967295}, 4294967295},
-      {{3, 4}, 4294967295},
-  });
-  testVcmples4Cases({
-      {{4, 3}, 4294967040},
-      {{214321, 2147483647}, 4278190080},
-      {{4294967295, 2147483647}, 4294967295},
-      {{4294967295, 4294967295}, 4294967295},
-      {{3, 4}, 4294967295},
-  });
-  testVcmpleu2Cases({
-      {{4, 3}, 4294901760},
-      {{214321, 2147483647}, 4294967295},
-      {{4294967295, 2147483647}, 65535},
-      {{4294967295, 4294967295}, 4294967295},
-      {{3, 4}, 4294967295},
-  });
-  testVcmpleu4Cases({
-      {{4, 3}, 4294967040},
-      {{214321, 2147483647}, 4294967295},
-      {{4294967295, 2147483647}, 16777215},
-      {{4294967295, 4294967295}, 4294967295},
-      {{3, 4}, 4294967295},
-  });
-  testVcmplts2Cases({
-      {{4, 3}, 0},
-      {{214321, 2147483647}, 4294901760},
-      {{4294967295, 2147483647}, 4294901760},
-      {{4294967295, 4294967295}, 0},
-      {{3, 4}, 65535},
-  });
-  testVcmplts4Cases({
-      {{4, 3}, 0},
-      {{214321, 2147483647}, 4278190080},
-      {{4294967295, 2147483647}, 4278190080},
-      {{4294967295, 4294967295}, 0},
-      {{3, 4}, 255},
-  });
-  testVcmpltu2Cases({
-      {{4, 3}, 0},
-      {{214321, 2147483647}, 4294967295},
-      {{4294967295, 2147483647}, 0},
-      {{4294967295, 4294967295}, 0},
-      {{3, 4}, 65535},
-  });
-  testVcmpltu4Cases({
-      {{4, 3}, 0},
-      {{214321, 2147483647}, 4294967295},
-      {{4294967295, 2147483647}, 0},
-      {{4294967295, 4294967295}, 0},
-      {{3, 4}, 255},
-  });
-  testVcmpne2Cases({
-      {{4, 3}, 65535},
-      {{214321, 2147483647}, 4294967295},
-      {{4294967295, 2147483647}, 4294901760},
-      {{4294967295, 4294967295}, 0},
-      {{3, 4}, 65535},
-  });
-  testVcmpne4Cases({
-      {{4, 3}, 255},
-      {{214321, 2147483647}, 4294967295},
-      {{4294967295, 2147483647}, 4278190080},
-      {{4294967295, 4294967295}, 0},
-      {{3, 4}, 255},
-  });
-  testVhaddu2Cases({
-      {{4, 3}, 3},
-      {{214321, 2147483647}, 1073848984},
-      {{4294967295, 2147483647}, 3221225471},
-      {{4294967295, 4294967295}, 4294967295},
-      {{3, 4}, 3},
-  });
-  testVhaddu4Cases({
-      {{4, 3}, 3},
-      {{214321, 2147483647}, 1065460376},
-      {{4294967295, 2147483647}, 3221225471},
-      {{4294967295, 4294967295}, 4294967295},
-      {{3, 4}, 3},
-  });
-  testVmaxs2Cases({
-      {{4, 3}, 4},
-      {{214321, 2147483647}, 2147435825},
-      {{4294967295, 2147483647}, 2147483647},
-      {{4294967295, 4294967295}, 4294967295},
-      {{3, 4}, 4},
-  });
-  testVmaxs4Cases({
-      {{4, 3}, 4},
-      {{214321, 2147483647}, 2130920753},
-      {{4294967295, 2147483647}, 2147483647},
-      {{4294967295, 4294967295}, 4294967295},
-      {{3, 4}, 4},
-  });
-  testVmaxu2Cases({
-      {{4, 3}, 4},
-      {{214321, 2147483647}, 2147483647},
-      {{4294967295, 2147483647}, 4294967295},
-      {{4294967295, 4294967295}, 4294967295},
-      {{3, 4}, 4},
-  });
-  testVmaxu4Cases({
-      {{4, 3}, 4},
-      {{214321, 2147483647}, 2147483647},
-      {{4294967295, 2147483647}, 4294967295},
-      {{4294967295, 4294967295}, 4294967295},
-      {{3, 4}, 4},
-  });
-  testVmins2Cases({
-      {{4, 3}, 3},
-      {{214321, 2147483647}, 262143},
-      {{4294967295, 2147483647}, 4294967295},
-      {{4294967295, 4294967295}, 4294967295},
-      {{3, 4}, 3},
-  });
-  testVmins4Cases({
-      {{4, 3}, 3},
-      {{214321, 2147483647}, 16777215},
-      {{4294967295, 2147483647}, 4294967295},
-      {{4294967295, 4294967295}, 4294967295},
-      {{3, 4}, 3},
-  });
-  testVminu2Cases({
-      {{4, 3}, 3},
-      {{214321, 2147483647}, 214321},
-      {{4294967295, 2147483647}, 2147483647},
-      {{4294967295, 4294967295}, 4294967295},
-      {{3, 4}, 3},
-  });
-  testVminu4Cases({
-      {{4, 3}, 3},
-      {{214321, 2147483647}, 214321},
-      {{4294967295, 2147483647}, 2147483647},
-      {{4294967295, 4294967295}, 4294967295},
-      {{3, 4}, 3},
-  });
-  testVneg2Cases({
-      {214321, 4294818511},
-      {3, 65533},
-      {2147483647, 2147549185},
-      {0, 0},
-      {4294967295, 65537},
-  });
-  testVneg4Cases({
-      {214321, 16628687},
-      {3, 253},
-      {2147483647, 2164326657},
-      {0, 0},
-      {4294967295, 16843009},
-  });
-  testVnegss2Cases({
-      {214321, 4294818511},
-      {3, 65533},
-      {2147483647, 2147549185},
-      {0, 0},
-      {4294967295, 65537},
-  });
-  testVnegss4Cases({
-      {214321, 16628687},
-      {3, 253},
-      {2147483647, 2164326657},
-      {0, 0},
-      {4294967295, 16843009},
-  });
-  testVsads2Cases({
-      {{4, 3}, 1},
-      {{214321, 2147483647}, 50478},
-      {{4294967295, 2147483647}, 32768},
-      {{4294967295, 4294967295}, 0},
-      {{3, 4}, 1},
-  });
-  testVsads4Cases({
-      {{4, 3}, 1},
-      {{214321, 2147483647}, 251},
-      {{4294967295, 2147483647}, 128},
-      {{4294967295, 4294967295}, 0},
-      {{3, 4}, 1},
-  });
-  testVsadu2Cases({
-      {{4, 3}, 1},
-      {{214321, 2147483647}, 80586},
-      {{4294967295, 2147483647}, 32768},
-      {{4294967295, 4294967295}, 0},
-      {{3, 4}, 1},
-  });
-  testVsadu4Cases({
-      {{4, 3}, 1},
-      {{214321, 2147483647}, 771},
-      {{4294967295, 2147483647}, 128},
-      {{4294967295, 4294967295}, 0},
-      {{3, 4}, 1},
-  });
-  testVseteq2Cases({
-      {{4, 3}, 65536},
-      {{214321, 2147483647}, 0},
-      {{4294967295, 2147483647}, 1},
-      {{4294967295, 4294967295}, 65537},
-      {{3, 4}, 65536},
-  });
-  testVseteq4Cases({
-      {{4, 3}, 16843008},
-      {{214321, 2147483647}, 0},
-      {{4294967295, 2147483647}, 65793},
-      {{4294967295, 4294967295}, 16843009},
-      {{3, 4}, 16843008},
-  });
-  testVsetges2Cases({
-      {{4, 3}, 65537},
-      {{214321, 2147483647}, 1},
-      {{4294967295, 2147483647}, 1},
-      {{4294967295, 4294967295}, 65537},
-      {{3, 4}, 65536},
-  });
-  testVsetges4Cases({
-      {{4, 3}, 16843009},
-      {{214321, 2147483647}, 65793},
-      {{4294967295, 2147483647}, 65793},
-      {{4294967295, 4294967295}, 16843009},
-      {{3, 4}, 16843008},
-  });
-  testVsetgeu2Cases({
-      {{4, 3}, 65537},
-      {{214321, 2147483647}, 0},
-      {{4294967295, 2147483647}, 65537},
-      {{4294967295, 4294967295}, 65537},
-      {{3, 4}, 65536},
-  });
-  testVsetgeu4Cases({
-      {{4, 3}, 16843009},
-      {{214321, 2147483647}, 0},
-      {{4294967295, 2147483647}, 16843009},
-      {{4294967295, 4294967295}, 16843009},
-      {{3, 4}, 16843008},
-  });
-  testVsetgts2Cases({
-      {{4, 3}, 1},
-      {{214321, 2147483647}, 1},
-      {{4294967295, 2147483647}, 0},
-      {{4294967295, 4294967295}, 0},
-      {{3, 4}, 0},
-  });
-  testVsetgts4Cases({
-      {{4, 3}, 1},
-      {{214321, 2147483647}, 65793},
-      {{4294967295, 2147483647}, 0},
-      {{4294967295, 4294967295}, 0},
-      {{3, 4}, 0},
-  });
-  testVsetgtu2Cases({
-      {{4, 3}, 1},
-      {{214321, 2147483647}, 0},
-      {{4294967295, 2147483647}, 65536},
-      {{4294967295, 4294967295}, 0},
-      {{3, 4}, 0},
-  });
-  testVsetgtu4Cases({
-      {{4, 3}, 1},
-      {{214321, 2147483647}, 0},
-      {{4294967295, 2147483647}, 16777216},
-      {{4294967295, 4294967295}, 0},
-      {{3, 4}, 0},
-  });
-  testVsetles2Cases({
-      {{4, 3}, 65536},
-      {{214321, 2147483647}, 65536},
-      {{4294967295, 2147483647}, 65537},
-      {{4294967295, 4294967295}, 65537},
-      {{3, 4}, 65537},
-  });
-  testVsetles4Cases({
-      {{4, 3}, 16843008},
-      {{214321, 2147483647}, 16777216},
-      {{4294967295, 2147483647}, 16843009},
-      {{4294967295, 4294967295}, 16843009},
-      {{3, 4}, 16843009},
-  });
-  testVsetleu2Cases({
-      {{4, 3}, 65536},
-      {{214321, 2147483647}, 65537},
-      {{4294967295, 2147483647}, 1},
-      {{4294967295, 4294967295}, 65537},
-      {{3, 4}, 65537},
-  });
-  testVsetleu4Cases({
-      {{4, 3}, 16843008},
-      {{214321, 2147483647}, 16843009},
-      {{4294967295, 2147483647}, 65793},
-      {{4294967295, 4294967295}, 16843009},
-      {{3, 4}, 16843009},
-  });
-  testVsetlts2Cases({
-      {{4, 3}, 0},
-      {{214321, 2147483647}, 65536},
-      {{4294967295, 2147483647}, 65536},
-      {{4294967295, 4294967295}, 0},
-      {{3, 4}, 1},
-  });
-  testVsetlts4Cases({
-      {{4, 3}, 0},
-      {{214321, 2147483647}, 16777216},
-      {{4294967295, 2147483647}, 16777216},
-      {{4294967295, 4294967295}, 0},
-      {{3, 4}, 1},
-  });
-  testVsetltu2Cases({
-      {{4, 3}, 0},
-      {{214321, 2147483647}, 65537},
-      {{4294967295, 2147483647}, 0},
-      {{4294967295, 4294967295}, 0},
-      {{3, 4}, 1},
-  });
-  testVsetltu4Cases({
-      {{4, 3}, 0},
-      {{214321, 2147483647}, 16843009},
-      {{4294967295, 2147483647}, 0},
-      {{4294967295, 4294967295}, 0},
-      {{3, 4}, 1},
-  });
-  testVsetne2Cases({
-      {{4, 3}, 1},
-      {{214321, 2147483647}, 65537},
-      {{4294967295, 2147483647}, 65536},
-      {{4294967295, 4294967295}, 0},
-      {{3, 4}, 1},
-  });
-  testVsetne4Cases({
-      {{4, 3}, 1},
-      {{214321, 2147483647}, 16843009},
-      {{4294967295, 2147483647}, 16777216},
-      {{4294967295, 4294967295}, 0},
-      {{3, 4}, 1},
-  });
-  testVsub2Cases({
-      {{4, 3}, 1},
-      {{214321, 2147483647}, 2147763506},
-      {{4294967295, 2147483647}, 2147483648},
-      {{4294967295, 4294967295}, 0},
-      {{3, 4}, 65535},
-  });
-  testVsub4Cases({
-      {{4, 3}, 1},
-      {{214321, 2147483647}, 2164540978},
-      {{4294967295, 2147483647}, 2147483648},
-      {{4294967295, 4294967295}, 0},
-      {{3, 4}, 255},
-  });
-  testVsubss2Cases({
-      {{4, 3}, 1},
-      {{214321, 2147483647}, 2147763506},
-      {{4294967295, 2147483647}, 2147483648},
-      {{4294967295, 4294967295}, 0},
-      {{3, 4}, 65535},
-  });
-  testVsubss4Cases({
-      {{4, 3}, 1},
-      {{214321, 2147483647}, 2164540978},
-      {{4294967295, 2147483647}, 2147483648},
-      {{4294967295, 4294967295}, 0},
-      {{3, 4}, 255},
-  });
-  testVsubus2Cases({
-      {{4, 3}, 1},
-      {{214321, 2147483647}, 0},
-      {{4294967295, 2147483647}, 2147483648},
-      {{4294967295, 4294967295}, 0},
-      {{3, 4}, 0},
-  });
-  testVsubus4Cases({
-      {{4, 3}, 1},
-      {{214321, 2147483647}, 0},
-      {{4294967295, 2147483647}, 2147483648},
-      {{4294967295, 4294967295}, 0},
-      {{3, 4}, 0},
-  });
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/math/math_extend.cpp b/sycl/test-e2e/syclcompat/math/math_extend.cpp
deleted file mode 100644
index 1482a33782a75..0000000000000
--- a/sycl/test-e2e/syclcompat/math/math_extend.cpp
+++ /dev/null
@@ -1,248 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  math_extend.cpp
- *
- *  Description:
- *    math extend helpers tests
- **************************************************************************/
-
-// ===----------- math_extend_func.cpp ---------- -*- C++ -* --------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===---------------------------------------------------------------------===//
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include <cmath>
-#include <cstdint>
-#include <limits>
-#include <stdio.h>
-#include <sycl/detail/core.hpp>
-
-#include <syclcompat/device.hpp>
-#include <syclcompat/math.hpp>
-#include <syclcompat/memory.hpp>
-
-#define CHECK(S, REF)                                                          \
-  {                                                                            \
-    ++test_id;                                                                 \
-    auto ret = S;                                                              \
-    if (ret != REF) {                                                          \
-      errc = test_id;                                                          \
-    }                                                                          \
-  }
-
-const auto INT32MAX = std::numeric_limits<int32_t>::max();
-const auto INT32MIN = std::numeric_limits<int32_t>::min();
-const auto UINT32MAX = std::numeric_limits<uint32_t>::max();
-const auto UINT32MIN = std::numeric_limits<uint32_t>::min();
-const int b = 4, c = 5, d = 6;
-
-int vadd() {
-  int errc{};
-  int test_id{};
-  CHECK(syclcompat::extend_add<int32_t>(3, 4), 7);
-  CHECK(syclcompat::extend_add<uint32_t>(b, c), 9);
-  CHECK(syclcompat::extend_add_sat<int32_t>(b, INT32MAX), INT32MAX);
-  CHECK(syclcompat::extend_add_sat<uint32_t>(UINT32MAX, INT32MAX), UINT32MAX);
-  CHECK(syclcompat::extend_add_sat<int32_t>(b, -20, d, sycl::plus<>()), -10);
-  CHECK(syclcompat::extend_add_sat<int32_t>(b, c, -20, sycl::minimum<>()), -20);
-  CHECK(syclcompat::extend_add_sat<int32_t>(b, (-33), 9, sycl::maximum<>()), 9);
-
-  return errc;
-}
-
-int vsub() {
-  int errc{};
-  int test_id{};
-  CHECK(syclcompat::extend_sub<int32_t>(3, 4), -1);
-  CHECK(syclcompat::extend_sub<uint32_t>(c, b), 1);
-  CHECK(syclcompat::extend_sub_sat<int32_t>(10, INT32MIN), INT32MAX);
-  CHECK(syclcompat::extend_sub_sat<uint32_t>(UINT32MIN, 1), UINT32MIN);
-  CHECK(syclcompat::extend_sub_sat<int32_t>(b, -20, d, sycl::plus<>()), 30);
-  CHECK(syclcompat::extend_sub_sat<int32_t>(b, c, -20, sycl::minimum<>()), -20);
-  CHECK(syclcompat::extend_sub_sat<int32_t>(b, (-33), 9, sycl::maximum<>()),
-        37);
-
-  return errc;
-}
-
-int vabsdiff() {
-  int errc{};
-  int test_id{};
-  CHECK(syclcompat::extend_absdiff<int32_t>(3, 4), 1);
-  CHECK(syclcompat::extend_absdiff<uint32_t>(c, b), 1);
-  CHECK(syclcompat::extend_absdiff_sat<int32_t>(10, INT32MIN), INT32MAX);
-  CHECK(syclcompat::extend_absdiff_sat<uint32_t>(UINT32MIN, 1), 1);
-  CHECK(syclcompat::extend_absdiff_sat<int32_t>(b, -20, d, sycl::plus<>()), 30);
-  CHECK(syclcompat::extend_absdiff_sat<int32_t>(b, c, -20, sycl::minimum<>()),
-        -20);
-  CHECK(syclcompat::extend_absdiff_sat<int32_t>(b, (-33), 9, sycl::maximum<>()),
-        37);
-
-  return errc;
-}
-
-int vmin() {
-  int errc{};
-  int test_id{};
-  CHECK(syclcompat::extend_min<int32_t>(3, 4), 3);
-  CHECK(syclcompat::extend_min<uint32_t>(c, b), 4);
-  CHECK(syclcompat::extend_min_sat<int32_t>(UINT32MAX, 1), 1);
-  CHECK(syclcompat::extend_min_sat<uint32_t>(10, (-1)), 0);
-  CHECK(syclcompat::extend_min_sat<int32_t>(b, -20, d, sycl::plus<>()), -14);
-  CHECK(syclcompat::extend_min_sat<int32_t>(b, c, -20, sycl::minimum<>()), -20);
-  CHECK(syclcompat::extend_min_sat<int32_t>(b, (-33), 9, sycl::maximum<>()), 9);
-
-  return errc;
-}
-
-int vmax() {
-  int errc{};
-  int test_id{};
-  CHECK(syclcompat::extend_max<int32_t>(3, 4), 4);
-  CHECK(syclcompat::extend_max<uint32_t>(c, b), 5);
-  CHECK(syclcompat::extend_max_sat<int32_t>(UINT32MAX, 1), INT32MAX);
-  CHECK(syclcompat::extend_max_sat<uint32_t>(UINT32MAX, 1), UINT32MAX);
-  CHECK(syclcompat::extend_max_sat<int32_t>(b, -20, d, sycl::plus<>()), 10);
-  CHECK(syclcompat::extend_max_sat<int32_t>(b, c, -20, sycl::minimum<>()), -20);
-  CHECK(syclcompat::extend_max_sat<int32_t>(b, (-33), 9, sycl::maximum<>()), 9);
-
-  return errc;
-}
-
-template <typename Tp> struct scale {
-  Tp operator()(Tp val, Tp scaler) { return val * scaler; }
-};
-
-template <typename Tp> struct noop {
-  Tp operator()(Tp val, Tp /*scaler*/) { return val; }
-};
-
-int shl_clamp() {
-  int errc{};
-  int test_id{};
-  CHECK(syclcompat::extend_shl_clamp<int32_t>(3, 4), 48);
-  CHECK(syclcompat::extend_shl_clamp<int32_t>(6, 33), 0);
-  CHECK(syclcompat::extend_shl_clamp<int32_t>(3, 4, 4, scale<int32_t>()), 192);
-  CHECK(syclcompat::extend_shl_clamp<int32_t>(3, 4, 4, noop<int32_t>()), 48);
-  CHECK(syclcompat::extend_shl_sat_clamp<int8_t>(9, 5), 127);
-  CHECK(syclcompat::extend_shl_sat_clamp<int8_t>(-9, 5), -128);
-  CHECK(syclcompat::extend_shl_sat_clamp<int8_t>(9, 5, -1, scale<int8_t>()),
-        -127);
-  CHECK(syclcompat::extend_shl_sat_clamp<int8_t>(9, 5, -1, noop<int8_t>()),
-        127);
-
-  return errc;
-}
-
-int shl_wrap() {
-  int errc{};
-  int test_id{};
-  CHECK(syclcompat::extend_shl_wrap<int32_t>(3, 4), 48);
-  CHECK(syclcompat::extend_shl_wrap<int32_t>(6, 32), 6);
-  CHECK(syclcompat::extend_shl_wrap<int32_t>(6, 33), 12);
-  CHECK(syclcompat::extend_shl_wrap<int32_t>(6, 64), 6);
-  CHECK(syclcompat::extend_shl_wrap<int32_t>(3, 4, 4, scale<int32_t>()), 192);
-  CHECK(syclcompat::extend_shl_wrap<int32_t>(6, 32, 4, noop<int32_t>()), 6);
-  CHECK(syclcompat::extend_shl_sat_wrap<int8_t>(9, 5), 127);
-  CHECK(syclcompat::extend_shl_sat_wrap<int8_t>(-9, 5), -128);
-  CHECK(syclcompat::extend_shl_sat_wrap<int8_t>(9, 5, -1, scale<int8_t>()),
-        -127);
-  CHECK(syclcompat::extend_shl_sat_wrap<int8_t>(9, 5, -1, noop<int8_t>()), 127);
-
-  return errc;
-}
-
-int shr_clamp() {
-  int errc{};
-  int test_id{};
-  CHECK(syclcompat::extend_shr_clamp<int32_t>(128, 5), 4);
-  CHECK(syclcompat::extend_shr_clamp<int32_t>(INT32MAX, 33), 0);
-  CHECK(syclcompat::extend_shr_clamp<int32_t>(128, 5, 4, scale<int32_t>()), 16);
-  CHECK(syclcompat::extend_shr_clamp<int32_t>(128, 5, 4, noop<int32_t>()), 4);
-  CHECK(syclcompat::extend_shr_sat_clamp<int8_t>(512, 1), 127);
-  CHECK(syclcompat::extend_shr_sat_clamp<int8_t>(-512, 1), -128);
-  CHECK(syclcompat::extend_shr_sat_clamp<int8_t>(512, 1, -1, scale<int8_t>()),
-        -127);
-  CHECK(syclcompat::extend_shr_sat_clamp<int8_t>(512, 1, -1, noop<int8_t>()),
-        127);
-
-  return errc;
-}
-
-int shr_wrap() {
-  int errc{};
-  int test_id{};
-  CHECK(syclcompat::extend_shr_wrap<int32_t>(128, 5), 4);
-  CHECK(syclcompat::extend_shr_wrap<int32_t>(128, 32), 128);
-  CHECK(syclcompat::extend_shr_wrap<int32_t>(128, 33), 64);
-  CHECK(syclcompat::extend_shr_wrap<int32_t>(128, 64), 128);
-  CHECK(syclcompat::extend_shr_wrap<int32_t>(128, 5, 4, scale<int32_t>()), 16);
-  CHECK(syclcompat::extend_shr_wrap<int32_t>(128, 5, 4, noop<int32_t>()), 4);
-  CHECK(syclcompat::extend_shr_sat_wrap<int8_t>(512, 1), 127);
-  CHECK(syclcompat::extend_shr_sat_wrap<int8_t>(-512, 1), -128);
-  CHECK(syclcompat::extend_shr_sat_wrap<int8_t>(512, 1, -1, scale<int8_t>()),
-        -127);
-  CHECK(syclcompat::extend_shr_sat_wrap<int8_t>(512, 1, -1, noop<int8_t>()),
-        127);
-
-  return errc;
-}
-
-template <auto F> void test_fn(sycl::queue q, int *ec) {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  q.submit([&](sycl::handler &cgh) {
-    cgh.single_task([=]() {
-      auto res = F();
-      if(res != 0) *ec = res;
-    });
-  });
-  int ec_h{};
-  syclcompat::memcpy<int>(&ec_h, ec, 1, q);
-  q.wait_and_throw();
-
-  if (ec_h != 0) {
-    std::cout << "Test " << ec_h << " failed." << std::endl;
-    syclcompat::free(ec, q);
-    assert(false);
-  }
-}
-
-int main() {
-  sycl::queue q = syclcompat::get_default_queue();
-  int *ec = syclcompat::malloc<int>(1, q);
-  syclcompat::fill<int>(ec, 0, 1, q);
-
-  test_fn<vadd>(q, ec);
-  test_fn<vsub>(q, ec);
-  test_fn<vabsdiff>(q, ec);
-  test_fn<vmin>(q, ec);
-  test_fn<vmax>(q, ec);
-  test_fn<shl_clamp>(q, ec);
-  test_fn<shl_wrap>(q, ec);
-  test_fn<shr_clamp>(q, ec);
-  test_fn<shr_wrap>(q, ec);
-
-  syclcompat::free(ec, q);
-}
diff --git a/sycl/test-e2e/syclcompat/math/math_extend_v_2.cpp b/sycl/test-e2e/syclcompat/math/math_extend_v_2.cpp
deleted file mode 100644
index 1668064bcf719..0000000000000
--- a/sycl/test-e2e/syclcompat/math/math_extend_v_2.cpp
+++ /dev/null
@@ -1,383 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  math_extend_v_2.cpp
- *
- *  Description:
- *    math extend 2-vectorized helpers tests
- **************************************************************************/
-
-// ===------------- math_extend_vfunc_2.cpp ----------------*- C++ -*-----===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===---------------------------------------------------------------------===//
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include <cmath>
-#include <cstdint>
-#include <sycl/detail/core.hpp>
-
-#include <syclcompat/device.hpp>
-#include <syclcompat/math.hpp>
-#include <syclcompat/memory.hpp>
-
-#define CHECK(S, REF)                                                          \
-  {                                                                            \
-    ++test_id;                                                                 \
-    auto ret = S;                                                              \
-    if (ret != REF) {                                                          \
-      errc = test_id;                                                          \
-    }                                                                          \
-  }
-
-int vadd2() {
-  int errc{};
-  int test_id{};
-  CHECK(syclcompat::extend_vadd2<int32_t>(0x0001FFFF, 0x00010005, 0),
-        0x00020004);
-  CHECK(syclcompat::extend_vadd2<int32_t>(0x7FFF7FFF, 0x00010001, 0),
-        0x80008000);
-  CHECK(syclcompat::extend_vadd2_sat<int32_t>(0x7FFF7FFF, 0x00010001, 0),
-        0x7FFF7FFF);
-
-  CHECK(syclcompat::extend_vadd2<uint32_t>(0x00010002, 0x00020003, 0),
-        0x00030005);
-  CHECK(syclcompat::extend_vadd2<uint32_t>(0xFFFEFFFF, 0x00030003, 0),
-        0x00010002);
-  CHECK(syclcompat::extend_vadd2_sat<uint32_t>((uint32_t)0xFFFEFFFF,
-                                               (uint32_t)0x00030003, 0),
-        0xFFFFFFFF);
-  return errc;
-}
-
-int vsub2() {
-  int errc{};
-  int test_id{};
-  CHECK(syclcompat::extend_vsub2<int32_t>(0x0001FFFF, 0xFFFF0001, 0),
-        0x0002FFFE);
-  // Testing API & Saturated API with mixed types
-  CHECK(syclcompat::extend_vsub2<int32_t>((int32_t)0x7FFFFFFD,
-                                          (int32_t)0xFFFA7FFF, 0),
-        0x80057FFE);
-  CHECK(syclcompat::extend_vsub2<int32_t>((uint32_t)0x7FFFFFFD,
-                                          (uint32_t)0xFFFA7FFF, 0),
-        0x80057FFE);
-  CHECK(syclcompat::extend_vsub2<int32_t>((uint32_t)0x7FFFFFFD,
-                                          (int32_t)0xFFFA7FFF, 0),
-        0x80057FFE);
-  CHECK(syclcompat::extend_vsub2<int32_t>((int32_t)0x7FFFFFFD,
-                                          (uint32_t)0xFFFA7FFF, 0),
-        0x80057FFE);
-  CHECK(syclcompat::extend_vsub2_sat<int32_t>((int32_t)0x7FFFFFFD,
-                                              (int32_t)0xFFFA7FFF, 0),
-        0x7FFF8000);
-  CHECK(syclcompat::extend_vsub2_sat<int32_t>((uint32_t)0x7FFFFFFD,
-                                              (uint32_t)0xFFFA7FFF, 0),
-        0x80057FFE);
-  CHECK(syclcompat::extend_vsub2_sat<int32_t>((int32_t)0x7FFFFFFD,
-                                              (uint32_t)0xFFFA7FFF, 0),
-        0x80058000);
-  CHECK(syclcompat::extend_vsub2_sat<int32_t>((uint32_t)0x7FFFFFFD,
-                                              (int32_t)0xFFFA7FFF, 0),
-        0x7FFF7FFE);
-
-  CHECK(syclcompat::extend_vsub2<uint32_t>(0x0002000B, 0x0001000A, 0),
-        0x00010001);
-  CHECK(syclcompat::extend_vsub2<uint32_t>((uint32_t)0x00010001,
-                                           (uint32_t)0x0002FFFF, 0),
-        0xFFFF0002);
-  CHECK(syclcompat::extend_vsub2<uint32_t>((int32_t)0x00010001,
-                                           (int32_t)0x0002FFFF, 0),
-        0xFFFF0002);
-  CHECK(syclcompat::extend_vsub2_sat<uint32_t>((uint32_t)0x00010001,
-                                               (uint32_t)0x0002FFFF, 0),
-        0x00000000);
-  CHECK(syclcompat::extend_vsub2_sat<uint32_t>((int32_t)0x00010001,
-                                               (int32_t)0x0002FFFF, 0),
-        0x00000002);
-
-  return errc;
-}
-
-int vadd2_add() {
-  int errc{};
-  int test_id{};
-  CHECK(syclcompat::extend_vadd2_add<int32_t>(0x00010002, 0x00030004, 1),
-        0x0000000B);
-  CHECK(syclcompat::extend_vadd2_add<int32_t>(0x0001FFFF, 0x0002FFFE, -1),
-        0xFFFFFFFF);
-  CHECK(syclcompat::extend_vadd2_add<int32_t>(0x00017FFF, 0x00017FFF, 1),
-        0x00010001);
-
-  CHECK(syclcompat::extend_vadd2_add<uint32_t>(0x00010002, 0x00030004, 1),
-        0x0000000B);
-  CHECK(syclcompat::extend_vadd2_add<uint32_t>((uint32_t)0x0001FFFF,
-                                               (uint32_t)0x0002FFFF, 1),
-        0x00020002);
-  CHECK(syclcompat::extend_vadd2_add<uint32_t>(0x0001FFFF, 0x0002FFFF, 1),
-        0x00000002);
-
-  return errc;
-}
-
-int vsub2_add() {
-  int errc{};
-  int test_id{};
-  // Testing API with mixed types
-  CHECK(syclcompat::extend_vsub2_add<int32_t>((int32_t)0x0001FFFF,
-                                              (int32_t)0xFFFF0001, 1),
-        1);
-  CHECK(syclcompat::extend_vsub2_add<int32_t>((uint32_t)0x7FFFFFFD,
-                                              (uint32_t)0xFFFA7FFF, -1),
-        0x00000002);
-  CHECK(syclcompat::extend_vsub2_add<int32_t>((int32_t)0x7FFFFFFD,
-                                              (int32_t)0xFFFA7FFF, -1),
-        0x00000002);
-  CHECK(syclcompat::extend_vsub2_add<int32_t>((int32_t)0x7FFFFFFD,
-                                              (uint32_t)0xFFFA7FFF, -1),
-        0xFFFF0002);
-  CHECK(syclcompat::extend_vsub2_add<int32_t>((uint32_t)0x7FFFFFFD,
-                                              (int32_t)0xFFFA7FFF, -1),
-        0x00010002);
-
-  CHECK(syclcompat::extend_vsub2_add<uint32_t>(0x0002000B, 0x0001000A, 1),
-        0x00000003);
-  CHECK(syclcompat::extend_vsub2_add<uint32_t>(0x00010001, 0x0002FFFF, 3),
-        0x00000004);
-
-  return errc;
-}
-
-int vabsdiff2() {
-  int errc{};
-  int test_id{};
-  CHECK(syclcompat::extend_vabsdiff2<int32_t>((int32_t)0xFFFF0001,
-                                              (int32_t)0x0003FFFF, 0),
-        0x00040002);
-  CHECK(syclcompat::extend_vabsdiff2<int32_t>((int32_t)0x80000002,
-                                              (int32_t)0x00010001, 0),
-        0x80010001);
-  CHECK(syclcompat::extend_vabsdiff2_sat<int32_t>((int32_t)0x80000002,
-                                                  (int32_t)0x00010001, 0),
-        0x7FFF0001);
-
-  CHECK(syclcompat::extend_vabsdiff2<uint32_t>(0x00010004, 0x00030002, 0),
-        0x00020002);
-  CHECK(syclcompat::extend_vabsdiff2<uint32_t>((uint32_t)0xFFFF0001,
-                                               (int32_t)0xFFFE0003, 0),
-        0x00010002);
-  CHECK(syclcompat::extend_vabsdiff2_sat<uint32_t>((uint32_t)0xFFFF0001,
-                                                   (int32_t)0xFFFE0003, 0),
-        0xFFFF0002);
-
-  return errc;
-}
-
-int vabsdiff2_add() {
-  int errc{};
-  int test_id{};
-  CHECK(syclcompat::extend_vabsdiff2_add<int32_t>((int32_t)0xFFFF0001,
-                                                  (int32_t)0x0003FFFF, -2),
-        0x00000004);
-
-  CHECK(syclcompat::extend_vabsdiff2_add<uint32_t>(0x000A000C, 0x000B000A, 1),
-        0x00000004);
-
-  return errc;
-}
-
-int vmin2() {
-  int errc{};
-  int test_id{};
-  CHECK(syclcompat::extend_vmin2<int32_t>((int32_t)0xFFFF0002, 0x00010001, 0),
-        (int32_t)0xFFFF0001);
-  CHECK(syclcompat::extend_vmin2_sat<int32_t>(0x0002FFF1, 0x0001FFF2, 0),
-        0x0001FFF1);
-
-  CHECK(syclcompat::extend_vmin2<uint32_t>(0x000A000D, 0x000B000C, 0),
-        0x000A000C);
-  CHECK(syclcompat::extend_vmin2_sat<uint32_t>(0x0002FFF1, 0x0001FFF2, 0),
-        0x00010000);
-
-  return errc;
-}
-
-int vmax2() {
-  int errc{};
-  int test_id{};
-  CHECK(syclcompat::extend_vmax2<int32_t>((int32_t)0xFFFF0002, 0x00010001, 0),
-        0x00010002);
-  CHECK(syclcompat::extend_vmax2_sat<int32_t>(0x80008000, 0x00018001, 0),
-        0x7FFF7FFF);
-
-  CHECK(syclcompat::extend_vmax2<uint32_t>(0x000A000D, 0x000B000C, 0),
-        0x000B000D);
-  CHECK(syclcompat::extend_vmax2_sat<uint32_t>(0x0002FFF1, 0x0001FFF2, 0),
-        0x00020000);
-
-  return errc;
-}
-
-int vmin2_vmax2_add() {
-  int errc{};
-  int test_id{};
-  CHECK(
-      syclcompat::extend_vmin2_add<int32_t>((int32_t)0xFFFF0002, 0x00010001, 2),
-      0x00000002);
-  CHECK(syclcompat::extend_vmin2_add<uint32_t>(0x000A000D, 0x000B000C, 2),
-        0x00000018);
-
-  CHECK(syclcompat::extend_vmax2_add<int32_t>((int32_t)0xFFFF0002, 0x00010001,
-                                              -2),
-        0x00000001);
-  CHECK(syclcompat::extend_vmax2_add<uint32_t>(0x000A000D, 0x000B000C, 2),
-        0x0000001A);
-
-  return errc;
-}
-
-int vavrg2() {
-  int errc{};
-  int test_id{};
-  CHECK(syclcompat::extend_vavrg2<int32_t>((int32_t)0xFFFFFFF6, 0x0005FFFA, 0),
-        0x0002FFF8);
-  CHECK(syclcompat::extend_vavrg2_sat<int32_t>((int32_t)0xFFFFFFF6, 0x0005FFFA,
-                                               0),
-        0x0002FFF8);
-
-  CHECK(syclcompat::extend_vavrg2<uint32_t>(0x00010006, 0x00030001, 0),
-        0x00020004);
-  CHECK(syclcompat::extend_vavrg2_sat<uint32_t>(0x00010006, 0x00030001, 0),
-        0x00020004);
-
-  return errc;
-}
-
-int vavrg2_add() {
-  int errc{};
-  int test_id{};
-  CHECK(syclcompat::extend_vavrg2_add<int32_t>((int32_t)0xFFFFFFF6, 0x0005FFFA,
-                                               -2),
-        0xFFFFFFF8);
-
-  CHECK(syclcompat::extend_vavrg2_add<uint32_t>(0x00010006, 0x00030002, 2),
-        0x00000008);
-
-  return errc;
-}
-
-int vcompare2() {
-  int errc{};
-  int test_id{};
-  CHECK(syclcompat::extend_vcompare2(0x0002FFFF, 0x0001FFFF, std::greater<>()),
-        (unsigned)0x00010000);
-  CHECK(syclcompat::extend_vcompare2((uint32_t)0x0002FFFF, (int32_t)0x0001FFFF,
-                                     std::greater<>()),
-        (unsigned)0x00010001);
-  CHECK(syclcompat::extend_vcompare2((int32_t)0x0002FFFF, (uint32_t)0x0001FFFF,
-                                     std::greater<>()),
-        (unsigned)0x00010000);
-
-  CHECK(syclcompat::extend_vcompare2(0x0002FFFF, 0x0001FFFF, std::less<>()),
-        (unsigned)0x00000000);
-  CHECK(syclcompat::extend_vcompare2(0x0002FFFF, 0x0002FFFF,
-                                     std::greater_equal<>()),
-        (unsigned)0x00010001);
-  CHECK(
-      syclcompat::extend_vcompare2(0x0002FFFF, 0x0001FFFF, std::less_equal<>()),
-      (unsigned)0x00000001);
-  CHECK(syclcompat::extend_vcompare2(0xFFFE0002, 0xFFFF0002, std::equal_to<>()),
-        (unsigned)0x00000001);
-  CHECK(syclcompat::extend_vcompare2(0xFFFE0002, 0xFFFF0002,
-                                     std::not_equal_to<>()),
-        (unsigned)0x00010000);
-
-  return errc;
-}
-
-int vcompare2_add() {
-  int errc{};
-  int test_id{};
-  CHECK(syclcompat::extend_vcompare2_add(0x0002FFFF, 0x0001FFFF, 1,
-                                         std::greater<>()),
-        (unsigned)0x00000002);
-  CHECK(syclcompat::extend_vcompare2_add(0x0002FFFF, 0x0001FFFF, 2,
-                                         std::less<>()),
-        (unsigned)0x00000002);
-  CHECK(syclcompat::extend_vcompare2_add(0x0002FFFF, 0x0002FFFF, 1,
-                                         std::greater_equal<>()),
-        (unsigned)0x00000003);
-  CHECK(syclcompat::extend_vcompare2_add(0x0002FFFF, 0x0001FFFF, 2,
-                                         std::less_equal<>()),
-        (unsigned)0x00000003);
-  CHECK(syclcompat::extend_vcompare2_add(0xFFFE0002, 0xFFFF0002, 0xFFFF,
-                                         std::equal_to<>()),
-        (unsigned)0x00010000);
-  CHECK(syclcompat::extend_vcompare2_add(0xFFFE0002, 0xFFFF0002, 0xFF,
-                                         std::not_equal_to<>()),
-        (unsigned)0x00000100);
-
-  return errc;
-}
-
-
-template <auto F> void test_fn(sycl::queue q, int *ec) {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  q.submit([&](sycl::handler &cgh) {
-    cgh.single_task([=]() {
-      auto res = F();
-      if(res != 0) *ec = res;
-    });
-  });
-  int ec_h{};
-  syclcompat::memcpy<int>(&ec_h, ec, 1, q);
-  q.wait_and_throw();
-
-  if (ec_h != 0) {
-    std::cout << "Test " << ec_h << " failed." << std::endl;
-    syclcompat::free(ec, q);
-    assert(false);
-  }
-}
-
-int main() {
-  sycl::queue q = syclcompat::get_default_queue();
-  int *ec = syclcompat::malloc<int>(1, q);
-  syclcompat::fill<int>(ec, 0, 1, q);
-
-  test_fn<vadd2>(q, ec);
-  test_fn<vsub2>(q, ec);
-  test_fn<vadd2_add>(q, ec);
-  test_fn<vsub2_add>(q, ec);
-  test_fn<vabsdiff2>(q, ec);
-  test_fn<vabsdiff2_add>(q, ec);
-  test_fn<vmin2>(q, ec);
-  test_fn<vmax2>(q, ec);
-  test_fn<vmin2_vmax2_add>(q, ec);
-  test_fn<vavrg2>(q, ec);
-  test_fn<vavrg2_add>(q, ec);
-  test_fn<vcompare2>(q, ec);
-  test_fn<vcompare2_add>(q, ec);
-
-  syclcompat::free(ec, q);
-}
diff --git a/sycl/test-e2e/syclcompat/math/math_extend_v_4.cpp b/sycl/test-e2e/syclcompat/math/math_extend_v_4.cpp
deleted file mode 100644
index e9ffaf92825f8..0000000000000
--- a/sycl/test-e2e/syclcompat/math/math_extend_v_4.cpp
+++ /dev/null
@@ -1,398 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  math_extend_v_4.cpp
- *
- *  Description:
- *    math extend 4-vectorized helpers tests
- **************************************************************************/
-
-// ===------------- math_extend_vfunc_4.cpp ----------------*- C++ -*-----===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===---------------------------------------------------------------------===//
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include <cmath>
-#include <cstdint>
-#include <sycl/detail/core.hpp>
-
-#include <syclcompat/device.hpp>
-#include <syclcompat/math.hpp>
-#include <syclcompat/memory.hpp>
-
-#define CHECK(S, REF)                                                          \
-  {                                                                            \
-    ++test_id;                                                                 \
-    auto ret = S;                                                              \
-    if (ret != REF) {                                                          \
-      errc = test_id;                                                          \
-    }                                                                          \
-  }
-
-int vadd4() {
-  int errc{};
-  int test_id{};
-  CHECK(syclcompat::extend_vadd4<int32_t>(0x0102FFFE, 0x01FF02FF, 0),
-        0x020101FD);
-  CHECK(syclcompat::extend_vadd4<int32_t>((int32_t)0x7E81FEFF,
-                                          (int32_t)0x02FD03FF, 0),
-        0x807E01FE);
-  CHECK(syclcompat::extend_vadd4<int32_t>((uint32_t)0x7E81FEFF,
-                                          (uint32_t)0x02FD03FF, 0),
-        0x807E01FE);
-  CHECK(syclcompat::extend_vadd4<int32_t>((uint32_t)0x7E81FEFF,
-                                          (int32_t)0x02FD03FF, 0),
-        0x807E01FE);
-  CHECK(syclcompat::extend_vadd4<int32_t>((int32_t)0x7E81FEFF,
-                                          (uint32_t)0x02FD03FF, 0),
-        0x807E01FE);
-  CHECK(syclcompat::extend_vadd4_sat<int32_t>((int32_t)0x7E81FEFF,
-                                              (int32_t)0x02FD03FF, 0),
-        0x7F8001FE);
-  CHECK(syclcompat::extend_vadd4_sat<int32_t>((uint32_t)0x7E81FEFF,
-                                              (uint32_t)0x02FD03FF, 0),
-        0x7F7F7F7F);
-  CHECK(syclcompat::extend_vadd4_sat<int32_t>((uint32_t)0x7E81FEFF,
-                                              (int32_t)0x02FD03FF, 0),
-        0x7F7E7F7F);
-  CHECK(syclcompat::extend_vadd4_sat<int32_t>((int32_t)0x7E81FEFF,
-                                              (uint32_t)0x02FD03FF, 0),
-        0x7F7E017F);
-
-  CHECK(syclcompat::extend_vadd4<uint32_t>(0x01020304, 0x0A0B0C0D, 0),
-        0x0B0D0F11);
-  CHECK(syclcompat::extend_vadd4<uint32_t>((uint32_t)0x000100FF,
-                                           (uint32_t)0x00FE0001, 0),
-        0x00FF0000);
-  CHECK(syclcompat::extend_vadd4_sat<uint32_t>((uint32_t)0x000100FF,
-                                               (uint32_t)0x00FE0001, 0),
-        0x00FF00FF);
-
-  return errc;
-}
-
-int vadd4_add() {
-  int errc{};
-  int test_id{};
-  CHECK(syclcompat::extend_vadd4_add<int32_t>(0x0102FFFE, 0x01FF02FF, 1),
-        0x00000002);
-  CHECK(syclcompat::extend_vadd4_add<int32_t>((int32_t)0x7E81FEFF,
-                                              (int32_t)0x02FD03FF, -1),
-        0xFFFFFFFC);
-  CHECK(syclcompat::extend_vadd4_add<int32_t>((uint32_t)0x7E81FEFF,
-                                              (uint32_t)0x02FD03FF, -1),
-        0x000004FC);
-  CHECK(syclcompat::extend_vadd4_add<int32_t>((uint32_t)0x7E81FEFF,
-                                              (int32_t)0x02FD03FF, -1),
-        0x000002FC);
-  CHECK(syclcompat::extend_vadd4_add<int32_t>((int32_t)0x7E81FEFF,
-                                              (uint32_t)0x02FD03FF, -1),
-        0x000001FC);
-
-  CHECK(syclcompat::extend_vadd4_add<uint32_t>(0x01020304, 0x01000100, 1),
-        0x0000000D);
-  CHECK(syclcompat::extend_vadd4_add<uint32_t>((uint32_t)0x000100FF,
-                                               (uint32_t)0x00FE0001, 1),
-        0x0000000200);
-
-  return errc;
-}
-
-int vsub4() {
-  int errc{};
-  int test_id{};
-  CHECK(syclcompat::extend_vsub4<int32_t>((int32_t)0x0102FFFF,
-                                          (int32_t)0x020101FE, 0),
-        0xFF01FE01);
-  CHECK(syclcompat::extend_vsub4<int32_t>((int32_t)0x01807F10, 0x0102FE10, 0),
-        0x007E8100);
-  CHECK(
-      syclcompat::extend_vsub4_sat<int32_t>((int32_t)0x01807F10, 0x0102FE10, 0),
-      0x00807F00);
-
-  CHECK(syclcompat::extend_vsub4<uint32_t>(0x02020C0B, 0x02010A0A, 0),
-        0x00010201);
-  CHECK(syclcompat::extend_vsub4<uint32_t>(0x01020304, 0x02040608, 0),
-        0xFFFEFDFC);
-  CHECK(syclcompat::extend_vsub4_sat<uint32_t>(0x01020304, 0x02040608, 0),
-        0x00000000);
-
-  return errc;
-}
-
-int vsub4_add() {
-  int errc{};
-  int test_id{};
-  CHECK(syclcompat::extend_vsub4_add<int32_t>((int32_t)0x0102FFFF,
-                                              (int32_t)0x020101FE, -1),
-        0xFFFFFFFE);
-  CHECK(
-      syclcompat::extend_vsub4_add<int32_t>((int32_t)0x01807F10, 0x0102FE10, 2),
-      0x00000001);
-
-  CHECK(syclcompat::extend_vsub4_add<uint32_t>(0x02020C0B, 0x02010A0A, 2),
-        0x00000006);
-  CHECK(syclcompat::extend_vsub4_add<uint32_t>(0x01020304, 0x02040608, 1),
-        0xFFFFFFF7);
-
-  CHECK(syclcompat::extend_vsub4_add<uint32_t>((uint32_t)0x01020304,
-                                               (uint32_t)0x02040608, 1),
-        0xFFFFFFF7);
-
-  return errc;
-}
-
-int vabsdiff4() {
-  int errc{};
-  int test_id{};
-  CHECK(
-      syclcompat::extend_vabsdiff4<int32_t>((int32_t)0xFF01FF02, 0x01FF02FF, 0),
-      0x02020303);
-  CHECK(syclcompat::extend_vabsdiff4<int32_t>((int32_t)0x8002007F,
-                                              (int32_t)0x01010080, 0),
-        0x810100FF);
-  CHECK(syclcompat::extend_vabsdiff4_sat<int32_t>((int32_t)0x8002007F,
-                                                  (int32_t)0x01010080, 0),
-        0x7F01007F);
-
-  CHECK(syclcompat::extend_vabsdiff4<uint32_t>(0x01020304, 0x04030201, 0),
-        0x03010103);
-  CHECK(syclcompat::extend_vabsdiff4<uint32_t>((uint32_t)0xFEFF0001,
-                                               (int32_t)0xF0FE0003, 0),
-        0x0E010002);
-  CHECK(syclcompat::extend_vabsdiff4_sat<uint32_t>((uint32_t)0xFEFF0001,
-                                                   (int32_t)0xF0FE0003, 0),
-        0xFFFF0002);
-
-  return errc;
-}
-
-int vabsdiff4_add() {
-  int errc{};
-  int test_id{};
-  CHECK(syclcompat::extend_vabsdiff4_add<int32_t>((int32_t)0xFF01FF02,
-                                                  0x01FF02FF, 1),
-        0x0000000B);
-  CHECK(syclcompat::extend_vabsdiff4_add<int32_t>((int32_t)0x8002007F,
-                                                  (int32_t)0x01010080, -1),
-        0x00000180);
-
-  CHECK(syclcompat::extend_vabsdiff4_add<uint32_t>(0x01020304, 0x04030201, 2),
-        0x0000000A);
-  CHECK(syclcompat::extend_vabsdiff4_add<uint32_t>((uint32_t)0xFEFF0001,
-                                                   (int32_t)0xF0FE0003, 1),
-        0x00000212);
-
-  return errc;
-}
-
-int vmin4() {
-  int errc{};
-  int test_id{};
-  CHECK(syclcompat::extend_vmin4<int32_t>((int32_t)0xFFFF0102,
-                                          (int32_t)0xFE010201, 0),
-        0xFEFF0101);
-
-  CHECK(syclcompat::extend_vmin4_sat<int32_t>(0x0102FF00, 0x0201FE00, 0),
-        0x0101FE00);
-
-  CHECK(syclcompat::extend_vmin4<uint32_t>(0x010A020D, 0x000B020C, 0),
-        0x000A020C);
-
-  CHECK(syclcompat::extend_vmin4_sat<uint32_t>(0x020201FF, 0x0201FFFE, 0),
-        0x02010000);
-
-  return errc;
-}
-
-int vmax4() {
-  int errc{};
-  int test_id{};
-  CHECK(syclcompat::extend_vmax4<int32_t>((int32_t)0xFFFF0102,
-                                          (int32_t)0xFE010201, 0),
-        0xFF010202);
-  CHECK(syclcompat::extend_vmax4_sat<int32_t>(0x0102FF00, 0x0201FE00, 0),
-        0x0202FF00);
-
-  CHECK(syclcompat::extend_vmax4<uint32_t>(0x010A020D, 0x000B020C, 0),
-        0x010B020D);
-  CHECK(syclcompat::extend_vmax4_sat<uint32_t>(0x020201FF, 0x0201FFFE, 0),
-        0x02020100);
-
-  return errc;
-}
-
-int vmin4_vmax4_add() {
-  int errc{};
-  int test_id{};
-  CHECK(syclcompat::extend_vmin4_add<int32_t>((int32_t)0xFFFF0102,
-                                              (int32_t)0xFE010201, -1),
-        0xFFFFFFFE);
-
-  CHECK(syclcompat::extend_vmin4_add<uint32_t>(0x010A020D, 0x000B020C, 1),
-        0x00000019);
-
-  CHECK(syclcompat::extend_vmax4_add<int32_t>((int32_t)0xFFFF0102,
-                                              (int32_t)0xFE010201, 2),
-        0x00000006);
-  CHECK(syclcompat::extend_vmax4_add<uint32_t>(0x010A020D, 0x000B020C, -1),
-        0x0000001A);
-
-  return errc;
-}
-
-int vavrg4() {
-  int errc{};
-  int test_id{};
-  CHECK(syclcompat::extend_vavrg4<int32_t>((int32_t)0xFF01FF01, 0x0505FF00, 0),
-        0x0203FF01);
-  CHECK(syclcompat::extend_vavrg4_sat<int32_t>((int32_t)0xFF01FF01, 0x0505FF00,
-                                               0),
-        0x0203FF01);
-
-  CHECK(syclcompat::extend_vavrg4<uint32_t>(0x00010106, (int32_t)0xFC050101, 0),
-        (int32_t)0xFE030104);
-  CHECK(syclcompat::extend_vavrg4_sat<uint32_t>(0x00010106, (int32_t)0xFC050101,
-                                                0),
-        (int32_t)0x00030104);
-
-  return errc;
-}
-
-int vavrg4_add() {
-  int errc{};
-  int test_id{};
-  CHECK(syclcompat::extend_vavrg4_add<int32_t>((int32_t)0xFF01FF01, 0x0505FF00,
-                                               1),
-        0x00000006);
-  CHECK(syclcompat::extend_vavrg4_add<int32_t>((int32_t)0xFF01FF01, 0x0505FF00,
-                                               -6),
-        0xFFFFFFFF);
-
-  CHECK(syclcompat::extend_vavrg4_add<uint32_t>(0x00010106, (int32_t)0xFC050101,
-                                                1),
-        (int32_t)0x00000007);
-
-  CHECK(syclcompat::extend_vavrg4_add<uint32_t>(0x00010106, (int32_t)0xFC050101,
-                                                -1),
-        (int32_t)0x00000005);
-
-  return errc;
-}
-
-int vcompare4() {
-  int errc{};
-  int test_id{};
-  CHECK(syclcompat::extend_vcompare4(0x0102FEFF, 0x01FFFFFE, std::greater<>()),
-        (unsigned)0x00010001);
-  CHECK(syclcompat::extend_vcompare4((uint32_t)0x0102FEFF, (int32_t)0x01FFFFFE,
-                                     std::greater<>()),
-        (unsigned)0x00010101);
-  CHECK(syclcompat::extend_vcompare4((int32_t)0x0102FEFF, (uint32_t)0x01FFFFFE,
-                                     std::greater<>()),
-        (unsigned)0x00000000);
-
-  CHECK(syclcompat::extend_vcompare4(0x0102FEFF, 0x01FFFFFE, std::less<>()),
-        (unsigned)0x00000100);
-  CHECK(syclcompat::extend_vcompare4(0x0102FEFF, 0x01FFFFFE,
-                                     std::greater_equal<>()),
-        (unsigned)0x01010001);
-  CHECK(
-      syclcompat::extend_vcompare4(0x0102FEFF, 0x01FFFFFE, std::less_equal<>()),
-      (unsigned)0x01000100);
-  CHECK(syclcompat::extend_vcompare4(0xFFFE0102, 0xFFFF0202, std::equal_to<>()),
-        (unsigned)0x01000001);
-  CHECK(syclcompat::extend_vcompare4(0xFFFE0102, 0xFFFF0202,
-                                     std::not_equal_to<>()),
-        (unsigned)0x00010100);
-
-  return errc;
-}
-
-int vcompare4_add() {
-  int errc{};
-  int test_id{};
-  CHECK(syclcompat::extend_vcompare4_add(0x0102FEFF, 0x01FFFFFE, 1,
-                                         std::greater<>()),
-        (unsigned)0x00000003);
-  CHECK(syclcompat::extend_vcompare4_add(0x0102FEFF, 0x01FFFFFE, 1,
-                                         std::less<>()),
-        (unsigned)0x00000002);
-  CHECK(syclcompat::extend_vcompare4_add(0x0102FEFF, 0x01FFFFFE, 2,
-                                         std::greater_equal<>()),
-        (unsigned)0x00000005);
-  CHECK(syclcompat::extend_vcompare4_add(0x0102FEFF, 0x01FFFFFE, 2,
-                                         std::less_equal<>()),
-        (unsigned)0x00000004);
-  CHECK(syclcompat::extend_vcompare4_add(0xFFFE0102, 0xFFFF0202, 0xFF,
-                                         std::equal_to<>()),
-        (unsigned)0x00000101);
-  CHECK(syclcompat::extend_vcompare4_add(0xFFFE0102, 0xFFFF0202, 0xFFFF,
-                                         std::not_equal_to<>()),
-        (unsigned)0x00010001);
-
-  return errc;
-}
-
-template <auto F> void test_fn(sycl::queue q, int *ec) {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  q.submit([&](sycl::handler &cgh) {
-    cgh.single_task([=]() {
-      auto res = F();
-      if(res != 0) *ec = res;
-    });
-  });
-  q.wait_and_throw();
-
-  int ec_h{};
-  syclcompat::memcpy<int>(&ec_h, ec, 1, q);
-  if (ec_h != 0) {
-    std::cout << "Test " << ec_h << " failed." << std::endl;
-    syclcompat::free(ec, q);
-    assert(false);
-  }
-}
-
-
-int main() {
-  sycl::queue q = syclcompat::get_default_queue();
-  int *ec = syclcompat::malloc<int>(1, q);
-  syclcompat::fill<int>(ec, 0, 1, q);
-
-  test_fn<vadd4>(q, ec);
-  test_fn<vsub4>(q, ec);
-  test_fn<vadd4_add>(q, ec);
-  test_fn<vsub4_add>(q, ec);
-  test_fn<vabsdiff4>(q, ec);
-  test_fn<vabsdiff4_add>(q, ec);
-  test_fn<vmin4>(q, ec);
-  test_fn<vmax4>(q, ec);
-  test_fn<vmin4_vmax4_add>(q, ec);
-  test_fn<vavrg4>(q, ec);
-  test_fn<vavrg4_add>(q, ec);
-  test_fn<vcompare4>(q, ec);
-  test_fn<vcompare4_add>(q, ec);
-
-  syclcompat::free(ec, q);
-}
diff --git a/sycl/test-e2e/syclcompat/math/math_fixt.hpp b/sycl/test-e2e/syclcompat/math/math_fixt.hpp
deleted file mode 100644
index 4647142da6c61..0000000000000
--- a/sycl/test-e2e/syclcompat/math/math_fixt.hpp
+++ /dev/null
@@ -1,298 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat
- *
- *  math_fixt.hpp
- *
- *  Description:
- *     Fixtures and helpers for to tests the math functionalities
- **************************************************************************/
-
-#pragma once
-
-#include <type_traits>
-
-#include <sycl/detail/core.hpp>
-#include <syclcompat.hpp>
-
-#include "../common.hpp"
-
-template <typename Container, typename ValueT, typename = void>
-static constexpr bool contained_is_same_v = false;
-
-template <typename Container, typename ValueT>
-static constexpr bool contained_is_same_v<
-    Container, ValueT, std::void_t<typename Container::value_type>> =
-    std::is_same_v<typename Container::value_type, ValueT>;
-
-template <typename Container, typename = void>
-static constexpr bool contained_is_integral_v = false;
-
-template <typename Container>
-static constexpr bool contained_is_integral_v<
-    Container, std::void_t<typename Container::value_type>> =
-    std::is_integral_v<typename Container::value_type>;
-
-template <typename Container, typename = void>
-static constexpr bool contained_is_floating_point_v = false;
-
-template <typename Container>
-static constexpr bool contained_is_floating_point_v<
-    Container, std::void_t<typename Container::value_type>> =
-    syclcompat::is_floating_point_v<typename Container::value_type>;
-
-template <typename... Ts> struct container_common_type;
-
-template <template <typename, int> typename Container, typename T, typename U,
-          int Size>
-struct container_common_type<Container<T, Size>, Container<U, Size>> {
-  using type = Container<std::common_type_t<T, U>, Size>;
-};
-
-template <typename T, typename U> struct container_common_type<T, U> {
-  using type = std::common_type_t<T, U>;
-};
-
-template <typename T, typename U>
-using container_common_type_t = typename container_common_type<T, U>::type;
-
-template <typename ...ValueT> struct should_skip {
-  bool operator()(const sycl::device &dev) const {
-    if constexpr ((std::is_same_v<ValueT, double> || ...) ||
-                  (contained_is_same_v<ValueT, double> || ...)) {
-      if (!dev.has(sycl::aspect::fp64)) {
-        std::cout << "  sycl::aspect::fp64 not supported by the SYCL device."
-                  << std::endl;
-        return true;
-      }
-    }
-    if constexpr ((std::is_same_v<ValueT, sycl::half> || ...) ||
-                  (contained_is_same_v<ValueT, sycl::half> || ...)) {
-      if (!dev.has(sycl::aspect::fp16)) {
-        std::cout << "  sycl::aspect::fp16 not supported by the SYCL device."
-                  << std::endl;
-        return true;
-      }
-    }
-    return false;
-  }
-};
-
-#define CHECK(ResultT, RESULT, EXPECTED)                                       \
-  if constexpr (std::is_integral_v<ResultT>) {                                 \
-    assert(RESULT == EXPECTED);                                                \
-  } else if constexpr (contained_is_integral_v<ResultT>) {                     \
-    for (size_t i = 0; i < RESULT.size(); i++)                                 \
-      assert(RESULT[i] == EXPECTED[i]);                                        \
-  } else if constexpr (syclcompat::is_floating_point_v<ResultT>) {             \
-    if (syclcompat::detail::isnan(RESULT))                                     \
-      assert(syclcompat::detail::isnan(EXPECTED));                             \
-    else                                                                       \
-      assert(fabs(RESULT - EXPECTED) < ERROR_TOLERANCE);                       \
-  } else if constexpr (contained_is_floating_point_v<ResultT>) {               \
-    for (size_t i = 0; i < RESULT.size(); i++) {                               \
-      if (syclcompat::detail::isnan(RESULT[i])) {                              \
-        assert(syclcompat::detail::isnan(EXPECTED[i]));                        \
-      } else {                                                                 \
-        assert(fabs(RESULT[i] - EXPECTED[i]) < ERROR_TOLERANCE);               \
-      }                                                                        \
-    }                                                                          \
-  } else {                                                                     \
-    static_assert(0, "math_fixt.hpp should not have arrived here.");           \
-  }
-
-class OpTestLauncher {
-protected:
-  syclcompat::dim3 grid_;
-  syclcompat::dim3 threads_;
-  size_t data_size_;
-  bool skip_;
-
-public:
-  OpTestLauncher(const syclcompat::dim3 &grid, const syclcompat::dim3 &threads,
-                 const size_t data_size, const bool skip)
-      : grid_{grid}, threads_{threads}, data_size_{data_size}, skip_{skip} {}
-};
-
-// Templated ResultT to support both arithmetic and boolean operators
-template <typename ValueT, typename ValueU,
-          typename ResultT = container_common_type_t<ValueT, ValueU>>
-class BinaryOpTestLauncher : OpTestLauncher {
-protected:
-  ValueT *op1_;
-  ValueU *op2_;
-  ResultT res_h_, *res_;
-  bool *res_hi_;
-  bool *res_lo_;
-
-public:
-  BinaryOpTestLauncher(const syclcompat::dim3 &grid,
-                       const syclcompat::dim3 &threads,
-                       const size_t data_size = 1)
-      : OpTestLauncher{grid, threads, data_size,
-                       should_skip<ValueT, ValueU, ResultT>()(
-                           syclcompat::get_current_device())} {
-    if (skip_)
-      return;
-    op1_ = syclcompat::malloc<ValueT>(data_size);
-    op2_ = syclcompat::malloc<ValueU>(data_size);
-    res_ = syclcompat::malloc<ResultT>(data_size);
-    res_hi_ = syclcompat::malloc<bool>(1);
-    res_lo_ = syclcompat::malloc<bool>(1);
-  };
-
-  virtual ~BinaryOpTestLauncher() {
-    if (skip_)
-      return;
-    syclcompat::free(op1_);
-    syclcompat::free(op2_);
-    syclcompat::free(res_);
-    syclcompat::free(res_hi_);
-    syclcompat::free(res_lo_);
-  }
-
-  template <auto Kernel>
-  void launch_test(ValueT op1, ValueU op2, ResultT expected) {
-    if (skip_)
-      return;
-    syclcompat::memcpy<ValueT>(op1_, &op1, data_size_);
-    syclcompat::memcpy<ValueU>(op2_, &op2, data_size_);
-    syclcompat::launch<Kernel>(grid_, threads_, op1_, op2_, res_);
-    syclcompat::wait();
-    syclcompat::memcpy<ResultT>(&res_h_, res_, data_size_);
-
-    CHECK(ResultT, res_h_, expected);
-  };
-  template <auto Kernel>
-  void launch_test(ValueT op1, ValueU op2, ResultT expected, bool need_relu) {
-    if (skip_)
-      return;
-    syclcompat::memcpy<ValueT>(op1_, &op1, data_size_);
-    syclcompat::memcpy<ValueU>(op2_, &op2, data_size_);
-    syclcompat::launch<Kernel>(grid_, threads_, op1_, op2_, res_, need_relu);
-    syclcompat::wait();
-    syclcompat::memcpy<ResultT>(&res_h_, res_, data_size_);
-
-    CHECK(ResultT, res_h_, expected);
-  };
-  template <auto Kernel>
-  void launch_test(ValueT op1, ValueU op2, ResultT expected, bool expected_hi,
-                   bool expected_lo) {
-    if (skip_)
-      return;
-    syclcompat::memcpy<ValueT>(op1_, &op1, data_size_);
-    syclcompat::memcpy<ValueU>(op2_, &op2, data_size_);
-    syclcompat::launch<Kernel>(grid_, threads_, op1_, op2_, res_, res_hi_,
-                               res_lo_);
-    syclcompat::wait();
-    syclcompat::memcpy<ResultT>(&res_h_, res_, data_size_);
-    bool res_hi_h_, res_lo_h_;
-    syclcompat::memcpy<bool>(&res_hi_h_, res_hi_, 1);
-    syclcompat::memcpy<bool>(&res_lo_h_, res_lo_, 1);
-
-    CHECK(ResultT, res_h_, expected);
-    assert(res_hi_h_ == expected_hi);
-    assert(res_lo_h_ == expected_lo);
-  };
-};
-
-template <typename ValueT, typename ResultT = ValueT>
-class UnaryOpTestLauncher : OpTestLauncher {
-protected:
-  ValueT *op_;
-  ResultT res_h_, *res_;
-
-public:
-  UnaryOpTestLauncher(const syclcompat::dim3 &grid,
-                      const syclcompat::dim3 &threads,
-                      const size_t data_size = 1)
-      : OpTestLauncher{
-            grid, threads, data_size,
-            should_skip<ValueT, ResultT>()(syclcompat::get_current_device())} {
-    if (skip_)
-      return;
-    op_ = syclcompat::malloc<ValueT>(data_size);
-    res_ = syclcompat::malloc<ResultT>(data_size);
-  };
-
-  virtual ~UnaryOpTestLauncher() {
-    if (skip_)
-      return;
-    syclcompat::free(op_);
-    syclcompat::free(res_);
-  }
-
-  template <auto Kernel> void launch_test(ValueT op, ResultT expected) {
-    if (skip_)
-      return;
-    syclcompat::memcpy<ValueT>(op_, &op, data_size_);
-    syclcompat::launch<Kernel>(grid_, threads_, op_, res_);
-    syclcompat::wait();
-    syclcompat::memcpy<ResultT>(&res_h_, res_, data_size_);
-
-    CHECK(ResultT, res_h_, expected);
-  }
-};
-
-// Templated ResultT to support both arithmetic and boolean operators
-template <typename ValueT, typename ValueU, typename ValueV,
-          typename ResultT = std::common_type_t<ValueT, ValueU, ValueV>>
-class TernaryOpTestLauncher : OpTestLauncher {
-protected:
-  ValueT *op1_;
-  ValueU *op2_;
-  ValueV *op3_;
-  ResultT res_h_, *res_;
-
-public:
-  TernaryOpTestLauncher(const syclcompat::dim3 &grid,
-                        const syclcompat::dim3 &threads,
-                        const size_t data_size = 1)
-      : OpTestLauncher{grid, threads, data_size,
-                       should_skip<ValueT, ValueU, ValueV, ResultT>()(
-                           syclcompat::get_current_device())} {
-    if (skip_)
-      return;
-    op1_ = syclcompat::malloc<ValueT>(data_size);
-    op2_ = syclcompat::malloc<ValueU>(data_size);
-    op3_ = syclcompat::malloc<ValueV>(data_size);
-    res_ = syclcompat::malloc<ResultT>(data_size);
-  };
-
-  virtual ~TernaryOpTestLauncher() {
-    if (skip_)
-      return;
-    syclcompat::free(op1_);
-    syclcompat::free(op2_);
-    syclcompat::free(op3_);
-    syclcompat::free(res_);
-  }
-
-  template <auto Kernel>
-  void launch_test(ValueT op1, ValueU op2, ValueV op3, ResultT expected,
-                   bool need_relu = false) {
-    if (skip_)
-      return;
-    syclcompat::memcpy<ValueT>(op1_, &op1, data_size_);
-    syclcompat::memcpy<ValueU>(op2_, &op2, data_size_);
-    syclcompat::memcpy<ValueV>(op3_, &op3, data_size_);
-    syclcompat::launch<Kernel>(grid_, threads_, op1_, op2_, op3_, res_,
-                               need_relu);
-    syclcompat::wait();
-    syclcompat::memcpy<ResultT>(&res_h_, res_, data_size_);
-
-    CHECK(ResultT, res_h_, expected);
-  };
-};
diff --git a/sycl/test-e2e/syclcompat/math/math_funnelshift.cpp b/sycl/test-e2e/syclcompat/math/math_funnelshift.cpp
deleted file mode 100644
index d93ee115bae75..0000000000000
--- a/sycl/test-e2e/syclcompat/math/math_funnelshift.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  math_funnelshift.cpp
- *
- *  Description:
- *    math funnel helpers tests
- **************************************************************************/
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include <syclcompat/device.hpp>
-#include <syclcompat/math.hpp>
-#include <syclcompat/memory.hpp>
-
-void testFunnelShiftKernel(int *const TestResults) {
-  TestResults[0] = (syclcompat::funnelshift_l(0xAA000000, 0xBB, 8) == 0xBBAA);
-  TestResults[1] =
-      (syclcompat::funnelshift_lc(0xAA000000, 0xBB, 16) == 0xBBAA00);
-  TestResults[2] = (syclcompat::funnelshift_r(0xAA00, 0xBB, 8) == 0xBB0000AA);
-  TestResults[3] = (syclcompat::funnelshift_rc(0xAA0000, 0xBB, 16) == 0xBB00AA);
-}
-
-int main() {
-  constexpr int nTests = 4;
-
-  sycl::queue q = syclcompat::get_default_queue();
-  int *testResults = syclcompat::malloc<int>(nTests, q);
-  int *testResultsHost = syclcompat::malloc_host<int>(nTests, q);
-  syclcompat::fill<int>(testResults, 0, nTests, q);
-
-  q.submit([&](sycl::handler &cgh) {
-     cgh.parallel_for(
-         1, [=](sycl::item<1> it) { testFunnelShiftKernel(testResults); });
-   }).wait_and_throw();
-
-  syclcompat::memcpy<int>(testResultsHost, testResults, nTests, q);
-
-  for (int i = 0; i < nTests; i++) {
-    if (testResultsHost[i] == 0) {
-      std::cerr << "funnelshift test " << i << " failed" << std::endl;
-      return 1;
-    }
-  }
-  syclcompat::free(testResults, q);
-  syclcompat::free(testResultsHost, q);
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/math/math_length_test.cpp b/sycl/test-e2e/syclcompat/math/math_length_test.cpp
deleted file mode 100644
index 24effadfa5398..0000000000000
--- a/sycl/test-e2e/syclcompat/math/math_length_test.cpp
+++ /dev/null
@@ -1,122 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  math_length_test.cpp
- *
- *  Description:
- *    vector length tests
- **************************************************************************/
-
-// The original source was under the license below:
-// ====------ UtilFastLengthTest.cpp---------- -*- C++ -* ----===////
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===----------------------------------------------------------------------===//
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include <numeric>
-
-#include <sycl/detail/core.hpp>
-
-#include <syclcompat.hpp>
-
-#define MAX_LEN 5
-
-void compute_fast_length(float *d_A, size_t n, float *ans) {
-  *ans = syclcompat::fast_length(d_A, n);
-}
-
-void compute_length(float *d_A, size_t n, float *ans) {
-  *ans = syclcompat::length(d_A, n);
-}
-
-class LengthLauncher {
-protected:
-  float *data_;
-  float *result_;
-  float host_result_{0.0};
-
-public:
-  LengthLauncher() {
-    data_ = (float *)syclcompat::malloc(MAX_LEN * sizeof(float));
-    result_ = (float *)syclcompat::malloc(sizeof(float));
-  };
-  ~LengthLauncher() {
-    syclcompat::free(data_);
-    syclcompat::free(result_);
-  }
-
-  void check_result(std::vector<float> result) {
-    float sum =
-        std::inner_product(result.begin(), result.end(), result.begin(), 0.0f);
-    float diff = fabs(sqrtf(sum)) - host_result_;
-    assert(diff <= 1.e-5);
-  }
-
-  template <auto F> void launch(std::vector<float> vec) {
-    size_t n = vec.size();
-    syclcompat::memcpy(data_, vec.data(), sizeof(float) * n);
-    auto data = data_;
-    auto result = result_;
-    syclcompat::get_default_queue().single_task(
-        [data, result, n]() { F(data, n, result); });
-    syclcompat::memcpy(&host_result_, result_, sizeof(float));
-    check_result(vec);
-  }
-};
-
-void test_fast_length() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  auto launcher = LengthLauncher();
-  launcher.launch<compute_fast_length>(std::vector<float>{0.8970062715});
-  launcher.launch<compute_fast_length>(
-      std::vector<float>{0.8335529744, 0.7346600673});
-  launcher.launch<compute_fast_length>(
-      std::vector<float>{0.1658983906, 0.590226484, 0.4891553616});
-  launcher.launch<compute_fast_length>(std::vector<float>{
-      0.6041178723, 0.7760620605, 0.2944284976, 0.6851913766});
-  launcher.launch<compute_fast_length>(std::vector<float>{
-      0.6041178723, 0.7760620605, 0.2944284976, 0.6851913766, 0.6851913766});
-}
-
-void test_length() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  auto launcher = LengthLauncher();
-  launcher.launch<compute_length>(std::vector<float>{0.8970062715});
-  launcher.launch<compute_length>(
-      std::vector<float>{0.8335529744, 0.7346600673});
-  launcher.launch<compute_length>(
-      std::vector<float>{0.1658983906, 0.590226484, 0.4891553616});
-  launcher.launch<compute_length>(std::vector<float>{
-      0.6041178723, 0.7760620605, 0.2944284976, 0.6851913766});
-  launcher.launch<compute_length>(std::vector<float>{
-      0.6041178723, 0.7760620605, 0.2944284976, 0.6851913766, 0.6851913766});
-}
-
-int main() {
-  test_fast_length();
-  test_length();
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/math/math_ops.cpp b/sycl/test-e2e/syclcompat/math/math_ops.cpp
deleted file mode 100644
index 81590c7710aa3..0000000000000
--- a/sycl/test-e2e/syclcompat/math/math_ops.cpp
+++ /dev/null
@@ -1,393 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  math_ops.cpp
- *
- *  Description:
- *    tests for non-vectorized math helper functions
- **************************************************************************/
-
-// DEFINE: %{mathflags} = %if cl_options %{/clang:-fno-fast-math%} %else %{-fno-fast-math%}
-
-// RUN: %{build} %{mathflags} -o %t.out
-// RUN: %{run} %t.out
-
-// XFAIL: target-native_cpu
-// XFAIL-TRACKER: https://github.com/intel/llvm/issues/20142
-
-#include <syclcompat/dims.hpp>
-#include <syclcompat/math.hpp>
-
-#include "../common.hpp"
-#include "math_fixt.hpp"
-
-template <typename ValueT, typename ValueU>
-inline void max_kernel(ValueT *a, ValueU *b,
-                       std::common_type_t<ValueT, ValueU> *r) {
-  *r = syclcompat::max<ValueT, ValueU>(*a, *b);
-}
-
-template <typename ValueT, typename ValueU = ValueT>
-void test_syclcompat_max() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  constexpr syclcompat::dim3 grid{1};
-  constexpr syclcompat::dim3 threads{1};
-  const ValueT op1 = static_cast<ValueT>(5);
-  const ValueU op2 = static_cast<ValueU>(10);
-  const std::common_type_t<ValueT, ValueU> res = static_cast<ValueU>(10);
-
-  BinaryOpTestLauncher<ValueT, ValueU>(grid, threads)
-      .template launch_test<max_kernel<ValueT, ValueU>>(op1, op2, res);
-}
-
-template <typename ValueT, typename ValueU>
-inline void min_kernel(ValueT *a, ValueU *b,
-                       std::common_type_t<ValueT, ValueU> *r) {
-  *r = syclcompat::min<ValueT,ValueU>(*a, *b);
-}
-
-template <typename ValueT, typename ValueU = ValueT>
-void test_syclcompat_min() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  constexpr syclcompat::dim3 grid{1};
-  constexpr syclcompat::dim3 threads{1};
-  const ValueT op1 = static_cast<ValueT>(5);
-  const ValueU op2 = static_cast<ValueU>(10);
-  const std::common_type_t<ValueT, ValueU> res =
-      static_cast<std::common_type_t<ValueT, ValueU>>(5);
-
-  BinaryOpTestLauncher<ValueT, ValueU>(grid, threads)
-      .template launch_test<min_kernel<ValueT, ValueU>>(op1, op2, res);
-}
-
-template <typename ValueT, typename ValueU>
-inline void fmin_nan_kernel(ValueT *a, ValueU *b,
-                            container_common_type_t<ValueT, ValueU> *r) {
-  *r = syclcompat::fmin_nan(*a, *b);
-}
-
-template <typename ValueT, typename ValueU = ValueT>
-void test_syclcompat_fmin_nan() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  using ValueTU = std::common_type_t<ValueT, ValueU>;
-
-  constexpr syclcompat::dim3 grid{1};
-  constexpr syclcompat::dim3 threads{1};
-  const ValueT op1 = static_cast<ValueT>(5);
-  const ValueU op2 = static_cast<ValueU>(10);
-  ValueU op3 = sycl::nan(static_cast<unsigned int>(0));
-
-  const ValueTU res =
-      static_cast<ValueTU>(5);
-
-  BinaryOpTestLauncher<ValueT, ValueU>(grid, threads)
-      .template launch_test<fmin_nan_kernel<ValueT, ValueU>>(op1, op2, res);
-
-  BinaryOpTestLauncher<ValueT, ValueU>(grid, threads)
-      .template launch_test<fmin_nan_kernel<ValueT, ValueU>>(op1, op3, op3);
-}
-
-template <template <typename T, int Dim> typename ContainerT, typename ValueT, typename ValueU = ValueT>
-void test_container_syclcompat_fmin_nan(){
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  constexpr syclcompat::dim3 grid{1};
-  constexpr syclcompat::dim3 threads{1};
-
-  using ValueTU = std::common_type_t<ValueT, ValueU>;
-  using ContT = ContainerT<ValueT, 2>;
-  using ContU = ContainerT<ValueU, 2>;
-  using ContTU = ContainerT<ValueTU, 2>;
-
-  const ContT op4 = {static_cast<ValueT>(5), static_cast<ValueT>(10)};
-  const ContU op5 = {static_cast<ValueU>(10), static_cast<ValueU>(5)};
-  const ContU op6 = {sycl::nan(static_cast<unsigned int>(0)), sycl::nan(static_cast<unsigned int>(0))};
-  const ContTU op6_res = {sycl::nan(static_cast<unsigned int>(0)), sycl::nan(static_cast<unsigned int>(0))};
-
-  const ContTU res2{static_cast<ValueTU>(5), static_cast<ValueTU>(5)};
-
-  BinaryOpTestLauncher<ContT, ContU>(grid, threads)
-      .template launch_test<fmin_nan_kernel<ContT, ContU>>(op4, op5, res2);
-
-  BinaryOpTestLauncher<ContT, ContU>(grid, threads)
-      .template launch_test<fmin_nan_kernel<ContT, ContU>>(op4, op6, op6_res);
-}
-
-template <typename ValueT, typename ValueU>
-inline void fmax_nan_kernel(ValueT *a, ValueU *b,
-                            container_common_type_t<ValueT, ValueU> *r) {
-  *r = syclcompat::fmax_nan(*a, *b);
-}
-
-template <typename ValueT, typename ValueU = ValueT>
-void test_syclcompat_fmax_nan() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  using ValueTU = std::common_type_t<ValueT, ValueU>;
-
-  constexpr syclcompat::dim3 grid{1};
-  constexpr syclcompat::dim3 threads{1};
-  const ValueT op1 = static_cast<ValueT>(5);
-  const ValueU op2 = static_cast<ValueU>(10);
-  ValueU op3 = sycl::nan(static_cast<unsigned int>(0));
-
-  const ValueTU res =
-      static_cast<ValueTU>(10);
-
-  BinaryOpTestLauncher<ValueT, ValueU>(grid, threads)
-      .template launch_test<fmax_nan_kernel<ValueT, ValueU>>(op1, op2, res);
-
-  BinaryOpTestLauncher<ValueT, ValueU>(grid, threads)
-      .template launch_test<fmax_nan_kernel<ValueT, ValueU>>(op1, op3, op3);
-}
-
-template <template <typename T, int Dim> typename ContainerT, typename ValueT, typename ValueU = ValueT>
-void test_container_syclcompat_fmax_nan(){
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  constexpr syclcompat::dim3 grid{1};
-  constexpr syclcompat::dim3 threads{1};
-
-  using ValueTU = std::common_type_t<ValueT, ValueU>;
-  using ContT = ContainerT<ValueT, 2>;
-  using ContU = ContainerT<ValueU, 2>;
-  using ContTU = ContainerT<ValueTU, 2>;
-
-  const ContT op4 = {static_cast<ValueT>(5), static_cast<ValueT>(10)};
-  const ContU op5 = {static_cast<ValueU>(10), static_cast<ValueU>(5)};
-  const ContU op6 = {sycl::nan(static_cast<unsigned int>(0)), sycl::nan(static_cast<unsigned int>(0))};
-  const ContTU op6_res = {sycl::nan(static_cast<unsigned int>(0)), sycl::nan(static_cast<unsigned int>(0))};
-
-  const ContTU res2{static_cast<ValueTU>(10), static_cast<ValueTU>(10)};
-
-  BinaryOpTestLauncher<ContT, ContU>(grid, threads)
-      .template launch_test<fmax_nan_kernel<ContT, ContU>>(op4, op5, res2);
-
-  BinaryOpTestLauncher<ContT, ContU>(grid, threads)
-      .template launch_test<fmax_nan_kernel<ContT, ContU>>(op4, op6, op6_res);
-}
-
-template <typename ValueT, typename ValueU>
-inline void pow_kernel(ValueT *a, ValueU *b, ValueT *r) {
-  *r = syclcompat::pow(*a, *b);
-}
-
-template <typename ValueT, typename ValueU = ValueT>
-void test_syclcompat_pow() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  // FIXME: non-floating point values default to double, requires fp64. Change
-  // when problem is solver at the header.
-  if constexpr (!std::is_floating_point_v<ValueT>) {
-    if (!syclcompat::get_current_device().has(sycl::aspect::fp64)) {
-      std::cout << "  sycl::aspect::fp64 not supported by the SYCL device."
-                << std::endl;
-      return;
-    }
-  }
-
-  constexpr syclcompat::dim3 grid{1};
-  constexpr syclcompat::dim3 threads{1};
-  // 3 ** 3 = 27
-  const ValueT op1 = static_cast<ValueT>(3);
-  const ValueU op2 = static_cast<ValueU>(3);
-  const ValueT res = static_cast<ValueT>(27);
-
-  BinaryOpTestLauncher<ValueT, ValueU>(grid, threads)
-      .template launch_test<pow_kernel<ValueT, ValueU>>(op1, op2, res);
-}
-
-template <typename ValueT> inline void relu_kernel(ValueT *a, ValueT *r) {
-  *r = syclcompat::relu(*a);
-}
-
-template <typename ValueT> void test_syclcompat_relu() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  constexpr syclcompat::dim3 grid{1};
-  constexpr syclcompat::dim3 threads{1};
-
-  // relu(3) = 3, relu(-value) = 0
-  const ValueT op1 = static_cast<ValueT>(3);
-  const ValueT res1 = static_cast<ValueT>(3);
-  UnaryOpTestLauncher<ValueT>(grid, threads)
-      .template launch_test<relu_kernel<ValueT>>(op1, res1);
-
-  const ValueT op2 = std::is_signed_v<ValueT> ? static_cast<ValueT>(-3)
-                                              : static_cast<ValueT>(2);
-  const ValueT res2 = std::is_signed_v<ValueT> ? static_cast<ValueT>(0)
-                                               : static_cast<ValueT>(2);
-  UnaryOpTestLauncher<ValueT>(grid, threads)
-      .template launch_test<relu_kernel<ValueT>>(op2, res2);
-
-  using ValueU = sycl::vec<ValueT, 2>;
-  const ValueU op3{op1, op2};
-  const ValueU res3{res1, res2};
-  UnaryOpTestLauncher<ValueU>(grid, threads)
-      .template launch_test<relu_kernel<ValueU>>(op3, res3);
-
-  using ValueV = sycl::marray<ValueT, 2>;
-  const ValueV op4{op1, op2};
-  const ValueV res4{res1, res2};
-  UnaryOpTestLauncher<ValueV>(grid, threads)
-      .template launch_test<relu_kernel<ValueV>>(op4, res4);
-}
-
-template <typename ValueT> inline void cbrt_kernel(ValueT *a, ValueT *r) {
-  *r = syclcompat::cbrt(*a);
-}
-
-template <typename ValueT> void test_syclcompat_cbrt() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  constexpr syclcompat::dim3 grid{1};
-  constexpr syclcompat::dim3 threads{1};
-
-  const ValueT op1 = static_cast<ValueT>(1);
-  const ValueT res1 = static_cast<ValueT>(1);
-  UnaryOpTestLauncher<ValueT>(grid, threads)
-      .template launch_test<cbrt_kernel<ValueT>>(op1, res1);
-
-  const ValueT op2 = static_cast<ValueT>(64);
-  const ValueT res2 = static_cast<ValueT>(4);
-  UnaryOpTestLauncher<ValueT>(grid, threads)
-      .template launch_test<cbrt_kernel<ValueT>>(op2, res2);
-}
-
-template <typename T>
-void isnan_kernel(T *a, T *r) {
-  *r = syclcompat::isnan(*a);
-}
-
-template <template <typename, int> typename ContainerT, typename ValueT>
-void test_isnan() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  using ContT = ContainerT<ValueT, 2>;
-  constexpr syclcompat::dim3 grid{1};
-  constexpr syclcompat::dim3 threads{1};
-  ContT op1 = {sycl::nan(static_cast<unsigned int>(0)), 1.0f};
-  // bool2 does not exist,1.0 and 0.0 floats are used for true
-  // and false instead.
-  ContT expect = {1.0, 0.0};
-
-  UnaryOpTestLauncher<ContT>(grid, threads)
-      .template launch_test<isnan_kernel<ContT>>(op1, expect);
-}
-
-// Hardcoded limits to avoid a "TernaryOpTestLauncher"
-static constexpr int MIN_CLAMP = 5;
-static constexpr int MAX_CLAMP = 10;
-
-template <typename ValueT> void clamp_kernel(ValueT *a, ValueT *r) {
-  *r = syclcompat::clamp(*a, static_cast<ValueT>(MIN_CLAMP),
-                         static_cast<ValueT>(MAX_CLAMP));
-}
-
-template <typename ValueT> void test_clamp() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  constexpr syclcompat::dim3 grid{1};
-  constexpr syclcompat::dim3 threads{1};
-  ValueT op1 = static_cast<ValueT>(7);
-  ValueT expect1 = static_cast<ValueT>(7);
-
-  UnaryOpTestLauncher<ValueT>(grid, threads)
-      .template launch_test<clamp_kernel<ValueT>>(op1, expect1);
-
-  ValueT op2 = static_cast<ValueT>(MAX_CLAMP + 1);
-  ValueT expect2 = static_cast<ValueT>(MAX_CLAMP);
-  UnaryOpTestLauncher<ValueT>(grid, threads)
-      .template launch_test<clamp_kernel<ValueT>>(op2, expect2);
-
-  ValueT op3 = static_cast<ValueT>(MIN_CLAMP - 1);
-  ValueT expect3 = static_cast<ValueT>(MIN_CLAMP);
-  UnaryOpTestLauncher<ValueT>(grid, threads)
-      .template launch_test<clamp_kernel<ValueT>>(op3, expect3);
-}
-
-template <template <typename T, int Dim> typename ContainerT, typename ValueT> void test_container_clamp() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  constexpr syclcompat::dim3 grid{1};
-  constexpr syclcompat::dim3 threads{1};
-  ValueT op1 = static_cast<ValueT>(7);
-  ValueT expect1 = static_cast<ValueT>(7);
-
-  ValueT op2 = static_cast<ValueT>(MAX_CLAMP + 1);
-  ValueT expect2 = static_cast<ValueT>(MAX_CLAMP);
-
-  using ContT = ContainerT<ValueT, 2>;
-  const ContT op4{op1, op2};
-  const ContT expect4{expect1, expect2};
-  UnaryOpTestLauncher<ContT>(grid, threads)
-      .template launch_test<clamp_kernel<ContT>>(op4, expect4);
-}
-
-int main() {
-  INSTANTIATE_ALL_TYPES(value_type_list, test_syclcompat_max);
-  INSTANTIATE_ALL_TYPES(value_type_list, test_syclcompat_min);
-
-  // Basic testing of deduction to avoid combinatorial explosion
-  test_syclcompat_max<double, float>();
-  test_syclcompat_max<long, int>();
-#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
-  test_syclcompat_max<sycl::ext::oneapi::bfloat16, float>();
-#endif
-
-  test_syclcompat_min<double, float>();
-  test_syclcompat_min<long, int>();
-#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
-  test_syclcompat_min<sycl::ext::oneapi::bfloat16, float>();
-#endif
-
-  INSTANTIATE_ALL_TYPES(fp_type_list, test_syclcompat_fmin_nan);
-  INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::vec, test_container_syclcompat_fmin_nan);
-  //INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::marray, test_container_syclcompat_fmin_nan);
-  test_syclcompat_fmin_nan<double, float>();
-  test_container_syclcompat_fmin_nan<sycl::vec, float, double>();
-#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
-  test_container_syclcompat_fmin_nan<sycl::vec, sycl::ext::oneapi::bfloat16, double>();
-#endif
-
-  INSTANTIATE_ALL_TYPES(fp_type_list, test_syclcompat_fmax_nan);
-  INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::vec, test_container_syclcompat_fmax_nan);
-  //INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::marray, test_container_syclcompat_fmax_nan);
-  test_syclcompat_fmax_nan<double, float>();
-  test_container_syclcompat_fmax_nan<sycl::vec, float, double>();
-#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
-  test_container_syclcompat_fmax_nan<sycl::vec, sycl::ext::oneapi::bfloat16, double>();
-#endif
-
-  INSTANTIATE_ALL_TYPES(value_type_list, test_syclcompat_pow);
-  test_syclcompat_pow<float, int>();
-  test_syclcompat_pow<double, int>();
-
-  INSTANTIATE_ALL_TYPES(value_type_list, test_syclcompat_relu);
-  INSTANTIATE_ALL_TYPES(fp_type_list_no_bfloat16, test_syclcompat_cbrt);
-
-  INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::vec, test_isnan);
-  //INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::marray, test_isnan);
-
-  INSTANTIATE_ALL_TYPES(value_type_list, test_clamp);
-  INSTANTIATE_ALL_CONTAINER_TYPES(vec_type_list, sycl::vec, test_container_clamp);
-  //INSTANTIATE_ALL_CONTAINER_TYPES(marray_type_list, sycl::marray, test_container_clamp);
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/math/math_vectorized.cpp b/sycl/test-e2e/syclcompat/math/math_vectorized.cpp
deleted file mode 100644
index 630d4b9c9f154..0000000000000
--- a/sycl/test-e2e/syclcompat/math/math_vectorized.cpp
+++ /dev/null
@@ -1,281 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  math_vectorized.cpp
- *
- *  Description:
- *    math helpers for vectorized operations and fp16 operations
- **************************************************************************/
-
-// REQUIRES: aspect-fp16
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include <syclcompat/math.hpp>
-
-#include "../common.hpp"
-#include "math_fixt.hpp"
-
-template <typename BinaryOp, typename ValueT>
-void vectorized_binary_kernel(unsigned *a, unsigned *b, unsigned *r,
-                              bool need_relu) {
-  *r = syclcompat::vectorized_binary<ValueT>(*a, *b, BinaryOp(), need_relu);
-}
-
-template <typename BinaryOp, typename ValueT>
-void test_vectorized_binary(unsigned op1, unsigned op2, unsigned expected,
-                            bool need_relu = false) {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  constexpr syclcompat::dim3 grid{1};
-  constexpr syclcompat::dim3 threads{1};
-
-  BinaryOpTestLauncher<unsigned, unsigned, unsigned>(grid, threads)
-      .template launch_test<vectorized_binary_kernel<BinaryOp, ValueT>>(
-          op1, op2, expected, need_relu);
-}
-
-template <typename BinaryOp, typename ValueT>
-void test_vectorized_binary_logical(unsigned op1, unsigned op2,
-                                    unsigned expected) {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  constexpr syclcompat::dim3 grid{1};
-  constexpr syclcompat::dim3 threads{1};
-
-  BinaryOpTestLauncher<unsigned, unsigned, unsigned>(grid, threads)
-      .template launch_test<vectorized_binary_kernel<BinaryOp, ValueT>>(
-          op1, op2, expected, false);
-}
-
-template <typename UnaryOp, typename ValueT>
-void vectorized_unary_kernel(unsigned *a, unsigned *r) {
-  *r = syclcompat::vectorized_unary<ValueT>(*a, UnaryOp());
-}
-
-template <typename UnaryOp, typename ValueT>
-void test_vectorized_unary(unsigned op1, unsigned expected) {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  constexpr syclcompat::dim3 grid{1};
-  constexpr syclcompat::dim3 threads{1};
-
-  UnaryOpTestLauncher<unsigned, unsigned>(grid, threads)
-      .template launch_test<vectorized_unary_kernel<UnaryOp, ValueT>>(op1,
-                                                                      expected);
-}
-
-template <typename ValueT>
-void vectorized_sum_abs_diff_kernel(unsigned *a, unsigned *b, unsigned *r) {
-  *r = syclcompat::vectorized_sum_abs_diff<ValueT>(*a, *b);
-}
-
-template <typename ValueT>
-void test_vectorized_sum_abs_diff(unsigned op1, unsigned op2,
-                                  unsigned expected) {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  constexpr syclcompat::dim3 grid{1};
-  constexpr syclcompat::dim3 threads{1};
-
-  BinaryOpTestLauncher<unsigned, unsigned, unsigned>(grid, threads)
-      .template launch_test<vectorized_sum_abs_diff_kernel<ValueT>>(op1, op2,
-                                                                    expected);
-}
-
-template <typename BinaryOp1, typename BinaryOp2, typename ValueT>
-void vectorized_ternary_kernel(unsigned *a, unsigned *b, unsigned *c,
-                               unsigned *r, bool need_relu) {
-  *r = syclcompat::vectorized_ternary<ValueT>(*a, *b, *c, BinaryOp1(),
-                                              BinaryOp2(), need_relu);
-}
-
-template <typename BinaryOp1, typename BinaryOp2, typename ValueT>
-void test_vectorized_ternary(unsigned op1, unsigned op2, unsigned op3,
-                             unsigned expected, bool need_relu = false) {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  constexpr syclcompat::dim3 grid{1};
-  constexpr syclcompat::dim3 threads{1};
-
-  TernaryOpTestLauncher<unsigned, unsigned, unsigned>(grid, threads)
-      .template launch_test<
-          vectorized_ternary_kernel<BinaryOp1, BinaryOp2, ValueT>>(
-          op1, op2, op3, expected, need_relu);
-}
-
-template <typename BinaryOp, typename ValueT>
-void vectorized_binary_with_pred_kernel(unsigned *a, unsigned *b, unsigned *r,
-                                        bool *pred_hi, bool *pred_lo) {
-  *r = syclcompat::vectorized_binary_with_pred<ValueT>(*a, *b, BinaryOp(),
-                                                       pred_hi, pred_lo);
-}
-
-template <typename BinaryOp, typename ValueT>
-void test_vectorized_binary_with_pred(unsigned op1, unsigned op2,
-                                      unsigned expected, bool pred_hi,
-                                      bool pred_lo) {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  constexpr syclcompat::dim3 grid{1};
-  constexpr syclcompat::dim3 threads{1};
-
-  BinaryOpTestLauncher<unsigned, unsigned, unsigned>(grid, threads)
-      .template launch_test<
-          vectorized_binary_with_pred_kernel<BinaryOp, ValueT>>(
-          op1, op2, expected, pred_hi, pred_lo);
-}
-
-int main() {
-  test_vectorized_binary<syclcompat::abs_diff, sycl::short2>(
-      0x00010002, 0x00040002, 0x00030000);
-  test_vectorized_binary<syclcompat::add_sat, sycl::short2>(
-      0x00020002, 0xFFFDFFFF, 0xFFFF0001);
-  test_vectorized_binary<syclcompat::rhadd, sycl::short2>(
-      0x00010008, 0x00020001, 0x00020005);
-  test_vectorized_binary<syclcompat::hadd, sycl::short2>(0x00010003, 0x00020005,
-                                                         0x00010004);
-  test_vectorized_binary<syclcompat::maximum, sycl::short2>(
-      0x0FFF0000, 0x00000FFF, 0x0FFF0FFF);
-  test_vectorized_binary<syclcompat::minimum, sycl::short2>(
-      0x0FFF0000, 0x00000FFF, 0x00000000);
-  test_vectorized_binary<syclcompat::sub_sat, sycl::short2>(
-      0xFFFB0005, 0x00030008, 0xFFF8FFFD);
-  test_vectorized_binary<syclcompat::abs_diff, sycl::short2>(
-      0x00010002, 0x00040002, 0x00030000, true);
-  test_vectorized_binary<syclcompat::add_sat, sycl::short2>(
-      0x00020002, 0xFFFDFFFF, 0x00000001, true);
-  test_vectorized_binary<syclcompat::rhadd, sycl::short2>(
-      0x00010008, 0x00020001, 0x00020005, true);
-  test_vectorized_binary<syclcompat::hadd, sycl::short2>(0x00010003, 0x00020005,
-                                                         0x00010004, true);
-  test_vectorized_binary<syclcompat::maximum, sycl::short2>(
-      0x0FFF0000, 0x00000FFF, 0x0FFF0FFF, true);
-  test_vectorized_binary<syclcompat::minimum, sycl::short2>(
-      0x0FFF0000, 0x00000FFF, 0x00000000, true);
-  test_vectorized_binary<syclcompat::sub_sat, sycl::short2>(
-      0xFFFB0005, 0x00030008, 0x00000000, true);
-  test_vectorized_unary<syclcompat::abs, sycl::short2>(0xFFFBFFFD, 0x00050003);
-  test_vectorized_sum_abs_diff<sycl::ushort2>(0x00010002, 0x00040002,
-                                              0x00000003);
-  test_vectorized_ternary<std::plus<>, syclcompat::maximum, sycl::ushort2>(
-      0x00010002, 0x00040002, 0x00080004, 0x00080004);
-  test_vectorized_ternary<std::plus<>, syclcompat::maximum, sycl::ushort2>(
-      0x00010002, 0x00040002, 0x00080004, 0x00080004, true);
-  test_vectorized_ternary<std::plus<>, syclcompat::minimum, sycl::ushort2>(
-      0x00010002, 0x00040002, 0x00080004, 0x00050004);
-  test_vectorized_ternary<std::plus<>, syclcompat::minimum, sycl::ushort2>(
-      0x00010002, 0x00040002, 0x00080004, 0x00050004, true);
-  test_vectorized_ternary<syclcompat::maximum, syclcompat::maximum,
-                          sycl::ushort2>(0x00010002, 0x00040002, 0x00080004,
-                                         0x00080004);
-  test_vectorized_ternary<syclcompat::maximum, syclcompat::maximum,
-                          sycl::ushort2>(0x00010002, 0x00040002, 0x00080004,
-                                         0x00080004, true);
-  test_vectorized_ternary<syclcompat::minimum, syclcompat::minimum,
-                          sycl::ushort2>(0x00010002, 0x00040002, 0x00080004,
-                                         0x00010002);
-  test_vectorized_ternary<syclcompat::minimum, syclcompat::minimum,
-                          sycl::ushort2>(0x00010002, 0x00040002, 0x00080004,
-                                         0x00010002, true);
-  test_vectorized_ternary<std::plus<>, syclcompat::maximum, sycl::short2>(
-      0x80010002, 0x00040002, 0x00080004, 0x00080004);
-  test_vectorized_ternary<std::plus<>, syclcompat::maximum, sycl::short2>(
-      0x80010002, 0x00040002, 0x00080004, 0x00080004, true);
-  test_vectorized_ternary<std::plus<>, syclcompat::minimum, sycl::short2>(
-      0x80010002, 0x00040002, 0x00080004, 0x80050004);
-  test_vectorized_ternary<std::plus<>, syclcompat::minimum, sycl::short2>(
-      0x80010002, 0x00040002, 0x00080004, 0x00000004, true);
-  test_vectorized_ternary<syclcompat::maximum, syclcompat::maximum,
-                          sycl::short2>(0x80010002, 0x00040002, 0x00080004,
-                                        0x00080004);
-  test_vectorized_ternary<syclcompat::maximum, syclcompat::maximum,
-                          sycl::short2>(0x80010002, 0x00040002, 0x00080004,
-                                        0x00080004, true);
-  test_vectorized_ternary<syclcompat::minimum, syclcompat::minimum,
-                          sycl::short2>(0x80010002, 0x00040002, 0x00080004,
-                                        0x80010002);
-  test_vectorized_ternary<syclcompat::minimum, syclcompat::minimum,
-                          sycl::short2>(0x80010002, 0x00040002, 0x00080004,
-                                        0x00000002, true);
-  test_vectorized_binary_with_pred<syclcompat::maximum, sycl::short2>(
-      0x80010002, 0x00040002, 0x00040002, false, true);
-  test_vectorized_binary_with_pred<syclcompat::minimum, sycl::short2>(
-      0x80010002, 0x00040002, 0x80010002, true, true);
-  test_vectorized_binary_with_pred<syclcompat::maximum, sycl::ushort2>(
-      0x80010002, 0x00040002, 0x80010002, true, true);
-  test_vectorized_binary_with_pred<syclcompat::minimum, sycl::ushort2>(
-      0x80010002, 0x00040002, 0x00040002, false, true);
-
-  // Logical Binary Operators v2
-  test_vectorized_binary_logical<std::equal_to<>, sycl::short2>(
-      0xFFF00002, 0xFFF00001, 0xFFFF0000);
-  test_vectorized_binary_logical<std::equal_to<>, sycl::short2>(
-      0x0001F00F, 0x0003F00F, 0x0000FFFF);
-
-  test_vectorized_binary_logical<std::not_equal_to<>, sycl::short2>(
-      0xFFF00002, 0xFFF00001, 0x0000FFFF);
-  test_vectorized_binary_logical<std::not_equal_to<>, sycl::short2>(
-      0x0001F00F, 0x0003F00F, 0xFFFF0000);
-
-  test_vectorized_binary_logical<std::greater_equal<>, sycl::short2>(
-      0xFFF00002, 0xFFF00001, 0xFFFFFFFF);
-  test_vectorized_binary_logical<std::greater_equal<>, sycl::short2>(
-      0x0001F00F, 0x0003F001, 0x0000FFFF);
-
-  test_vectorized_binary_logical<std::greater<>, sycl::short2>(
-      0xFFF00002, 0xFFF00001, 0x0000FFFF);
-  test_vectorized_binary_logical<std::greater<>, sycl::short2>(
-      0x0003F00F, 0x0001F00F, 0xFFFF0000);
-
-  test_vectorized_binary_logical<std::less_equal<>, sycl::short2>(
-      0xFFF00001, 0xF0F00002, 0x0000FFFF);
-  test_vectorized_binary_logical<std::less_equal<>, sycl::short2>(
-      0x0001FF0F, 0x0003F00F, 0xFFFF0000);
-
-  test_vectorized_binary_logical<std::less<>, sycl::short2>(
-      0xFFF00001, 0xFFF00002, 0x0000FFFF);
-  test_vectorized_binary_logical<std::less<>, sycl::short2>(
-      0x0001F00F, 0x0003F00F, 0xFFFF0000);
-
-  // Logical Binary Operators v4
-  test_vectorized_binary_logical<std::equal_to<>, sycl::uchar4>(
-      0x0001F00F, 0x0003F00F, 0xFF00FFFF);
-  test_vectorized_binary_logical<std::equal_to<>, sycl::uchar4>(
-      0x0102F0F0, 0x0202F0FF, 0x00FFFF00);
-
-  test_vectorized_binary_logical<std::not_equal_to<>, sycl::uchar4>(
-      0x0001F00F, 0xFF01F10F, 0xFF00FF00);
-  test_vectorized_binary_logical<std::not_equal_to<>, sycl::uchar4>(
-      0x0201F0F0, 0x0202F0FF, 0x00FF00FF);
-
-  test_vectorized_binary_logical<std::greater_equal<>, sycl::uchar4>(
-      0xFFF00002, 0xFFF10101, 0xFF0000FF);
-  test_vectorized_binary_logical<std::greater_equal<>, sycl::uchar4>(
-      0x0001F1F0, 0x0103F001, 0x0000FFFF);
-
-  test_vectorized_binary_logical<std::greater<>, sycl::uchar4>(
-      0xFFF00002, 0xF0F00001, 0xFF0000FF);
-  test_vectorized_binary_logical<std::greater<>, sycl::uchar4>(
-      0x0103F0F1, 0x0102F0F0, 0x00FF00FF);
-
-  test_vectorized_binary_logical<std::less_equal<>, sycl::uchar4>(
-      0xFFF10001, 0xFFF00100, 0xFF00FF00);
-  test_vectorized_binary_logical<std::less_equal<>, sycl::uchar4>(
-      0x0101F1F0, 0x0003F0F1, 0x00FF00FF);
-
-  test_vectorized_binary_logical<std::less<>, sycl::uchar4>(
-      0xFFF10001, 0xFFF20100, 0x00FFFF00);
-  test_vectorized_binary_logical<std::less<>, sycl::uchar4>(
-      0x0101F1F0, 0x0102F1F1, 0x00FF00FF);
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/math/math_vectorized_isgreater_test.cpp b/sycl/test-e2e/syclcompat/math/math_vectorized_isgreater_test.cpp
deleted file mode 100644
index 9b12f6574d394..0000000000000
--- a/sycl/test-e2e/syclcompat/math/math_vectorized_isgreater_test.cpp
+++ /dev/null
@@ -1,211 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  math_vectorized_isgreater_test.cpp
- *
- *  Description:
- *    vectorized_isgreater tests
- **************************************************************************/
-
-// The original source was under the license below:
-// ====---- UtilVectorizedIsgreaterTest.cpp----------- -*- C++ -* ----===////
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===----------------------------------------------------------------------===//
-
-// DEFINE: %{mathflags} = %if cl_options %{/clang:-fno-fast-math%} %else %{-fno-fast-math%}
-
-// RUN: %{build} %{mathflags} -o %t.out
-// RUN: %{run} %t.out
-
-#include <sycl/detail/core.hpp>
-#include <syclcompat.hpp>
-
-void test_kernel_vect_is_greater_1(unsigned int vect_count,
-                                   unsigned int *input_1, unsigned int *input_2,
-                                   unsigned int *output,
-                                   sycl::nd_item<3> item_ct1) {
-
-  int index = item_ct1.get_local_range().get(2) * item_ct1.get_group(2) +
-              item_ct1.get_local_id(2);
-
-  if (index < vect_count) {
-    output[index] =
-        syclcompat::vectorized_isgreater<sycl::ushort2, unsigned int>(
-            input_1[index], input_2[index]);
-  }
-}
-
-void test_vec_gt_1() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  syclcompat::device_ext &dev_ct1 = syclcompat::get_current_device();
-  sycl::queue *q_ct1 = dev_ct1.default_queue();
-
-  const unsigned int num_data = 7;
-  unsigned int mem_size = sizeof(unsigned int) * num_data;
-
-  unsigned int *h_out_data = (unsigned int *)malloc(mem_size);
-  unsigned int *h_data = (unsigned int *)malloc(mem_size);
-
-  for (unsigned int i = 0; i < num_data; i++)
-    h_out_data[i] = 0;
-
-  unsigned int *d_out_data;
-  d_out_data = (unsigned int *)sycl::malloc_device(mem_size, *q_ct1);
-
-  unsigned int *d_in_data_1;
-  d_in_data_1 = (unsigned int *)sycl::malloc_device(mem_size, *q_ct1);
-
-  unsigned int *d_in_data_2;
-  d_in_data_2 = (unsigned int *)sycl::malloc_device(mem_size, *q_ct1);
-
-  for (unsigned int i = 0; i < num_data; i++)
-    h_data[i] = i;
-  q_ct1->memcpy(d_in_data_1, h_data, mem_size).wait();
-
-  for (unsigned int i = 0; i < num_data; i++)
-    h_data[i] = num_data - 1 - i;
-  q_ct1->memcpy(d_in_data_2, h_data, mem_size).wait();
-
-  q_ct1->submit([&](sycl::handler &cgh) {
-    cgh.parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, 3) * sycl::range<3>(1, 1, 3),
-                          sycl::range<3>(1, 1, 3)),
-        [=](sycl::nd_item<3> item_ct1) {
-          test_kernel_vect_is_greater_1(num_data, d_in_data_1, d_in_data_2,
-                                        d_out_data, item_ct1);
-        });
-  });
-  dev_ct1.queues_wait_and_throw();
-
-  q_ct1->memcpy(h_out_data, d_out_data, mem_size).wait();
-
-  unsigned int ref_data[num_data] = {0, 0, 0, 0, 1, 1, 1};
-  for (unsigned int i = 0; i < num_data; i++) {
-    if (h_out_data[i] != ref_data[i]) {
-      printf("vec_max_test_1 failed!\n");
-      exit(-1);
-    }
-  }
-
-  free(h_out_data);
-  free(h_data);
-  sycl::free(d_out_data, *q_ct1);
-  sycl::free(d_in_data_1, *q_ct1);
-  sycl::free(d_in_data_2, *q_ct1);
-}
-
-void test_kernel_vect_is_greater_2(unsigned int vect_count,
-                                   unsigned int *input_1, unsigned int *input_2,
-                                   unsigned int *output,
-                                   sycl::nd_item<3> item_ct1) {
-
-  int index = item_ct1.get_local_range().get(2) * item_ct1.get_group(2) +
-              item_ct1.get_local_id(2);
-
-  if (index < vect_count) {
-    output[index] = syclcompat::vectorized_isgreater<sycl::half2, unsigned int>(
-        input_1[index], input_2[index]);
-  }
-}
-
-void test_vec_gt_2() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  syclcompat::device_ext &dev_ct1 = syclcompat::get_current_device();
-  sycl::queue *q_ct1 = dev_ct1.default_queue();
-
-  // This test uses vector of sycl::half, which requires support for fp16 on
-  // the corresponding device.
-  if (!dev_ct1.has(sycl::aspect::fp16)) {
-    std::cout
-        << "Test case skipped as the device does not support aspect::fp16."
-        << std::endl;
-    return;
-  }
-
-  const unsigned int num_data = 7;
-  unsigned int mem_size = sizeof(unsigned int) * num_data;
-
-  unsigned int *h_out_data = (unsigned int *)malloc(mem_size);
-  unsigned int *h_data = (unsigned int *)malloc(mem_size);
-  unsigned int a_array[num_data] = {6 + 65535, 1 + 65535, 2 + 65535, 3,
-                                    4,         5,         6};
-  unsigned int b_array[num_data] = {0 + 65535, 5 + 65535, 4 + 65535, 3,
-                                    2,         1,         0};
-
-  for (unsigned int i = 0; i < num_data; i++)
-    h_out_data[i] = 0;
-
-  unsigned int *d_out_data;
-  d_out_data = (unsigned int *)sycl::malloc_device(mem_size, *q_ct1);
-
-  unsigned int *d_in_data_1;
-  d_in_data_1 = (unsigned int *)sycl::malloc_device(mem_size, *q_ct1);
-
-  unsigned int *d_in_data_2;
-  d_in_data_2 = (unsigned int *)sycl::malloc_device(mem_size, *q_ct1);
-
-  for (unsigned int i = 0; i < num_data; i++)
-    h_data[i] = a_array[i];
-
-  q_ct1->memcpy(d_in_data_1, h_data, mem_size).wait();
-
-  for (unsigned int i = 0; i < num_data; i++)
-    h_data[i] = b_array[i];
-
-  q_ct1->memcpy(d_in_data_2, h_data, mem_size).wait();
-
-  q_ct1->submit([&](sycl::handler &cgh) {
-    cgh.parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, 3) * sycl::range<3>(1, 1, 3),
-                          sycl::range<3>(1, 1, 3)),
-        [=](sycl::nd_item<3> item_ct1) {
-          test_kernel_vect_is_greater_2(num_data, d_in_data_1, d_in_data_2,
-                                        d_out_data, item_ct1);
-        });
-  });
-  dev_ct1.queues_wait_and_throw();
-
-  q_ct1->memcpy(h_out_data, d_out_data, mem_size).wait();
-
-  unsigned int ref_data[num_data] = {0xffff0000, 0x0,    0x0,   0x0,
-                                     0xffff,     0xffff, 0xffff};
-  for (unsigned int i = 0; i < num_data; i++) {
-    if (h_out_data[i] != ref_data[i]) {
-      printf("vec_max_test_2 failed!\n");
-      exit(-1);
-    }
-  }
-
-  free(h_out_data);
-  free(h_data);
-  sycl::free(d_out_data, *q_ct1);
-  sycl::free(d_in_data_1, *q_ct1);
-  sycl::free(d_in_data_2, *q_ct1);
-}
-
-int main() {
-  test_vec_gt_1();
-  test_vec_gt_2();
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/math/math_vectorized_max_test.cpp b/sycl/test-e2e/syclcompat/math/math_vectorized_max_test.cpp
deleted file mode 100644
index dc63139d18dfd..0000000000000
--- a/sycl/test-e2e/syclcompat/math/math_vectorized_max_test.cpp
+++ /dev/null
@@ -1,114 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  math_vectorized_max_test.cpp
- *
- *  Description:
- *    vectorized_max tests
- **************************************************************************/
-
-// The original source was under the license below:
-// ====------ UtilVectorizedMaxTest.cpp---------- -*- C++ -* ----===////
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===----------------------------------------------------------------------===//
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include <sycl/detail/core.hpp>
-#include <syclcompat.hpp>
-
-void test_kernel_vect_max(unsigned int vect_count, unsigned int *input_1,
-                          unsigned int *input_2, unsigned int *output,
-                          sycl::nd_item<3> item_ct1) {
-
-  int index = item_ct1.get_local_range().get(2) * item_ct1.get_group(2) +
-              item_ct1.get_local_id(2);
-
-  if (index < vect_count) {
-    output[index] =
-        syclcompat::vectorized_max<sycl::char4>(input_1[index], input_2[index]);
-  }
-}
-
-void test_vec_max() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  syclcompat::device_ext &dev_ct1 = syclcompat::get_current_device();
-  sycl::queue *q_ct1 = dev_ct1.default_queue();
-
-  const unsigned int num_data = 7;
-  unsigned int mem_size = sizeof(unsigned int) * num_data;
-
-  unsigned int *h_out_data = (unsigned int *)malloc(mem_size);
-  unsigned int *h_data = (unsigned int *)malloc(mem_size);
-
-  for (unsigned int i = 0; i < num_data; i++)
-    h_out_data[i] = 0;
-
-  unsigned int *d_out_data;
-  d_out_data = (unsigned int *)sycl::malloc_device(mem_size, *q_ct1);
-
-  unsigned int *d_in_data_1;
-  d_in_data_1 = (unsigned int *)sycl::malloc_device(mem_size, *q_ct1);
-
-  unsigned int *d_in_data_2;
-  d_in_data_2 = (unsigned int *)sycl::malloc_device(mem_size, *q_ct1);
-
-  for (unsigned int i = 0; i < num_data; i++)
-    h_data[i] = i;
-  q_ct1->memcpy(d_in_data_1, h_data, mem_size).wait();
-
-  for (unsigned int i = 0; i < num_data; i++)
-    h_data[i] = num_data - 1 - i;
-  q_ct1->memcpy(d_in_data_2, h_data, mem_size).wait();
-
-  q_ct1->submit([&](sycl::handler &cgh) {
-    cgh.parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, 3) * sycl::range<3>(1, 1, 3),
-                          sycl::range<3>(1, 1, 3)),
-        [=](sycl::nd_item<3> item_ct1) {
-          test_kernel_vect_max(num_data, d_in_data_1, d_in_data_2, d_out_data,
-                               item_ct1);
-        });
-  });
-  dev_ct1.queues_wait_and_throw();
-
-  q_ct1->memcpy(h_out_data, d_out_data, mem_size).wait();
-
-  unsigned int ref_data[num_data] = {6, 5, 4, 3, 4, 5, 6};
-  for (unsigned int i = 0; i < num_data; i++) {
-    if (h_out_data[i] != ref_data[i])
-      exit(-1);
-  }
-
-  free(h_out_data);
-  free(h_data);
-  sycl::free(d_out_data, *q_ct1);
-  sycl::free(d_in_data_1, *q_ct1);
-  sycl::free(d_in_data_2, *q_ct1);
-}
-
-int main() {
-  test_vec_max();
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/math/math_vectorized_min_test.cpp b/sycl/test-e2e/syclcompat/math/math_vectorized_min_test.cpp
deleted file mode 100644
index a96bf59c35244..0000000000000
--- a/sycl/test-e2e/syclcompat/math/math_vectorized_min_test.cpp
+++ /dev/null
@@ -1,114 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  math_vectorized_min_test.cpp
- *
- *  Description:
- *    vectorized_min tests
- **************************************************************************/
-
-// The original source was under the license below:
-// ====------ UtilVectorizedMinTest.cpp---------- -*- C++ -* ----===////
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===----------------------------------------------------------------------===//
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include <sycl/detail/core.hpp>
-#include <syclcompat.hpp>
-
-void test_kernel_vect_min(unsigned int vect_count, unsigned int *input_1,
-                          unsigned int *input_2, unsigned int *output,
-                          sycl::nd_item<3> item_ct1) {
-
-  int index = item_ct1.get_local_range().get(2) * item_ct1.get_group(2) +
-              item_ct1.get_local_id(2);
-
-  if (index < vect_count) {
-    output[index] =
-        syclcompat::vectorized_min<sycl::char4>(input_1[index], input_2[index]);
-  }
-}
-
-void test_vec_min() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  syclcompat::device_ext &dev_ct1 = syclcompat::get_current_device();
-  sycl::queue *q_ct1 = dev_ct1.default_queue();
-
-  const unsigned int num_data = 7;
-  unsigned int mem_size = sizeof(unsigned int) * num_data;
-
-  unsigned int *h_out_data = (unsigned int *)malloc(mem_size);
-  unsigned int *h_data = (unsigned int *)malloc(mem_size);
-
-  for (unsigned int i = 0; i < num_data; i++)
-    h_out_data[i] = 0;
-
-  unsigned int *d_out_data;
-  d_out_data = (unsigned int *)sycl::malloc_device(mem_size, *q_ct1);
-
-  unsigned int *d_in_data_1;
-  d_in_data_1 = (unsigned int *)sycl::malloc_device(mem_size, *q_ct1);
-
-  unsigned int *d_in_data_2;
-  d_in_data_2 = (unsigned int *)sycl::malloc_device(mem_size, *q_ct1);
-
-  for (unsigned int i = 0; i < num_data; i++)
-    h_data[i] = i;
-  q_ct1->memcpy(d_in_data_1, h_data, mem_size).wait();
-
-  for (unsigned int i = 0; i < num_data; i++)
-    h_data[i] = num_data - 1 - i;
-  q_ct1->memcpy(d_in_data_2, h_data, mem_size).wait();
-
-  q_ct1->submit([&](sycl::handler &cgh) {
-    cgh.parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, 3) * sycl::range<3>(1, 1, 3),
-                          sycl::range<3>(1, 1, 3)),
-        [=](sycl::nd_item<3> item_ct1) {
-          test_kernel_vect_min(num_data, d_in_data_1, d_in_data_2, d_out_data,
-                               item_ct1);
-        });
-  });
-  dev_ct1.queues_wait_and_throw();
-
-  q_ct1->memcpy(h_out_data, d_out_data, mem_size).wait();
-
-  unsigned int ref_data[num_data] = {0, 1, 2, 3, 2, 1, 0};
-  for (unsigned int i = 0; i < num_data; i++) {
-    if (h_out_data[i] != ref_data[i])
-      exit(-1);
-  }
-
-  free(h_out_data);
-  free(h_data);
-  sycl::free(d_out_data, *q_ct1);
-  sycl::free(d_in_data_1, *q_ct1);
-  sycl::free(d_in_data_2, *q_ct1);
-}
-
-int main() {
-  test_vec_min();
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/memory/global_memory_usmnone.cpp b/sycl/test-e2e/syclcompat/memory/global_memory_usmnone.cpp
deleted file mode 100644
index 0be9c220efde6..0000000000000
--- a/sycl/test-e2e/syclcompat/memory/global_memory_usmnone.cpp
+++ /dev/null
@@ -1,175 +0,0 @@
-// ====------ global_memory.cpp---------- -*- C++ -* ----===////
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===----------------------------------------------------------------------===//
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-#define SYCLCOMPAT_USM_LEVEL_NONE
-#include <sycl/detail/core.hpp>
-#include <syclcompat/syclcompat.hpp>
-
-class TestStruct {
-public:
-  void test() {}
-  template<class T> void testTemplate() {}
-};
-
-template<class T>
-class TemplateStuct {
-public:
-  void test() {}
-  template<class Ty> void testTemplate() {}
-};
-
-syclcompat::global_memory<volatile int, 0> d1_a(0);
-syclcompat::global_memory<int, 1> d2_a(36);
-syclcompat::global_memory<TemplateStuct<int>, 0> d3_a;
-syclcompat::global_memory<TestStruct, 0> d4_a;
-syclcompat::constant_memory<int, 1> c1_a(16);
-syclcompat::constant_memory<int, 0> c2_a;
-syclcompat::constant_memory<TemplateStuct<int>, 0> c3_a;
-syclcompat::constant_memory<TestStruct, 0> c4_a;
-
-syclcompat::constant_memory<int, 2> c_2d_a(sycl::range<2>(5, 3),
-{{0, 10, 20},
-{30, 40, 50},
-{60, 70, 80},
-{90, 100, 110},
-{120, 130, 140}});
-syclcompat::constant_memory<int, 2> c_2d_b(sycl::range<2>(3, 5),
-{{0, 10, 20, 30, 40},
-{50, 60, 70, 80, 90},
-{100, 110, 120, 130, 140}});
-syclcompat::constant_memory<int, 2> c_2d_c(sycl::range<2>(3, 5),
-                                     {0, 10, 20, 30, 40,
-                                      50, 60, 70, 80, 90,
-                                      100, 110, 120, 130, 140});
-syclcompat::constant_memory<int, 3> c_3d(sycl::range<3>(2, 2, 4),
-                                   {0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100,
-                                    110, 120, 130, 140});
-syclcompat::constant_memory<int, 1> c_1d(sycl::range<1>(15),
-                                   {0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100,
-                                    110, 120, 130, 140});
-
-bool verify_init(int *data) {
-  for(auto i = 0; i < 15; ++i) {
-    if (data[i] != i * 10)
-      return false;
-  }
-  return true;
-}
-
-bool verify() {
-  const int size = 15;
-  auto size_bytes = 15 * sizeof(int);
-
-  int h_result[15];
-  syclcompat::memcpy(h_result, c_2d_a.get_ptr(), size_bytes);
-  if (!verify_init(h_result))
-    return false;
-  syclcompat::memcpy(h_result, c_2d_b.get_ptr(), size_bytes);
-  if (!verify_init(h_result))
-    return false;
-  syclcompat::memcpy(h_result, c_2d_c.get_ptr(), size_bytes);
-  if (!verify_init(h_result))
-    return false;
-  syclcompat::memcpy(h_result, c_3d.get_ptr(), size_bytes);
-  if (!verify_init(h_result))
-    return false;
-  syclcompat::memcpy(h_result, c_1d.get_ptr(), size_bytes);
-  if (!verify_init(h_result))
-    return false;
-  return true;
-}
-
-void test4(TemplateStuct<int> *d3, TestStruct *d4) {
-  d3->test();
-  d3->testTemplate<int>();
-  d4->test();
-  d4->testTemplate<int>();
-}
-
-void test3(TemplateStuct<int> c3, TestStruct c4) {
-  c3.test();
-  c3.testTemplate<int>();
-  c4.test();
-  c4.testTemplate<int>();
-}
-
-void test2(volatile int &a) {
-  a = 3;
-}
-
-void test1(volatile int *acc_d1, int *acc_d2, int const *c1, int c2) {
-  unsigned d_a = 1;
-  *acc_d1 = 0;
-  *acc_d2 = d_a;
-  unsigned d_c = (unsigned)(*acc_d1);
-  unsigned *d_d = (unsigned *)acc_d2;
-  unsigned *d_e = (unsigned *)(acc_d2 + 5);
-  int *d_f = acc_d2 - 6;
-  test2(*acc_d1);
-}
-
-int main() {
-  d1_a.init();
-  d2_a.init();
-  c1_a.init();
-  c2_a.init();
-  syclcompat::get_default_queue().submit(
-    [&](sycl::handler &cgh) {
-      auto d1_acc = d1_a.get_access(cgh);
-      auto d2_acc = d2_a.get_access(cgh);
-      auto c1_acc = c1_a.get_access(cgh);
-      auto c2_acc = c2_a.get_access(cgh);
-      cgh.parallel_for<syclcompat_kernel_name<class kernel_test1>>(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item) {
-            test1(d1_acc.get_multi_ptr<sycl::access::decorated::no>().get(),
-                  d2_acc.get_multi_ptr<sycl::access::decorated::no>().get(),
-                  c1_acc.get_multi_ptr<sycl::access::decorated::no>().get(),
-                  c2_acc);
-          });
-    });
-  c3_a.init();
-  c4_a.init();
-  syclcompat::get_default_queue().submit(
-    [&](sycl::handler &cgh) {
-      auto c3_acc = c3_a.get_access(cgh);
-      auto c4_acc = c4_a.get_access(cgh);
-      cgh.parallel_for<syclcompat_kernel_name<class kernel_test2>>(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-        [=] (sycl::nd_item<3> item) {
-          test3(c3_acc, c4_acc);
-        });
-    });
-
-  sycl::queue *q = syclcompat::get_current_device().create_queue();
-  d3_a.init(*q);
-  d4_a.init(*q);
-  q->submit(
-    [&](sycl::handler &cgh) {
-      auto d3_acc = d3_a.get_access(cgh);
-      auto d4_acc = d4_a.get_access(cgh);
-      cgh.parallel_for<syclcompat_kernel_name<class kernel_test3>>(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-          [=](sycl::nd_item<3> item) {
-            test4(d3_acc.get_multi_ptr<sycl::access::decorated::no>().get(),
-                  d4_acc.get_multi_ptr<sycl::access::decorated::no>().get());
-          });
-    });
-
-
-  if (verify()) {
-    printf("Init Constant Memory Success!\n");
-    return 0;
-  } else {
-    printf("Init Constant Memory Fail!\n");
-    return 1;
-  }
-}
diff --git a/sycl/test-e2e/syclcompat/memory/local_memory.cpp b/sycl/test-e2e/syclcompat/memory/local_memory.cpp
deleted file mode 100644
index b24e7197a3e9c..0000000000000
--- a/sycl/test-e2e/syclcompat/memory/local_memory.cpp
+++ /dev/null
@@ -1,126 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  local_memory.cpp
- *
- *  Description:
- *    launch<F> tests with static local memory
- **************************************************************************/
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include <numeric>
-
-#include <syclcompat/device.hpp>
-#include <syclcompat/id_query.hpp>
-#include <syclcompat/launch.hpp>
-#include <syclcompat/memory.hpp>
-
-#include "memory_fixt.hpp"
-
-// 1D test
-// Write id to linear block, then reverse order
-template <int BLOCK_SIZE> void local_mem_1d(int *d_A) {
-  int *as = syclcompat::local_mem<int[BLOCK_SIZE]>();
-  int id = syclcompat::local_id::x();
-  as[id] = id;
-  syclcompat::wg_barrier();
-  int val = as[BLOCK_SIZE - id - 1];
-  d_A[syclcompat::global_id::x()] = val;
-}
-
-void test_local_1d() {
-  auto checker = [](std::vector<int> input) {
-    std::vector<int> expected(input.size());
-    std::iota(expected.rbegin(), expected.rend(), 0);
-    assert(std::equal(expected.begin(), expected.end(), input.begin()));
-  };
-  LocalMemTest<local_mem_1d<32>>(1, 32).launch_test(checker);
-}
-
-// 2D test
-// Write id to 2D block, then reverse order
-template <int BLOCK_SIZE> void local_mem_2d(int *d_A) {
-  auto as = syclcompat::local_mem<int[BLOCK_SIZE][BLOCK_SIZE]>();
-  int id_x = syclcompat::local_id::x();
-  int id_y = syclcompat::local_id::y();
-  as[id_y][id_x] = id_x * BLOCK_SIZE + id_y;
-  syclcompat::wg_barrier();
-  int val = as[BLOCK_SIZE - id_y - 1][BLOCK_SIZE - id_x - 1];
-  d_A[syclcompat::global_id::y() * BLOCK_SIZE + syclcompat::global_id::x()] =
-      val;
-}
-
-void test_local_2d() {
-  constexpr int TILE_SIZE = 16;
-  auto checker = [](std::vector<int> input) {
-    for (int y = 0; y < TILE_SIZE; ++y) {
-      for (int x = 0; x < TILE_SIZE; ++x) {
-        int linear_id = y * TILE_SIZE + x;
-        int expected = ((TILE_SIZE - x - 1) * TILE_SIZE) + (TILE_SIZE - y - 1);
-        assert(input[linear_id] == expected);
-      }
-    }
-  };
-  LocalMemTest<local_mem_2d<TILE_SIZE>>({1, 1}, {TILE_SIZE, TILE_SIZE})
-      .launch_test(checker);
-}
-
-// 3D test
-// Write id to 3D block, then reverse order
-template <int BLOCK_SIZE> void local_mem_3d(int *d_A) {
-  auto as = syclcompat::local_mem<int[BLOCK_SIZE][BLOCK_SIZE][BLOCK_SIZE]>();
-  int id_x = syclcompat::local_id::x();
-  int id_y = syclcompat::local_id::y();
-  int id_z = syclcompat::local_id::z();
-  as[id_z][id_y][id_x] =
-      (id_x * (BLOCK_SIZE * BLOCK_SIZE)) + (id_y * BLOCK_SIZE) + id_z;
-  syclcompat::wg_barrier();
-  int val =
-      as[BLOCK_SIZE - id_z - 1][BLOCK_SIZE - id_y - 1][BLOCK_SIZE - id_x - 1];
-  d_A[syclcompat::global_id::z() * BLOCK_SIZE * BLOCK_SIZE +
-      syclcompat::global_id::y() * BLOCK_SIZE + syclcompat::global_id::x()] =
-      val;
-}
-
-void test_local_3d() {
-  constexpr int TILE_SIZE = 4;
-  auto checker = [](std::vector<int> input) {
-    for (int z = 0; z < TILE_SIZE; ++z) {
-      for (int y = 0; y < TILE_SIZE; ++y) {
-        for (int x = 0; x < TILE_SIZE; ++x) {
-          int linear_id = z * TILE_SIZE * TILE_SIZE + y * TILE_SIZE + x;
-          int expected = ((TILE_SIZE - x - 1) * TILE_SIZE * TILE_SIZE) +
-                         ((TILE_SIZE - y - 1) * TILE_SIZE) +
-                         (TILE_SIZE - z - 1);
-          assert(input[linear_id] == expected);
-        }
-      }
-    }
-  };
-  LocalMemTest<local_mem_3d<TILE_SIZE>>({1, 1, 1},
-                                        {TILE_SIZE, TILE_SIZE, TILE_SIZE})
-      .launch_test(checker);
-}
-
-int main() {
-  test_local_1d();
-  test_local_2d();
-  test_local_3d();
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/memory/local_memory_ptr_to_integer.cpp b/sycl/test-e2e/syclcompat/memory/local_memory_ptr_to_integer.cpp
deleted file mode 100644
index 9d22804309a2c..0000000000000
--- a/sycl/test-e2e/syclcompat/memory/local_memory_ptr_to_integer.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-// REQUIRES: target-nvidia
-// RUN:  %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_75 -o %t.out
-// RUN:  %{run} %t.out
-#include <sycl/detail/core.hpp>
-#include <sycl/group_barrier.hpp>
-#include <syclcompat/memory.hpp>
-
-using namespace sycl;
-#define NUM_ELEMENTS 64
-
-template <class T> void test(queue stream) {
-  half *res = malloc_shared<half>(NUM_ELEMENTS, stream);
-
-  for (int i = 0; i < NUM_ELEMENTS; ++i) {
-    res[i] = 0.5;
-  }
-
-  sycl::nd_range<1> global_range{sycl::range{32}, sycl::range{32}};
-
-  stream
-      .submit([&](handler &h) {
-        h.parallel_for<T>(global_range, [=](nd_item<1> item) {
-          sycl::group work_group = item.get_group();
-          int id = item.get_global_linear_id();
-          half *data = syclcompat::local_mem<half[NUM_ELEMENTS]>();
-
-          data[id * 2] = id;
-          data[id * 2 + 1] = id + 0.5;
-
-          T addr =
-              syclcompat::ptr_to_int<T>(reinterpret_cast<char *>(data) + (id % 8) * 16);
-
-          uint32_t fragment;
-#if defined(__NVPTX__)
-          asm volatile("ldmatrix.sync.aligned.m8n8.x1.shared.b16 {%0}, [%1];\n"
-                       : "=r"(fragment)
-                       : "r"(addr));
-#endif
-          sycl::group_barrier(work_group);
-
-          half *data_ptr = reinterpret_cast<half *>(&fragment);
-          res[id * 2] = data_ptr[0];
-          res[id * 2 + 1] = data_ptr[1];
-        });
-      })
-      .wait();
-
-  for (int i = 0; i < NUM_ELEMENTS; i++) {
-    assert(res[i] == static_cast<half>(i / 2.0));
-  }
-
-  free(res, stream);
-};
-
-int main() {
-
-  queue stream{property::queue::in_order{}};
-  test<size_t>(stream);
-  test<uint32_t>(stream);
-
-  std::cout << "PASS" << std::endl;
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/memory/memcpy_3d.cpp b/sycl/test-e2e/syclcompat/memory/memcpy_3d.cpp
deleted file mode 100644
index 10db2b27560af..0000000000000
--- a/sycl/test-e2e/syclcompat/memory/memcpy_3d.cpp
+++ /dev/null
@@ -1,754 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  memcpy_3d.cpp
- *
- *  Description:
- *    3D memory copy tests
- **************************************************************************/
-
-// The original source was under the license below:
-// ====------ memcpy_3d.cpp---------- -*- C++ -* ----===////
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===----------------------------------------------------------------------===//
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include <malloc.h>
-#include <stdio.h>
-#include <sycl/detail/core.hpp>
-
-#include <syclcompat/memory.hpp>
-
-#include "memory_common.hpp"
-
-void test_memcpy3D_memset() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  size_t width = 6;
-  size_t height = 8;
-  size_t depth = 10;
-  float *h_data;
-  float *h_ref;
-
-  syclcompat::pitched_data d_data;
-  sycl::range<3> extent = sycl::range<3>(sizeof(float) * 1, 1, 1);
-  syclcompat::pitched_data cpyParm_from_data_ct1, cpyParm_to_data_ct1;
-  sycl::id<3> cpyParm_from_pos_ct1(0, 0, 0), cpyParm_to_pos_ct1(0, 0, 0);
-  sycl::range<3> cpyParm_size_ct1(0, 0, 0);
-
-  h_data =
-      (float *)syclcompat::malloc_host(sizeof(float) * width * height * depth);
-  for (int i = 0; i < width * height * depth; i++)
-    h_data[i] = (float)i;
-
-  h_ref =
-      (float *)syclcompat::malloc_host(sizeof(float) * width * height * depth);
-  for (int i = 0; i < width * height * depth; i++)
-    h_ref[i] = (float)i;
-
-  // alloc memory.
-  extent = sycl::range<3>(sizeof(float) * width, height, depth);
-  d_data = (syclcompat::pitched_data)syclcompat::malloc(extent);
-
-  // copy to Device.
-  cpyParm_from_data_ct1 = syclcompat::pitched_data(
-      (void *)h_data, sizeof(float) * width, width, height);
-  cpyParm_to_data_ct1 = d_data;
-  cpyParm_size_ct1 = extent;
-  syclcompat::memcpy(cpyParm_to_data_ct1, cpyParm_to_pos_ct1,
-                     cpyParm_from_data_ct1, cpyParm_from_pos_ct1,
-                     cpyParm_size_ct1);
-
-  // copy back to host.
-  cpyParm_from_data_ct1 = d_data;
-  cpyParm_to_data_ct1 = syclcompat::pitched_data(
-      (void *)h_data, sizeof(float) * width, width, height);
-  cpyParm_size_ct1 = extent;
-  syclcompat::memcpy(cpyParm_to_data_ct1, cpyParm_to_pos_ct1,
-                     cpyParm_from_data_ct1, cpyParm_from_pos_ct1,
-                     cpyParm_size_ct1);
-
-  check(h_data, h_ref, width * height * depth);
-  // memset device data.
-  syclcompat::memset(d_data, 0x1, extent);
-
-  // copy back to host
-  cpyParm_from_data_ct1 = d_data;
-  cpyParm_to_data_ct1 = syclcompat::pitched_data(
-      (void *)h_data, sizeof(float) * width, width, height);
-  cpyParm_size_ct1 = extent;
-  syclcompat::memcpy(cpyParm_to_data_ct1, cpyParm_to_pos_ct1,
-                     cpyParm_from_data_ct1, cpyParm_from_pos_ct1,
-                     cpyParm_size_ct1);
-  // memset reference data.
-  memset(h_ref, 0x1, width * height * depth * sizeof(float));
-  check(h_data, h_ref, width * height * depth);
-
-  syclcompat::free(h_data);
-  syclcompat::free(h_ref);
-  sycl::free(d_data.get_data_ptr(), syclcompat::get_default_context());
-}
-
-void test_memcpy3D_memset_q() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  sycl::queue q{{sycl::property::queue::in_order()}};
-  size_t width = 6;
-  size_t height = 8;
-  size_t depth = 10;
-  float *h_data;
-  float *h_ref;
-
-  syclcompat::pitched_data d_data;
-  sycl::range<3> extent = sycl::range<3>(sizeof(float) * 1, 1, 1);
-  syclcompat::pitched_data cpyParm_from_data_ct1, cpyParm_to_data_ct1;
-  sycl::id<3> cpyParm_from_pos_ct1(0, 0, 0), cpyParm_to_pos_ct1(0, 0, 0);
-  sycl::range<3> cpyParm_size_ct1(0, 0, 0);
-
-  h_data = (float *)syclcompat::malloc_host(
-      sizeof(float) * width * height * depth, q);
-  for (int i = 0; i < width * height * depth; i++)
-    h_data[i] = (float)i;
-
-  h_ref = (float *)syclcompat::malloc_host(
-      sizeof(float) * width * height * depth, q);
-  for (int i = 0; i < width * height * depth; i++)
-    h_ref[i] = (float)i;
-
-  // alloc memory.
-  extent = sycl::range<3>(sizeof(float) * width, height, depth);
-  d_data = (syclcompat::pitched_data)syclcompat::malloc(extent, q);
-
-  // copy to Device.
-  cpyParm_from_data_ct1 = syclcompat::pitched_data(
-      (void *)h_data, sizeof(float) * width, width, height);
-  cpyParm_to_data_ct1 = d_data;
-  cpyParm_size_ct1 = extent;
-  syclcompat::memcpy(cpyParm_to_data_ct1, cpyParm_to_pos_ct1,
-                     cpyParm_from_data_ct1, cpyParm_from_pos_ct1,
-                     cpyParm_size_ct1, q);
-
-  // copy back to host.
-  cpyParm_from_data_ct1 = d_data;
-  cpyParm_to_data_ct1 = syclcompat::pitched_data(
-      (void *)h_data, sizeof(float) * width, width, height);
-  cpyParm_size_ct1 = extent;
-  syclcompat::memcpy(cpyParm_to_data_ct1, cpyParm_to_pos_ct1,
-                     cpyParm_from_data_ct1, cpyParm_from_pos_ct1,
-                     cpyParm_size_ct1, q);
-
-  check(h_data, h_ref, width * height * depth);
-  // memset device data.
-  syclcompat::memset(d_data, 0x1, extent, q);
-
-  // copy back to host
-  cpyParm_from_data_ct1 = d_data;
-  cpyParm_to_data_ct1 = syclcompat::pitched_data(
-      (void *)h_data, sizeof(float) * width, width, height);
-  cpyParm_size_ct1 = extent;
-  syclcompat::memcpy(cpyParm_to_data_ct1, cpyParm_to_pos_ct1,
-                     cpyParm_from_data_ct1, cpyParm_from_pos_ct1,
-                     cpyParm_size_ct1, q);
-  // memset reference data.
-  memset(h_ref, 0x1, width * height * depth * sizeof(float));
-  check(h_data, h_ref, width * height * depth);
-
-  syclcompat::free(h_data, q);
-  syclcompat::free(h_ref, q);
-  syclcompat::free(d_data.get_data_ptr(), q);
-}
-
-void test_memcpy3D_offset() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  size_t width = 4;
-  size_t height = 4;
-  size_t depth = 5;
-  float *h_data;
-
-  syclcompat::pitched_data d_data;
-  sycl::range<3> extent = sycl::range<3>(sizeof(float) * 1, 1, 1);
-  syclcompat::pitched_data cpyParm_from_data_ct1, cpyParm_to_data_ct1;
-  sycl::id<3> cpyParm_from_pos_ct1(0, 0, 0), cpyParm_to_pos_ct1(0, 0, 0);
-  sycl::range<3> cpyParm_size_ct1(0, 0, 0);
-
-  h_data =
-      (float *)syclcompat::malloc_host(sizeof(float) * width * height * depth);
-  /*
-    0.000000        1.000000        2.000000        3.000000
-    4.000000        5.000000        6.000000        7.000000
-    8.000000        9.000000        10.000000       11.000000
-    12.000000       13.000000       14.000000       15.000000
-
-    16.000000       17.000000       18.000000       19.000000
-    20.000000       21.000000       22.000000       23.000000
-    24.000000       25.000000       26.000000       27.000000
-    28.000000       29.000000       30.000000       31.000000
-
-    32.000000       33.000000       34.000000       35.000000
-    36.000000       37.000000       38.000000       39.000000
-    40.000000       41.000000       42.000000       43.000000
-    44.000000       45.000000       46.000000       47.000000
-
-    48.000000       49.000000       50.000000       51.000000
-    52.000000       53.000000       54.000000       55.000000
-    56.000000       57.000000       58.000000       59.000000
-    60.000000       61.000000       62.000000       63.000000
-
-    64.000000       65.000000       66.000000       67.000000
-    68.000000       69.000000       70.000000       71.000000
-    72.000000       73.000000       74.000000       75.000000
-    76.000000       77.000000       78.000000       79.000000
-  */
-  for (int i = 0; i < width * height * depth; i++)
-    h_data[i] = (float)i;
-
-  /*
-    5.000000        6.000000
-    9.000000        10.000000
-
-    21.000000       22.000000
-    25.000000       26.000000
-
-    37.000000       38.000000
-    41.000000       42.000000
-  */
-  float Ref[12] = {5, 6, 9, 10, 21, 22, 25, 26, 37, 38, 41, 42};
-
-  size_t out_width = 2;
-  size_t out_height = 2;
-  size_t out_depth = 3;
-
-  // alloc memory.
-  extent = sycl::range<3>(sizeof(float) * width, height, depth);
-  d_data = (syclcompat::pitched_data)syclcompat::malloc(extent);
-
-  // copy to Device.
-  cpyParm_from_data_ct1 = syclcompat::pitched_data(
-      (void *)h_data, sizeof(float) * width, width, height);
-  cpyParm_to_data_ct1 = d_data;
-  cpyParm_size_ct1 = extent;
-  syclcompat::memcpy(cpyParm_to_data_ct1, cpyParm_to_pos_ct1,
-                     cpyParm_from_data_ct1, cpyParm_from_pos_ct1,
-                     cpyParm_size_ct1);
-
-  cpyParm_from_pos_ct1 = {1 * sizeof(float), 1, 0}; // set offset on x/y/z.
-  cpyParm_size_ct1 = {out_width * sizeof(float), out_height, out_depth};
-
-  for (int i = 0; i < out_width * out_height * out_depth; i++)
-    h_data[i] = -1;
-  // copy back to host.
-  cpyParm_from_data_ct1 = d_data;
-  cpyParm_to_data_ct1 = syclcompat::pitched_data(
-      (void *)h_data, sizeof(float) * out_width, out_width, out_height);
-
-  syclcompat::memcpy(cpyParm_to_data_ct1, cpyParm_to_pos_ct1,
-                     cpyParm_from_data_ct1, cpyParm_from_pos_ct1,
-                     cpyParm_size_ct1);
-
-  // Copy back to host data.
-  check(h_data, Ref, out_width * out_height * out_depth);
-  syclcompat::free(h_data);
-  sycl::free(d_data.get_data_ptr(), syclcompat::get_default_context());
-}
-
-void test_memcpy3D_offset_q() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  sycl::queue q{{sycl::property::queue::in_order()}};
-  size_t width = 4;
-  size_t height = 4;
-  size_t depth = 5;
-  float *h_data;
-
-  syclcompat::pitched_data d_data;
-  sycl::range<3> extent = sycl::range<3>(sizeof(float) * 1, 1, 1);
-  syclcompat::pitched_data cpyParm_from_data_ct1, cpyParm_to_data_ct1;
-  sycl::id<3> cpyParm_from_pos_ct1(0, 0, 0), cpyParm_to_pos_ct1(0, 0, 0);
-  sycl::range<3> cpyParm_size_ct1(0, 0, 0);
-
-  h_data = (float *)syclcompat::malloc_host(
-      sizeof(float) * width * height * depth, q);
-  /*
-    0.000000        1.000000        2.000000        3.000000
-    4.000000        5.000000        6.000000        7.000000
-    8.000000        9.000000        10.000000       11.000000
-    12.000000       13.000000       14.000000       15.000000
-
-    16.000000       17.000000       18.000000       19.000000
-    20.000000       21.000000       22.000000       23.000000
-    24.000000       25.000000       26.000000       27.000000
-    28.000000       29.000000       30.000000       31.000000
-
-    32.000000       33.000000       34.000000       35.000000
-    36.000000       37.000000       38.000000       39.000000
-    40.000000       41.000000       42.000000       43.000000
-    44.000000       45.000000       46.000000       47.000000
-
-    48.000000       49.000000       50.000000       51.000000
-    52.000000       53.000000       54.000000       55.000000
-    56.000000       57.000000       58.000000       59.000000
-    60.000000       61.000000       62.000000       63.000000
-
-    64.000000       65.000000       66.000000       67.000000
-    68.000000       69.000000       70.000000       71.000000
-    72.000000       73.000000       74.000000       75.000000
-    76.000000       77.000000       78.000000       79.000000
-  */
-  for (int i = 0; i < width * height * depth; i++)
-    h_data[i] = (float)i;
-
-  /*
-    5.000000        6.000000
-    9.000000        10.000000
-
-    21.000000       22.000000
-    25.000000       26.000000
-
-    37.000000       38.000000
-    41.000000       42.000000
-  */
-  float Ref[12] = {5, 6, 9, 10, 21, 22, 25, 26, 37, 38, 41, 42};
-
-  size_t out_width = 2;
-  size_t out_height = 2;
-  size_t out_depth = 3;
-
-  // alloc memory.
-  extent = sycl::range<3>(sizeof(float) * width, height, depth);
-  d_data = (syclcompat::pitched_data)syclcompat::malloc(extent, q);
-
-  // copy to Device.
-  cpyParm_from_data_ct1 = syclcompat::pitched_data(
-      (void *)h_data, sizeof(float) * width, width, height);
-  cpyParm_to_data_ct1 = d_data;
-  cpyParm_size_ct1 = extent;
-  syclcompat::memcpy(cpyParm_to_data_ct1, cpyParm_to_pos_ct1,
-                     cpyParm_from_data_ct1, cpyParm_from_pos_ct1,
-                     cpyParm_size_ct1, q);
-
-  cpyParm_from_pos_ct1 = {1 * sizeof(float), 1, 0}; // set offset on x/y/z.
-  cpyParm_size_ct1 = {out_width * sizeof(float), out_height, out_depth};
-
-  for (int i = 0; i < out_width * out_height * out_depth; i++)
-    h_data[i] = -1;
-  // copy back to host.
-  cpyParm_from_data_ct1 = d_data;
-  cpyParm_to_data_ct1 = syclcompat::pitched_data(
-      (void *)h_data, sizeof(float) * out_width, out_width, out_height);
-
-  syclcompat::memcpy(cpyParm_to_data_ct1, cpyParm_to_pos_ct1,
-                     cpyParm_from_data_ct1, cpyParm_from_pos_ct1,
-                     cpyParm_size_ct1, q);
-  // Copy back to host data.
-  check(h_data, Ref, out_width * out_height * out_depth);
-  syclcompat::free(h_data, q);
-  syclcompat::free(d_data.get_data_ptr(), q);
-}
-
-void test_memcpy3D_offsetZ() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  size_t width = 4;
-  size_t height = 4;
-  size_t depth = 5;
-  float *h_data;
-
-  syclcompat::pitched_data d_data;
-  sycl::range<3> extent = sycl::range<3>(sizeof(float) * 1, 1, 1);
-  syclcompat::pitched_data cpyParm_from_data_ct1, cpyParm_to_data_ct1;
-  sycl::id<3> cpyParm_from_pos_ct1(0, 0, 0), cpyParm_to_pos_ct1(0, 0, 0);
-  sycl::range<3> cpyParm_size_ct1(0, 0, 0);
-
-  h_data =
-      (float *)syclcompat::malloc_host(sizeof(float) * width * height * depth);
-  /*
-    0.000000        1.000000        2.000000        3.000000
-    4.000000        5.000000        6.000000        7.000000
-    8.000000        9.000000        10.000000       11.000000
-    12.000000       13.000000       14.000000       15.000000
-
-    16.000000       17.000000       18.000000       19.000000
-    20.000000       21.000000       22.000000       23.000000
-    24.000000       25.000000       26.000000       27.000000
-    28.000000       29.000000       30.000000       31.000000
-
-    32.000000       33.000000       34.000000       35.000000
-    36.000000       37.000000       38.000000       39.000000
-    40.000000       41.000000       42.000000       43.000000
-    44.000000       45.000000       46.000000       47.000000
-
-    48.000000       49.000000       50.000000       51.000000
-    52.000000       53.000000       54.000000       55.000000
-    56.000000       57.000000       58.000000       59.000000
-    60.000000       61.000000       62.000000       63.000000
-
-    64.000000       65.000000       66.000000       67.000000
-    68.000000       69.000000       70.000000       71.000000
-    72.000000       73.000000       74.000000       75.000000
-    76.000000       77.000000       78.000000       79.000000
-  */
-  for (int i = 0; i < width * height * depth; i++)
-    h_data[i] = (float)i;
-
-  /*
-    21.000000       22.000000
-    25.000000       26.000000
-
-    37.000000       38.000000
-    41.000000       42.000000
-
-    53.000000       54.000000
-    57.000000       58.000000
-  */
-  float Ref[12] = {21, 22, 25, 26, 37, 38, 41, 42, 53, 54, 57, 58};
-
-  size_t out_width = 2;
-  size_t out_height = 2;
-  size_t out_depth = 3;
-
-  // alloc memory.
-  extent = sycl::range<3>(sizeof(float) * width, height, depth);
-  d_data = (syclcompat::pitched_data)syclcompat::malloc(extent);
-
-  // copy to Device.
-  cpyParm_from_data_ct1 = syclcompat::pitched_data(
-      (void *)h_data, sizeof(float) * width, width, height);
-  cpyParm_to_data_ct1 = d_data;
-  cpyParm_size_ct1 = extent;
-  syclcompat::memcpy(cpyParm_to_data_ct1, cpyParm_to_pos_ct1,
-                     cpyParm_from_data_ct1, cpyParm_from_pos_ct1,
-                     cpyParm_size_ct1);
-
-  cpyParm_from_pos_ct1 = {1 * sizeof(float), 1, 1}; // set offset on x/y/z.
-  cpyParm_size_ct1 = {out_width * sizeof(float), out_height, out_depth};
-
-  for (int i = 0; i < out_width * out_height * out_depth; i++)
-    h_data[i] = -1;
-  // copy back to host.
-  cpyParm_from_data_ct1 = d_data;
-  cpyParm_to_data_ct1 = syclcompat::pitched_data(
-      (void *)h_data, sizeof(float) * out_width, out_width, out_height);
-
-  syclcompat::memcpy(cpyParm_to_data_ct1, cpyParm_to_pos_ct1,
-                     cpyParm_from_data_ct1, cpyParm_from_pos_ct1,
-                     cpyParm_size_ct1);
-
-  // Copy back to host data.
-  check(h_data, Ref, out_width * out_height * out_depth);
-  syclcompat::free(h_data);
-  sycl::free(d_data.get_data_ptr(), syclcompat::get_default_context());
-}
-
-void test_memcpy3D_offsetZ_q() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  sycl::queue q{{sycl::property::queue::in_order()}};
-  size_t width = 4;
-  size_t height = 4;
-  size_t depth = 5;
-  float *h_data;
-
-  syclcompat::pitched_data d_data;
-  sycl::range<3> extent = sycl::range<3>(sizeof(float) * 1, 1, 1);
-  syclcompat::pitched_data cpyParm_from_data_ct1, cpyParm_to_data_ct1;
-  sycl::id<3> cpyParm_from_pos_ct1(0, 0, 0), cpyParm_to_pos_ct1(0, 0, 0);
-  sycl::range<3> cpyParm_size_ct1(0, 0, 0);
-
-  h_data = (float *)syclcompat::malloc_host(
-      sizeof(float) * width * height * depth, q);
-  /*
-    0.000000        1.000000        2.000000        3.000000
-    4.000000        5.000000        6.000000        7.000000
-    8.000000        9.000000        10.000000       11.000000
-    12.000000       13.000000       14.000000       15.000000
-
-    16.000000       17.000000       18.000000       19.000000
-    20.000000       21.000000       22.000000       23.000000
-    24.000000       25.000000       26.000000       27.000000
-    28.000000       29.000000       30.000000       31.000000
-
-    32.000000       33.000000       34.000000       35.000000
-    36.000000       37.000000       38.000000       39.000000
-    40.000000       41.000000       42.000000       43.000000
-    44.000000       45.000000       46.000000       47.000000
-
-    48.000000       49.000000       50.000000       51.000000
-    52.000000       53.000000       54.000000       55.000000
-    56.000000       57.000000       58.000000       59.000000
-    60.000000       61.000000       62.000000       63.000000
-
-    64.000000       65.000000       66.000000       67.000000
-    68.000000       69.000000       70.000000       71.000000
-    72.000000       73.000000       74.000000       75.000000
-    76.000000       77.000000       78.000000       79.000000
-  */
-  for (int i = 0; i < width * height * depth; i++)
-    h_data[i] = (float)i;
-
-  /*
-    21.000000       22.000000
-    25.000000       26.000000
-
-    37.000000       38.000000
-    41.000000       42.000000
-
-    53.000000       54.000000
-    57.000000       58.000000
-  */
-  float Ref[12] = {21, 22, 25, 26, 37, 38, 41, 42, 53, 54, 57, 58};
-
-  size_t out_width = 2;
-  size_t out_height = 2;
-  size_t out_depth = 3;
-
-  // alloc memory.
-  extent = sycl::range<3>(sizeof(float) * width, height, depth);
-  d_data = (syclcompat::pitched_data)syclcompat::malloc(extent, q);
-
-  // copy to Device.
-  cpyParm_from_data_ct1 = syclcompat::pitched_data(
-      (void *)h_data, sizeof(float) * width, width, height);
-  cpyParm_to_data_ct1 = d_data;
-  cpyParm_size_ct1 = extent;
-  syclcompat::memcpy(cpyParm_to_data_ct1, cpyParm_to_pos_ct1,
-                     cpyParm_from_data_ct1, cpyParm_from_pos_ct1,
-                     cpyParm_size_ct1, q);
-
-  cpyParm_from_pos_ct1 = {1 * sizeof(float), 1, 1}; // set offset on x/y/z.
-  cpyParm_size_ct1 = {out_width * sizeof(float), out_height, out_depth};
-
-  for (int i = 0; i < out_width * out_height * out_depth; i++)
-    h_data[i] = -1;
-  // copy back to host.
-  cpyParm_from_data_ct1 = d_data;
-  cpyParm_to_data_ct1 = syclcompat::pitched_data(
-      (void *)h_data, sizeof(float) * out_width, out_width, out_height);
-
-  syclcompat::memcpy(cpyParm_to_data_ct1, cpyParm_to_pos_ct1,
-                     cpyParm_from_data_ct1, cpyParm_from_pos_ct1,
-                     cpyParm_size_ct1, q);
-
-  // Copy back to host data.
-  check(h_data, Ref, out_width * out_height * out_depth);
-  syclcompat::free(h_data, q);
-  syclcompat::free(d_data.get_data_ptr(), q);
-}
-
-// short path1
-// test copy 3D data special case
-// for continuous plane, we can copy it as linear data
-void test_memcpy3D_plane() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  size_t width = 4;
-  size_t height = 4;
-  size_t depth = 5;
-  float *h_data;
-
-  syclcompat::pitched_data d_data;
-  sycl::range<3> extent = sycl::range<3>(sizeof(float) * 1, 1, 1);
-  syclcompat::pitched_data cpyParm_from_data_ct1, cpyParm_to_data_ct1;
-  sycl::id<3> cpyParm_from_pos_ct1(0, 0, 0), cpyParm_to_pos_ct1(0, 0, 0);
-  sycl::range<3> cpyParm_size_ct1(0, 0, 0);
-
-  h_data =
-      (float *)syclcompat::malloc_host(sizeof(float) * width * height * depth);
-  /*
-    0.000000        1.000000        2.000000        3.000000
-    4.000000        5.000000        6.000000        7.000000
-    8.000000        9.000000        10.000000       11.000000
-    12.000000       13.000000       14.000000       15.000000
-
-    16.000000       17.000000       18.000000       19.000000
-    20.000000       21.000000       22.000000       23.000000
-    24.000000       25.000000       26.000000       27.000000
-    28.000000       29.000000       30.000000       31.000000
-
-    32.000000       33.000000       34.000000       35.000000
-    36.000000       37.000000       38.000000       39.000000
-    40.000000       41.000000       42.000000       43.000000
-    44.000000       45.000000       46.000000       47.000000
-
-    48.000000       49.000000       50.000000       51.000000
-    52.000000       53.000000       54.000000       55.000000
-    56.000000       57.000000       58.000000       59.000000
-    60.000000       61.000000       62.000000       63.000000
-
-    64.000000       65.000000       66.000000       67.000000
-    68.000000       69.000000       70.000000       71.000000
-    72.000000       73.000000       74.000000       75.000000
-    76.000000       77.000000       78.000000       79.000000
-  */
-  for (int i = 0; i < width * height * depth; i++)
-    h_data[i] = (float)i;
-
-  /*
-    0.000000        1.000000        2.000000        3.000000
-    4.000000        5.000000        6.000000        7.000000
-    8.000000        9.000000        10.000000       11.000000
-    12.000000       13.000000       14.000000       15.000000
-
-    16.000000       17.000000       18.000000       19.000000
-    20.000000       21.000000       22.000000       23.000000
-    24.000000       25.000000       26.000000       27.000000
-    28.000000       29.000000       30.000000       31.000000
-  */
-  float Ref[32] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10,
-                   11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
-                   22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
-
-  size_t out_width = 4;
-  size_t out_height = 4;
-  size_t out_depth = 2;
-
-  // alloc memory.
-  extent = sycl::range<3>(sizeof(float) * width, height, depth);
-  d_data = (syclcompat::pitched_data)syclcompat::malloc(extent);
-
-  // copy to Device.
-  cpyParm_from_data_ct1 = syclcompat::pitched_data(
-      (void *)h_data, sizeof(float) * width, width, height);
-  cpyParm_to_data_ct1 = d_data;
-  cpyParm_size_ct1 = extent;
-  syclcompat::memcpy(cpyParm_to_data_ct1, cpyParm_to_pos_ct1,
-                     cpyParm_from_data_ct1, cpyParm_from_pos_ct1,
-                     cpyParm_size_ct1);
-
-  cpyParm_from_pos_ct1 = {0, 0, 0}; // set offset on x/y/z.
-  cpyParm_size_ct1 = {out_width * sizeof(float), out_height, out_depth};
-
-  for (int i = 0; i < out_width * out_height * out_depth; i++)
-    h_data[i] = -1;
-  // copy back to host.
-  cpyParm_from_data_ct1 = d_data;
-  cpyParm_to_data_ct1 = syclcompat::pitched_data(
-      (void *)h_data, sizeof(float) * out_width, out_width, out_height);
-  syclcompat::memcpy(cpyParm_to_data_ct1, cpyParm_to_pos_ct1,
-                     cpyParm_from_data_ct1, cpyParm_from_pos_ct1,
-                     cpyParm_size_ct1);
-
-  // Copy back to host data.
-  check(h_data, Ref, out_width * out_height * out_depth);
-  syclcompat::free(h_data);
-  sycl::free(d_data.get_data_ptr(), syclcompat::get_default_context());
-}
-
-// short path2
-// test copy 3D data special case
-// for continuous row, we can copy it as linear data
-void test_memcpy3D_row() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  size_t width = 4;
-  size_t height = 4;
-  size_t depth = 5;
-  float *h_data;
-
-  syclcompat::pitched_data d_data;
-  sycl::range<3> extent = sycl::range<3>(sizeof(float) * 1, 1, 1);
-  syclcompat::pitched_data cpyParm_from_data_ct1, cpyParm_to_data_ct1;
-  sycl::id<3> cpyParm_from_pos_ct1(0, 0, 0), cpyParm_to_pos_ct1(0, 0, 0);
-  sycl::range<3> cpyParm_size_ct1(0, 0, 0);
-
-  h_data =
-      (float *)syclcompat::malloc_host(sizeof(float) * width * height * depth);
-  /*
-    0.000000        1.000000        2.000000        3.000000
-    4.000000        5.000000        6.000000        7.000000
-    8.000000        9.000000        10.000000       11.000000
-    12.000000       13.000000       14.000000       15.000000
-
-    16.000000       17.000000       18.000000       19.000000
-    20.000000       21.000000       22.000000       23.000000
-    24.000000       25.000000       26.000000       27.000000
-    28.000000       29.000000       30.000000       31.000000
-
-    32.000000       33.000000       34.000000       35.000000
-    36.000000       37.000000       38.000000       39.000000
-    40.000000       41.000000       42.000000       43.000000
-    44.000000       45.000000       46.000000       47.000000
-
-    48.000000       49.000000       50.000000       51.000000
-    52.000000       53.000000       54.000000       55.000000
-    56.000000       57.000000       58.000000       59.000000
-    60.000000       61.000000       62.000000       63.000000
-
-    64.000000       65.000000       66.000000       67.000000
-    68.000000       69.000000       70.000000       71.000000
-    72.000000       73.000000       74.000000       75.000000
-    76.000000       77.000000       78.000000       79.000000
-  */
-  for (int i = 0; i < width * height * depth; i++)
-    h_data[i] = (float)i;
-
-  /*
-    0.000000        1.000000        2.000000        3.000000
-    4.000000        5.000000        6.000000        7.000000
-
-    16.000000       17.000000       18.000000       19.000000
-    20.000000       21.000000       22.000000       23.000000
-  */
-  float Ref[16] = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23};
-
-  size_t out_width = 4;
-  size_t out_height = 2;
-  size_t out_depth = 2;
-
-  // alloc memory.
-  extent = sycl::range<3>(sizeof(float) * width, height, depth);
-  d_data = (syclcompat::pitched_data)syclcompat::malloc(extent);
-
-  // copy to Device.
-  cpyParm_from_data_ct1 = syclcompat::pitched_data(
-      (void *)h_data, sizeof(float) * width, width, height);
-  cpyParm_to_data_ct1 = d_data;
-  cpyParm_size_ct1 = extent;
-  syclcompat::memcpy(cpyParm_to_data_ct1, cpyParm_to_pos_ct1,
-                     cpyParm_from_data_ct1, cpyParm_from_pos_ct1,
-                     cpyParm_size_ct1);
-
-  cpyParm_from_pos_ct1 = {0, 0, 0}; // set offset on x/y/z.
-  cpyParm_size_ct1 = {out_width * sizeof(float), out_height, out_depth};
-
-  for (int i = 0; i < out_width * out_height * out_depth; i++)
-    h_data[i] = -1;
-  // copy back to host.
-  cpyParm_from_data_ct1 = d_data;
-  cpyParm_to_data_ct1 = syclcompat::pitched_data(
-      (void *)h_data, sizeof(float) * out_width, out_width, out_height);
-  syclcompat::memcpy(cpyParm_to_data_ct1, cpyParm_to_pos_ct1,
-                     cpyParm_from_data_ct1, cpyParm_from_pos_ct1,
-                     cpyParm_size_ct1);
-
-  // Copy back to host data.
-  check(h_data, Ref, out_width * out_height * out_depth);
-  syclcompat::free(h_data);
-  sycl::free(d_data.get_data_ptr(), syclcompat::get_default_context());
-}
-
-int main() {
-  test_memcpy3D_memset();
-  test_memcpy3D_memset_q();
-  test_memcpy3D_offset();
-  test_memcpy3D_offset_q();
-  test_memcpy3D_offsetZ();
-  test_memcpy3D_offsetZ_q();
-  test_memcpy3D_plane();
-  test_memcpy3D_row();
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/memory/memcpy_3d2.cpp b/sycl/test-e2e/syclcompat/memory/memcpy_3d2.cpp
deleted file mode 100644
index 5b0691c35059b..0000000000000
--- a/sycl/test-e2e/syclcompat/memory/memcpy_3d2.cpp
+++ /dev/null
@@ -1,586 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  memcpy_3d2.cpp
- *
- *  Description:
- *    3D memory copy tests
- **************************************************************************/
-
-// The original source was under the license below:
-// ====------ memcpy_3d2.cpp ------------------------------ -*- C++ -* ----===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-// ===---------------------------------------------------------------------===//
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include <malloc.h>
-#include <stdio.h>
-#include <sycl/detail/core.hpp>
-
-#include <syclcompat/memory.hpp>
-
-#include "memory_common.hpp"
-
-void test_memcpy3D_async_pitchedAPI() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  size_t width = 6;
-  size_t height = 8;
-  size_t depth = 10;
-  float *h_data;
-  float *h_ref;
-  syclcompat::byte_t a = 'a';
-  assert(sizeof(syclcompat::byte_t) == 1);
-
-  syclcompat::pitched_data d_data;
-  sycl::range<3> extent = sycl::range<3>(sizeof(float) * 1, 1, 1);
-  syclcompat::pitched_data cpyParm_from_data_ct1, cpyParm_to_data_ct1;
-  sycl::id<3> cpyParm_from_pos_ct1(0, 0, 0), cpyParm_to_pos_ct1(0, 0, 0);
-  sycl::range<3> cpyParm_size_ct1(0, 0, 0);
-
-  h_data =
-      (float *)syclcompat::malloc_host(sizeof(float) * width * height * depth);
-  for (int i = 0; i < width * height * depth; i++)
-    h_data[i] = (float)i;
-
-  h_ref =
-      (float *)syclcompat::malloc_host(sizeof(float) * width * height * depth);
-  for (int i = 0; i < width * height * depth; i++)
-    h_ref[i] = (float)i;
-
-  // alloc memory.
-  extent = sycl::range<3>(sizeof(float) * width, height, depth);
-  d_data = (syclcompat::pitched_data)syclcompat::malloc(extent);
-
-  // copy to Device.
-  cpyParm_from_data_ct1 = syclcompat::pitched_data(
-      (void *)h_data, sizeof(float) * width, width, height);
-
-  if (cpyParm_from_data_ct1.get_data_ptr() != h_data ||
-      cpyParm_from_data_ct1.get_pitch() != sizeof(float) * width ||
-      cpyParm_from_data_ct1.get_x() != width ||
-      cpyParm_from_data_ct1.get_y() != height) {
-    assert(false);
-  }
-  cpyParm_from_data_ct1.set_data_ptr((void *)h_data);
-  cpyParm_from_data_ct1.set_pitch(sizeof(float) * width);
-  cpyParm_from_data_ct1.set_x(width);
-  cpyParm_from_data_ct1.set_y(height);
-
-  if (cpyParm_from_data_ct1.get_data_ptr() != h_data ||
-      cpyParm_from_data_ct1.get_pitch() != sizeof(float) * width ||
-      cpyParm_from_data_ct1.get_x() != width ||
-      cpyParm_from_data_ct1.get_y() != height) {
-    assert(false);
-  }
-
-  cpyParm_to_data_ct1 = d_data;
-  cpyParm_size_ct1 = extent;
-  syclcompat::memcpy_async(cpyParm_to_data_ct1, cpyParm_to_pos_ct1,
-                           cpyParm_from_data_ct1, cpyParm_from_pos_ct1,
-                           cpyParm_size_ct1);
-  syclcompat::get_default_queue().wait_and_throw();
-  // copy back to host.
-  cpyParm_from_data_ct1 = d_data;
-  cpyParm_to_data_ct1 = syclcompat::pitched_data(
-      (void *)h_data, sizeof(float) * width, width, height);
-  cpyParm_size_ct1 = extent;
-  syclcompat::memcpy_async(cpyParm_to_data_ct1, cpyParm_to_pos_ct1,
-                           cpyParm_from_data_ct1, cpyParm_from_pos_ct1,
-                           cpyParm_size_ct1);
-  syclcompat::get_default_queue().wait_and_throw();
-  check(h_data, h_ref, width * height * depth);
-  // memset device data.
-  syclcompat::memset_async(d_data, 0x1, extent);
-  syclcompat::get_default_queue().wait_and_throw();
-  // copy back to host
-  cpyParm_from_data_ct1 = d_data;
-  cpyParm_to_data_ct1 = syclcompat::pitched_data(
-      (void *)h_data, sizeof(float) * width, width, height);
-  cpyParm_size_ct1 = extent;
-  syclcompat::memcpy_async(cpyParm_to_data_ct1, cpyParm_to_pos_ct1,
-                           cpyParm_from_data_ct1, cpyParm_from_pos_ct1,
-                           cpyParm_size_ct1);
-  syclcompat::get_default_queue().wait_and_throw();
-  // memset reference data.
-  memset(h_ref, 0x1, width * height * depth * sizeof(float));
-  check(h_data, h_ref, width * height * depth);
-
-  syclcompat::free(h_data);
-  syclcompat::free(h_ref);
-  sycl::free(d_data.get_data_ptr(), syclcompat::get_default_context());
-}
-void test_memcpy3D_async_pitchedAPI_q() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  sycl::queue q{{sycl::property::queue::in_order()}};
-  size_t width = 6;
-  size_t height = 8;
-  size_t depth = 10;
-  float *h_data;
-  float *h_ref;
-
-  syclcompat::pitched_data d_data;
-  sycl::range<3> extent = sycl::range<3>(sizeof(float) * 1, 1, 1);
-  syclcompat::pitched_data cpyParm_from_data_ct1, cpyParm_to_data_ct1;
-  sycl::id<3> cpyParm_from_pos_ct1(0, 0, 0), cpyParm_to_pos_ct1(0, 0, 0);
-  sycl::range<3> cpyParm_size_ct1(0, 0, 0);
-
-  h_data = (float *)syclcompat::malloc_host(
-      sizeof(float) * width * height * depth, q);
-  for (int i = 0; i < width * height * depth; i++)
-    h_data[i] = (float)i;
-
-  h_ref = (float *)syclcompat::malloc_host(
-      sizeof(float) * width * height * depth, q);
-  for (int i = 0; i < width * height * depth; i++)
-    h_ref[i] = (float)i;
-
-  // alloc memory.
-  extent = sycl::range<3>(sizeof(float) * width, height, depth);
-  d_data = (syclcompat::pitched_data)syclcompat::malloc(extent, q);
-
-  // copy to Device.
-  cpyParm_from_data_ct1 = syclcompat::pitched_data(
-      (void *)h_data, sizeof(float) * width, width, height);
-  cpyParm_to_data_ct1 = d_data;
-  cpyParm_size_ct1 = extent;
-  syclcompat::memcpy_async(cpyParm_to_data_ct1, cpyParm_to_pos_ct1,
-                           cpyParm_from_data_ct1, cpyParm_from_pos_ct1,
-                           cpyParm_size_ct1, q);
-  q.wait_and_throw();
-  // copy back to host.
-  cpyParm_from_data_ct1 = d_data;
-  cpyParm_to_data_ct1 = syclcompat::pitched_data(
-      (void *)h_data, sizeof(float) * width, width, height);
-  cpyParm_size_ct1 = extent;
-  syclcompat::memcpy_async(cpyParm_to_data_ct1, cpyParm_to_pos_ct1,
-                           cpyParm_from_data_ct1, cpyParm_from_pos_ct1,
-                           cpyParm_size_ct1, q);
-
-  q.wait_and_throw();
-
-  check(h_data, h_ref, width * height * depth);
-  // memset device data.
-  syclcompat::memset_async(d_data, 0x1, extent, q);
-  q.wait_and_throw();
-  // copy back to host
-  cpyParm_from_data_ct1 = d_data;
-  cpyParm_to_data_ct1 = syclcompat::pitched_data(
-      (void *)h_data, sizeof(float) * width, width, height);
-  cpyParm_size_ct1 = extent;
-  syclcompat::memcpy_async(cpyParm_to_data_ct1, cpyParm_to_pos_ct1,
-                           cpyParm_from_data_ct1, cpyParm_from_pos_ct1,
-                           cpyParm_size_ct1, q);
-
-  q.wait_and_throw();
-  // memset reference data.
-  memset(h_ref, 0x1, width * height * depth * sizeof(float));
-  check(h_data, h_ref, width * height * depth);
-
-  syclcompat::free(h_data, q);
-  syclcompat::free(h_ref, q);
-  syclcompat::free(d_data.get_data_ptr(), q);
-}
-
-void test_memcpy3D_async_offset() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  size_t width = 4;
-  size_t height = 4;
-  size_t depth = 5;
-  float *h_data;
-
-  syclcompat::pitched_data d_data;
-  sycl::range<3> extent = sycl::range<3>(sizeof(float) * 1, 1, 1);
-  syclcompat::pitched_data cpyParm_from_data_ct1, cpyParm_to_data_ct1;
-  sycl::id<3> cpyParm_from_pos_ct1(0, 0, 0), cpyParm_to_pos_ct1(0, 0, 0);
-  sycl::range<3> cpyParm_size_ct1(0, 0, 0);
-
-  h_data =
-      (float *)syclcompat::malloc_host(sizeof(float) * width * height * depth);
-  /*
-    0.000000        1.000000        2.000000        3.000000
-    4.000000        5.000000        6.000000        7.000000
-    8.000000        9.000000        10.000000       11.000000
-    12.000000       13.000000       14.000000       15.000000
-
-    16.000000       17.000000       18.000000       19.000000
-    20.000000       21.000000       22.000000       23.000000
-    24.000000       25.000000       26.000000       27.000000
-    28.000000       29.000000       30.000000       31.000000
-
-    32.000000       33.000000       34.000000       35.000000
-    36.000000       37.000000       38.000000       39.000000
-    40.000000       41.000000       42.000000       43.000000
-    44.000000       45.000000       46.000000       47.000000
-
-    48.000000       49.000000       50.000000       51.000000
-    52.000000       53.000000       54.000000       55.000000
-    56.000000       57.000000       58.000000       59.000000
-    60.000000       61.000000       62.000000       63.000000
-
-    64.000000       65.000000       66.000000       67.000000
-    68.000000       69.000000       70.000000       71.000000
-    72.000000       73.000000       74.000000       75.000000
-    76.000000       77.000000       78.000000       79.000000
-  */
-  for (int i = 0; i < width * height * depth; i++)
-    h_data[i] = (float)i;
-
-  /*
-    5.000000        6.000000
-    9.000000        10.000000
-
-    21.000000       22.000000
-    25.000000       26.000000
-
-    37.000000       38.000000
-    41.000000       42.000000
-  */
-  float Ref[12] = {5, 6, 9, 10, 21, 22, 25, 26, 37, 38, 41, 42};
-
-  size_t out_width = 2;
-  size_t out_height = 2;
-  size_t out_depth = 3;
-
-  // alloc memory.
-  extent = sycl::range<3>(sizeof(float) * width, height, depth);
-  d_data = (syclcompat::pitched_data)syclcompat::malloc(extent);
-
-  // copy to Device.
-  cpyParm_from_data_ct1 = syclcompat::pitched_data(
-      (void *)h_data, sizeof(float) * width, width, height);
-  cpyParm_to_data_ct1 = d_data;
-  cpyParm_size_ct1 = extent;
-  syclcompat::memcpy_async(cpyParm_to_data_ct1, cpyParm_to_pos_ct1,
-                           cpyParm_from_data_ct1, cpyParm_from_pos_ct1,
-                           cpyParm_size_ct1);
-  syclcompat::get_default_queue().wait_and_throw();
-  cpyParm_from_pos_ct1 = {1 * sizeof(float), 1, 0}; // set offset on x/y/z.
-  cpyParm_size_ct1 = {out_width * sizeof(float), out_height, out_depth};
-
-  for (int i = 0; i < out_width * out_height * out_depth; i++)
-    h_data[i] = -1;
-  // copy back to host.
-  cpyParm_from_data_ct1 = d_data;
-  cpyParm_to_data_ct1 = syclcompat::pitched_data(
-      (void *)h_data, sizeof(float) * out_width, out_width, out_height);
-  syclcompat::memcpy_async(cpyParm_to_data_ct1, cpyParm_to_pos_ct1,
-                           cpyParm_from_data_ct1, cpyParm_from_pos_ct1,
-                           cpyParm_size_ct1);
-  syclcompat::get_default_queue().wait_and_throw();
-  // Copy back to host data.
-  check(h_data, Ref, out_width * out_height * out_depth);
-  syclcompat::free(h_data);
-  sycl::free(d_data.get_data_ptr(), syclcompat::get_default_context());
-}
-
-void test_memcpy3D_async_offset_q() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  sycl::queue q{{sycl::property::queue::in_order()}};
-  size_t width = 4;
-  size_t height = 4;
-  size_t depth = 5;
-  float *h_data;
-
-  syclcompat::pitched_data d_data;
-  sycl::range<3> extent = sycl::range<3>(sizeof(float) * 1, 1, 1);
-  syclcompat::pitched_data cpyParm_from_data_ct1, cpyParm_to_data_ct1;
-  sycl::id<3> cpyParm_from_pos_ct1(0, 0, 0), cpyParm_to_pos_ct1(0, 0, 0);
-  sycl::range<3> cpyParm_size_ct1(0, 0, 0);
-
-  h_data = (float *)syclcompat::malloc_host(
-      sizeof(float) * width * height * depth, q);
-  /*
-    0.000000        1.000000        2.000000        3.000000
-    4.000000        5.000000        6.000000        7.000000
-    8.000000        9.000000        10.000000       11.000000
-    12.000000       13.000000       14.000000       15.000000
-
-    16.000000       17.000000       18.000000       19.000000
-    20.000000       21.000000       22.000000       23.000000
-    24.000000       25.000000       26.000000       27.000000
-    28.000000       29.000000       30.000000       31.000000
-
-    32.000000       33.000000       34.000000       35.000000
-    36.000000       37.000000       38.000000       39.000000
-    40.000000       41.000000       42.000000       43.000000
-    44.000000       45.000000       46.000000       47.000000
-
-    48.000000       49.000000       50.000000       51.000000
-    52.000000       53.000000       54.000000       55.000000
-    56.000000       57.000000       58.000000       59.000000
-    60.000000       61.000000       62.000000       63.000000
-
-    64.000000       65.000000       66.000000       67.000000
-    68.000000       69.000000       70.000000       71.000000
-    72.000000       73.000000       74.000000       75.000000
-    76.000000       77.000000       78.000000       79.000000
-  */
-  for (int i = 0; i < width * height * depth; i++)
-    h_data[i] = (float)i;
-
-  /*
-    5.000000        6.000000
-    9.000000        10.000000
-
-    21.000000       22.000000
-    25.000000       26.000000
-
-    37.000000       38.000000
-    41.000000       42.000000
-  */
-  float Ref[12] = {5, 6, 9, 10, 21, 22, 25, 26, 37, 38, 41, 42};
-
-  size_t out_width = 2;
-  size_t out_height = 2;
-  size_t out_depth = 3;
-
-  // alloc memory.
-  extent = sycl::range<3>(sizeof(float) * width, height, depth);
-  d_data = (syclcompat::pitched_data)syclcompat::malloc(extent, q);
-
-  // copy to Device.
-  cpyParm_from_data_ct1 = syclcompat::pitched_data(
-      (void *)h_data, sizeof(float) * width, width, height);
-  cpyParm_to_data_ct1 = d_data;
-  cpyParm_size_ct1 = extent;
-  syclcompat::memcpy_async(cpyParm_to_data_ct1, cpyParm_to_pos_ct1,
-                           cpyParm_from_data_ct1, cpyParm_from_pos_ct1,
-                           cpyParm_size_ct1, q);
-  q.wait_and_throw();
-  cpyParm_from_pos_ct1 = {1 * sizeof(float), 1, 0}; // set offset on x/y/z.
-  cpyParm_size_ct1 = {out_width * sizeof(float), out_height, out_depth};
-
-  for (int i = 0; i < out_width * out_height * out_depth; i++)
-    h_data[i] = -1;
-  // copy back to host.
-  cpyParm_from_data_ct1 = d_data;
-  cpyParm_to_data_ct1 = syclcompat::pitched_data(
-      (void *)h_data, sizeof(float) * out_width, out_width, out_height);
-  syclcompat::memcpy_async(cpyParm_to_data_ct1, cpyParm_to_pos_ct1,
-                           cpyParm_from_data_ct1, cpyParm_from_pos_ct1,
-                           cpyParm_size_ct1, q);
-  q.wait_and_throw();
-  // Copy back to host data.
-  check(h_data, Ref, out_width * out_height * out_depth);
-  syclcompat::free(h_data, q);
-  syclcompat::free(d_data.get_data_ptr(), q);
-}
-
-void test_memcpy3D_async_offsetZ() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  size_t width = 4;
-  size_t height = 4;
-  size_t depth = 5;
-  float *h_data;
-
-  syclcompat::pitched_data d_data;
-  sycl::range<3> extent = sycl::range<3>(sizeof(float) * 1, 1, 1);
-  syclcompat::pitched_data cpyParm_from_data_ct1, cpyParm_to_data_ct1;
-  sycl::id<3> cpyParm_from_pos_ct1(0, 0, 0), cpyParm_to_pos_ct1(0, 0, 0);
-  sycl::range<3> cpyParm_size_ct1(0, 0, 0);
-
-  h_data =
-      (float *)syclcompat::malloc_host(sizeof(float) * width * height * depth);
-  /*
-    0.000000        1.000000        2.000000        3.000000
-    4.000000        5.000000        6.000000        7.000000
-    8.000000        9.000000        10.000000       11.000000
-    12.000000       13.000000       14.000000       15.000000
-
-    16.000000       17.000000       18.000000       19.000000
-    20.000000       21.000000       22.000000       23.000000
-    24.000000       25.000000       26.000000       27.000000
-    28.000000       29.000000       30.000000       31.000000
-
-    32.000000       33.000000       34.000000       35.000000
-    36.000000       37.000000       38.000000       39.000000
-    40.000000       41.000000       42.000000       43.000000
-    44.000000       45.000000       46.000000       47.000000
-
-    48.000000       49.000000       50.000000       51.000000
-    52.000000       53.000000       54.000000       55.000000
-    56.000000       57.000000       58.000000       59.000000
-    60.000000       61.000000       62.000000       63.000000
-
-    64.000000       65.000000       66.000000       67.000000
-    68.000000       69.000000       70.000000       71.000000
-    72.000000       73.000000       74.000000       75.000000
-    76.000000       77.000000       78.000000       79.000000
-  */
-  for (int i = 0; i < width * height * depth; i++)
-    h_data[i] = (float)i;
-
-  /*
-    21.000000       22.000000
-    25.000000       26.000000
-
-    37.000000       38.000000
-    41.000000       42.000000
-
-    53.000000       54.000000
-    57.000000       58.000000
-  */
-  float Ref[12] = {21, 22, 25, 26, 37, 38, 41, 42, 53, 54, 57, 58};
-
-  size_t out_width = 2;
-  size_t out_height = 2;
-  size_t out_depth = 3;
-
-  // alloc memory.
-  extent = sycl::range<3>(sizeof(float) * width, height, depth);
-  d_data = (syclcompat::pitched_data)syclcompat::malloc(extent);
-
-  // copy to Device.
-  cpyParm_from_data_ct1 = syclcompat::pitched_data(
-      (void *)h_data, sizeof(float) * width, width, height);
-  cpyParm_to_data_ct1 = d_data;
-  cpyParm_size_ct1 = extent;
-  syclcompat::memcpy_async(cpyParm_to_data_ct1, cpyParm_to_pos_ct1,
-                           cpyParm_from_data_ct1, cpyParm_from_pos_ct1,
-                           cpyParm_size_ct1);
-  syclcompat::get_default_queue().wait_and_throw();
-  cpyParm_from_pos_ct1 = {1 * sizeof(float), 1, 1}; // set offset on x/y/z.
-  cpyParm_size_ct1 = {out_width * sizeof(float), out_height, out_depth};
-
-  for (int i = 0; i < out_width * out_height * out_depth; i++)
-    h_data[i] = -1;
-  // copy back to host.
-  cpyParm_from_data_ct1 = d_data;
-  cpyParm_to_data_ct1 = syclcompat::pitched_data(
-      (void *)h_data, sizeof(float) * out_width, out_width, out_height);
-  syclcompat::memcpy_async(cpyParm_to_data_ct1, cpyParm_to_pos_ct1,
-                           cpyParm_from_data_ct1, cpyParm_from_pos_ct1,
-                           cpyParm_size_ct1);
-  syclcompat::get_default_queue().wait_and_throw();
-  // Copy back to host data.
-  check(h_data, Ref, out_width * out_height * out_depth);
-  syclcompat::free(h_data);
-  sycl::free(d_data.get_data_ptr(), syclcompat::get_default_context());
-}
-
-void test_memcpy3D_async_offsetZ_q() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  sycl::queue q{{sycl::property::queue::in_order()}};
-  size_t width = 4;
-  size_t height = 4;
-  size_t depth = 5;
-  float *h_data;
-
-  syclcompat::pitched_data d_data;
-  sycl::range<3> extent = sycl::range<3>(sizeof(float) * 1, 1, 1);
-  syclcompat::pitched_data cpyParm_from_data_ct1, cpyParm_to_data_ct1;
-  sycl::id<3> cpyParm_from_pos_ct1(0, 0, 0), cpyParm_to_pos_ct1(0, 0, 0);
-  sycl::range<3> cpyParm_size_ct1(0, 0, 0);
-
-  h_data = (float *)syclcompat::malloc_host(
-      sizeof(float) * width * height * depth, q);
-  /*
-    0.000000        1.000000        2.000000        3.000000
-    4.000000        5.000000        6.000000        7.000000
-    8.000000        9.000000        10.000000       11.000000
-    12.000000       13.000000       14.000000       15.000000
-
-    16.000000       17.000000       18.000000       19.000000
-    20.000000       21.000000       22.000000       23.000000
-    24.000000       25.000000       26.000000       27.000000
-    28.000000       29.000000       30.000000       31.000000
-
-    32.000000       33.000000       34.000000       35.000000
-    36.000000       37.000000       38.000000       39.000000
-    40.000000       41.000000       42.000000       43.000000
-    44.000000       45.000000       46.000000       47.000000
-
-    48.000000       49.000000       50.000000       51.000000
-    52.000000       53.000000       54.000000       55.000000
-    56.000000       57.000000       58.000000       59.000000
-    60.000000       61.000000       62.000000       63.000000
-
-    64.000000       65.000000       66.000000       67.000000
-    68.000000       69.000000       70.000000       71.000000
-    72.000000       73.000000       74.000000       75.000000
-    76.000000       77.000000       78.000000       79.000000
-  */
-  for (int i = 0; i < width * height * depth; i++)
-    h_data[i] = (float)i;
-
-  /*
-    21.000000       22.000000
-    25.000000       26.000000
-
-    37.000000       38.000000
-    41.000000       42.000000
-
-    53.000000       54.000000
-    57.000000       58.000000
-  */
-  float Ref[12] = {21, 22, 25, 26, 37, 38, 41, 42, 53, 54, 57, 58};
-
-  size_t out_width = 2;
-  size_t out_height = 2;
-  size_t out_depth = 3;
-
-  // alloc memory.
-  extent = sycl::range<3>(sizeof(float) * width, height, depth);
-  d_data = (syclcompat::pitched_data)syclcompat::malloc(extent, q);
-
-  // copy to Device.
-  cpyParm_from_data_ct1 = syclcompat::pitched_data(
-      (void *)h_data, sizeof(float) * width, width, height);
-  cpyParm_to_data_ct1 = d_data;
-  cpyParm_size_ct1 = extent;
-  syclcompat::memcpy_async(cpyParm_to_data_ct1, cpyParm_to_pos_ct1,
-                           cpyParm_from_data_ct1, cpyParm_from_pos_ct1,
-                           cpyParm_size_ct1, q);
-  q.wait_and_throw();
-  cpyParm_from_pos_ct1 = {1 * sizeof(float), 1, 1}; // set offset on x/y/z.
-  cpyParm_size_ct1 = {out_width * sizeof(float), out_height, out_depth};
-
-  for (int i = 0; i < out_width * out_height * out_depth; i++)
-    h_data[i] = -1;
-  // copy back to host.
-  cpyParm_from_data_ct1 = d_data;
-  cpyParm_to_data_ct1 = syclcompat::pitched_data(
-      (void *)h_data, sizeof(float) * out_width, out_width, out_height);
-  syclcompat::memcpy_async(cpyParm_to_data_ct1, cpyParm_to_pos_ct1,
-                           cpyParm_from_data_ct1, cpyParm_from_pos_ct1,
-                           cpyParm_size_ct1, q);
-  q.wait_and_throw();
-  // Copy back to host data.
-  check(h_data, Ref, out_width * out_height * out_depth);
-  syclcompat::free(h_data, q);
-  syclcompat::free(d_data.get_data_ptr(), q);
-}
-
-int main() {
-  test_memcpy3D_async_pitchedAPI();
-  test_memcpy3D_async_pitchedAPI_q();
-  test_memcpy3D_async_offset();
-  test_memcpy3D_async_offset_q();
-  test_memcpy3D_async_offsetZ();
-  test_memcpy3D_async_offsetZ_q();
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/memory/memory_async.cpp b/sycl/test-e2e/syclcompat/memory/memory_async.cpp
deleted file mode 100644
index fecd54fe921a0..0000000000000
--- a/sycl/test-e2e/syclcompat/memory/memory_async.cpp
+++ /dev/null
@@ -1,211 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  memory_async.cpp
- *
- *  Description:
- *    Asynchronous memory operations event dependency tests
- **************************************************************************/
-
-// The original source was under the license below:
-// ====------ memory_async.cpp------------------- -*- C++ -* ----===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===---------------------------------------------------------------------===//
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-// RUN: %{build} -DSYCLCOMPAT_USM_LEVEL_NONE -o %t.out
-// RUN: %{run} %t.out
-// Tests for the sycl::events returned from syclcompat::*Async API calls
-
-#include <stdio.h>
-#include <sycl/detail/core.hpp>
-#include <syclcompat/memory.hpp>
-#include "memory_fixt.hpp"
-
-// enqueue_free is just a host task, so we are really testing the event
-// dependency here
-void test_free_async() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  AsyncTest atest;
-
-  float *d_D = (float *)syclcompat::malloc(sizeof(float));
-  sycl::event kernel_ev = atest.launch_kernel();
-  sycl::event free_ev = syclcompat::enqueue_free({d_D}, {kernel_ev});
-
-  atest.check_events(kernel_ev, free_ev);
-}
-
-// The following tests are simply testing (as best possible) that
-// the sycl::event returned from *Async really corresponds to the task
-// We don't check that the memory operation does what it's supposed to,
-// this is tested elsewhere.
-void test_memcpy_async1() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  AsyncTest atest;
-
-  sycl::event memcpy_ev = syclcompat::memcpy_async(atest.d_A_, atest.d_C_,
-                                                   sizeof(float) * atest.size_);
-  sycl::event host_ev = atest.launch_host_task({memcpy_ev});
-
-  atest.check_events(memcpy_ev, host_ev);
-}
-
-void test_memcpy_async2() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  AsyncTest atest;
-
-  sycl::event memcpy_ev =
-      syclcompat::memcpy_async(atest.d_A_, 32, atest.d_C_, 32, 32, 4);
-  sycl::event host_ev = atest.launch_host_task({memcpy_ev});
-
-  atest.check_events(memcpy_ev, host_ev);
-}
-
-void test_memcpy_async3() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  AsyncTest atest;
-
-  size_t width = 4;
-  size_t height = 4;
-  size_t depth = 4;
-  assert(width * height * depth <= atest.size_);
-
-  syclcompat::pitched_data d_A_pitched{atest.d_A_, sizeof(float) * width, width,
-                                       height};
-  syclcompat::pitched_data d_B_pitched{atest.d_B_, sizeof(float) * width, width,
-                                       height};
-  sycl::id<3> pos_A(0, 0, 0);
-  sycl::id<3> pos_B(0, 0, 0);
-  sycl::event memcpy_ev = syclcompat::memcpy_async(
-      d_A_pitched, pos_A, d_B_pitched, pos_B, {2, 2, 2});
-  sycl::event host_ev = atest.launch_host_task({memcpy_ev});
-
-  atest.check_events(memcpy_ev, host_ev);
-}
-
-void test_memset_async1() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  AsyncTest atest;
-
-  sycl::event memset_ev =
-      syclcompat::memset_async(atest.d_C_, 1, sizeof(int) * atest.size_);
-  sycl::event host_ev = atest.launch_host_task({memset_ev});
-
-  atest.check_events(memset_ev, host_ev);
-}
-
-void test_memset_async2() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  AsyncTest atest;
-
-  sycl::event memset_ev =
-      syclcompat::memset_async(atest.d_C_, 32, 1, sizeof(int) * 32, 4);
-  sycl::event host_ev = atest.launch_host_task({memset_ev});
-
-  atest.check_events(memset_ev, host_ev);
-}
-
-void test_memset_async3() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  AsyncTest atest;
-
-  size_t width = 4;
-  size_t height = 4;
-  size_t depth = 4;
-  assert(width * height * depth <= atest.size_);
-
-  syclcompat::pitched_data d_A_pitched{atest.d_A_, sizeof(int) * width, width,
-                                       height};
-  sycl::event memset_ev =
-      syclcompat::memset_async(d_A_pitched, 1, {sizeof(int) * 2, 2, 2});
-  sycl::event host_ev = atest.launch_host_task({memset_ev});
-
-  atest.check_events(memset_ev, host_ev);
-}
-
-void test_fill_event() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  AsyncTest atest;
-
-  sycl::event fill_ev = syclcompat::fill_async(atest.d_A_, 1.0f, atest.size_);
-  sycl::event host_ev = atest.launch_host_task({fill_ev});
-
-  atest.check_events(fill_ev, host_ev);
-}
-
-void test_combine_events() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  AsyncTest atest;
-
-  std::vector<sycl::event> evs;
-  for (int i = 0; i < 5; i++)
-    evs.push_back(atest.launch_kernel());
-
-  sycl::event combined = syclcompat::detail::combine_events(evs, atest.q_);
-
-  using namespace sycl::info;
-
-  // Lambda returns true if all events 'complete'
-  auto all_done = [&](std::vector<sycl::event> evs) {
-    return std::all_of(evs.begin(), evs.end(), [](sycl::event ev) {
-      return ev.get_info<event::command_execution_status>() ==
-             event_command_status::complete;
-    });
-  };
-
-  event_command_status combined_status =
-      combined.get_info<event::command_execution_status>();
-  bool prerequisites_done = all_done(evs);
-
-  // Check combined event remains 'submitted' if not all prerequisites completed
-  if (!prerequisites_done)
-    assert(combined_status == event_command_status::submitted);
-
-  // Check all prerequisites completed once combined is completed
-  while (combined_status != event_command_status::running &&
-         combined_status != event_command_status::complete) {
-    combined_status = combined.get_info<event::command_execution_status>();
-  }
-  assert(all_done(evs));
-}
-
-int main() {
-#ifdef SYCLCOMPAT_USM_LEVEL_NONE
-  std::cout << "Running SYCLCOMPAT_USM_LEVEL_NONE tests" << std::endl;
-#else
-  std::cout << "Running USM tests" << std::endl;
-#endif
-  test_free_async();
-
-  test_memcpy_async1();
-  test_memcpy_async2();
-  test_memcpy_async3();
-
-  test_memset_async1();
-  test_memset_async2();
-  test_memset_async3();
-
-  test_fill_event();
-  test_combine_events();
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/memory/memory_common.hpp b/sycl/test-e2e/syclcompat/memory/memory_common.hpp
deleted file mode 100644
index a37ce29ba76f1..0000000000000
--- a/sycl/test-e2e/syclcompat/memory/memory_common.hpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  memory_common.hpp
- *
- *  Description:
- *    Memory content helper for the Memory functionality tests
- **************************************************************************/
-
-#pragma once
-
-#include <cassert>
-#include <cmath>
-#include <tuple>
-
-#include <sycl/detail/core.hpp>
-
-inline void check(float *h_data, float *h_ref, size_t size) {
-  for (size_t i = 0; i < size; i++) {
-    float diff = fabs(h_data[i] - h_ref[i]);
-    assert(diff <= 1.e-6);
-  }
-}
diff --git a/sycl/test-e2e/syclcompat/memory/memory_fixt.hpp b/sycl/test-e2e/syclcompat/memory/memory_fixt.hpp
deleted file mode 100644
index b410f2bf77b05..0000000000000
--- a/sycl/test-e2e/syclcompat/memory/memory_fixt.hpp
+++ /dev/null
@@ -1,203 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  memory_fixt.hpp
- *
- *  Description:
- *    Memory content fixtures for the Memory functionality tests
- **************************************************************************/
-
-#pragma once
-
-#include <syclcompat/device.hpp>
-#include <syclcompat/dims.hpp>
-#include <syclcompat/launch.hpp>
-#include <syclcompat/memory.hpp>
-
-constexpr size_t WG_SIZE = 256;
-constexpr size_t NUM_WG = 32;
-
-// Fixture to set up & launch a kernel to depend on, or
-// a host_task which depends on something else
-class AsyncTest {
-public:
-  AsyncTest()
-      : q_{syclcompat::get_default_queue()}, grid_{NUM_WG}, thread_{WG_SIZE},
-        size_{WG_SIZE * NUM_WG} {
-    d_A_ = syclcompat::malloc<float>(size_, q_);
-    d_B_ = syclcompat::malloc<float>(size_, q_);
-    d_C_ = syclcompat::malloc<float>(size_, q_);
-  }
-
-  ~AsyncTest() {
-    syclcompat::free(d_A_, q_);
-    syclcompat::free(d_B_, q_);
-    syclcompat::free(d_C_, q_);
-  }
-  sycl::event launch_kernel() {
-    auto &dd_A = d_A_;
-    auto &dd_B = d_B_;
-    auto &dd_C = d_C_;
-#ifdef SYCLCOMPAT_USM_LEVEL_NONE
-    syclcompat::buffer_t buffer_A = syclcompat::get_buffer(d_A_);
-    syclcompat::buffer_t buffer_B = syclcompat::get_buffer(d_B_);
-    syclcompat::buffer_t buffer_C = syclcompat::get_buffer(d_C_);
-#endif
-    return q_.submit([&](sycl::handler &cgh) {
-#ifdef SYCLCOMPAT_USM_LEVEL_NONE
-      auto A = buffer_A.get_access<sycl::access::mode::read_write>(cgh);
-      auto B = buffer_B.get_access<sycl::access::mode::read_write>(cgh);
-      auto C = buffer_C.get_access<sycl::access::mode::read_write>(cgh);
-#else
-      auto A = dd_A;
-      auto B = dd_B;
-      auto C = dd_C;
-#endif
-      cgh.parallel_for(size_, [=](sycl::id<1> id) {
-        A[id] = static_cast<float>(id) + 1.0f;
-        B[id] = static_cast<float>(id) + 1.0f;
-        C[id] = A[id] + B[id];
-      });
-    });
-  }
-
-  sycl::event launch_host_task(std::vector<sycl::event> dep_events) {
-    return q_.submit([&](sycl::handler &cgh) {
-      cgh.depends_on(dep_events);
-      cgh.host_task([]() {});
-    });
-  }
-
-  // Check that a dependent event (e2) doesn't start until after the dependee
-  // (e1)
-  void check_events(sycl::event e1, sycl::event e2) {
-    using namespace sycl::info;
-
-    event_command_status e2_status =
-        e2.get_info<event::command_execution_status>();
-    event_command_status e1_status =
-        e1.get_info<event::command_execution_status>();
-
-    // Check event 2 hasn't started iff event 1 hasn't finished
-    if (e1_status != event_command_status::complete) {
-      assert(e2_status == event_command_status::submitted);
-    }
-
-    // Once event 2 is finished, check event 1 has finished
-    while (e2_status != event_command_status::complete) {
-      e2_status = e2.get_info<event::command_execution_status>();
-    }
-    assert(e1.get_info<event::command_execution_status>() ==
-           event_command_status::complete);
-  }
-
-  sycl::queue q_;
-  syclcompat::dim3 const grid_;
-  syclcompat::dim3 const thread_;
-  float *d_A_;
-  float *d_B_;
-  float *d_C_;
-  size_t size_;
-};
-
-template <typename T> bool should_skip(const sycl::device &dev) {
-  bool skip = false;
-  if (!dev.has(sycl::aspect::fp64) && std::is_same_v<T, double>) {
-    std::cout << "  sycl::aspect::fp64 not supported by the SYCL device."
-              << std::endl;
-    skip = true;
-  }
-  if (!dev.has(sycl::aspect::fp16) && std::is_same_v<T, sycl::half>) {
-    std::cout << "  sycl::aspect::fp16 not supported by the SYCL device."
-              << std::endl;
-    skip = true;
-  }
-  return skip;
-}
-
-// USM Tests Helpers
-// Fixture to set up & launch testing kernel
-template <typename T> struct USMTest {
-  USMTest()
-      : q_{syclcompat::get_default_queue()}, grid_{NUM_WG}, thread_{WG_SIZE},
-        size_{WG_SIZE * NUM_WG},
-        skip{should_skip<T>(syclcompat::get_current_device())} {}
-
-  void launch_kernel() {
-    auto &dd_A = data;
-    return q_
-        .submit([&](sycl::handler &cgh) {
-          cgh.parallel_for(
-              size_, [=](sycl::id<1> id) { dd_A[id] = static_cast<int>(id); });
-        })
-        .wait();
-  }
-
-  // Check result is identity vector
-  // Handles memcpy for USM device alloc
-  void check_result() {
-    sycl::usm::alloc ptr_type = sycl::get_pointer_type(data, q_.get_context());
-    assert(ptr_type != sycl::usm::alloc::unknown);
-
-    T *result;
-    if (ptr_type == sycl::usm::alloc::device) {
-      result = static_cast<T *>(std::malloc(sizeof(T) * size_));
-      syclcompat::memcpy(result, data, sizeof(T) * size_);
-    } else {
-      result = data;
-    }
-
-    for (size_t i = 0; i < size_; i++) {
-      assert(result[i] == static_cast<T>(i));
-    }
-
-    if (ptr_type == sycl::usm::alloc::device)
-      std::free(result);
-  }
-
-  sycl::queue q_;
-  syclcompat::dim3 const grid_;
-  syclcompat::dim3 const thread_;
-  T *data;
-  size_t size_;
-  bool skip;
-};
-
-template <auto F> class LocalMemTest {
-public:
-  LocalMemTest(syclcompat::dim3 grid, syclcompat::dim3 threads)
-      : grid_{grid}, threads_{threads}, size_{grid_.size() * threads_.size()},
-        host_data_(size_) {
-    data_ = (int *)syclcompat::malloc(size_ * sizeof(int));
-  };
-  ~LocalMemTest() { syclcompat::free(data_); };
-
-  template <typename Lambda, typename... Args>
-  void launch_test(Lambda checker, Args... args) {
-    syclcompat::launch<F>(grid_, threads_, data_, args...);
-    syclcompat::memcpy(host_data_.data(), data_, size_ * sizeof(int));
-    checker(host_data_);
-  }
-
-private:
-  syclcompat::dim3 grid_;
-  syclcompat::dim3 threads_;
-  size_t size_;
-  sycl::queue q_;
-  int *data_;
-  std::vector<int> host_data_;
-  using CheckLambda = std::function<void(std::vector<int>)>;
-};
diff --git a/sycl/test-e2e/syclcompat/memory/memory_image.cpp b/sycl/test-e2e/syclcompat/memory/memory_image.cpp
deleted file mode 100644
index 819a7d97d4ff6..0000000000000
--- a/sycl/test-e2e/syclcompat/memory/memory_image.cpp
+++ /dev/null
@@ -1,256 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  memory_image.cpp
- *
- *  Description:
- *    3D memory copy tests for new image/memcpy_parameter API
- **************************************************************************/
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-// FIXME(@intel/syclcompat-lib-reviewers): These are some limited tests for the
-// new syclcompat::experimental::memcpy API. These aren't officially supported
-// at present, but we can test the pitched_data variants easily. Once this
-// moves out of experimental, let's test these APIs thoroughly
-
-#include <malloc.h>
-#include <stdio.h>
-#include <sycl/detail/core.hpp>
-
-#include <syclcompat/memory.hpp>
-
-#include "memory_common.hpp"
-
-void test_memcpy3D_parameter_offset() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  size_t width = 4;
-  size_t height = 4;
-  size_t depth = 5;
-  float *h_data;
-
-  syclcompat::pitched_data d_data;
-  sycl::range<3> extent = sycl::range<3>(sizeof(float) * 1, 1, 1);
-  syclcompat::pitched_data cpyParm_from_data_ct1, cpyParm_to_data_ct1;
-  sycl::id<3> cpyParm_from_pos_ct1(0, 0, 0), cpyParm_to_pos_ct1(0, 0, 0);
-  sycl::range<3> cpyParm_size_ct1(0, 0, 0);
-
-  h_data =
-      (float *)syclcompat::malloc_host(sizeof(float) * width * height * depth);
-  /*
-    0.000000        1.000000        2.000000        3.000000
-    4.000000        5.000000        6.000000        7.000000
-    8.000000        9.000000        10.000000       11.000000
-    12.000000       13.000000       14.000000       15.000000
-
-    16.000000       17.000000       18.000000       19.000000
-    20.000000       21.000000       22.000000       23.000000
-    24.000000       25.000000       26.000000       27.000000
-    28.000000       29.000000       30.000000       31.000000
-
-    32.000000       33.000000       34.000000       35.000000
-    36.000000       37.000000       38.000000       39.000000
-    40.000000       41.000000       42.000000       43.000000
-    44.000000       45.000000       46.000000       47.000000
-
-    48.000000       49.000000       50.000000       51.000000
-    52.000000       53.000000       54.000000       55.000000
-    56.000000       57.000000       58.000000       59.000000
-    60.000000       61.000000       62.000000       63.000000
-
-    64.000000       65.000000       66.000000       67.000000
-    68.000000       69.000000       70.000000       71.000000
-    72.000000       73.000000       74.000000       75.000000
-    76.000000       77.000000       78.000000       79.000000
-  */
-  for (int i = 0; i < width * height * depth; i++)
-    h_data[i] = (float)i;
-
-  /*
-    5.000000        6.000000
-    9.000000        10.000000
-
-    21.000000       22.000000
-    25.000000       26.000000
-
-    37.000000       38.000000
-    41.000000       42.000000
-  */
-  float Ref[12] = {5, 6, 9, 10, 21, 22, 25, 26, 37, 38, 41, 42};
-
-  size_t out_width = 2;
-  size_t out_height = 2;
-  size_t out_depth = 3;
-
-  // alloc memory.
-  extent = sycl::range<3>(sizeof(float) * width, height, depth);
-  d_data = (syclcompat::pitched_data)syclcompat::malloc(extent);
-
-  // copy to Device.
-  cpyParm_from_data_ct1 = syclcompat::pitched_data(
-      (void *)h_data, sizeof(float) * width, width, height);
-  cpyParm_to_data_ct1 = d_data;
-  cpyParm_size_ct1 = extent;
-
-  {
-    syclcompat::experimental::memcpy_parameter params{};
-    params.to.pitched = cpyParm_to_data_ct1;
-    params.to.pos = cpyParm_to_pos_ct1;
-    params.from.pitched = cpyParm_from_data_ct1;
-    params.from.pos = cpyParm_from_pos_ct1;
-    params.size = cpyParm_size_ct1;
-    syclcompat::experimental::memcpy(params);
-  }
-
-  cpyParm_from_pos_ct1 = {1 * sizeof(float), 1, 0}; // set offset on x/y/z.
-  cpyParm_size_ct1 = {out_width * sizeof(float), out_height, out_depth};
-
-  for (int i = 0; i < out_width * out_height * out_depth; i++)
-    h_data[i] = -1;
-  // copy back to host.
-  cpyParm_from_data_ct1 = d_data;
-  cpyParm_to_data_ct1 = syclcompat::pitched_data(
-      (void *)h_data, sizeof(float) * out_width, out_width, out_height);
-
-  {
-    syclcompat::experimental::memcpy_parameter params{};
-    params.to.pitched = cpyParm_to_data_ct1;
-    params.to.pos = cpyParm_to_pos_ct1;
-    params.from.pitched = cpyParm_from_data_ct1;
-    params.from.pos = cpyParm_from_pos_ct1;
-    params.size = cpyParm_size_ct1;
-    syclcompat::experimental::memcpy(params);
-  }
-
-  // Copy back to host data.
-  check(h_data, Ref, out_width * out_height * out_depth);
-  syclcompat::free(h_data);
-  sycl::free(d_data.get_data_ptr(), syclcompat::get_default_context());
-}
-
-void test_memcpy3D_async_parameter_offset() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  size_t width = 4;
-  size_t height = 4;
-  size_t depth = 5;
-  float *h_data;
-
-  syclcompat::pitched_data d_data;
-  sycl::range<3> extent = sycl::range<3>(sizeof(float) * 1, 1, 1);
-  syclcompat::pitched_data cpyParm_from_data_ct1, cpyParm_to_data_ct1;
-  sycl::id<3> cpyParm_from_pos_ct1(0, 0, 0), cpyParm_to_pos_ct1(0, 0, 0);
-  sycl::range<3> cpyParm_size_ct1(0, 0, 0);
-
-  h_data =
-      (float *)syclcompat::malloc_host(sizeof(float) * width * height * depth);
-  /*
-    0.000000        1.000000        2.000000        3.000000
-    4.000000        5.000000        6.000000        7.000000
-    8.000000        9.000000        10.000000       11.000000
-    12.000000       13.000000       14.000000       15.000000
-
-    16.000000       17.000000       18.000000       19.000000
-    20.000000       21.000000       22.000000       23.000000
-    24.000000       25.000000       26.000000       27.000000
-    28.000000       29.000000       30.000000       31.000000
-
-    32.000000       33.000000       34.000000       35.000000
-    36.000000       37.000000       38.000000       39.000000
-    40.000000       41.000000       42.000000       43.000000
-    44.000000       45.000000       46.000000       47.000000
-
-    48.000000       49.000000       50.000000       51.000000
-    52.000000       53.000000       54.000000       55.000000
-    56.000000       57.000000       58.000000       59.000000
-    60.000000       61.000000       62.000000       63.000000
-
-    64.000000       65.000000       66.000000       67.000000
-    68.000000       69.000000       70.000000       71.000000
-    72.000000       73.000000       74.000000       75.000000
-    76.000000       77.000000       78.000000       79.000000
-  */
-  for (int i = 0; i < width * height * depth; i++)
-    h_data[i] = (float)i;
-
-  /*
-    5.000000        6.000000
-    9.000000        10.000000
-
-    21.000000       22.000000
-    25.000000       26.000000
-
-    37.000000       38.000000
-    41.000000       42.000000
-  */
-  float Ref[12] = {5, 6, 9, 10, 21, 22, 25, 26, 37, 38, 41, 42};
-
-  size_t out_width = 2;
-  size_t out_height = 2;
-  size_t out_depth = 3;
-
-  // alloc memory.
-  extent = sycl::range<3>(sizeof(float) * width, height, depth);
-  d_data = (syclcompat::pitched_data)syclcompat::malloc(extent);
-
-  // copy to Device.
-  cpyParm_from_data_ct1 = syclcompat::pitched_data(
-      (void *)h_data, sizeof(float) * width, width, height);
-  cpyParm_to_data_ct1 = d_data;
-  cpyParm_size_ct1 = extent;
-
-  {
-    syclcompat::experimental::memcpy_parameter params{};
-    params.to.pitched = cpyParm_to_data_ct1;
-    params.to.pos = cpyParm_to_pos_ct1;
-    params.from.pitched = cpyParm_from_data_ct1;
-    params.from.pos = cpyParm_from_pos_ct1;
-    params.size = cpyParm_size_ct1;
-    syclcompat::experimental::memcpy_async(params);
-  }
-  syclcompat::get_default_queue().wait_and_throw();
-  cpyParm_from_pos_ct1 = {1 * sizeof(float), 1, 0}; // set offset on x/y/z.
-  cpyParm_size_ct1 = {out_width * sizeof(float), out_height, out_depth};
-
-  for (int i = 0; i < out_width * out_height * out_depth; i++)
-    h_data[i] = -1;
-  // copy back to host.
-  cpyParm_from_data_ct1 = d_data;
-  cpyParm_to_data_ct1 = syclcompat::pitched_data(
-      (void *)h_data, sizeof(float) * out_width, out_width, out_height);
-  {
-    syclcompat::experimental::memcpy_parameter params{};
-    params.to.pitched = cpyParm_to_data_ct1;
-    params.to.pos = cpyParm_to_pos_ct1;
-    params.from.pitched = cpyParm_from_data_ct1;
-    params.from.pos = cpyParm_from_pos_ct1;
-    params.size = cpyParm_size_ct1;
-    syclcompat::experimental::memcpy_async(params);
-  }
-  syclcompat::get_default_queue().wait_and_throw();
-  // Copy back to host data.
-  check(h_data, Ref, out_width * out_height * out_depth);
-  syclcompat::free(h_data);
-  sycl::free(d_data.get_data_ptr(), syclcompat::get_default_context());
-}
-
-int main() {
-  // Copied and modified from memcpy_3d.cpp test_memcpy3D_offset()
-  test_memcpy3D_parameter_offset();
-  test_memcpy3D_async_parameter_offset();
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/memory/memory_image_xfails.cpp b/sycl/test-e2e/syclcompat/memory/memory_image_xfails.cpp
deleted file mode 100644
index 330a8607cadb3..0000000000000
--- a/sycl/test-e2e/syclcompat/memory/memory_image_xfails.cpp
+++ /dev/null
@@ -1,142 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  memory_async.cpp
- *
- *  Description:
- *    Asynchronous memory operations event dependency tests
- **************************************************************************/
-
-// The original source was under the license below:
-// ====------ memory_async.cpp------------------- -*- C++ -* ----===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===---------------------------------------------------------------------===//
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-// Tests for the sycl::events returned from syclcompat::*Async API calls
-
-#include "sycl/exception.hpp"
-#include <stdexcept>
-#include <stdio.h>
-
-#include <sycl/detail/core.hpp>
-
-#include <syclcompat/memory.hpp>
-
-void test_memcpy_parameter_async(
-    syclcompat::experimental::memcpy_parameter param, bool xpass) {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  try {
-    syclcompat::experimental::memcpy_async(param);
-    assert(xpass);
-  } catch (std::runtime_error &) {
-    assert(!xpass);
-  }
-}
-
-void test_memcpy_parameter(syclcompat::experimental::memcpy_parameter param,
-                           bool xpass) {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  try {
-    syclcompat::experimental::memcpy(param);
-    assert(xpass);
-  } catch (std::runtime_error &) {
-    assert(!xpass);
-  }
-}
-
-// Check (most) memcpy_parameter APIs raise std::runtime_error.
-void test_memcpy_parameter_xfails() {
-
-  {
-    // Empty `memcpy_params` passes in no bindless_image
-    // or image pointers. This is the code path that ought to pass.
-    syclcompat::experimental::memcpy_parameter params;
-    test_memcpy_parameter(params, true);
-    test_memcpy_parameter_async(params, true);
-  }
-
-  {
-    // Mimick passing a bindless image for source
-    syclcompat::experimental::memcpy_parameter params;
-    params.from.image_bindless =
-        reinterpret_cast<syclcompat::experimental::image_mem_wrapper *>(1);
-    test_memcpy_parameter(params, false);
-    test_memcpy_parameter_async(params, false);
-  }
-
-  {
-    // Mimick passing a bindless image for dest
-    syclcompat::experimental::memcpy_parameter params;
-    params.to.image_bindless =
-        reinterpret_cast<syclcompat::experimental::image_mem_wrapper *>(1);
-    test_memcpy_parameter(params, false);
-    test_memcpy_parameter_async(params, false);
-  }
-
-  {
-    // Mimick passing a bindless image for source & dest
-    syclcompat::experimental::memcpy_parameter params;
-    params.from.image_bindless =
-        reinterpret_cast<syclcompat::experimental::image_mem_wrapper *>(1);
-    params.to.image_bindless =
-        reinterpret_cast<syclcompat::experimental::image_mem_wrapper *>(1);
-    test_memcpy_parameter(params, false);
-    test_memcpy_parameter_async(params, false);
-  }
-
-  {
-    // Mimick passing an image for source
-    syclcompat::experimental::memcpy_parameter params;
-    params.from.image =
-        reinterpret_cast<syclcompat::experimental::image_matrix *>(1);
-    test_memcpy_parameter(params, false);
-    test_memcpy_parameter_async(params, false);
-  }
-
-  {
-    // Mimick passing an image for dest
-    syclcompat::experimental::memcpy_parameter params;
-    params.to.image =
-        reinterpret_cast<syclcompat::experimental::image_matrix *>(1);
-    test_memcpy_parameter(params, false);
-    test_memcpy_parameter_async(params, false);
-  }
-
-  {
-    // Mimick passing an image for source & dest
-    syclcompat::experimental::memcpy_parameter params;
-    params.from.image =
-        reinterpret_cast<syclcompat::experimental::image_matrix *>(1);
-    params.to.image =
-        reinterpret_cast<syclcompat::experimental::image_matrix *>(1);
-    test_memcpy_parameter(params, false);
-    test_memcpy_parameter_async(params, false);
-  }
-}
-
-int main() {
-  test_memcpy_parameter_xfails();
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/memory/memory_is_device_ptr_usmnone.cpp b/sycl/test-e2e/syclcompat/memory/memory_is_device_ptr_usmnone.cpp
deleted file mode 100644
index dbde31a15c63b..0000000000000
--- a/sycl/test-e2e/syclcompat/memory/memory_is_device_ptr_usmnone.cpp
+++ /dev/null
@@ -1,28 +0,0 @@
-// ====------ memory_is_device_ptr.cpp---------- -*- C++ -* ----===////
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===----------------------------------------------------------------------===//
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#define SYCLCOMPAT_USM_LEVEL_NONE
-#include <sycl/detail/core.hpp>
-#include <syclcompat/memory.hpp>
-
-int main() {
-  float* f = (float*)syclcompat::malloc(sizeof(float));
-  bool pass = false;
-
-  if (syclcompat::is_device_ptr(f)) {
-    pass = true;
-  }
-
-  syclcompat::free(f);
-
-  return (pass ? 0 : 1);
-}
diff --git a/sycl/test-e2e/syclcompat/memory/memory_management_diff_queues.cpp b/sycl/test-e2e/syclcompat/memory/memory_management_diff_queues.cpp
deleted file mode 100644
index 6850e79c2bb65..0000000000000
--- a/sycl/test-e2e/syclcompat/memory/memory_management_diff_queues.cpp
+++ /dev/null
@@ -1,128 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  memory_management_diff_queue.cpp
- *
- *  Description:
- *    memory operations tests for operations when changing the default queue
- **************************************************************************/
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include <sycl/detail/core.hpp>
-
-#include <syclcompat/memory.hpp>
-
-#include "../common.hpp"
-#include "memory_common.hpp"
-#include "memory_fixt.hpp"
-
-void test_memcpy() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  sycl::queue q{{sycl::property::queue::in_order()}};
-
-  constexpr int ELEMENTS = 5000;
-  constexpr int N1 = 1000;
-  float *h_A = (float *)malloc(ELEMENTS * sizeof(float));
-  float *h_B = (float *)malloc(ELEMENTS * sizeof(float));
-  float *h_C = (float *)malloc(ELEMENTS * sizeof(float));
-
-  for (int i = 0; i < ELEMENTS; i++) {
-    h_A[i] = 1.0f;
-    h_B[i] = 2.0f;
-  }
-
-  float *d_A = nullptr;
-  // hostA[0..999] -> deviceA[0..999]
-  // hostB[0..3999] -> deviceA[1000..4999]
-  // deviceA[0..4999] -> hostC[0..4999]
-  d_A = (float *)syclcompat::malloc(ELEMENTS * sizeof(float), q);
-  syclcompat::memcpy((void *)d_A, (void *)h_A, N1 * sizeof(float), q);
-
-  syclcompat::set_default_queue(q);
-  syclcompat::memcpy((void *)(d_A + N1), (void *)h_B,
-                     (ELEMENTS - N1) * sizeof(float));
-
-  syclcompat::memcpy((void *)h_C, (void *)d_A, ELEMENTS * sizeof(float));
-
-  // verify
-  for (int i = 0; i < N1; i++) {
-    assert(h_A[i] == h_C[i]);
-  }
-
-  for (int i = N1; i < ELEMENTS; i++) {
-    assert(h_B[i] == h_C[i]);
-  }
-
-  free(h_A);
-  free(h_B);
-  free(h_C);
-  syclcompat::free((void *)d_A);
-}
-
-void test_memset() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  sycl::queue q{{sycl::property::queue::in_order()}};
-
-  constexpr int PORTION = 5;
-  constexpr int ELEMENTS = PORTION * 3;
-
-  int *h_A = (int *)malloc(ELEMENTS * sizeof(int));
-  for (int i = 0; i < ELEMENTS; i++) {
-    h_A[i] = 4;
-  }
-
-  int *d_A = nullptr;
-
-  d_A = (int *)syclcompat::malloc(ELEMENTS * sizeof(int));
-  // hostA -> deviceA
-  syclcompat::memcpy((void *)d_A, (void *)h_A, ELEMENTS * sizeof(int), q);
-
-  // set d_A[0,..., PORTION - 1] = 0
-  syclcompat::memset((void *)d_A, 0, PORTION * sizeof(int), q);
-
-  syclcompat::set_default_queue(q);
-  // set d_A[PORTION,..., 2 * PORTION - 1] = 0x01010101
-  syclcompat::memset((void *)(d_A + PORTION), 1, PORTION * sizeof(int));
-  // deviceA -> hostA
-  syclcompat::memcpy((void *)h_A, (void *)d_A, ELEMENTS * sizeof(int));
-
-  // check d_A[0,..., PORTION - 1] = 0
-  for (int i = 0; i < PORTION; i++) {
-    assert(h_A[i] == 0);
-  }
-
-  // check d_A[PORTION,..., 2 * PORTION - 1] = 0x01010101
-  for (int i = PORTION; i < (2 * PORTION - 1); i++) {
-    assert(h_A[i] == 0x01010101);
-  }
-
-  // check d_A[2 * PORTION,..., ELEMENTS] = 4
-  for (int i = 2 * PORTION; i < ELEMENTS; i++) {
-    assert(h_A[i] == 4);
-  }
-
-  free(h_A);
-  syclcompat::free((void *)d_A);
-}
-
-int main() {
-  test_memcpy();
-  test_memset();
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/memory/memory_management_shared.cpp b/sycl/test-e2e/syclcompat/memory/memory_management_shared.cpp
deleted file mode 100644
index 514a07bd5dd6f..0000000000000
--- a/sycl/test-e2e/syclcompat/memory/memory_management_shared.cpp
+++ /dev/null
@@ -1,91 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  memory_management_shared.cpp
- *
- *  Description:
- *    memory operations tests with shared memory
- **************************************************************************/
-
-// The original source was under the license below:
-// ====------ memory_management_test2.cpp---------- -*- C++ -* ----===////
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===----------------------------------------------------------------------===//
-
-// REQUIRES: aspect-usm_shared_allocations
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include <sycl/detail/core.hpp>
-
-#include <syclcompat/memory.hpp>
-
-#include "../common.hpp"
-#include "memory_common.hpp"
-
-constexpr size_t DataW = 100;
-constexpr size_t DataH = 100;
-
-void test_shared_memory() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  syclcompat::shared_memory<float, 1> s_A(DataW);
-  syclcompat::shared_memory<float, 1> s_B(DataW);
-  syclcompat::shared_memory<float, 1> s_C(DataW);
-
-  s_A.init();
-  s_B.init();
-  s_C.init();
-
-  for (int i = 0; i < DataW; i++) {
-    s_A[i] = 1.0f;
-    s_B[i] = 2.0f;
-  }
-
-  {
-    syclcompat::get_default_queue().submit([&](sycl::handler &cgh) {
-      float *d_A = s_A.get_ptr();
-      float *d_B = s_B.get_ptr();
-      float *d_C = s_C.get_ptr();
-      cgh.parallel_for(sycl::range<1>(DataW), [=](sycl::id<1> id) {
-        int i = id[0];
-        float *A = d_A;
-        float *B = d_B;
-        float *C = d_C;
-        C[i] = A[i] + B[i];
-      });
-    });
-    syclcompat::get_default_queue().wait_and_throw();
-  }
-
-  // verify hostD
-  for (int i = 0; i < DataW; i++) {
-    for (int j = 0; j < DataH; j++) {
-      assert(fabs(s_C[i] - s_A[i] - s_B[i]) <= 1e-5);
-    }
-  }
-}
-
-int main() {
-  test_shared_memory();
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/memory/memory_management_test1.cpp b/sycl/test-e2e/syclcompat/memory/memory_management_test1.cpp
deleted file mode 100644
index e6c55134cdea4..0000000000000
--- a/sycl/test-e2e/syclcompat/memory/memory_management_test1.cpp
+++ /dev/null
@@ -1,551 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  memory_management_test1.cpp
- *
- *  Description:
- *    memory operations tests
- **************************************************************************/
-
-// The original source was under the license below:
-// ====------ memory_management_test1.cpp---------- -*- C++ -* ----===////
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===----------------------------------------------------------------------===//
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include <sycl/detail/core.hpp>
-
-#include <syclcompat/memory.hpp>
-
-#include "../common.hpp"
-#include "memory_common.hpp"
-#include "memory_fixt.hpp"
-
-void test_memcpy() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  constexpr int Num = 5000;
-  constexpr int N1 = 1000;
-  float *h_A = (float *)malloc(Num * sizeof(float));
-  float *h_B = (float *)malloc(Num * sizeof(float));
-  float *h_C = (float *)malloc(Num * sizeof(float));
-
-  for (int i = 0; i < Num; i++) {
-    h_A[i] = 1.0f;
-    h_B[i] = 2.0f;
-  }
-
-  float *d_A = nullptr;
-  // hostA[0..999] -> deviceA[0..999]
-  // hostB[0..3999] -> deviceA[1000..4999]
-  // deviceA[0..4999] -> hostC[0..4999]
-  d_A = (float *)syclcompat::malloc(Num * sizeof(float));
-  syclcompat::memcpy((void *)d_A, (void *)h_A, N1 * sizeof(float));
-  syclcompat::memcpy((void *)(d_A + N1), (void *)h_B,
-                     (Num - N1) * sizeof(float));
-  syclcompat::memcpy((void *)h_C, (void *)d_A, Num * sizeof(float));
-  syclcompat::free((void *)d_A);
-
-  syclcompat::free(0);
-  syclcompat::free(NULL);
-  syclcompat::free(nullptr);
-
-  // verify
-  for (int i = 0; i < N1; i++) {
-    assert(h_A[i] == h_C[i]);
-  }
-
-  for (int i = N1; i < Num; i++) {
-    assert(h_B[i] == h_C[i]);
-  }
-
-  free(h_A);
-  free(h_B);
-  free(h_C);
-}
-
-void test_memcpy_q() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  sycl::queue q{{sycl::property::queue::in_order()}};
-  constexpr int Num = 5000;
-  constexpr int N1 = 1000;
-  float *h_A = (float *)malloc(Num * sizeof(float));
-  float *h_B = (float *)malloc(Num * sizeof(float));
-  float *h_C = (float *)malloc(Num * sizeof(float));
-
-  for (int i = 0; i < Num; i++) {
-    h_A[i] = 1.0f;
-    h_B[i] = 2.0f;
-  }
-
-  float *d_A = nullptr;
-  // hostA[0..999] -> deviceA[0..999]
-  // hostB[0..3999] -> deviceA[1000..4999]
-  // deviceA[0..4999] -> hostC[0..4999]
-  d_A = (float *)syclcompat::malloc(Num * sizeof(float), q);
-  syclcompat::memcpy((void *)d_A, (void *)h_A, N1 * sizeof(float), q);
-  syclcompat::memcpy((void *)(d_A + N1), (void *)h_B,
-                     (Num - N1) * sizeof(float), q);
-  syclcompat::memcpy((void *)h_C, (void *)d_A, Num * sizeof(float), q);
-  syclcompat::free((void *)d_A, q);
-
-  syclcompat::free(0, q);
-  syclcompat::free(NULL, q);
-  syclcompat::free(nullptr, q);
-
-  // verify
-  for (int i = 0; i < N1; i++) {
-    assert(h_A[i] == h_C[i]);
-  }
-
-  for (int i = N1; i < Num; i++) {
-    assert(h_B[i] == h_C[i]);
-  }
-
-  free(h_A);
-  free(h_B);
-  free(h_C);
-}
-
-template <size_t memset_size_bits = 8> void test_memset_impl() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  // ValueT -> int for memset and memset_d32, short for memset_d16.
-  using ValueT = std::conditional_t<
-      memset_size_bits == 8 || memset_size_bits == 32, int,
-      std::conditional_t<memset_size_bits == 16, short, void>>;
-  static_assert(!std::is_void_v<ValueT>,
-                "memset tests only work for 8, 16 and 32 bits");
-
-  constexpr int Num = 10;
-  ValueT *h_A = (ValueT *)malloc(Num * sizeof(ValueT));
-
-  for (int i = 0; i < Num; i++) {
-    h_A[i] = 4;
-  }
-
-  ValueT *d_A = (ValueT *)syclcompat::malloc(Num * sizeof(ValueT));
-  // hostA -> deviceA
-  syclcompat::memcpy((void *)d_A, (void *)h_A, Num * sizeof(ValueT));
-
-  // set d_A[0,..., 6] = 0
-  if constexpr (memset_size_bits == 8)
-    syclcompat::memset((void *)d_A, 0, (Num - 3) * sizeof(ValueT));
-  else if constexpr (memset_size_bits == 16)
-    syclcompat::memset_d16((void *)d_A, 0, (Num - 3));
-  else if constexpr (memset_size_bits == 32)
-    syclcompat::memset_d32((void *)d_A, 0, (Num - 3));
-
-  // deviceA -> hostA
-  syclcompat::memcpy((void *)h_A, (void *)d_A, Num * sizeof(ValueT));
-
-  syclcompat::free((void *)d_A);
-
-  // check d_A[0,..., 6] = 0
-  for (int i = 0; i < Num - 3; i++) {
-    assert(h_A[i] == 0);
-  }
-
-  // check d_A[7,..., 9] = 4
-  for (int i = Num - 3; i < Num; i++) {
-    assert(h_A[i] == 4);
-  }
-
-  free(h_A);
-}
-
-template <size_t memset_size_bits = 8> void test_memset_q_impl() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  // ValueT -> int for memset and memset_d32, short for memset_d16.
-  using ValueT = std::conditional_t<
-      memset_size_bits == 8 || memset_size_bits == 32, int,
-      std::conditional_t<memset_size_bits == 16, short, void>>;
-  static_assert(!std::is_void_v<ValueT>,
-                "memset tests only work for 8, 16 and 32 bits");
-
-  sycl::queue q{{sycl::property::queue::in_order()}};
-  constexpr int Num = 10;
-  ValueT *h_A = (ValueT *)malloc(Num * sizeof(ValueT));
-
-  for (int i = 0; i < Num; i++) {
-    h_A[i] = 4;
-  }
-
-  ValueT *d_A = (ValueT *)syclcompat::malloc(Num * sizeof(ValueT), q);
-  // hostA -> deviceA
-  syclcompat::memcpy((void *)d_A, (void *)h_A, Num * sizeof(ValueT), q);
-
-  // set d_A[0,..., 6] = 0
-  if constexpr (memset_size_bits == 8)
-    syclcompat::memset((void *)d_A, 0, (Num - 3) * sizeof(ValueT), q);
-  else if constexpr (memset_size_bits == 16)
-    syclcompat::memset_d16((void *)d_A, 0, (Num - 3), q);
-  else if constexpr (memset_size_bits == 32)
-    syclcompat::memset_d32((void *)d_A, 0, (Num - 3), q);
-
-  // deviceA -> hostA
-  syclcompat::memcpy((void *)h_A, (void *)d_A, Num * sizeof(ValueT), q);
-
-  syclcompat::free((void *)d_A, q);
-
-  // check d_A[0,..., 6] = 0
-  for (int i = 0; i < Num - 3; i++) {
-    assert(h_A[i] == 0);
-  }
-
-  // check d_A[7,..., 9] = 4
-  for (int i = Num - 3; i < Num; i++) {
-    assert(h_A[i] == 4);
-  }
-
-  free(h_A);
-}
-
-void test_memset() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  constexpr size_t memset_size_in_bits = 8;
-  test_memset_impl<memset_size_in_bits>();
-}
-
-void test_memset_d16() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  constexpr size_t memset_size_in_bits = 16;
-  test_memset_impl<memset_size_in_bits>();
-}
-
-void test_memset_d32() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  constexpr size_t memset_size_in_bits = 32;
-  test_memset_impl<memset_size_in_bits>();
-}
-
-void test_memset_q() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  constexpr size_t memset_size_in_bits = 8;
-  test_memset_q_impl<memset_size_in_bits>();
-}
-
-void test_memset_d16_q() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  constexpr size_t memset_size_in_bits = 16;
-  test_memset_q_impl<memset_size_in_bits>();
-}
-
-void test_memset_d32_q() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  constexpr size_t memset_size_in_bits = 32;
-  test_memset_q_impl<memset_size_in_bits>();
-}
-
-template <typename T> void test_memcpy_t() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  constexpr int Num = 5000;
-  constexpr int N1 = 1000;
-  T *h_A = (T *)malloc(Num * sizeof(T));
-  T *h_B = (T *)malloc(Num * sizeof(T));
-  T *h_C = (T *)malloc(Num * sizeof(T));
-
-  for (int i = 0; i < Num; i++) {
-    h_A[i] = static_cast<T>(1);
-    h_B[i] = static_cast<T>(2);
-  }
-
-  T *d_A = nullptr;
-  // hostA[0..999] -> deviceA[0..999]
-  // hostB[0..3999] -> deviceA[1000..4999]
-  // deviceA[0..4999] -> hostC[0..4999]
-  d_A = syclcompat::malloc<T>(Num);
-  syclcompat::memcpy<T>(d_A, h_A, N1);
-  syclcompat::memcpy<T>((d_A + N1), h_B, (Num - N1));
-  syclcompat::memcpy<T>(h_C, d_A, Num);
-  syclcompat::free((void *)d_A);
-
-  syclcompat::free(0);
-  syclcompat::free(NULL);
-  syclcompat::free(nullptr);
-
-  // verify
-  for (int i = 0; i < N1; i++) {
-    assert(h_A[i] == h_C[i]);
-  }
-
-  for (int i = N1; i < Num; i++) {
-    assert(h_B[i] == h_C[i]);
-  }
-
-  free(h_A);
-  free(h_B);
-  free(h_C);
-}
-
-template <typename T> void test_memcpy_t_q() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  sycl::queue q{{sycl::property::queue::in_order()}};
-  constexpr int Num = 5000;
-  constexpr int N1 = 1000;
-  T *h_A = (T *)malloc(Num * sizeof(T));
-  T *h_B = (T *)malloc(Num * sizeof(T));
-  T *h_C = (T *)malloc(Num * sizeof(T));
-
-  for (int i = 0; i < Num; i++) {
-    h_A[i] = static_cast<T>(1);
-    h_B[i] = static_cast<T>(2);
-  }
-
-  T *d_A = nullptr;
-  // hostA[0..999] -> deviceA[0..999]
-  // hostB[0..3999] -> deviceA[1000..4999]
-  // deviceA[0..4999] -> hostC[0..4999]
-  d_A = syclcompat::malloc<T>(Num, q);
-  syclcompat::memcpy<T>(d_A, h_A, N1, q);
-  syclcompat::memcpy<T>((d_A + N1), h_B, (Num - N1), q);
-  syclcompat::memcpy<T>(h_C, d_A, Num, q);
-  syclcompat::free((void *)d_A, q);
-
-  syclcompat::free(0, q);
-  syclcompat::free(NULL, q);
-  syclcompat::free(nullptr, q);
-
-  // verify
-  for (int i = 0; i < N1; i++) {
-    assert(h_A[i] == h_C[i]);
-  }
-
-  for (int i = N1; i < Num; i++) {
-    assert(h_B[i] == h_C[i]);
-  }
-
-  free(h_A);
-  free(h_B);
-  free(h_C);
-}
-
-template <typename T> void test_fill() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  bool skip = should_skip<T>(syclcompat::get_current_device());
-  if (skip) // Unsupported aspect
-    return;
-
-  constexpr int Num = 10;
-  T *h_A = (T *)malloc(Num * sizeof(T));
-
-  for (int i = 0; i < Num; i++) {
-    h_A[i] = static_cast<T>(4);
-  }
-
-  T *d_A = nullptr;
-
-  d_A = syclcompat::malloc<T>(Num);
-  // hostA -> deviceA
-  syclcompat::memcpy((void *)d_A, (void *)h_A, Num * sizeof(T));
-
-  // set d_A[0,..., 6] = 0
-  syclcompat::fill((void *)d_A, static_cast<T>(0), (Num - 3));
-
-  // deviceA -> hostA
-  syclcompat::memcpy((void *)h_A, (void *)d_A, Num * sizeof(T));
-
-  syclcompat::free((void *)d_A);
-
-  // check d_A[0,..., 6] = 0
-  for (int i = 0; i < Num - 3; i++) {
-    assert(h_A[i] == static_cast<T>(0));
-  }
-
-  // check d_A[7,..., 9] = 4
-  for (int i = Num - 3; i < Num; i++) {
-    assert(h_A[i] == static_cast<T>(4));
-  }
-
-  free(h_A);
-}
-
-template <typename T> void test_fill_q() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  bool skip = should_skip<T>(syclcompat::get_current_device());
-  if (skip) // Unsupported aspect
-    return;
-  sycl::queue q{{sycl::property::queue::in_order()}};
-  constexpr int Num = 10;
-  T *h_A = (T *)malloc(Num * sizeof(T));
-
-  for (int i = 0; i < Num; i++) {
-    h_A[i] = static_cast<T>(4);
-  }
-
-  T *d_A = nullptr;
-
-  d_A = syclcompat::malloc<T>(Num, q);
-  // hostA -> deviceA
-  syclcompat::memcpy((void *)d_A, (void *)h_A, Num * sizeof(T), q);
-
-  // set d_A[0,..., 6] = 0
-  syclcompat::fill((void *)d_A, static_cast<T>(0), (Num - 3), q);
-
-  // deviceA -> hostA
-  syclcompat::memcpy((void *)h_A, (void *)d_A, Num * sizeof(T), q);
-
-  syclcompat::free((void *)d_A, q);
-
-  // check d_A[0,..., 6] = 0
-  for (int i = 0; i < Num - 3; i++) {
-    assert(h_A[i] == static_cast<T>(0));
-  }
-
-  // check d_A[7,..., 9] = 4
-  for (int i = Num - 3; i < Num; i++) {
-    assert(h_A[i] == static_cast<T>(4));
-  }
-
-  free(h_A);
-}
-
-constexpr size_t size = 2000;
-constexpr size_t offset = 1000;
-
-syclcompat::constant_memory<float, 1> d_A(size);
-syclcompat::constant_memory<float, 1> d_B(size);
-
-void test_constant_memcpy() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-
-  float *h_A = (float *)malloc(size / 2 * sizeof(float));
-  float *h_B = (float *)malloc(size / 2 * sizeof(float));
-  float *h_C = (float *)malloc(size * sizeof(float));
-  float *h_D = (float *)malloc(size * sizeof(float));
-
-  for (int i = 0; i < size / 2; i++) {
-    h_A[i] = 1.0f;
-    h_B[i] = 2.0f;
-  }
-
-  // hostA[0..999] -> deviceA[0..999]
-  // hostB[0..999] -> deviceA[1000..1999]
-  // deviceA[0..1999] -> hostC[0..1999]
-  // deviceA[0..999] -> deviceB[0..999]
-  // deviceA[1000..1999] -> deviceB[1000..1999]
-  // deviceB[0..1999] -> hostD[0..1999]
-
-  syclcompat::memcpy(d_A.get_ptr(), h_A, offset * sizeof(float));
-  syclcompat::memcpy((char *)d_A.get_ptr() + offset * sizeof(float), h_B,
-                     (size - offset) * sizeof(float));
-  syclcompat::memcpy(h_C, d_A.get_ptr(), size * sizeof(float));
-  syclcompat::memcpy(d_B.get_ptr(), d_A.get_ptr(), offset * sizeof(float));
-  syclcompat::memcpy((char *)d_B.get_ptr() + offset * sizeof(float),
-                     (void *)((size_t)d_A.get_ptr() + offset * sizeof(float)),
-                     (size - offset) * sizeof(float));
-  syclcompat::memcpy(h_D, d_B.get_ptr(), size * sizeof(float));
-
-  // verify hostD
-  for (int i = 0; i < offset; i++) {
-    assert(fabs(h_A[i] - h_D[i]) <= 1e-5);
-  }
-
-  for (int i = offset; i < size; i++) {
-    assert(fabs(h_B[i - offset] - h_D[i]) <= 1e-5);
-  }
-
-  free(h_A);
-  free(h_B);
-  free(h_C);
-  free(h_D);
-}
-
-void test_constant_memcpy_q() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  sycl::queue q{{sycl::property::queue::in_order()}};
-
-  constexpr size_t size = 2000;
-  constexpr size_t offset = 1000;
-  syclcompat::constant_memory<float, 1> d_A(size, q);
-  syclcompat::constant_memory<float, 1> d_B(size, q);
-
-  float *h_A = (float *)malloc(size / 2 * sizeof(float));
-  float *h_B = (float *)malloc(size / 2 * sizeof(float));
-  float *h_C = (float *)malloc(size * sizeof(float));
-  float *h_D = (float *)malloc(size * sizeof(float));
-
-  for (int i = 0; i < size / 2; i++) {
-    h_A[i] = 1.0f;
-    h_B[i] = 2.0f;
-  }
-
-  // hostA[0..999] -> deviceA[0..999]
-  // hostB[0..999] -> deviceA[1000..1999]
-  // deviceA[0..1999] -> hostC[0..1999]
-  // deviceA[0..999] -> deviceB[0..999]
-  // deviceA[1000..1999] -> deviceB[1000..1999]
-  // deviceB[0..1999] -> hostD[0..1999]
-
-  syclcompat::memcpy(d_A.get_ptr(), h_A, offset * sizeof(float), q);
-
-  syclcompat::memcpy((char *)d_A.get_ptr() + offset * sizeof(float), h_B,
-                     (size - offset) * sizeof(float), q);
-  syclcompat::memcpy(h_C, d_A.get_ptr(), size * sizeof(float), q);
-
-  syclcompat::memcpy(d_B.get_ptr(), d_A.get_ptr(), offset * sizeof(float), q);
-
-  syclcompat::memcpy((char *)d_B.get_ptr() + offset * sizeof(float),
-                     (void *)((size_t)d_A.get_ptr() + offset * sizeof(float)),
-                     (size - offset) * sizeof(float), q);
-
-  syclcompat::memcpy(h_D, d_B.get_ptr(), size * sizeof(float), q);
-
-  // verify hostD
-  for (int i = 0; i < offset; i++) {
-    assert(fabs(h_A[i] - h_D[i]) <= 1e-5);
-  }
-
-  for (int i = offset; i < size; i++) {
-    assert(fabs(h_B[i - offset] - h_D[i]) <= 1e-5);
-  }
-
-  free(h_A);
-  free(h_B);
-  free(h_C);
-  free(h_D);
-}
-
-int main() {
-  test_memcpy();
-  test_memcpy_q();
-  test_memset();
-  test_memset_q();
-  test_memset_d16();
-  test_memset_d16_q();
-  test_memset_d32();
-  test_memset_d32_q();
-  test_constant_memcpy();
-  test_constant_memcpy_q();
-
-  INSTANTIATE_ALL_TYPES(value_type_list, test_memcpy_t);
-  INSTANTIATE_ALL_TYPES(value_type_list, test_memcpy_t_q);
-  INSTANTIATE_ALL_TYPES(value_type_list, test_fill);
-  INSTANTIATE_ALL_TYPES(value_type_list, test_fill_q);
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/memory/memory_management_test1_usmnone.cpp b/sycl/test-e2e/syclcompat/memory/memory_management_test1_usmnone.cpp
deleted file mode 100644
index ac71e6538b9b9..0000000000000
--- a/sycl/test-e2e/syclcompat/memory/memory_management_test1_usmnone.cpp
+++ /dev/null
@@ -1,503 +0,0 @@
-// ====------ memory_management_test1_usmnone.cpp---------- -*- C++ -* ----===////
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===----------------------------------------------------------------------===//
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-// UNSUPPORTED: linux && gpu-intel-gen12 && (!build-mode && run-mode)
-// UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/17966
-
-#define SYCLCOMPAT_USM_LEVEL_NONE
-#include <sycl/detail/core.hpp>
-#include <syclcompat/memory.hpp>
-#include "memory_common.hpp"
-
-void test_memcpy() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  int Num = 5000;
-  int N1 = 1000;
-  float *h_A = (float*)malloc(Num*sizeof(float));
-  float *h_B = (float*)malloc(Num*sizeof(float));
-  float *h_C = (float*)malloc(Num*sizeof(float));
-
-  for (int i = 0; i < Num; i++) {
-    h_A[i] = 1.0f;
-    h_B[i] = 2.0f;
-  }
-
-  float *d_A;
-  // hostA[0..999] -> deviceA[0..999]
-  // hostB[0..3999] -> deviceA[1000..4999]
-  // deviceA[0..4999] -> hostC[0..4999]
-  d_A = (float *)syclcompat::malloc(Num * sizeof(float));
-  syclcompat::memcpy((void*) d_A, (void*) h_A, N1 * sizeof(float));
-  syclcompat::memcpy((void*) (d_A + N1), (void*) h_B, (Num-N1) * sizeof(float));
-  syclcompat::memcpy((void*) h_C, (void*) d_A, Num * sizeof(float));
-  syclcompat::free((void*)d_A);
-
-  syclcompat::free(0);
-  syclcompat::free(NULL);
-  syclcompat::free(nullptr);
-
-  // verify
-  for(int i = 0; i < N1; i++){
-      if (fabs(h_A[i] - h_C[i]) > 1e-5) {
-          fprintf(stderr,"Check: Elements are A = %f, B = %f, C = %f:\n", h_A[i],  h_B[i],  h_C[i]);
-          fprintf(stderr,"Result verification failed at element %d:\n", i);
-          exit(EXIT_FAILURE);
-      }
-  }
-
-  for(int i = N1; i < Num; i++){
-      if (fabs(h_B[i] - h_C[i]) > 1e-5) {
-          fprintf(stderr,"Check: Elements are A = %f, B = %f, C = %f:\n", h_A[i],  h_B[i],  h_C[i]);
-          fprintf(stderr,"Result verification failed at element %d:\n", i);
-          exit(EXIT_FAILURE);
-      }
-  }
-
-
-  free(h_A);
-  free(h_B);
-  free(h_C);
-}
-
-void test_vecadd() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  int Num = 5000;
-  int Offset = 0; // Current dpcpp version in ics environment has bugs with Offset > 0,
-                  // CORC-6222 has fixed this issue, but the version of dpcpp used in ics
-                  // environment has not cover this patch. After it has this patch,
-                  // Offest could be set to 100, and current test case will pass.
-
-  float *h_A = (float*)malloc(Num*sizeof(float));
-  float *h_B = (float*)malloc(Num*sizeof(float));
-  float *h_C = (float*)malloc(Num*sizeof(float));
-
-  //syclcompat::dev_mgr::instance().select_device(0);
-
-  for (int i = 0; i < Num; i++) {
-    h_A[i] = 1.0f;
-    h_B[i] = 2.0f;
-  }
-
-  float *d_A, *d_B, *d_C;
-  // hostA -> deviceA
-  // hostB -> deviceB
-  // kernel: deviceC = deviceA + deviceB
-  // deviceA -> hostC
-  d_A = (float *)syclcompat::malloc(Num * sizeof(float));
-  d_B = (float *)syclcompat::malloc(Num * sizeof(float));
-  d_C = (float *)syclcompat::malloc(Num * sizeof(float));
-  syclcompat::memcpy((void*) d_A, (void*) h_A, Num * sizeof(float));
-  syclcompat::memcpy((void*) d_B, (void*) h_B, Num * sizeof(float));
-
-  d_A += Offset;
-  d_B += Offset;
-  d_C += Offset;
-
-  {
-    std::pair<syclcompat::buffer_t, size_t> buffer_and_offset_A = syclcompat::get_buffer_and_offset(d_A);
-    size_t offset_A = buffer_and_offset_A.second;
-    std::pair<syclcompat::buffer_t, size_t> buffer_and_offset_B = syclcompat::get_buffer_and_offset(d_B);
-    size_t offset_B = buffer_and_offset_A.second;
-    std::pair<syclcompat::buffer_t, size_t> buffer_and_offset_C = syclcompat::get_buffer_and_offset(d_C);
-    size_t offset_C = buffer_and_offset_A.second;
-    syclcompat::get_default_queue().submit(
-      [&](sycl::handler &cgh) {
-      auto d_A_acc = buffer_and_offset_A.first.get_access<sycl::access::mode::read_write>(cgh);
-      auto d_B_acc = buffer_and_offset_B.first.get_access<sycl::access::mode::read_write>(cgh);
-      auto d_C_acc = buffer_and_offset_C.first.get_access<sycl::access::mode::read_write>(cgh);
-
-        cgh.parallel_for<class vectorAdd3_1>(
-          sycl::range<1>(Num-Offset),
-          [=](sycl::id<1> id) {
-
-            float *A = (float*)(&d_A_acc[0]+offset_A);
-            float *B = (float*)(&d_B_acc[0]+offset_B);
-            float *C = (float*)(&d_C_acc[0]+offset_C);
-             int i = id[0];
-
-            C[i] = A[i] + B[i];
-          });
-      });
-  }
-
-  syclcompat::memcpy((void*) (h_C+Offset), (void*) d_C, (Num-Offset) * sizeof(float));
-  syclcompat::free((void*)d_A);
-  syclcompat::free((void*)d_B);
-  syclcompat::free((void*)d_C);
-
-  // verify
-  for(int i = Offset; i < Num; i++){
-      if (fabs(h_C[i] - h_A[i] - h_B[i]) > 1e-5) {
-        fprintf(stderr,"Check %d: Elements are A = %f, B = %f, C = %f:\n", i,h_A[i],  h_B[i],  h_C[i]);
-        fprintf(stderr,"Result verification failed at element %d:\n", i);
-        exit(EXIT_FAILURE);
-      }
-  }
-
-
-  free(h_A);
-  free(h_B);
-  free(h_C);
-}
-
-
-void test_memset() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  int Num = 10;
-  int *h_A = (int*)malloc(Num*sizeof(int));
-
-  for (int i = 0; i < Num; i++) {
-    h_A[i] = 4;
-  }
-
-  int *d_A;
-
-  d_A = (int *)syclcompat::malloc(Num * sizeof(int));
-  // hostA -> deviceA
-  syclcompat::memcpy((void*) d_A, (void*) h_A, Num * sizeof(int));
-
-  // set d_A[0,..., 6] = 0
-  syclcompat::memset((void*) d_A, 0, (Num - 3) * sizeof(int));
-
-  // deviceA -> hostA
-  syclcompat::memcpy((void*) h_A, (void*) d_A, Num * sizeof(int));
-
-  syclcompat::free((void*)d_A);
-
-  // check d_A[0,..., 6] = 0
-  for (int i = 0; i < Num - 3; i++) {
-    if (h_A[i] != 0) {
-      fprintf(stderr, "Check: h_A[%d] is %d:\n", i, h_A[i]);
-      fprintf(stderr, "Result verification failed at element [%d]!\n", i);
-      exit(EXIT_FAILURE);
-    }
-  }
-
-  // check d_A[7,..., 9] = 4
-  for (int i = Num - 3; i < Num; i++) {
-    if (h_A[i] != 4) {
-      fprintf(stderr, "Check: h_A[%d] is %d:\n", i, h_A[i]);
-      fprintf(stderr, "Result verification failed at element h_A[%d]!\n", i);
-      exit(EXIT_FAILURE);
-    }
-  }
-
-
-  free(h_A);
-}
-
-const unsigned int Num = 5000;
-const unsigned int N1 = 1000;
-syclcompat::constant_memory<float, 1> d_A(Num * sizeof(float));
-syclcompat::constant_memory<float, 1> d_B(Num * sizeof(float));
-
-void test_constant_memcpy() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  float h_A[Num];
-  float h_B[Num];
-  float h_C[Num];
-  float h_D[Num];
-
-  for (int i = 0; i < Num; i++) {
-    h_A[i] = 1.0f;
-    h_B[i] = 2.0f;
-  }
-
-  for (int i = 0; i < Num; i++) {
-    h_A[i] = 1.0f;
-    h_B[i] = 2.0f;
-  }
-  // hostA[0..999] -> deviceA[0..999]
-  // hostB[0..3999] -> deviceA[1000..4999]
-  // deviceA[0..4999] -> deviceB[0..4999]
-  // deviceA[0..4999] -> hostC[0..4999]
-  // deviceB[0..4999] -> hostD[0..4999]
-
-  syclcompat::memcpy((void *)d_A.get_ptr(), (void *)&h_A[0], N1 * sizeof(float));
-  syclcompat::memcpy((char *)d_A.get_ptr() + N1 * sizeof(float), (void*) h_B, (Num-N1) * sizeof(float));
-  syclcompat::memcpy((void *)h_C, (void *)d_A.get_ptr(), Num * sizeof(float));
-
-  syclcompat::memcpy((void *)d_B.get_ptr(), (void *)d_A.get_ptr(), N1 * sizeof(float));
-  syclcompat::memcpy((char *)d_B.get_ptr() + N1 * sizeof(float), (void *)((size_t)d_A.get_ptr() + N1* sizeof(float)), (Num - N1) * sizeof(float));
-  syclcompat::memcpy((void *)h_D, (void *)d_B.get_ptr(), Num * sizeof(float));
-
-  // verify hostD
-  for (int i = 0; i < N1; i++) {
-    if (fabs(h_A[i] - h_D[i]) > 1e-5) {
-      fprintf(stderr, "Check: Elements are A = %f, D = %f:\n", h_A[i], h_D[i]);
-      fprintf(stderr, "Result verification failed at element %d:\n", i);
-      exit(EXIT_FAILURE);
-    }
-  }
-
-  for (int i = N1; i < Num; i++) {
-    if (fabs(h_B[i] - h_D[i]) > 1e-5) {
-      fprintf(stderr, "Check: Elements are B = %f, D = %f:\n",   h_B[i], h_D[i]);
-      fprintf(stderr, "Result verification failed at element %d:\n", i);
-      exit(EXIT_FAILURE);
-    }
-  }
-
-}
-
-void test_memcpy(sycl::queue &q) {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  int Num = 5000;
-  int N1 = 1000;
-  float *h_A = (float*)malloc(Num*sizeof(float));
-  float *h_B = (float*)malloc(Num*sizeof(float));
-  float *h_C = (float*)malloc(Num*sizeof(float));
-
-  for (int i = 0; i < Num; i++) {
-    h_A[i] = 1.0f;
-    h_B[i] = 2.0f;
-  }
-
-  float *d_A;
-  // hostA[0..999] -> deviceA[0..999]
-  // hostB[0..3999] -> deviceA[1000..4999]
-  // deviceA[0..4999] -> hostC[0..4999]
-  d_A = (float *)syclcompat::malloc(Num * sizeof(float), q);
-  syclcompat::memcpy((void*) d_A, (void*) h_A, N1 * sizeof(float), q);
-  syclcompat::memcpy((void*) (d_A + N1), (void*) h_B, (Num-N1) * sizeof(float), q);
-  syclcompat::memcpy((void*) h_C, (void*) d_A, Num * sizeof(float), q);
-  syclcompat::free((void*)d_A, q);
-
-  syclcompat::free(0, q);
-  syclcompat::free(NULL, q);
-  syclcompat::free(nullptr, q);
-
-  // verify
-  for(int i = 0; i < N1; i++){
-      if (fabs(h_A[i] - h_C[i]) > 1e-5) {
-          fprintf(stderr,"Check: Elements are A = %f, B = %f, C = %f:\n", h_A[i],  h_B[i],  h_C[i]);
-          fprintf(stderr,"Result verification failed at element %d:\n", i);
-          exit(EXIT_FAILURE);
-      }
-  }
-
-  for(int i = N1; i < Num; i++){
-      if (fabs(h_B[i] - h_C[i]) > 1e-5) {
-          fprintf(stderr,"Check: Elements are A = %f, B = %f, C = %f:\n", h_A[i],  h_B[i],  h_C[i]);
-          fprintf(stderr,"Result verification failed at element %d:\n", i);
-          exit(EXIT_FAILURE);
-      }
-  }
-
-
-  free(h_A);
-  free(h_B);
-  free(h_C);
-}
-
-void test_vecadd(sycl::queue &q) {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  int Num = 5000;
-  int Offset = 0; // Current dpcpp version in ics environment has bugs with Offset > 0,
-                  // CORC-6222 has fixed this issue, but the version of dpcpp used in ics
-                  // environment has not cover this patch. After it has this patch,
-                  // Offest could be set to 100, and current test case will pass.
-
-  float *h_A = (float*)malloc(Num*sizeof(float));
-  float *h_B = (float*)malloc(Num*sizeof(float));
-  float *h_C = (float*)malloc(Num*sizeof(float));
-
-  //syclcompat::dev_mgr::instance().select_device(0);
-
-  for (int i = 0; i < Num; i++) {
-    h_A[i] = 1.0f;
-    h_B[i] = 2.0f;
-  }
-
-  float *d_A, *d_B, *d_C;
-  // hostA -> deviceA
-  // hostB -> deviceB
-  // kernel: deviceC = deviceA + deviceB
-  // deviceA -> hostC
-  d_A = (float *)syclcompat::malloc(Num * sizeof(float), q);
-  d_B = (float *)syclcompat::malloc(Num * sizeof(float), q);
-  d_C = (float *)syclcompat::malloc(Num * sizeof(float), q);
-  syclcompat::memcpy((void*) d_A, (void*) h_A, Num * sizeof(float), q);
-  syclcompat::memcpy((void*) d_B, (void*) h_B, Num * sizeof(float), q);
-
-  d_A += Offset;
-  d_B += Offset;
-  d_C += Offset;
-
-  {
-    std::pair<syclcompat::buffer_t, size_t> buffer_and_offset_A = syclcompat::get_buffer_and_offset(d_A);
-    size_t offset_A = buffer_and_offset_A.second;
-    std::pair<syclcompat::buffer_t, size_t> buffer_and_offset_B = syclcompat::get_buffer_and_offset(d_B);
-    size_t offset_B = buffer_and_offset_A.second;
-    std::pair<syclcompat::buffer_t, size_t> buffer_and_offset_C = syclcompat::get_buffer_and_offset(d_C);
-    size_t offset_C = buffer_and_offset_A.second;
-    syclcompat::get_default_queue().submit(
-      [&](sycl::handler &cgh) {
-      auto d_A_acc = buffer_and_offset_A.first.get_access<sycl::access::mode::read_write>(cgh);
-      auto d_B_acc = buffer_and_offset_B.first.get_access<sycl::access::mode::read_write>(cgh);
-      auto d_C_acc = buffer_and_offset_C.first.get_access<sycl::access::mode::read_write>(cgh);
-
-        cgh.parallel_for<class vectorAdd3_2>(
-          sycl::range<1>(Num-Offset),
-          [=](sycl::id<1> id) {
-
-            float *A = (float*)(&d_A_acc[0]+offset_A);
-            float *B = (float*)(&d_B_acc[0]+offset_B);
-            float *C = (float*)(&d_C_acc[0]+offset_C);
-             int i = id[0];
-
-            C[i] = A[i] + B[i];
-          });
-      });
-  }
-
-  syclcompat::memcpy((void*) (h_C+Offset), (void*) d_C, (Num-Offset) * sizeof(float), q);
-  syclcompat::free((void*)d_A, q);
-  syclcompat::free((void*)d_B, q);
-  syclcompat::free((void*)d_C, q);
-
-  // verify
-  for(int i = Offset; i < Num; i++){
-      if (fabs(h_C[i] - h_A[i] - h_B[i]) > 1e-5) {
-        fprintf(stderr,"Check %d: Elements are A = %f, B = %f, C = %f:\n", i,h_A[i],  h_B[i],  h_C[i]);
-        fprintf(stderr,"Result verification failed at element %d:\n", i);
-        exit(EXIT_FAILURE);
-      }
-  }
-
-
-  free(h_A);
-  free(h_B);
-  free(h_C);
-}
-
-
-void test_memset(sycl::queue &q) {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  int Num = 10;
-  int *h_A = (int*)malloc(Num*sizeof(int));
-
-  for (int i = 0; i < Num; i++) {
-    h_A[i] = 4;
-  }
-
-  int *d_A;
-
-  d_A = (int *)syclcompat::malloc(Num * sizeof(int), q);
-  // hostA -> deviceA
-  syclcompat::memcpy((void*) d_A, (void*) h_A, Num * sizeof(int), q);
-
-  // set d_A[0,..., 6] = 0
-  syclcompat::memset((void*) d_A, 0, (Num - 3) * sizeof(int), q);
-
-  // deviceA -> hostA
-  syclcompat::memcpy((void*) h_A, (void*) d_A, Num * sizeof(int), q);
-
-  syclcompat::free((void*)d_A, q);
-
-  // check d_A[0,..., 6] = 0
-  for (int i = 0; i < Num - 3; i++) {
-    if (h_A[i] != 0) {
-      fprintf(stderr, "Check: h_A[%d] is %d:\n", i, h_A[i]);
-      fprintf(stderr, "Result verification failed at element [%d]!\n", i);
-      exit(EXIT_FAILURE);
-    }
-  }
-
-  // check d_A[7,..., 9] = 4
-  for (int i = Num - 3; i < Num; i++) {
-    if (h_A[i] != 4) {
-      fprintf(stderr, "Check: h_A[%d] is %d:\n", i, h_A[i]);
-      fprintf(stderr, "Result verification failed at element h_A[%d]!\n", i);
-      exit(EXIT_FAILURE);
-    }
-  }
-
-
-  free(h_A);
-}
-
-void test_constant_memcpy(sycl::queue &q) {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  const unsigned int Num = 5000;
-  const unsigned int N1 = 1000;
-  syclcompat::constant_memory<float, 1> d_A(Num * sizeof(float));
-  syclcompat::constant_memory<float, 1> d_B(Num * sizeof(float));
-
-  float h_A[Num];
-  float h_B[Num];
-  float h_C[Num];
-  float h_D[Num];
-
-  for (int i = 0; i < Num; i++) {
-    h_A[i] = 1.0f;
-    h_B[i] = 2.0f;
-  }
-
-  for (int i = 0; i < Num; i++) {
-    h_A[i] = 1.0f;
-    h_B[i] = 2.0f;
-  }
-  // hostA[0..999] -> deviceA[0..999]
-  // hostB[0..3999] -> deviceA[1000..4999]
-  // deviceA[0..4999] -> deviceB[0..4999]
-  // deviceA[0..4999] -> hostC[0..4999]
-  // deviceB[0..4999] -> hostD[0..4999]
-
-  syclcompat::memcpy((void *)d_A.get_ptr(), (void *)&h_A[0], N1 * sizeof(float), q);
-  syclcompat::memcpy((char *)d_A.get_ptr() + N1 * sizeof(float), (void*) h_B, (Num-N1) * sizeof(float), q);
-  syclcompat::memcpy((void *)h_C, (void *)d_A.get_ptr(), Num * sizeof(float), q);
-
-  syclcompat::memcpy((void *)d_B.get_ptr(), (void *)d_A.get_ptr(), N1 * sizeof(float), q);
-  syclcompat::memcpy((char *)d_B.get_ptr() + N1 * sizeof(float), (void *)((size_t)d_A.get_ptr() + N1* sizeof(float)), (Num - N1) * sizeof(float), q);
-  syclcompat::memcpy((void *)h_D, (void *)d_B.get_ptr(), Num * sizeof(float), q);
-
-  // verify hostD
-  for (int i = 0; i < N1; i++) {
-    if (fabs(h_A[i] - h_D[i]) > 1e-5) {
-      fprintf(stderr, "Check: Elements are A = %f, D = %f:\n", h_A[i], h_D[i]);
-      fprintf(stderr, "Result verification failed at element %d:\n", i);
-      exit(EXIT_FAILURE);
-    }
-  }
-
-  for (int i = N1; i < Num; i++) {
-    if (fabs(h_B[i] - h_D[i]) > 1e-5) {
-      fprintf(stderr, "Check: Elements are B = %f, D = %f:\n",   h_B[i], h_D[i]);
-      fprintf(stderr, "Result verification failed at element %d:\n", i);
-      exit(EXIT_FAILURE);
-    }
-  }
-
-}
-
-int main() {
-  test_memcpy();
-  test_vecadd();
-  test_memset();
-  test_constant_memcpy();
-
-  sycl::queue q;
-  test_memcpy(q);
-  test_vecadd(q);
-  test_memset(q);
-  test_constant_memcpy(q);
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/memory/memory_management_test2.cpp b/sycl/test-e2e/syclcompat/memory/memory_management_test2.cpp
deleted file mode 100644
index 47d8488294e61..0000000000000
--- a/sycl/test-e2e/syclcompat/memory/memory_management_test2.cpp
+++ /dev/null
@@ -1,324 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  memory_management_test2.cpp
- *
- *  Description:
- *    memory operations tests
- **************************************************************************/
-
-// The original source was under the license below:
-// ====------ memory_management_test2.cpp---------- -*- C++ -* ----===////
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===----------------------------------------------------------------------===//
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include <sycl/detail/core.hpp>
-
-#include <syclcompat/memory.hpp>
-
-#include "memory_common.hpp"
-
-constexpr size_t DataW = 100;
-constexpr size_t DataH = 100;
-
-void test_memcpy_pitched() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  size_t width = 6;
-  size_t height = 8;
-  float *h_data;
-  float *h_ref;
-  size_t h_pitch = sizeof(float) * width;
-  h_data = (float *)malloc(sizeof(float) * width * height);
-  for (int i = 0; i < width * height; i++)
-    h_data[i] = (float)i;
-
-  h_ref = (float *)malloc(sizeof(float) * width * height);
-  for (int i = 0; i < width * height; i++)
-    h_ref[i] = (float)i;
-
-  // alloc device memory.
-  size_t d_pitch;
-  float *d_data;
-  d_data = (float *)syclcompat::malloc(d_pitch, sizeof(float) * width, height);
-
-  // copy to Device.
-  syclcompat::memcpy(d_data, d_pitch, h_data, h_pitch, sizeof(float) * width,
-                     height);
-
-  // copy back to host.
-  syclcompat::memcpy(h_data, h_pitch, d_data, d_pitch, sizeof(float) * width,
-                     height);
-
-  check(h_data, h_ref, width * height);
-
-  // memset device data.
-  syclcompat::memset(d_data, d_pitch, 0x1, sizeof(float) * width, height);
-
-  // copy back to host
-  syclcompat::memcpy(h_data, h_pitch, d_data, d_pitch, sizeof(float) * width,
-                     height);
-
-  // memset reference data.
-  memset(h_ref, 0x1, width * height * sizeof(float));
-  check(h_data, h_ref, width * height);
-
-  free(h_data);
-  free(h_ref);
-  syclcompat::free((void *)d_data);
-}
-
-void test_memcpy_kernel() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  int Num = 5000;
-  int Offset =
-      0; // Current dpcpp version in ics environment has bugs with Offset >
-         // 0, CORC-6222 has fixed this issue, but the version of dpcpp used in
-         // ics environment has not cover this patch. After it has this patch,
-         // Offest could be set to 100, and current test case will pass.
-
-  float *h_A = (float *)malloc(Num * sizeof(float));
-  float *h_B = (float *)malloc(Num * sizeof(float));
-  float *h_C = (float *)malloc(Num * sizeof(float));
-
-  // syclcompat::dev_mgr::instance().select_device(0);
-
-  for (int i = 0; i < Num; i++) {
-    h_A[i] = 1.0f;
-    h_B[i] = 2.0f;
-  }
-
-  float *d_A, *d_B, *d_C;
-  // hostA -> deviceA
-  // hostB -> deviceB
-  // kernel: deviceC = deviceA + deviceB
-  // deviceA -> hostC
-  d_A = (float *)syclcompat::malloc(Num * sizeof(float));
-  d_B = (float *)syclcompat::malloc(Num * sizeof(float));
-  d_C = (float *)syclcompat::malloc(Num * sizeof(float));
-  syclcompat::memcpy((void *)d_A, (void *)h_A, Num * sizeof(float));
-  syclcompat::memcpy((void *)d_B, (void *)h_B, Num * sizeof(float));
-
-  d_A += Offset;
-  d_B += Offset;
-  d_C += Offset;
-
-  {
-    syclcompat::get_default_queue().submit([&](sycl::handler &cgh) {
-      cgh.parallel_for(sycl::range<1>(Num - Offset), [=](sycl::id<1> id) {
-        float *A = d_A;
-        float *B = d_B;
-        float *C = d_C;
-        int i = id[0];
-        C[i] = A[i] + B[i];
-      });
-    });
-    syclcompat::get_default_queue().wait_and_throw();
-  }
-
-  syclcompat::memcpy((void *)(h_C + Offset), (void *)d_C,
-                     (Num - Offset) * sizeof(float));
-  syclcompat::free((void *)d_A);
-  syclcompat::free((void *)d_B);
-  syclcompat::free((void *)d_C);
-
-  // verify
-  for (int i = Offset; i < Num; i++) {
-    assert(fabs(h_C[i] - h_A[i] - h_B[i]) <= 1e-5);
-  }
-
-  free(h_A);
-  free(h_B);
-  free(h_C);
-}
-
-void test_global_memory() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  float h_A[DataW][DataH];
-  float h_B[DataW][DataH];
-  float h_C[DataW][DataH];
-
-  for (int i = 0; i < DataW; i++) {
-    for (int j = 0; j < DataH; j++) {
-      h_A[i][j] = 1.0f;
-      h_B[i][j] = 2.0f;
-    }
-  }
-
-  syclcompat::global_memory<float, 2> g_A(DataW, DataH);
-  syclcompat::global_memory<float, 2> g_B(DataW, DataH);
-  syclcompat::global_memory<float, 2> g_C(DataW, DataH);
-
-  g_A.init();
-  g_B.init();
-  g_C.init();
-
-  syclcompat::memcpy((void *)g_A.get_ptr(), (void *)&h_A[0][0],
-                     DataW * DataH * sizeof(float));
-  syclcompat::memcpy((void *)g_B.get_ptr(), (void *)&h_B[0][0],
-                     DataW * DataH * sizeof(float));
-
-  {
-    syclcompat::get_default_queue().submit([&](sycl::handler &cgh) {
-      auto g_A_acc = g_A.get_access(cgh);
-      auto g_B_acc = g_B.get_access(cgh);
-      auto g_C_acc = g_C.get_access(cgh);
-      cgh.parallel_for(sycl::range<2>(DataW, DataH), [=](sycl::id<2> id) {
-        syclcompat::accessor<float, syclcompat::memory_region::global, 2> A(
-            g_A_acc);
-        syclcompat::accessor<float, syclcompat::memory_region::global, 2> B(
-            g_B_acc);
-        syclcompat::accessor<float, syclcompat::memory_region::global, 2> C(
-            g_C_acc);
-        int i = id[0], j = id[1];
-        C[i][j] = A[i][j] + B[i][j];
-      });
-    });
-    syclcompat::get_default_queue().wait_and_throw();
-  }
-  syclcompat::memcpy((void *)&h_C[0][0], (void *)g_C.get_ptr(),
-                     DataW * DataH * sizeof(float));
-
-  // verify hostD
-  for (int i = 0; i < DataW; i++) {
-    for (int j = 0; j < DataH; j++) {
-      assert(fabs(h_C[i][j] - h_A[i][j] - h_B[i][j]) <= 1e-5);
-    }
-  }
-}
-
-void test_constant_memory() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  float h_A[DataW][DataH];
-  float h_B[DataW][DataH];
-  float h_C[DataW][DataH];
-
-  for (int i = 0; i < DataW; i++) {
-    for (int j = 0; j < DataH; j++) {
-      h_A[i][j] = 1.0f;
-      h_B[i][j] = 2.0f;
-    }
-  }
-
-  syclcompat::constant_memory<float, 2> c_A(DataW, DataH);
-  syclcompat::constant_memory<float, 2> c_B(DataW, DataH);
-  syclcompat::global_memory<float, 2> g_C(DataW, DataH);
-
-  c_A.init();
-  c_B.init();
-  g_C.init();
-  syclcompat::memcpy((void *)c_A.get_ptr(), (void *)&h_A[0][0],
-                     DataW * DataH * sizeof(float));
-  syclcompat::memcpy((void *)c_B.get_ptr(), (void *)&h_B[0][0],
-                     DataW * DataH * sizeof(float));
-
-  {
-    syclcompat::get_default_queue().submit([&](sycl::handler &cgh) {
-      auto c_A_acc = c_A.get_access(cgh);
-      auto c_B_acc = c_B.get_access(cgh);
-      auto g_C_acc = g_C.get_access(cgh);
-      cgh.parallel_for(sycl::range<2>(DataW, DataH), [=](sycl::id<2> id) {
-        syclcompat::accessor<float, syclcompat::memory_region::constant, 2> A(
-            c_A_acc);
-        syclcompat::accessor<float, syclcompat::memory_region::constant, 2> B(
-            c_B_acc);
-        syclcompat::accessor<float, syclcompat::memory_region::global, 2> C(
-            g_C_acc);
-        int i = id[0], j = id[1];
-        C[i][j] = A[i][j] + B[i][j];
-      });
-    });
-    syclcompat::get_default_queue().wait_and_throw();
-  }
-  syclcompat::memcpy((void *)&h_C[0][0], (void *)g_C.get_ptr(),
-                     DataW * DataH * sizeof(float));
-  // verify hostD
-  for (int i = 0; i < DataW; i++) {
-    for (int j = 0; j < DataH; j++) {
-      assert(fabs(h_C[i][j] - h_A[i][j] - h_B[i][j]) <= 1e-5);
-    }
-  }
-}
-
-void test_memcpy_pitched_q() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  sycl::queue q{{sycl::property::queue::in_order()}};
-  size_t width = 6;
-  size_t height = 8;
-  float *h_data;
-  float *h_ref;
-  size_t h_pitch = sizeof(float) * width;
-  h_data = (float *)malloc(sizeof(float) * width * height);
-  for (int i = 0; i < width * height; i++)
-    h_data[i] = (float)i;
-
-  h_ref = (float *)malloc(sizeof(float) * width * height);
-  for (int i = 0; i < width * height; i++)
-    h_ref[i] = (float)i;
-
-  // alloc device memory.
-  size_t d_pitch;
-  float *d_data;
-  d_data =
-      (float *)syclcompat::malloc(d_pitch, sizeof(float) * width, height, q);
-
-  // copy to Device.
-  syclcompat::memcpy(d_data, d_pitch, h_data, h_pitch, sizeof(float) * width,
-                     height, q);
-
-  // copy back to host.
-  syclcompat::memcpy(h_data, h_pitch, d_data, d_pitch, sizeof(float) * width,
-                     height, q);
-
-  check(h_data, h_ref, width * height);
-
-  // memset device data.
-  syclcompat::memset(d_data, d_pitch, 0x1, sizeof(float) * width, height, q);
-
-  // copy back to host
-  syclcompat::memcpy(h_data, h_pitch, d_data, d_pitch, sizeof(float) * width,
-                     height, q);
-
-  // memset reference data.
-  memset(h_ref, 0x1, width * height * sizeof(float));
-  check(h_data, h_ref, width * height);
-
-  free(h_data);
-  free(h_ref);
-  syclcompat::free((void *)d_data, q);
-}
-
-int main() {
-  test_memcpy_kernel();
-  test_memcpy_pitched();
-  test_memcpy_pitched_q();
-
-  test_global_memory();
-  test_constant_memory();
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/memory/memory_management_test2_usmnone.cpp b/sycl/test-e2e/syclcompat/memory/memory_management_test2_usmnone.cpp
deleted file mode 100644
index e36f2384382af..0000000000000
--- a/sycl/test-e2e/syclcompat/memory/memory_management_test2_usmnone.cpp
+++ /dev/null
@@ -1,551 +0,0 @@
-// ====------ memory_management_test_mempcy_2_usmnone.cpp---------- -*- C++ -* ----===////
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===----------------------------------------------------------------------===//
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#define SYCLCOMPAT_USM_LEVEL_NONE
-#include <sycl/detail/core.hpp>
-#include <syclcompat/memory.hpp>
-#include "memory_common.hpp"
-
-void check(float *h_data, float *h_ref, size_t width, size_t height,
-           size_t depth) {
-  for (int i = 0; i < width * height * depth; i++) {
-    float diff = fabs(h_data[i] - h_ref[i]);
-    if (diff > 1.e-6) {
-      printf("Verification failed!");
-      printf("h_data[%d]=%f, h_ref[%d]=%f, diff=%f\n", i, h_data[i], i,
-             h_ref[i], diff);
-      exit(-1);
-    }
-  }
-}
-
-
-void test_mempcy_pitched() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  size_t width = 6;
-  size_t height = 8;
-  float *h_data;
-  float *h_ref;
-  size_t h_pitch = sizeof(float) * width;
-  h_data = (float *)malloc(sizeof(float) * width * height);
-  for (int i = 0; i < width * height; i++)
-    h_data[i] = (float)i;
-
-  h_ref = (float *)malloc(sizeof(float) * width * height);
-  for (int i = 0; i < width * height; i++)
-    h_ref[i] = (float)i;
-
-  // alloc device memory.
-  size_t d_pitch;
-  float *d_data;
-  d_data = (float *)syclcompat::malloc(d_pitch, sizeof(float) * width, height);
-
-  // copy to Device.
-  syclcompat::memcpy(d_data, d_pitch, h_data, h_pitch, sizeof(float) * width, height);
-
-  // copy back to host.
-  syclcompat::memcpy(h_data, h_pitch, d_data, d_pitch, sizeof(float) * width, height);
-
-  check(h_data, h_ref, width, height, 1);
-
-  // memset device data.
-  syclcompat::memset(d_data, d_pitch, 0x1, sizeof(float) * width, height);
-
-  // copy back to host
-  syclcompat::memcpy(h_data, h_pitch, d_data, d_pitch, sizeof(float) * width, height);
-
-  // memset reference data.
-  memset(h_ref, 0x1, width * height * sizeof(float));
-  check(h_data, h_ref, width, height, 1);
-
-  free(h_data);
-  free(h_ref);
-  syclcompat::free((void *)d_data);
-}
-
-void test_memcpy_reinterp_kernel() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  int Num = 5000;
-
-  float *h_A = (float*)malloc(Num*sizeof(float));
-  float *h_B = (float*)malloc(Num*sizeof(float));
-  float *h_C = (float*)malloc(Num*sizeof(float));
-
-  for (int i = 0; i < Num; i++) {
-    h_A[i] = 1.0f;
-    h_B[i] = 2.0f;
-  }
-
-  float *d_A, *d_B, *d_C;
-  // hostA -> deviceA
-  // hostB -> deviceB
-  // kernel: deviceC = deviceA + deviceB
-  // deviceA -> hostC
-  d_A = (float *)syclcompat::malloc(Num * sizeof(float));
-  d_B = (float *)syclcompat::malloc(Num * sizeof(float));
-  d_C = (float *)syclcompat::malloc(Num * sizeof(float));
-  syclcompat::memcpy((void*) d_A, (void*) h_A, Num * sizeof(float));
-  syclcompat::memcpy((void*) d_B, (void*) h_B, Num * sizeof(float));
-
-  {
-    syclcompat::buffer_t buffer_A = syclcompat::get_buffer(d_A);
-    syclcompat::buffer_t buffer_B = syclcompat::get_buffer(d_B);
-    syclcompat::buffer_t buffer_C = syclcompat::get_buffer(d_C);
-
-    syclcompat::get_default_queue().submit(
-      [&](sycl::handler &cgh) {
-      auto A = buffer_A.reinterpret<float>().get_access<sycl::access::mode::read_write>(cgh);
-      auto B = buffer_B.reinterpret<float>().get_access<sycl::access::mode::read_write>(cgh);
-      auto C = buffer_C.reinterpret<float>().get_access<sycl::access::mode::read_write>(cgh);
-
-        cgh.parallel_for(
-          sycl::range<1>(Num),
-          [=](sycl::id<1> id) {
-             int i = id[0];
-
-            C[i] = A[i] + B[i];
-          });
-      });
-      syclcompat::get_default_queue().wait_and_throw();
-  }
-
-  syclcompat::memcpy((void*) (h_C), (void*) d_C, (Num) * sizeof(float));
-  syclcompat::free((void*)d_A);
-  syclcompat::free((void*)d_B);
-  syclcompat::free((void*)d_C);
-
-  // verify
-  for(int i = 0; i < Num; i++){
-      if (fabs(h_C[i] - h_A[i] - h_B[i]) > 1e-5) {
-        fprintf(stderr,"Check %d: Elements are A = %f, B = %f, C = %f:\n", i,h_A[i],  h_B[i],  h_C[i]);
-        fprintf(stderr,"Result verification failed at element %d:\n", i);
-        exit(EXIT_FAILURE);
-      }
-  }
-
-  free(h_A);
-  free(h_B);
-  free(h_C);
-}
-
-void test_memcpy_kernel() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  int Num = 5000;
-
-  float *h_A = (float*)malloc(Num*sizeof(float));
-  float *h_B = (float*)malloc(Num*sizeof(float));
-  float *h_C = (float*)malloc(Num*sizeof(float));
-
-  for (int i = 0; i < Num; i++) {
-    h_A[i] = 1.0f;
-    h_B[i] = 2.0f;
-  }
-
-  float *d_A, *d_B, *d_C;
-  // hostA -> deviceA
-  // hostB -> deviceB
-  // kernel: deviceC = deviceA + deviceB
-  // deviceA -> hostC
-  d_A = (float *)syclcompat::malloc(Num * sizeof(float));
-  d_B = (float *)syclcompat::malloc(Num * sizeof(float));
-  d_C = (float *)syclcompat::malloc(Num * sizeof(float));
-  syclcompat::memcpy((void*) d_A, (void*) h_A, Num * sizeof(float));
-  syclcompat::memcpy((void*) d_B, (void*) h_B, Num * sizeof(float));
-
-  {
-    auto buffer_A = syclcompat::get_buffer<float>(d_A);
-    auto buffer_B = syclcompat::get_buffer<float>(d_B);
-    auto buffer_C = syclcompat::get_buffer<float>(d_C);
-
-    syclcompat::get_default_queue().submit(
-      [&](sycl::handler &cgh) {
-      auto A = buffer_A.get_access<sycl::access::mode::read_write>(cgh);
-      auto B = buffer_B.get_access<sycl::access::mode::read_write>(cgh);
-      auto C = buffer_C.get_access<sycl::access::mode::read_write>(cgh);
-
-        cgh.parallel_for(
-          sycl::range<1>(Num),
-          [=](sycl::id<1> id) {
-             int i = id[0];
-
-            C[i] = A[i] + B[i];
-          });
-      });
-      syclcompat::get_default_queue().wait_and_throw();
-  }
-
-  syclcompat::memcpy((void*) (h_C), (void*) d_C, Num * sizeof(float));
-  syclcompat::free((void*)d_A);
-  syclcompat::free((void*)d_B);
-  syclcompat::free((void*)d_C);
-
-  // verify
-  for(int i = 0; i < Num; i++){
-      if (fabs(h_C[i] - h_A[i] - h_B[i]) > 1e-5) {
-        fprintf(stderr,"Check %d: Elements are A = %f, B = %f, C = %f:\n", i,h_A[i],  h_B[i],  h_C[i]);
-        fprintf(stderr,"Result verification failed at element %d:\n", i);
-        exit(EXIT_FAILURE);
-      }
-  }
-
-  free(h_A);
-  free(h_B);
-  free(h_C);
-}
-
-void test_access_wrapper() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  int Num = 5000;
-  int Offset = 0; // Current dpcpp version in ics environment has bugs with Offset > 0,
-                  // CORC-6222 has fixed this issue, but the version of dpcpp used in ics
-                  // environment has not cover this patch. After it has this patch,
-                  // Offest could be set to 100, and current test case will pass.
-
-  float *h_A = (float*)malloc(Num*sizeof(float));
-  float *h_B = (float*)malloc(Num*sizeof(float));
-  float *h_C = (float*)malloc(Num*sizeof(float));
-
-  for (int i = 0; i < Num; i++) {
-    h_A[i] = 1.0f;
-    h_B[i] = 2.0f;
-  }
-
-  float *d_A, *d_B, *d_C;
-  // hostA -> deviceA
-  // hostB -> deviceB
-  // kernel: deviceC = deviceA + deviceB
-  // deviceA -> hostC
-  d_A = (float *)syclcompat::malloc(Num * sizeof(float));
-  d_B = (float *)syclcompat::malloc(Num * sizeof(float));
-  d_C = (float *)syclcompat::malloc(Num * sizeof(float));
-  syclcompat::memcpy((void*) d_A, (void*) h_A, Num * sizeof(float));
-  syclcompat::memcpy((void*) d_B, (void*) h_B, Num * sizeof(float));
-
-  d_A += Offset;
-  d_B += Offset;
-  d_C += Offset;
-
-  {
-    syclcompat::get_default_queue().submit(
-      [&](sycl::handler &cgh) {
-      syclcompat::access_wrapper<float *> d_A_acc(d_A, cgh);
-      syclcompat::access_wrapper<float *> d_B_acc(d_B, cgh);
-      syclcompat::access_wrapper<float *> d_C_acc(d_C, cgh);
-
-        cgh.parallel_for(
-          sycl::range<1>(Num-Offset),
-          [=](sycl::id<1> id) {
-
-            float *A = d_A_acc.get_raw_pointer();
-            float *B = d_B_acc.get_raw_pointer();
-            float *C = d_C_acc.get_raw_pointer();
-             int i = id[0];
-            C[i] = A[i] + B[i];
-          });
-      });
-      syclcompat::get_default_queue().wait_and_throw();
-  }
-
-  syclcompat::memcpy((void*) (h_C+Offset), (void*) d_C, (Num-Offset) * sizeof(float));
-  syclcompat::free((void*)d_A);
-  syclcompat::free((void*)d_B);
-  syclcompat::free((void*)d_C);
-
-  // verify
-  for(int i = Offset; i < Num; i++){
-      if (fabs(h_C[i] - h_A[i] - h_B[i]) > 1e-5) {
-        fprintf(stderr,"Check %d: Elements are A = %f, B = %f, C = %f:\n", i,h_A[i],  h_B[i],  h_C[i]);
-        fprintf(stderr,"Result verification failed at element %d:\n", i);
-        exit(EXIT_FAILURE);
-      }
-  }
-
-  free(h_A);
-  free(h_B);
-  free(h_C);
-}
-
-#define DataW 100
-#define DataH 100
-syclcompat::constant_memory<float, 2> c_A(DataW, DataH);
-syclcompat::constant_memory<float, 2> c_B(DataW, DataH);
-syclcompat::global_memory<float, 2> c_C(DataW, DataH);
-
-void test_constant_memory() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  float h_A[DataW][DataH];
-  float h_B[DataW][DataH];
-  float h_C[DataW][DataH];
-
-  for (int i = 0; i < DataW; i++) {
-    for (int j = 0; j < DataH; j++) {
-      h_A[i][j] = 1.0f;
-      h_B[i][j] = 2.0f;
-    }
-  }
-
-  c_A.init();
-  c_B.init();
-  c_C.init();
-  syclcompat::memcpy((void *)c_A.get_ptr(), (void *)&h_A[0][0], DataW * DataH * sizeof(float));
-  syclcompat::memcpy((void *)c_B.get_ptr(), (void *)&h_B[0][0], DataW * DataH * sizeof(float));
-
-  {
-    syclcompat::get_default_queue().submit(
-      [&](sycl::handler &cgh) {
-      auto c_A_acc = c_A.get_access(cgh);
-      auto c_B_acc = c_B.get_access(cgh);
-      auto c_C_acc = c_C.get_access(cgh);
-        cgh.parallel_for(
-          sycl::range<2>(DataW, DataH),
-          [=](sycl::id<2> id) {
-            syclcompat::accessor<float, syclcompat::memory_region::constant, 2> A(c_A_acc);
-            syclcompat::accessor<float, syclcompat::memory_region::constant, 2> B(c_B_acc);
-            syclcompat::accessor<float, syclcompat::memory_region::global, 2> C(c_C_acc);
-            int i = id[0], j = id[1];
-            C[i][j] = A[i][j] + B[i][j];
-          });
-      });
-      syclcompat::get_default_queue().wait_and_throw();
-  }
-  syclcompat::memcpy((void *)&h_C[0][0], (void *)c_C.get_ptr(), DataW * DataH * sizeof(float));
-
-  // verify hostD
-  for (int i = 0; i < DataW; i++) {
-    for (int j = 0; j < DataH; j++) {
-      if (fabs(h_C[i][j] - h_A[i][j] - h_B[i][j]) > 1e-5) {
-        fprintf(stderr, "Result verification failed at element [%d][%d]:\n", i, j);
-        exit(EXIT_FAILURE);
-      }
-    }
-  }
-}
-
-syclcompat::global_memory<float, 2> g_A(DataW, DataH);
-syclcompat::global_memory<float, 2> g_B(DataW, DataH);
-syclcompat::global_memory<float, 2> g_C(DataW, DataH);
-
-void test_global_memory() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  float h_A[DataW][DataH];
-  float h_B[DataW][DataH];
-  float h_C[DataW][DataH];
-
-  for (int i = 0; i < DataW; i++) {
-    for (int j = 0; j < DataH; j++) {
-      h_A[i][j] = 1.0f;
-      h_B[i][j] = 2.0f;
-    }
-  }
-
-  g_A.init();
-  g_B.init();
-  g_C.init();
-
-  syclcompat::memcpy((void *)g_A.get_ptr(), (void *)&h_A[0][0], DataW * DataH * sizeof(float));
-  syclcompat::memcpy((void *)g_B.get_ptr(), (void *)&h_B[0][0], DataW * DataH * sizeof(float));
-
-  {
-    syclcompat::get_default_queue().submit(
-      [&](sycl::handler &cgh) {
-      auto g_A_acc = g_A.get_access(cgh);
-      auto g_B_acc = g_B.get_access(cgh);
-      auto g_C_acc = g_C.get_access(cgh);
-        cgh.parallel_for(
-          sycl::range<2>(DataW, DataH),
-          [=](sycl::id<2> id) {
-            syclcompat::accessor<float, syclcompat::memory_region::global, 2> A(g_A_acc);
-            syclcompat::accessor<float, syclcompat::memory_region::global, 2> B(g_B_acc);
-            syclcompat::accessor<float, syclcompat::memory_region::global, 2> C(g_C_acc);
-            int i = id[0], j = id[1];
-            C[i][j] = A[i][j] + B[i][j];
-          });
-      });
-      syclcompat::get_default_queue().wait_and_throw();
-  }
-  syclcompat::memcpy((void *)&h_C[0][0], (void *)g_C.get_ptr(), DataW * DataH * sizeof(float));
-
-  // verify hostD
-  for (int i = 0; i < DataW; i++) {
-    for (int j = 0; j < DataH; j++) {
-      if (fabs(h_C[i][j] - h_A[i][j] - h_B[i][j]) > 1e-5) {
-        fprintf(stderr, "Result verification failed at element [%d][%d]:\n", i, j);
-        exit(EXIT_FAILURE);
-      }
-    }
-  }
-}
-
-syclcompat::shared_memory<float, 1> s_A(DataW);
-syclcompat::shared_memory<float, 1> s_B(DataW);
-syclcompat::shared_memory<float, 1> s_C(DataW);
-
-void test_shared_memory() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  s_A.init();
-  s_B.init();
-  s_C.init();
-
-  for (int i = 0; i < DataW; i++) {
-    s_A[i] = 1.0f;
-    s_B[i] = 2.0f;
-  }
-
-  {
-    syclcompat::get_default_queue().submit(
-      [&](sycl::handler &cgh) {
-        syclcompat::access_wrapper<float *> A_acc(s_A.get_ptr(), cgh);
-        syclcompat::access_wrapper<float *> B_acc(s_B.get_ptr(), cgh);
-        syclcompat::access_wrapper<float *> C_acc(s_C.get_ptr(), cgh);
-        cgh.parallel_for(
-          sycl::range<1>(DataW),
-          [=](sycl::id<1> id) {
-            int i = id[0];
-            float * A = A_acc.get_raw_pointer();
-            float * B = B_acc.get_raw_pointer();
-            float * C = C_acc.get_raw_pointer();
-            C[i] = A[i] + B[i];
-          });
-      });
-      syclcompat::get_default_queue().wait_and_throw();
-  }
-
-  // verify hostD
-  for (int i = 0; i < DataW; i++) {
-    for (int j = 0; j < DataH; j++) {
-      if (fabs(s_C[i] - s_A[i] - s_B[i]) > 1e-5) {
-        fprintf(stderr, "Result verification failed at element [%d][%d]:\n", i, j);
-        exit(EXIT_FAILURE);
-      }
-    }
-  }
-}
-
-void test_local_memory() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  int Num = 16;
-
-  float *h_A = (float*)malloc(Num * Num * sizeof(float));
-  float *h_B = (float*)malloc(Num * Num * sizeof(float));
-
-  for (int i = 0; i < Num; i++) {
-    for(int j = 0; j < Num; j++) {
-      h_A[i * Num + j] = 2.0f;
-    }
-  }
-
-  float *d_A;
-  d_A = (float *)syclcompat::malloc(Num * Num * sizeof(float));
-
-  {
-    auto buffer_A = syclcompat::get_buffer<float>(d_A);
-
-    syclcompat::get_default_queue().submit(
-      [&](sycl::handler &cgh) {
-      sycl::range<2> acc_range(Num, Num);
-      sycl::local_accessor<float, 2> C_local_acc(acc_range, cgh);
-      auto A = buffer_A.get_access<sycl::access::mode::read_write>(cgh);
-
-        cgh.parallel_for(
-          sycl::nd_range<2>(sycl::range<2>(Num, Num), sycl::range<2>(Num, Num)),
-          [=](sycl::nd_item<2> id) {
-            syclcompat::accessor<float, syclcompat::memory_region::local, 2> C_local(C_local_acc, acc_range);
-            int i = id.get_local_id(0), j = id.get_local_id(1);
-            C_local[i][j] = 1;
-            A[i * Num + j] = C_local[i][j] * 2;
-          });
-      });
-      syclcompat::get_default_queue().wait_and_throw();
-  }
-
-  syclcompat::memcpy((void*) (h_B), (void*) d_A, Num * Num * sizeof(float));
-  syclcompat::free((void*)d_A);
-
-  // verify
-  for(int i = 0; i < Num * Num; i++){
-      if (fabs(h_A[i] - h_B[i]) > 1e-5) {
-        fprintf(stderr,"Check %d: Elements are A = %f, B = %f\n", i, h_A[i],  h_B[i]);
-        fprintf(stderr,"Result verification failed at element %d:\n", i);
-        exit(EXIT_FAILURE);
-      }
-  }
-
-  free(h_A);
-  free(h_B);
-}
-
-void test_mempcy_pitched(sycl::queue &q) {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  size_t width = 6;
-  size_t height = 8;
-  float *h_data;
-  float *h_ref;
-  size_t h_pitch = sizeof(float) * width;
-  h_data = (float *)malloc(sizeof(float) * width * height);
-  for (int i = 0; i < width * height; i++)
-    h_data[i] = (float)i;
-
-  h_ref = (float *)malloc(sizeof(float) * width * height);
-  for (int i = 0; i < width * height; i++)
-    h_ref[i] = (float)i;
-
-  // alloc device memory.
-  size_t d_pitch;
-  float *d_data;
-  d_data = (float *)syclcompat::malloc(d_pitch, sizeof(float) * width, height, q);
-
-  // copy to Device.
-  syclcompat::memcpy(d_data, d_pitch, h_data, h_pitch, sizeof(float) * width, height, q);
-
-  // copy back to host.
-  syclcompat::memcpy(h_data, h_pitch, d_data, d_pitch, sizeof(float) * width, height, q);
-
-  check(h_data, h_ref, width, height, 1);
-
-  // memset device data.
-  syclcompat::memset(d_data, d_pitch, 0x1, sizeof(float) * width, height, q);
-
-  // copy back to host
-  syclcompat::memcpy(h_data, h_pitch, d_data, d_pitch, sizeof(float) * width, height, q);
-
-  // memset reference data.
-  memset(h_ref, 0x1, width * height * sizeof(float));
-  check(h_data, h_ref, width, height, 1);
-
-  free(h_data);
-  free(h_ref);
-  syclcompat::free((void *)d_data, q);
-
-  printf("test_mempcy_pitched passed!\n");
-}
-
-int main() {
-  test_mempcy_pitched();
-  test_memcpy_reinterp_kernel();
-  test_memcpy_kernel();
-  test_access_wrapper();
-  test_constant_memory();
-  test_global_memory();
-  test_shared_memory();
-  test_local_memory();
-
-  sycl::queue q;
-  test_mempcy_pitched(q);
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/memory/memory_management_test3.cpp b/sycl/test-e2e/syclcompat/memory/memory_management_test3.cpp
deleted file mode 100644
index 75f49298c4926..0000000000000
--- a/sycl/test-e2e/syclcompat/memory/memory_management_test3.cpp
+++ /dev/null
@@ -1,711 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  memory_management_test3.cpp
- *
- *  Description:
- *    memory operations tests
- **************************************************************************/
-
-// The original source was under the license below:
-// ====------ memory_management_test3.cpp---------- -*- C++ -* ----===////
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===----------------------------------------------------------------------===//
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-//
-// UNSUPPORTED: linux && level_zero_v2_adapter
-// UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/19662
-
-#include <sycl/detail/core.hpp>
-
-#include <syclcompat/memory.hpp>
-
-#include "../common.hpp"
-#include "memory_common.hpp"
-#include "memory_fixt.hpp"
-
-void test_free_memory() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  float *d_A = (float *)syclcompat::malloc(sizeof(float));
-
-  syclcompat::free(d_A);
-
-  syclcompat::free(0);
-  syclcompat::free(NULL);
-  syclcompat::free(nullptr);
-}
-
-void test_free_memory_q() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  sycl::queue q{{sycl::property::queue::in_order()}};
-  float *d_A = (float *)syclcompat::malloc(sizeof(float), q);
-  syclcompat::free((void *)d_A, q);
-
-  syclcompat::free(0, q);
-  syclcompat::free(NULL, q);
-  syclcompat::free(nullptr, q);
-}
-
-void test_wait_and_free_memory() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  float *d_A = (float *)syclcompat::malloc(sizeof(float));
-  syclcompat::wait_and_free((void *)d_A);
-
-  syclcompat::wait_and_free(0);
-  syclcompat::wait_and_free(NULL);
-  syclcompat::wait_and_free(nullptr);
-}
-
-void test_wait_and_free_memory_q() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  sycl::queue q{{sycl::property::queue::in_order()}};
-  float *d_A = (float *)syclcompat::malloc(sizeof(float), q);
-  syclcompat::wait_and_free((void *)d_A, q);
-
-  syclcompat::wait_and_free(0, q);
-  syclcompat::wait_and_free(NULL, q);
-  syclcompat::wait_and_free(nullptr, q);
-}
-
-void test_memcpy_async() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  int Num = 5000;
-  int N1 = 1000;
-  float *h_A = (float *)malloc(Num * sizeof(float));
-  float *h_B = (float *)malloc(Num * sizeof(float));
-  float *h_C = (float *)malloc(Num * sizeof(float));
-
-  for (int i = 0; i < Num; i++) {
-    h_A[i] = 1.0f;
-    h_B[i] = 2.0f;
-  }
-
-  // hostA[0..999] -> deviceA[0..999]
-  // hostB[0..3999] -> deviceA[1000..4999]
-  // deviceA[0..4999] -> hostC[0..4999]
-  float *d_A = (float *)syclcompat::malloc(Num * sizeof(float));
-
-  syclcompat::memcpy_async((void *)d_A, (void *)h_A, N1 * sizeof(float));
-  syclcompat::memcpy_async((void *)(d_A + N1), (void *)h_B,
-                           (Num - N1) * sizeof(float));
-  syclcompat::memcpy_async((void *)h_C, (void *)d_A, Num * sizeof(float));
-
-  syclcompat::wait();
-
-  syclcompat::free((void *)d_A);
-
-  // verify
-  for (int i = 0; i < N1; i++) {
-    assert(fabs(h_A[i] - h_C[i]) <= 1e-5);
-  }
-
-  for (int i = N1; i < Num; i++) {
-    assert(fabs(h_B[i] - h_C[i]) <= 1e-5);
-  }
-
-  free(h_A);
-  free(h_B);
-  free(h_C);
-}
-
-void test_memcpy_async_q() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  sycl::queue q{{sycl::property::queue::in_order()}};
-  int Num = 5000;
-  int N1 = 1000;
-  float *h_A = (float *)malloc(Num * sizeof(float));
-  float *h_B = (float *)malloc(Num * sizeof(float));
-  float *h_C = (float *)malloc(Num * sizeof(float));
-
-  for (int i = 0; i < Num; i++) {
-    h_A[i] = 1.0f;
-    h_B[i] = 2.0f;
-  }
-
-  // hostA[0..999] -> deviceA[0..999]
-  // hostB[0..3999] -> deviceA[1000..4999]
-  // deviceA[0..4999] -> hostC[0..4999]
-  float *d_A = (float *)syclcompat::malloc(Num * sizeof(float), q);
-  syclcompat::memcpy_async((void *)d_A, (void *)h_A, N1 * sizeof(float), q);
-  syclcompat::memcpy_async((void *)(d_A + N1), (void *)h_B,
-                           (Num - N1) * sizeof(float), q);
-  syclcompat::memcpy_async((void *)h_C, (void *)d_A, Num * sizeof(float), q);
-  q.wait_and_throw();
-  syclcompat::free((void *)d_A, q);
-
-  // verify
-  for (int i = 0; i < N1; i++) {
-    assert(fabs(h_A[i] - h_C[i]) <= 1e-5);
-  }
-
-  for (int i = N1; i < Num; i++) {
-    assert(fabs(h_B[i] - h_C[i]) <= 1e-5);
-  }
-
-  free(h_A);
-  free(h_B);
-  free(h_C);
-}
-
-void test_memcpy_async_pitched() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  size_t width = 6;
-  size_t height = 8;
-  float *h_data = nullptr;
-  float *h_ref = nullptr;
-  size_t h_pitch = sizeof(float) * width;
-  h_data = (float *)malloc(sizeof(float) * width * height);
-  for (int i = 0; i < width * height; i++)
-    h_data[i] = (float)i;
-
-  h_ref = (float *)malloc(sizeof(float) * width * height);
-  for (int i = 0; i < width * height; i++)
-    h_ref[i] = (float)i;
-
-  // alloc device memory.
-  size_t d_pitch;
-  float *d_data =
-      (float *)syclcompat::malloc(d_pitch, sizeof(float) * width, height);
-
-  // copy to Device.
-  syclcompat::memcpy_async(d_data, d_pitch, h_data, h_pitch,
-                           sizeof(float) * width, height);
-
-  // copy back to host.
-  syclcompat::memcpy_async(h_data, h_pitch, d_data, d_pitch,
-                           sizeof(float) * width, height);
-
-  syclcompat::get_default_queue().wait_and_throw();
-  check(h_data, h_ref, width * height);
-
-  // memset device data.
-  syclcompat::memset_async(d_data, d_pitch, 0x1, sizeof(float) * width, height);
-
-  // copy back to host
-  syclcompat::memcpy_async(h_data, h_pitch, d_data, d_pitch,
-                           sizeof(float) * width, height);
-  syclcompat::get_default_queue().wait_and_throw();
-  // memset reference data.
-  memset(h_ref, 0x1, width * height * sizeof(float));
-  check(h_data, h_ref, width * height);
-
-  free(h_data);
-  free(h_ref);
-  syclcompat::free((void *)d_data);
-}
-
-void test_memcpy_async_pitched_q() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  sycl::queue q{{sycl::property::queue::in_order()}};
-  size_t width = 6;
-  size_t height = 8;
-  float *h_data = nullptr;
-  float *h_ref = nullptr;
-  size_t h_pitch = sizeof(float) * width;
-  h_data = (float *)malloc(sizeof(float) * width * height);
-  for (int i = 0; i < width * height; i++)
-    h_data[i] = (float)i;
-
-  h_ref = (float *)malloc(sizeof(float) * width * height);
-  for (int i = 0; i < width * height; i++)
-    h_ref[i] = (float)i;
-
-  // alloc device memory.
-  size_t d_pitch;
-  float *d_data =
-      (float *)syclcompat::malloc(d_pitch, sizeof(float) * width, height, q);
-
-  // copy to Device.
-  syclcompat::memcpy_async(d_data, d_pitch, h_data, h_pitch,
-                           sizeof(float) * width, height, q);
-
-  // copy back to host.
-  syclcompat::memcpy_async(h_data, h_pitch, d_data, d_pitch,
-                           sizeof(float) * width, height, q);
-  q.wait_and_throw();
-  check(h_data, h_ref, width * height);
-
-  // memset device data.
-  syclcompat::memset_async(d_data, d_pitch, 0x1, sizeof(float) * width, height,
-                           q);
-
-  // copy back to host
-  syclcompat::memcpy_async(h_data, h_pitch, d_data, d_pitch,
-                           sizeof(float) * width, height, q);
-  q.wait_and_throw();
-  // memset reference data.
-  memset(h_ref, 0x1, width * height * sizeof(float));
-  check(h_data, h_ref, width * height);
-
-  free(h_data);
-  free(h_ref);
-  syclcompat::free((void *)d_data, q);
-}
-
-template <size_t memset_size_bits = 8> void test_memset_async_impl() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  // ValueT -> int for memset and memset_d32, short for memset_d16.
-  using ValueT = std::conditional_t<
-      memset_size_bits == 8 || memset_size_bits == 32, int,
-      std::conditional_t<memset_size_bits == 16, short, void>>;
-  static_assert(!std::is_void_v<ValueT>,
-                "memset tests only work for 8, 16 and 32 bits");
-
-  int Num = 10;
-  ValueT *h_A = (ValueT *)malloc(Num * sizeof(ValueT));
-
-  for (int i = 0; i < Num; i++) {
-    h_A[i] = 4;
-  }
-
-  ValueT *d_A = (ValueT *)syclcompat::malloc(Num * sizeof(ValueT));
-  // hostA -> deviceA
-  syclcompat::memcpy_async((void *)d_A, (void *)h_A, Num * sizeof(ValueT));
-
-  // set d_A[0,..., 6] = 0
-  if constexpr (memset_size_bits == 8)
-    syclcompat::memset_async((void *)d_A, 0, (Num - 3) * sizeof(ValueT));
-  else if constexpr (memset_size_bits == 16)
-    syclcompat::memset_d16_async((void *)d_A, 0, (Num - 3));
-  else if constexpr (memset_size_bits == 32)
-    syclcompat::memset_d32_async((void *)d_A, 0, (Num - 3));
-
-  // deviceA -> hostA
-  syclcompat::memcpy_async((void *)h_A, (void *)d_A, Num * sizeof(ValueT));
-
-  syclcompat::get_default_queue().wait_and_throw();
-
-  syclcompat::free((void *)d_A);
-
-  // check d_A[0,..., 6] = 0
-  for (int i = 0; i < Num - 3; i++) {
-    assert(h_A[i] == 0);
-  }
-
-  // check d_A[7,..., 9] = 4
-  for (int i = Num - 3; i < Num; i++) {
-    assert(h_A[i] == 4);
-  }
-
-  free(h_A);
-}
-
-template <size_t bits = 8> void test_memset_async_q_impl() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  // int for memset and memset_d32, short for memset_d16.
-  using ValueT =
-      std::conditional_t<bits == 8 || bits == 32, int,
-                         std::conditional_t<bits == 16, short, void>>;
-  static_assert(!std::is_void_v<ValueT>,
-                "memset tests only work for 8, 16 and 32 bits");
-
-  sycl::queue q{{sycl::property::queue::in_order()}};
-  int Num = 10;
-  ValueT *h_A = (ValueT *)malloc(Num * sizeof(ValueT));
-
-  for (int i = 0; i < Num; i++) {
-    h_A[i] = 4;
-  }
-
-  ValueT *d_A = (ValueT *)syclcompat::malloc(Num * sizeof(ValueT), q);
-  // hostA -> deviceA
-  syclcompat::memcpy_async((void *)d_A, (void *)h_A, Num * sizeof(ValueT), q);
-
-  // set d_A[0,..., 6] = 0
-  if constexpr (bits == 8)
-    syclcompat::memset_async((void *)d_A, 0, (Num - 3) * sizeof(ValueT), q);
-  else if constexpr (bits == 16)
-    syclcompat::memset_d16_async((void *)d_A, 0, (Num - 3), q);
-  else if constexpr (bits == 32)
-    syclcompat::memset_d32_async((void *)d_A, 0, (Num - 3), q);
-
-  // deviceA -> hostA
-  syclcompat::memcpy_async((void *)h_A, (void *)d_A, Num * sizeof(ValueT), q);
-  q.wait_and_throw();
-  syclcompat::free((void *)d_A, q);
-
-  // check d_A[0,..., 6] = 0
-  for (int i = 0; i < Num - 3; i++) {
-    assert(h_A[i] == 0);
-  }
-
-  // check d_A[7,..., 9] = 4
-  for (int i = Num - 3; i < Num; i++) {
-    assert(h_A[i] == 4);
-  }
-
-  free(h_A);
-}
-
-void test_memset_async() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  constexpr size_t memset_size_in_bits = 8;
-  test_memset_async_impl<memset_size_in_bits>();
-}
-
-void test_memset_d16_async() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  constexpr size_t memset_size_in_bits = 16;
-  test_memset_async_impl<memset_size_in_bits>();
-}
-
-void test_memset_d32_async() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  constexpr size_t memset_size_in_bits = 32;
-  test_memset_async_impl<memset_size_in_bits>();
-}
-
-void test_memset_async_q() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  constexpr size_t memset_size_in_bits = 8;
-  test_memset_async_q_impl<memset_size_in_bits>();
-}
-
-void test_memset_d16_async_q() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  constexpr size_t memset_size_in_bits = 16;
-  test_memset_async_q_impl<memset_size_in_bits>();
-}
-
-void test_memset_d32_async_q() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  constexpr size_t memset_size_in_bits = 32;
-  test_memset_async_q_impl<memset_size_in_bits>();
-}
-
-template <typename T> void test_memcpy_async_t_q() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  sycl::queue q{{sycl::property::queue::in_order()}};
-  int Num = 5000;
-  int N1 = 1000;
-  T *h_A = (T *)malloc(Num * sizeof(T));
-  T *h_B = (T *)malloc(Num * sizeof(T));
-  T *h_C = (T *)malloc(Num * sizeof(T));
-
-  for (int i = 0; i < Num; i++) {
-    h_A[i] = static_cast<T>(1);
-    h_B[i] = static_cast<T>(2);
-  }
-
-  T *d_A = nullptr;
-  // hostA[0..999] -> deviceA[0..999]
-  // hostB[0..3999] -> deviceA[1000..4999]
-  // deviceA[0..4999] -> hostC[0..4999]
-  d_A = syclcompat::malloc<T>(Num, q);
-  syclcompat::memcpy_async<T>(d_A, h_A, N1, q);
-  syclcompat::memcpy_async<T>((d_A + N1), h_B, (Num - N1), q);
-  syclcompat::memcpy_async<T>(h_C, d_A, Num, q);
-  q.wait_and_throw();
-  syclcompat::free((void *)d_A, q);
-
-  // verify
-  for (int i = 0; i < N1; i++) {
-    assert(h_A[i] == h_C[i]);
-  }
-
-  for (int i = N1; i < Num; i++) {
-    assert(h_B[i] == h_C[i]);
-  }
-
-  free(h_A);
-  free(h_B);
-  free(h_C);
-}
-
-template <typename T> void test_memcpy_async_t() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  int Num = 5000;
-  int N1 = 1000;
-  T *h_A = (T *)malloc(Num * sizeof(T));
-  T *h_B = (T *)malloc(Num * sizeof(T));
-  T *h_C = (T *)malloc(Num * sizeof(T));
-
-  for (int i = 0; i < Num; i++) {
-    h_A[i] = static_cast<T>(1);
-    h_B[i] = static_cast<T>(2);
-  }
-
-  // hostA[0..999] -> deviceA[0..999]
-  // hostB[0..3999] -> deviceA[1000..4999]
-  // deviceA[0..4999] -> hostC[0..4999]
-  T *d_A = syclcompat::malloc<T>(Num);
-  syclcompat::memcpy_async<T>(d_A, h_A, N1);
-  syclcompat::memcpy_async<T>((d_A + N1), h_B, (Num - N1));
-  syclcompat::memcpy_async<T>(h_C, d_A, Num);
-
-  syclcompat::wait();
-
-  syclcompat::free((void *)d_A);
-
-  // verify
-  for (int i = 0; i < N1; i++) {
-    assert(h_A[i] == h_C[i]);
-  }
-
-  for (int i = N1; i < Num; i++) {
-    assert(h_B[i] == h_C[i]);
-  }
-
-  free(h_A);
-  free(h_B);
-  free(h_C);
-}
-
-template <typename T> void test_fill_async() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  bool skip = should_skip<T>(syclcompat::get_current_device());
-  if (skip) // Unsupported aspect
-    return;
-
-  constexpr int Num = 10;
-  T *h_A = (T *)malloc(Num * sizeof(T));
-
-  for (int i = 0; i < Num; i++) {
-    h_A[i] = static_cast<T>(4);
-  }
-
-  T *d_A = nullptr;
-
-  d_A = syclcompat::malloc<T>(Num);
-  // hostA -> deviceA
-  syclcompat::memcpy((void *)d_A, (void *)h_A, Num * sizeof(T));
-
-  // set d_A[0,..., 6] = 0
-  syclcompat::fill_async((void *)d_A, static_cast<T>(0), (Num - 3));
-
-  // deviceA -> hostA
-  syclcompat::memcpy((void *)h_A, (void *)d_A, Num * sizeof(T));
-
-  syclcompat::get_default_queue().wait_and_throw();
-
-  syclcompat::free((void *)d_A);
-
-  // check d_A[0,..., 6] = 0
-  for (int i = 0; i < Num - 3; i++) {
-    assert(h_A[i] == static_cast<T>(0));
-  }
-
-  // check d_A[7,..., 9] = 4
-  for (int i = Num - 3; i < Num; i++) {
-    assert(h_A[i] == static_cast<T>(4));
-  }
-
-  free(h_A);
-}
-
-template <typename T> void test_fill_async_q() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  bool skip = should_skip<T>(syclcompat::get_current_device());
-  if (skip) // Unsupported aspect
-    return;
-
-  sycl::queue q{{sycl::property::queue::in_order()}};
-  constexpr int Num = 10;
-  T *h_A = (T *)malloc(Num * sizeof(T));
-
-  for (int i = 0; i < Num; i++) {
-    h_A[i] = static_cast<T>(4);
-  }
-
-  T *d_A = nullptr;
-
-  d_A = syclcompat::malloc<T>(Num, q);
-  // hostA -> deviceA
-  syclcompat::memcpy((void *)d_A, (void *)h_A, Num * sizeof(T), q);
-
-  // set d_A[0,..., 6] = 0
-  syclcompat::fill_async((void *)d_A, static_cast<T>(0), (Num - 3), q);
-
-  // deviceA -> hostA
-  syclcompat::memcpy((void *)h_A, (void *)d_A, Num * sizeof(T), q);
-
-  q.wait_and_throw();
-
-  syclcompat::free((void *)d_A, q);
-
-  // check d_A[0,..., 6] = 0
-  for (int i = 0; i < Num - 3; i++) {
-    assert(h_A[i] == static_cast<T>(0));
-  }
-
-  // check d_A[7,..., 9] = 4
-  for (int i = Num - 3; i < Num; i++) {
-    assert(h_A[i] == static_cast<T>(4));
-  }
-
-  free(h_A);
-}
-
-void test_constant_memcpy_async() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  constexpr size_t size = 2000;
-  constexpr size_t offset = 1000;
-
-  syclcompat::constant_memory<float, 1> d_A(size);
-  syclcompat::constant_memory<float, 1> d_B(size);
-
-  float *h_A = (float *)malloc(size / 2 * sizeof(float));
-  float *h_B = (float *)malloc(size / 2 * sizeof(float));
-  float *h_C = (float *)malloc(size * sizeof(float));
-  float *h_D = (float *)malloc(size * sizeof(float));
-
-  for (int i = 0; i < size / 2; i++) {
-    h_A[i] = 1.0f;
-    h_B[i] = 2.0f;
-  }
-
-  // hostA[0..999] -> deviceA[0..999]
-  // hostB[0..999] -> deviceA[1000..1999]
-  // deviceA[0..1999] -> hostC[0..1999]
-  // deviceA[0..999] -> deviceB[0..999]
-  // deviceA[1000..1999] -> deviceB[1000..1999]
-  // deviceB[0..1999] -> hostD[0..1999]
-
-  syclcompat::memcpy_async(d_A.get_ptr(), h_A, offset * sizeof(float));
-  syclcompat::memcpy_async((char *)d_A.get_ptr() + offset * sizeof(float), h_B,
-                           (size - offset) * sizeof(float));
-  syclcompat::memcpy_async(h_C, d_A.get_ptr(), size * sizeof(float));
-  syclcompat::memcpy_async(d_B.get_ptr(), d_A.get_ptr(),
-                           offset * sizeof(float));
-  syclcompat::memcpy_async((char *)d_A.get_ptr() + offset * sizeof(float), h_B,
-                           (size - offset) * sizeof(float));
-  syclcompat::memcpy_async((void *)h_C, (void *)d_A.get_ptr(),
-                           size * sizeof(float));
-  syclcompat::memcpy_async((void *)d_B.get_ptr(), (void *)d_A.get_ptr(),
-                           offset * sizeof(float));
-  syclcompat::memcpy_async(
-      (char *)d_B.get_ptr() + offset * sizeof(float),
-      (void *)((size_t)d_A.get_ptr() + offset * sizeof(float)),
-      (size - offset) * sizeof(float));
-  syclcompat::memcpy_async(h_D, d_B.get_ptr(), size * sizeof(float));
-  syclcompat::get_default_queue().wait_and_throw();
-
-  // verify hostD
-  for (int i = 0; i < offset; i++) {
-    assert(fabs(h_A[i] - h_D[i]) <= 1e-5);
-  }
-
-  for (int i = offset; i < size; i++) {
-    assert(fabs(h_B[i - offset] - h_D[i]) <= 1e-5);
-  }
-
-  free(h_A);
-  free(h_B);
-  free(h_C);
-  free(h_D);
-}
-
-void test_constant_memcpy_async_q() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  sycl::queue q{{sycl::property::queue::in_order()}};
-
-  constexpr size_t size = 2000;
-  constexpr size_t offset = 1000;
-  syclcompat::constant_memory<float, 1> d_A(size, q);
-  syclcompat::constant_memory<float, 1> d_B(size, q);
-
-  float *h_A = (float *)malloc(size / 2 * sizeof(float));
-  float *h_B = (float *)malloc(size / 2 * sizeof(float));
-  float *h_C = (float *)malloc(size * sizeof(float));
-  float *h_D = (float *)malloc(size * sizeof(float));
-
-  for (int i = 0; i < size / 2; i++) {
-    h_A[i] = 1.0f;
-    h_B[i] = 2.0f;
-  }
-
-  // hostA[0..999] -> deviceA[0..999]
-  // hostB[0..999] -> deviceA[1000..1999]
-  // deviceA[0..1999] -> hostC[0..1999]
-  // deviceA[0..999] -> deviceB[0..999]
-  // deviceA[1000..1999] -> deviceB[1000..1999]
-  // deviceB[0..1999] -> hostD[0..1999]
-
-  syclcompat::memcpy_async(d_A.get_ptr(), h_A, offset * sizeof(float), q);
-
-  syclcompat::memcpy_async((char *)d_A.get_ptr() + offset * sizeof(float), h_B,
-                           (size - offset) * sizeof(float), q);
-  syclcompat::memcpy_async(h_C, d_A.get_ptr(), size * sizeof(float), q);
-
-  syclcompat::memcpy_async(d_B.get_ptr(), d_A.get_ptr(), offset * sizeof(float),
-                           q);
-
-  syclcompat::memcpy_async(
-      (char *)d_B.get_ptr() + offset * sizeof(float),
-      (void *)((size_t)d_A.get_ptr() + offset * sizeof(float)),
-      (size - offset) * sizeof(float), q);
-
-  syclcompat::memcpy_async(h_D, d_B.get_ptr(), size * sizeof(float), q);
-  q.wait_and_throw();
-
-  // verify hostD
-  for (int i = 0; i < offset; i++) {
-    assert(fabs(h_A[i] - h_D[i]) <= 1e-5);
-  }
-
-  for (int i = offset; i < size; i++) {
-    assert(fabs(h_B[i - offset] - h_D[i]) <= 1e-5);
-  }
-
-  free(h_A);
-  free(h_B);
-  free(h_C);
-  free(h_D);
-}
-
-int main() {
-  test_free_memory();
-  test_free_memory_q();
-  test_wait_and_free_memory();
-  test_wait_and_free_memory_q();
-  test_memcpy_async();
-  test_memcpy_async_q();
-  test_memcpy_async_pitched();
-  test_memcpy_async_pitched_q();
-  test_memset_async();
-  test_memset_async_q();
-  test_memset_d16_async();
-  test_memset_d16_async_q();
-  test_memset_d32_async();
-  test_memset_d32_async_q();
-  test_constant_memcpy_async();
-  test_constant_memcpy_async_q();
-
-  INSTANTIATE_ALL_TYPES(value_type_list, test_memcpy_async_t);
-  INSTANTIATE_ALL_TYPES(value_type_list, test_memcpy_async_t_q);
-  INSTANTIATE_ALL_TYPES(value_type_list, test_fill_async);
-  INSTANTIATE_ALL_TYPES(value_type_list, test_fill_async_q);
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/memory/memory_management_test3_usmnone.cpp b/sycl/test-e2e/syclcompat/memory/memory_management_test3_usmnone.cpp
deleted file mode 100644
index 63dbddf797a9b..0000000000000
--- a/sycl/test-e2e/syclcompat/memory/memory_management_test3_usmnone.cpp
+++ /dev/null
@@ -1,601 +0,0 @@
-// ====------ memory_management_test3_usmnone.cpp---------- -*- C++ -* ----===////
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===----------------------------------------------------------------------===//
-
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-//
-// UNSUPPORTED: linux && level_zero_v2_adapter
-// UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/19662
-
-#define SYCLCOMPAT_USM_LEVEL_NONE
-#include <sycl/detail/core.hpp>
-#include <syclcompat/memory.hpp>
-#include "memory_common.hpp"
-
-void check(float *h_data, float *h_ref, size_t width, size_t height,
-           size_t depth) {
-  for (int i = 0; i < width * height * depth; i++) {
-    float diff = fabs(h_data[i] - h_ref[i]);
-    if (diff > 1.e-6) {
-      printf("Verification failed!");
-      printf("h_data[%d]=%f, h_ref[%d]=%f, diff=%f\n", i, h_data[i], i,
-             h_ref[i], diff);
-      exit(-1);
-    }
-  }
-}
-
-void test_mempcy_async() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  int Num = 5000;
-  int N1 = 1000;
-  float *h_A = (float*)malloc(Num*sizeof(float));
-  float *h_B = (float*)malloc(Num*sizeof(float));
-  float *h_C = (float*)malloc(Num*sizeof(float));
-
-  for (int i = 0; i < Num; i++) {
-    h_A[i] = 1.0f;
-    h_B[i] = 2.0f;
-  }
-
-  float *d_A;
-  // hostA[0..999] -> deviceA[0..999]
-  // hostB[0..3999] -> deviceA[1000..4999]
-  // deviceA[0..4999] -> hostC[0..4999]
-  d_A = (float *)syclcompat::malloc(Num * sizeof(float));
-  syclcompat::memcpy_async((void*) d_A, (void*) h_A, N1 * sizeof(float));
-  syclcompat::memcpy_async((void*) (d_A + N1), (void*) h_B, (Num-N1) * sizeof(float));
-  syclcompat::memcpy_async((void*) h_C, (void*) d_A, Num * sizeof(float));
-
-  syclcompat::wait_and_free((void*)d_A);
-
-  syclcompat::free(0);
-  syclcompat::free(NULL);
-  syclcompat::free(nullptr);
-
-  // verify
-  for(int i = 0; i < N1; i++){
-      if (fabs(h_A[i] - h_C[i]) > 1e-5) {
-          fprintf(stderr,"Check: Elements are A = %f, B = %f, C = %f:\n", h_A[i],  h_B[i],  h_C[i]);
-          fprintf(stderr,"Result verification failed at element %d:\n", i);
-          exit(EXIT_FAILURE);
-      }
-  }
-
-  for(int i = N1; i < Num; i++){
-      if (fabs(h_B[i] - h_C[i]) > 1e-5) {
-          fprintf(stderr,"Check: Elements are A = %f, B = %f, C = %f:\n", h_A[i],  h_B[i],  h_C[i]);
-          fprintf(stderr,"Result verification failed at element %d:\n", i);
-          exit(EXIT_FAILURE);
-      }
-  }
-
-  free(h_A);
-  free(h_B);
-  free(h_C);
-}
-
-void test_buffer_and_offset_kernel() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  int Num = 5000;
-  int Offset = 0; // Current dpcpp version in ics environment has bugs with Offset > 0,
-                  // CORC-6222 has fixed this issue, but the version of dpcpp used in ics
-                  // environment has not cover this patch. After it has this patch,
-                  // Offest could be set to 100, and current test case will pass.
-
-  float *h_A = (float*)malloc(Num*sizeof(float));
-  float *h_B = (float*)malloc(Num*sizeof(float));
-  float *h_C = (float*)malloc(Num*sizeof(float));
-
-  //syclcompat::dev_mgr::instance().select_device(0);
-
-  for (int i = 0; i < Num; i++) {
-    h_A[i] = 1.0f;
-    h_B[i] = 2.0f;
-  }
-
-  float *d_A, *d_B, *d_C;
-  // hostA -> deviceA
-  // hostB -> deviceB
-  // kernel: deviceC = deviceA + deviceB
-  // deviceA -> hostC
-  d_A = (float *)syclcompat::malloc(Num * sizeof(float));
-  d_B = (float *)syclcompat::malloc(Num * sizeof(float));
-  d_C = (float *)syclcompat::malloc(Num * sizeof(float));
-  syclcompat::memcpy_async((void*) d_A, (void*) h_A, Num * sizeof(float));
-  syclcompat::memcpy_async((void*) d_B, (void*) h_B, Num * sizeof(float));
-
-  syclcompat::get_default_queue().wait_and_throw();
-
-  d_A += Offset;
-  d_B += Offset;
-  d_C += Offset;
-
-  {
-    std::pair<syclcompat::buffer_t, size_t> buffer_and_offset_A = syclcompat::get_buffer_and_offset(d_A);
-    size_t offset_A = buffer_and_offset_A.second;
-    std::pair<syclcompat::buffer_t, size_t> buffer_and_offset_B = syclcompat::get_buffer_and_offset(d_B);
-    size_t offset_B = buffer_and_offset_A.second;
-    std::pair<syclcompat::buffer_t, size_t> buffer_and_offset_C = syclcompat::get_buffer_and_offset(d_C);
-    size_t offset_C = buffer_and_offset_A.second;
-    syclcompat::get_default_queue().submit(
-      [&](sycl::handler &cgh) {
-      auto d_A_acc = buffer_and_offset_A.first.get_access<sycl::access::mode::read_write>(cgh);
-      auto d_B_acc = buffer_and_offset_B.first.get_access<sycl::access::mode::read_write>(cgh);
-      auto d_C_acc = buffer_and_offset_C.first.get_access<sycl::access::mode::read_write>(cgh);
-
-        cgh.parallel_for<class vectorAdd3_1>(
-          sycl::range<1>(Num-Offset),
-          [=](sycl::id<1> id) {
-
-            float *A = (float*)(&d_A_acc[0]+offset_A);
-            float *B = (float*)(&d_B_acc[0]+offset_B);
-            float *C = (float*)(&d_C_acc[0]+offset_C);
-             int i = id[0];
-
-            C[i] = A[i] + B[i];
-          });
-      });
-  }
-  syclcompat::memcpy_async((void*) (h_C+Offset), (void*) d_C, (Num-Offset) * sizeof(float));
-
-  syclcompat::get_default_queue().wait_and_throw();
-
-  syclcompat::free((void*)d_A);
-  syclcompat::free((void*)d_B);
-  syclcompat::free((void*)d_C);
-
-  // verify
-  for(int i = Offset; i < Num; i++){
-      if (fabs(h_C[i] - h_A[i] - h_B[i]) > 1e-5) {
-        fprintf(stderr,"Check %d: Elements are A = %f, B = %f, C = %f:\n", i,h_A[i],  h_B[i],  h_C[i]);
-        fprintf(stderr,"Result verification failed at element %d:\n", i);
-        exit(EXIT_FAILURE);
-      }
-  }
-
-  free(h_A);
-  free(h_B);
-  free(h_C);
-}
-
-
-void test_memset_async() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  int Num = 10;
-  int *h_A = (int*)malloc(Num*sizeof(int));
-
-  for (int i = 0; i < Num; i++) {
-    h_A[i] = 4;
-  }
-
-  int *d_A;
-
-  d_A = (int *)syclcompat::malloc(Num * sizeof(int));
-  // hostA -> deviceA
-  syclcompat::memcpy_async((void*) d_A, (void*) h_A, Num * sizeof(int));
-
-  // set d_A[0,..., 6] = 0
-  syclcompat::memset_async((void*) d_A, 0, (Num - 3) * sizeof(int));
-
-  // deviceA -> hostA
-  syclcompat::memcpy_async((void*) h_A, (void*) d_A, Num * sizeof(int));
-
-  syclcompat::get_default_queue().wait_and_throw();
-
-  syclcompat::free((void*)d_A);
-
-  // check d_A[0,..., 6] = 0
-  for (int i = 0; i < Num - 3; i++) {
-    if (h_A[i] != 0) {
-      fprintf(stderr, "Check: h_A[%d] is %d:\n", i, h_A[i]);
-      fprintf(stderr, "Result verification failed at element [%d]!\n", i);
-      exit(EXIT_FAILURE);
-    }
-  }
-
-  // check d_A[7,..., 9] = 4
-  for (int i = Num - 3; i < Num; i++) {
-    if (h_A[i] != 4) {
-      fprintf(stderr, "Check: h_A[%d] is %d:\n", i, h_A[i]);
-      fprintf(stderr, "Result verification failed at element h_A[%d]!\n", i);
-      exit(EXIT_FAILURE);
-    }
-  }
-
-  free(h_A);
-}
-
-const unsigned int Num = 5000;
-const unsigned int N1 = 1000;
-syclcompat::constant_memory<float, 1> d_A(Num * sizeof(float));
-syclcompat::constant_memory<float, 1> d_B(Num * sizeof(float));
-
-void test_memcpy_async_getptr() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  float h_A[Num];
-  float h_B[Num];
-  float h_C[Num];
-  float h_D[Num];
-
-  for (int i = 0; i < Num; i++) {
-    h_A[i] = 1.0f;
-    h_B[i] = 2.0f;
-  }
-
-  for (int i = 0; i < Num; i++) {
-    h_A[i] = 1.0f;
-    h_B[i] = 2.0f;
-  }
-  // hostA[0..999] -> deviceA[0..999]
-  // hostB[0..3999] -> deviceA[1000..4999]
-  // deviceA[0..4999] -> deviceB[0..4999]
-  // deviceA[0..4999] -> hostC[0..4999]
-  // deviceB[0..4999] -> hostD[0..4999]
-  syclcompat::memcpy_async((void *)d_A.get_ptr(), (void *)&h_A[0], N1 * sizeof(float));
-  syclcompat::memcpy_async((char *)d_A.get_ptr() + N1 * sizeof(float), (void*) h_B, (Num-N1) * sizeof(float));
-  syclcompat::memcpy_async((void *)h_C, (void *)d_A.get_ptr(), Num * sizeof(float));
-  syclcompat::memcpy_async((void *)d_B.get_ptr(), (void *)d_A.get_ptr(), N1 * sizeof(float));
-  syclcompat::memcpy_async((char *)d_B.get_ptr() + N1 * sizeof(float), (void *)((size_t)d_A.get_ptr() + N1* sizeof(float)), (Num - N1) * sizeof(float));
-  syclcompat::memcpy_async((void *)h_D, (void *)d_B.get_ptr(), Num * sizeof(float));
-
-  syclcompat::get_default_queue().wait_and_throw();
-  // verify hostD
-  for (int i = 0; i < N1; i++) {
-    if (fabs(h_A[i] - h_D[i]) > 1e-5) {
-      fprintf(stderr, "Check: Elements are A = %f, D = %f:\n", h_A[i], h_D[i]);
-      fprintf(stderr, "Result verification failed at element %d:\n", i);
-      exit(EXIT_FAILURE);
-    }
-  }
-
-  for (int i = N1; i < Num; i++) {
-    if (fabs(h_B[i] - h_D[i]) > 1e-5) {
-      fprintf(stderr, "Check: Elements are B = %f, D = %f:\n",   h_B[i], h_D[i]);
-      fprintf(stderr, "Result verification failed at element %d:\n", i);
-      exit(EXIT_FAILURE);
-    }
-  }
-}
-
-void test_memcpy_pitched_async() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  size_t width = 6;
-  size_t height = 8;
-  float *h_data;
-  float *h_ref;
-  size_t h_pitch = sizeof(float) * width;
-  h_data = (float *)malloc(sizeof(float) * width * height);
-  for (int i = 0; i < width * height; i++)
-    h_data[i] = (float)i;
-
-  h_ref = (float *)malloc(sizeof(float) * width * height);
-  for (int i = 0; i < width * height; i++)
-    h_ref[i] = (float)i;
-
-  // alloc device memory.
-  size_t d_pitch;
-  float *d_data;
-  d_data = (float *)syclcompat::malloc(d_pitch, sizeof(float) * width, height);
-
-  // copy to Device.
-  syclcompat::memcpy_async(d_data, d_pitch, h_data, h_pitch, sizeof(float) * width, height);
-
-  // copy back to host.
-  syclcompat::memcpy_async(h_data, h_pitch, d_data, d_pitch, sizeof(float) * width, height);
-
-  syclcompat::get_default_queue().wait_and_throw();
-  check(h_data, h_ref, width, height, 1);
-
-  // memset device data.
-  syclcompat::memset_async(d_data, d_pitch, 0x1, sizeof(float) * width, height);
-
-  // copy back to host
-  syclcompat::memcpy_async(h_data, h_pitch, d_data, d_pitch, sizeof(float) * width, height);
-  syclcompat::get_default_queue().wait_and_throw();
-  // memset reference data.
-  memset(h_ref, 0x1, width * height * sizeof(float));
-  check(h_data, h_ref, width, height, 1);
-
-  free(h_data);
-  free(h_ref);
-  syclcompat::free((void *)d_data);
-}
-
-void test_mempcy_async(sycl::queue &q) {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  int Num = 5000;
-  int N1 = 1000;
-  float *h_A = (float*)malloc(Num*sizeof(float));
-  float *h_B = (float*)malloc(Num*sizeof(float));
-  float *h_C = (float*)malloc(Num*sizeof(float));
-
-  for (int i = 0; i < Num; i++) {
-    h_A[i] = 1.0f;
-    h_B[i] = 2.0f;
-  }
-
-  float *d_A;
-  // hostA[0..999] -> deviceA[0..999]
-  // hostB[0..3999] -> deviceA[1000..4999]
-  // deviceA[0..4999] -> hostC[0..4999]
-  d_A = (float *)syclcompat::malloc(Num * sizeof(float), q);
-  syclcompat::memcpy_async((void*) d_A, (void*) h_A, N1 * sizeof(float), q);
-  syclcompat::memcpy_async((void*) (d_A + N1), (void*) h_B, (Num-N1) * sizeof(float), q);
-  syclcompat::memcpy_async((void*) h_C, (void*) d_A, Num * sizeof(float), q);
-  q.wait_and_throw();
-  syclcompat::free((void*)d_A, q);
-
-  syclcompat::free(0, q);
-  syclcompat::free(NULL, q);
-  syclcompat::free(nullptr, q);
-
-  // verify
-  for(int i = 0; i < N1; i++){
-      if (fabs(h_A[i] - h_C[i]) > 1e-5) {
-          fprintf(stderr,"Check: Elements are A = %f, B = %f, C = %f:\n", h_A[i],  h_B[i],  h_C[i]);
-          fprintf(stderr,"Result verification failed at element %d:\n", i);
-          exit(EXIT_FAILURE);
-      }
-  }
-
-  for(int i = N1; i < Num; i++){
-      if (fabs(h_B[i] - h_C[i]) > 1e-5) {
-          fprintf(stderr,"Check: Elements are A = %f, B = %f, C = %f:\n", h_A[i],  h_B[i],  h_C[i]);
-          fprintf(stderr,"Result verification failed at element %d:\n", i);
-          exit(EXIT_FAILURE);
-      }
-  }
-
-  free(h_A);
-  free(h_B);
-  free(h_C);
-}
-
-void test_buffer_and_offset_kernel(sycl::queue &q) {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  int Num = 5000;
-  int Offset = 0; // Current dpcpp version in ics environment has bugs with Offset > 0,
-                  // CORC-6222 has fixed this issue, but the version of dpcpp used in ics
-                  // environment has not cover this patch. After it has this patch,
-                  // Offest could be set to 100, and current test case will pass.
-
-  float *h_A = (float*)malloc(Num*sizeof(float));
-  float *h_B = (float*)malloc(Num*sizeof(float));
-  float *h_C = (float*)malloc(Num*sizeof(float));
-
-  //syclcompat::dev_mgr::instance().select_device(0);
-
-  for (int i = 0; i < Num; i++) {
-    h_A[i] = 1.0f;
-    h_B[i] = 2.0f;
-  }
-
-  float *d_A, *d_B, *d_C;
-  // hostA -> deviceA
-  // hostB -> deviceB
-  // kernel: deviceC = deviceA + deviceB
-  // deviceA -> hostC
-  d_A = (float *)syclcompat::malloc(Num * sizeof(float), q);
-  d_B = (float *)syclcompat::malloc(Num * sizeof(float), q);
-  d_C = (float *)syclcompat::malloc(Num * sizeof(float), q);
-  syclcompat::memcpy_async((void*) d_A, (void*) h_A, Num * sizeof(float), q);
-  syclcompat::memcpy_async((void*) d_B, (void*) h_B, Num * sizeof(float), q);
-  q.wait_and_throw();
-  d_A += Offset;
-  d_B += Offset;
-  d_C += Offset;
-
-  {
-    std::pair<syclcompat::buffer_t, size_t> buffer_and_offset_A = syclcompat::get_buffer_and_offset(d_A);
-    size_t offset_A = buffer_and_offset_A.second;
-    std::pair<syclcompat::buffer_t, size_t> buffer_and_offset_B = syclcompat::get_buffer_and_offset(d_B);
-    size_t offset_B = buffer_and_offset_A.second;
-    std::pair<syclcompat::buffer_t, size_t> buffer_and_offset_C = syclcompat::get_buffer_and_offset(d_C);
-    size_t offset_C = buffer_and_offset_A.second;
-    syclcompat::get_default_queue().submit(
-      [&](sycl::handler &cgh) {
-      auto d_A_acc = buffer_and_offset_A.first.get_access<sycl::access::mode::read_write>(cgh);
-      auto d_B_acc = buffer_and_offset_B.first.get_access<sycl::access::mode::read_write>(cgh);
-      auto d_C_acc = buffer_and_offset_C.first.get_access<sycl::access::mode::read_write>(cgh);
-
-        cgh.parallel_for<class vectorAdd3_2>(
-          sycl::range<1>(Num-Offset),
-          [=](sycl::id<1> id) {
-
-            float *A = (float*)(&d_A_acc[0]+offset_A);
-            float *B = (float*)(&d_B_acc[0]+offset_B);
-            float *C = (float*)(&d_C_acc[0]+offset_C);
-             int i = id[0];
-
-            C[i] = A[i] + B[i];
-          });
-      });
-  }
-  syclcompat::memcpy_async((void*) (h_C+Offset), (void*) d_C, (Num-Offset) * sizeof(float), q);
-  q.wait_and_throw();
-  syclcompat::free((void*)d_A, q);
-  syclcompat::free((void*)d_B, q);
-  syclcompat::free((void*)d_C, q);
-
-  // verify
-  for(int i = Offset; i < Num; i++){
-      if (fabs(h_C[i] - h_A[i] - h_B[i]) > 1e-5) {
-        fprintf(stderr,"Check %d: Elements are A = %f, B = %f, C = %f:\n", i,h_A[i],  h_B[i],  h_C[i]);
-        fprintf(stderr,"Result verification failed at element %d:\n", i);
-        exit(EXIT_FAILURE);
-      }
-  }
-
-  free(h_A);
-  free(h_B);
-  free(h_C);
-}
-
-
-void test_memset_async(sycl::queue &q) {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  int Num = 10;
-  int *h_A = (int*)malloc(Num*sizeof(int));
-
-  for (int i = 0; i < Num; i++) {
-    h_A[i] = 4;
-  }
-
-  int *d_A;
-
-  d_A = (int *)syclcompat::malloc(Num * sizeof(int), q);
-  // hostA -> deviceA
-  syclcompat::memcpy_async((void*) d_A, (void*) h_A, Num * sizeof(int), q);
-
-  // set d_A[0,..., 6] = 0
-  syclcompat::memset_async((void*) d_A, 0, (Num - 3) * sizeof(int), q);
-
-  // deviceA -> hostA
-  syclcompat::memcpy_async((void*) h_A, (void*) d_A, Num * sizeof(int), q);
-  syclcompat::wait_and_free((void*)d_A, q);
-
-  // check d_A[0,..., 6] = 0
-  for (int i = 0; i < Num - 3; i++) {
-    if (h_A[i] != 0) {
-      fprintf(stderr, "Check: h_A[%d] is %d:\n", i, h_A[i]);
-      fprintf(stderr, "Result verification failed at element [%d]!\n", i);
-      exit(EXIT_FAILURE);
-    }
-  }
-
-  // check d_A[7,..., 9] = 4
-  for (int i = Num - 3; i < Num; i++) {
-    if (h_A[i] != 4) {
-      fprintf(stderr, "Check: h_A[%d] is %d:\n", i, h_A[i]);
-      fprintf(stderr, "Result verification failed at element h_A[%d]!\n", i);
-      exit(EXIT_FAILURE);
-    }
-  }
-  free(h_A);
-}
-
-void test_memcpy_async_getptr(sycl::queue &q) {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  const unsigned int Num = 5000;
-  const unsigned int N1 = 1000;
-  syclcompat::constant_memory<float, 1> d_A(Num * sizeof(float));
-  syclcompat::constant_memory<float, 1> d_B(Num * sizeof(float));
-
-  float h_A[Num];
-  float h_B[Num];
-  float h_C[Num];
-  float h_D[Num];
-
-  for (int i = 0; i < Num; i++) {
-    h_A[i] = 1.0f;
-    h_B[i] = 2.0f;
-  }
-
-  for (int i = 0; i < Num; i++) {
-    h_A[i] = 1.0f;
-    h_B[i] = 2.0f;
-  }
-  // hostA[0..999] -> deviceA[0..999]
-  // hostB[0..3999] -> deviceA[1000..4999]
-  // deviceA[0..4999] -> deviceB[0..4999]
-  // deviceA[0..4999] -> hostC[0..4999]
-  // deviceB[0..4999] -> hostD[0..4999]
-  syclcompat::memcpy_async((void *)d_A.get_ptr(), (void *)&h_A[0], N1 * sizeof(float), q);
-  syclcompat::memcpy_async((char *)d_A.get_ptr() + N1 * sizeof(float), (void*) h_B, (Num-N1) * sizeof(float), q);
-  syclcompat::memcpy_async((void *)h_C, (void *)d_A.get_ptr(), Num * sizeof(float), q);
-  syclcompat::memcpy_async((void *)d_B.get_ptr(), (void *)d_A.get_ptr(), N1 * sizeof(float), q);
-  syclcompat::memcpy_async((char *)d_B.get_ptr() + N1 * sizeof(float), (void *)((size_t)d_A.get_ptr() + N1* sizeof(float)), (Num - N1) * sizeof(float), q);
-  syclcompat::memcpy_async((void *)h_D, (void *)d_B.get_ptr(), Num * sizeof(float), q);
-  q.wait_and_throw();
-  // verify hostD
-  for (int i = 0; i < N1; i++) {
-    if (fabs(h_A[i] - h_D[i]) > 1e-5) {
-      fprintf(stderr, "Check: Elements are A = %f, D = %f:\n", h_A[i], h_D[i]);
-      fprintf(stderr, "Result verification failed at element %d:\n", i);
-      exit(EXIT_FAILURE);
-    }
-  }
-
-  for (int i = N1; i < Num; i++) {
-    if (fabs(h_B[i] - h_D[i]) > 1e-5) {
-      fprintf(stderr, "Check: Elements are B = %f, D = %f:\n",   h_B[i], h_D[i]);
-      fprintf(stderr, "Result verification failed at element %d:\n", i);
-      exit(EXIT_FAILURE);
-    }
-  }
-}
-
-void test_memcpy_pitched_async(sycl::queue &q) {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  size_t width = 6;
-  size_t height = 8;
-  float *h_data;
-  float *h_ref;
-  size_t h_pitch = sizeof(float) * width;
-  h_data = (float *)malloc(sizeof(float) * width * height);
-  for (int i = 0; i < width * height; i++)
-    h_data[i] = (float)i;
-
-  h_ref = (float *)malloc(sizeof(float) * width * height);
-  for (int i = 0; i < width * height; i++)
-    h_ref[i] = (float)i;
-
-  // alloc device memory.
-  size_t d_pitch;
-  float *d_data;
-  d_data = (float *)syclcompat::malloc(d_pitch, sizeof(float) * width, height, q);
-
-  // copy to Device.
-  syclcompat::memcpy_async(d_data, d_pitch, h_data, h_pitch, sizeof(float) * width, height, q);
-
-  // copy back to host.
-  syclcompat::memcpy_async(h_data, h_pitch, d_data, d_pitch, sizeof(float) * width, height, q);
-  q.wait_and_throw();
-  check(h_data, h_ref, width, height, 1);
-
-  // memset device data.
-  syclcompat::memset_async(d_data, d_pitch, 0x1, sizeof(float) * width, height, q);
-
-  // copy back to host
-  syclcompat::memcpy_async(h_data, h_pitch, d_data, d_pitch, sizeof(float) * width, height, q);
-  q.wait_and_throw();
-  // memset reference data.
-  memset(h_ref, 0x1, width * height * sizeof(float));
-  check(h_data, h_ref, width, height, 1);
-
-  free(h_data);
-  free(h_ref);
-  syclcompat::free((void *)d_data, q);
-}
-
-int main() {
-  test_mempcy_async();
-  test_buffer_and_offset_kernel();
-  test_memset_async();
-  test_memcpy_async_getptr();
-  test_memcpy_pitched_async();
-
-  sycl::queue q;
-  test_mempcy_async(q);
-  test_buffer_and_offset_kernel(q);
-  test_memset_async(q);
-  test_memcpy_async_getptr(q);
-  test_memcpy_pitched_async(q);
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/memory/shared_memory_usmnone.cpp b/sycl/test-e2e/syclcompat/memory/shared_memory_usmnone.cpp
deleted file mode 100644
index 2714977056312..0000000000000
--- a/sycl/test-e2e/syclcompat/memory/shared_memory_usmnone.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-// ====------ shared_memory.cpp---------- -*- C++ -* ----===////
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===----------------------------------------------------------------------===//
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#define SYCLCOMPAT_USM_LEVEL_NONE
-#include <sycl/detail/core.hpp>
-#include <syclcompat/syclcompat.hpp>
-#include <stdio.h>
-#include <string.h>
-
-#define M 4
-#define N 8
-
-syclcompat::shared_memory<float, 1> array(N);
-syclcompat::shared_memory<float, 1> result(M*N);
-
-void my_kernel(float* array, float* result,
-               sycl::nd_item<3> item_ct1,
-               float *resultInGroup)
-{
-
-
-  if(item_ct1.get_group_linear_id() == 0)
-    array[item_ct1.get_local_id(2)] = item_ct1.get_local_id(2);
-  resultInGroup[item_ct1.get_local_id(2)] = item_ct1.get_group(2);
-
-  item_ct1.barrier();
-
-  if (item_ct1.get_local_id(2) == 0) {
-    memcpy(&result[item_ct1.get_group(2)*N], resultInGroup, sizeof(float)*N);
-  }
-}
-
-
-int main () {
-  {
-    std::pair<syclcompat::buffer_t, size_t> array_buf_ct0 = syclcompat::get_buffer_and_offset(array.get_ptr());
-    size_t array_offset_ct0 = array_buf_ct0.second;
-    std::pair<syclcompat::buffer_t, size_t> result_buf_ct1 = syclcompat::get_buffer_and_offset(result.get_ptr());
-    size_t result_offset_ct1 = result_buf_ct1.second;
-    syclcompat::get_default_queue().submit(
-      [&](sycl::handler &cgh) {
-        sycl::local_accessor<float, 1> resultInGroup_acc_ct1(sycl::range<1>(8), cgh);
-        auto array_acc_ct0 = array_buf_ct0.first.get_access<sycl::access::mode::read_write>(cgh);
-        auto result_acc_ct1 = result_buf_ct1.first.get_access<sycl::access::mode::read_write>(cgh);
-
-        cgh.parallel_for(
-          sycl::nd_range<3>(sycl::range<3>(1, 1, M) * sycl::range<3>(1, 1, N), sycl::range<3>(1, 1, N)), 
-          [=](sycl::nd_item<3> item_ct1) {
-            float *array_ct0 = (float *)(&array_acc_ct0[0] + array_offset_ct0);
-            float *result_ct1 = (float *)(&result_acc_ct1[0] + result_offset_ct1);
-            my_kernel(array_ct0, result_ct1, item_ct1,
-                      resultInGroup_acc_ct1
-                          .get_multi_ptr<sycl::access::decorated::no>()
-                          .get());
-          });
-      });
-  }
-
-  syclcompat::get_current_device().queues_wait_and_throw();
-  for(int j = 0; j < M; j++) {
-    for (int i = 0; i < N; i++) {
-      assert(result[j*N + i] == static_cast<float>(j));
-    }
-  }
-  for(int j = 0; j < N; j++)
-      assert(array[j] == static_cast<float>(j));
-  return 0;
-}
-
diff --git a/sycl/test-e2e/syclcompat/memory/usm_allocations.cpp b/sycl/test-e2e/syclcompat/memory/usm_allocations.cpp
deleted file mode 100644
index 2129a9a01c006..0000000000000
--- a/sycl/test-e2e/syclcompat/memory/usm_allocations.cpp
+++ /dev/null
@@ -1,152 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  usm_allocations.cpp
- *
- *  Description:
- *    USM allocation tests
- **************************************************************************/
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-// XFAIL: target-native_cpu
-// XFAIL-TRACKER: https://github.com/intel/llvm/issues/20142
-
-#include <cassert>
-#include <numeric>
-
-#include <sycl/detail/core.hpp>
-
-#include <syclcompat/memory.hpp>
-
-#include "../common.hpp"
-#include "memory_common.hpp"
-#include "memory_fixt.hpp"
-
-template <typename T> void test_malloc() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  USMTest<T> usm_fixture;
-  if (usm_fixture.skip)
-    return; // Skip unsupported
-
-  usm_fixture.data = syclcompat::malloc<T>(usm_fixture.size_);
-  usm_fixture.launch_kernel();
-  usm_fixture.check_result();
-  syclcompat::free(usm_fixture.data);
-}
-
-template <typename T> void test_host() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  USMTest<T> usm_fixture;
-  if (usm_fixture.skip)
-    return; // Skip unsupported
-  if (!usm_fixture.q_.get_device().has(sycl::aspect::usm_host_allocations))
-    return; // Skip unsupported
-
-  usm_fixture.data = syclcompat::malloc_host<T>(usm_fixture.size_);
-  usm_fixture.launch_kernel();
-  usm_fixture.check_result();
-  syclcompat::free(usm_fixture.data);
-}
-
-void test_non_templated_malloc() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  USMTest<int> usm_fixture;
-
-  usm_fixture.data =
-      static_cast<int *>(syclcompat::malloc(usm_fixture.size_ * sizeof(int)));
-  usm_fixture.launch_kernel();
-  usm_fixture.check_result();
-  syclcompat::free(usm_fixture.data);
-}
-
-void test_non_templated_host() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  USMTest<int> usm_fixture;
-  if (!usm_fixture.q_.get_device().has(sycl::aspect::usm_host_allocations))
-    return; // Skip unsupported
-
-  usm_fixture.data = static_cast<int *>(
-      syclcompat::malloc_host(usm_fixture.size_ * sizeof(int)));
-  usm_fixture.launch_kernel();
-  usm_fixture.check_result();
-  syclcompat::free(usm_fixture.data);
-}
-
-// Test deduce direction
-void test_deduce() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  auto default_queue = syclcompat::get_default_queue();
-  if (!default_queue.get_device().has(sycl::aspect::usm_host_allocations))
-    return; // Skip unsupported
-
-  int *h_ptr = (int *)syclcompat::malloc_host(sizeof(int));
-  int *sys_ptr = (int *)std::malloc(sizeof(int));
-  int *d_ptr = (int *)syclcompat::malloc(sizeof(int));
-
-  // * to host
-  assert(syclcompat::detail::deduce_memcpy_direction(default_queue, h_ptr,
-                                                     h_ptr) ==
-         syclcompat::detail::memcpy_direction::device_to_device);
-  assert(syclcompat::detail::deduce_memcpy_direction(default_queue, h_ptr,
-                                                     sys_ptr) ==
-         syclcompat::detail::memcpy_direction::host_to_host);
-  assert(syclcompat::detail::deduce_memcpy_direction(default_queue, h_ptr,
-                                                     d_ptr) ==
-         syclcompat::detail::memcpy_direction::device_to_device);
-
-  // * to sys
-  assert(syclcompat::detail::deduce_memcpy_direction(default_queue, sys_ptr,
-                                                     h_ptr) ==
-         syclcompat::detail::memcpy_direction::host_to_host);
-  assert(syclcompat::detail::deduce_memcpy_direction(default_queue, sys_ptr,
-                                                     sys_ptr) ==
-         syclcompat::detail::memcpy_direction::host_to_host);
-  assert(syclcompat::detail::deduce_memcpy_direction(default_queue, sys_ptr,
-                                                     d_ptr) ==
-         syclcompat::detail::memcpy_direction::device_to_host);
-
-  // * to dev
-  assert(syclcompat::detail::deduce_memcpy_direction(default_queue, d_ptr,
-                                                     h_ptr) ==
-         syclcompat::detail::memcpy_direction::device_to_device);
-  assert(syclcompat::detail::deduce_memcpy_direction(default_queue, d_ptr,
-                                                     sys_ptr) ==
-         syclcompat::detail::memcpy_direction::host_to_device);
-  assert(syclcompat::detail::deduce_memcpy_direction(default_queue, d_ptr,
-                                                     d_ptr) ==
-         syclcompat::detail::memcpy_direction::device_to_device);
-
-  std::free(sys_ptr);
-  syclcompat::free(h_ptr);
-  syclcompat::free(d_ptr);
-}
-
-int main() {
-  INSTANTIATE_ALL_TYPES(value_type_list, test_malloc);
-  INSTANTIATE_ALL_TYPES(value_type_list, test_host);
-
-  // Avoid combinatorial explosion by only testing non-templated
-  // syclcompat::malloc with int type
-  test_non_templated_malloc();
-  test_non_templated_host();
-
-  test_deduce();
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/memory/usm_shared_allocations.cpp b/sycl/test-e2e/syclcompat/memory/usm_shared_allocations.cpp
deleted file mode 100644
index 7f1925bd65b1a..0000000000000
--- a/sycl/test-e2e/syclcompat/memory/usm_shared_allocations.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  usm_allocations.cpp
- *
- *  Description:
- *    USM allocation tests
- **************************************************************************/
-
-// REQUIRES: aspect-usm_shared_allocations
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-// XFAIL: target-native_cpu
-// XFAIL-TRACKER: https://github.com/intel/llvm/issues/20142
-
-#include <cassert>
-#include <numeric>
-
-#include <sycl/detail/core.hpp>
-
-#include <syclcompat/memory.hpp>
-
-#include "../common.hpp"
-#include "memory_common.hpp"
-#include "memory_fixt.hpp"
-
-template <typename T> void test_shared() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  USMTest<T> usm_fixture;
-
-  if (usm_fixture.skip)
-    return; // Skip unsupported
-
-  usm_fixture.data = syclcompat::malloc_shared<T>(usm_fixture.size_);
-  usm_fixture.launch_kernel();
-  usm_fixture.check_result();
-  syclcompat::free(usm_fixture.data);
-}
-
-void test_non_templated_shared() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-  USMTest<int> usm_fixture;
-
-  usm_fixture.data = static_cast<int *>(
-      syclcompat::malloc_shared(usm_fixture.size_ * sizeof(int)));
-  usm_fixture.launch_kernel();
-  usm_fixture.check_result();
-  syclcompat::free(usm_fixture.data);
-}
-
-// Test deduce direction
-void test_deduce_shared() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  auto default_queue = syclcompat::get_default_queue();
-
-  int *h_ptr = (int *)syclcompat::malloc_host(sizeof(int));
-  int *sys_ptr = (int *)std::malloc(sizeof(int));
-  int *d_ptr = (int *)syclcompat::malloc(sizeof(int));
-  int *s_ptr = (int *)syclcompat::malloc_shared(sizeof(int));
-
-  // * to host
-  assert(syclcompat::detail::deduce_memcpy_direction(default_queue, h_ptr,
-                                                     s_ptr) ==
-         syclcompat::detail::memcpy_direction::device_to_device);
-
-  // * to sys
-  assert(syclcompat::detail::deduce_memcpy_direction(default_queue, sys_ptr,
-                                                     s_ptr) ==
-         syclcompat::detail::memcpy_direction::host_to_host);
-
-  // * to dev
-  assert(syclcompat::detail::deduce_memcpy_direction(default_queue, d_ptr,
-                                                     s_ptr) ==
-         syclcompat::detail::memcpy_direction::device_to_device);
-
-  // * to shared
-  assert(syclcompat::detail::deduce_memcpy_direction(default_queue, s_ptr,
-                                                     h_ptr) ==
-         syclcompat::detail::memcpy_direction::device_to_device);
-  assert(syclcompat::detail::deduce_memcpy_direction(default_queue, s_ptr,
-                                                     sys_ptr) ==
-         syclcompat::detail::memcpy_direction::host_to_host);
-  assert(syclcompat::detail::deduce_memcpy_direction(default_queue, s_ptr,
-                                                     d_ptr) ==
-         syclcompat::detail::memcpy_direction::device_to_device);
-  assert(syclcompat::detail::deduce_memcpy_direction(default_queue, s_ptr,
-                                                     s_ptr) ==
-         syclcompat::detail::memcpy_direction::device_to_device);
-
-  syclcompat::free(s_ptr);
-  std::free(sys_ptr);
-  syclcompat::free(h_ptr);
-  syclcompat::free(d_ptr);
-}
-
-int main() {
-  INSTANTIATE_ALL_TYPES(value_type_list, test_shared);
-  test_non_templated_shared();
-  test_deduce_shared();
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/util/max_active_work_groups_per_cu.cpp b/sycl/test-e2e/syclcompat/util/max_active_work_groups_per_cu.cpp
deleted file mode 100644
index 78c2a00756332..0000000000000
--- a/sycl/test-e2e/syclcompat/util/max_active_work_groups_per_cu.cpp
+++ /dev/null
@@ -1,133 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat
- *
- *  max_active_work_groups_per_cu.cpp
- *
- *  Description:
- *    Test the syclcompat::max_active_work_groups_per_cu API
- **************************************************************************/
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include "sycl/accessor.hpp"
-#include <sycl/detail/core.hpp>
-#include <syclcompat/util.hpp>
-
-template <class T, size_t Dim>
-using sycl_global_accessor =
-    sycl::accessor<T, Dim, sycl::access::mode::read_write,
-                   sycl::access::target::global_buffer>;
-
-using value_type = int;
-
-template <int RangeDim> struct MyKernel {
-  MyKernel(sycl_global_accessor<value_type, RangeDim> acc) : acc_{acc} {}
-  void operator()(sycl::nd_item<RangeDim> item) const {
-    auto gid = item.get_global_id();
-    acc_[gid] = item.get_global_linear_id();
-  }
-  sycl_global_accessor<value_type, RangeDim> acc_;
-  static constexpr bool has_local_mem = false;
-};
-
-template <int RangeDim> struct MyLocalMemKernel {
-  MyLocalMemKernel(sycl_global_accessor<value_type, RangeDim> acc,
-                   sycl::local_accessor<value_type, RangeDim> lacc)
-      : acc_{acc}, lacc_{lacc} {}
-  void operator()(sycl::nd_item<RangeDim> item) const {
-    auto gid = item.get_global_id();
-    acc_[gid] = item.get_global_linear_id();
-    auto lid = item.get_local_id();
-    lacc_[lid] = item.get_global_linear_id();
-  }
-  sycl_global_accessor<value_type, RangeDim> acc_;
-  sycl::local_accessor<value_type, RangeDim> lacc_;
-  static constexpr bool has_local_mem = true;
-};
-
-template <template <int> class KernelName, int RangeDim>
-void test_max_active_work_groups_per_cu(sycl::queue q,
-                                        sycl::range<RangeDim> wg_range,
-                                        size_t local_mem_size = 0) {
-  if constexpr (!KernelName<RangeDim>::has_local_mem)
-    assert(local_mem_size == 0 && "Bad test setup");
-
-  size_t max_per_cu = syclcompat::max_active_work_groups_per_cu<KernelName<RangeDim>>(
-      wg_range, local_mem_size, q);
- 
-  // Check we get the same result passing equivalent dim3
-  syclcompat::dim3 wg_dim3{wg_range};
-  size_t max_per_cu_dim3 = syclcompat::max_active_work_groups_per_cu<KernelName<RangeDim>>(
-      wg_dim3, local_mem_size, q);
-  assert(max_per_cu == max_per_cu_dim3);
-
-  // Compare w/ reference impl
-  size_t max_compute_units =
-      q.get_device().get_info<sycl::info::device::max_compute_units>();
-  namespace syclex = sycl::ext::oneapi::experimental;
-  auto ctx = q.get_context();
-  auto bundle = sycl::get_kernel_bundle<sycl::bundle_state::executable>(ctx);
-  auto kernel = bundle.template get_kernel<KernelName<RangeDim>>();
-  size_t max_wgs = kernel.template ext_oneapi_get_info<
-      syclex::info::kernel_queue_specific::max_num_work_groups>(
-      q, sycl::range<3>{syclcompat::dim3{wg_range}}, local_mem_size);
-  assert(max_per_cu == max_wgs / max_compute_units);
-
-  // We aren't interested in the launch, it's here to define the kernel
-  if (false) {
-    sycl::range<RangeDim> global_range = wg_range;
-    if(max_per_cu > 0)
-      global_range[0] = global_range[0] * max_per_cu * max_compute_units;
-    sycl::nd_range<RangeDim> my_range{global_range, wg_range};
-    sycl::buffer<value_type, RangeDim> buf{global_range};
-
-    q.submit([&](sycl::handler &cgh) {
-      auto acc = buf.template get_access<sycl::access::mode::read_write>(cgh);
-      if constexpr (KernelName<RangeDim>::has_local_mem) {
-        sycl::local_accessor<value_type, RangeDim> lacc(
-            my_range.get_local_range(), cgh);
-        cgh.parallel_for(my_range, KernelName<RangeDim>{acc, lacc});
-      } else {
-        cgh.parallel_for(my_range, KernelName<RangeDim>{acc});
-      }
-    });
-  }
-}
-
-int main() {
-  sycl::queue q{};
-  sycl::range<1> range_1d{32};
-  sycl::range<2> range_2d{1, 32};
-  sycl::range<3> range_3d{1, 1, 32};
-  syclcompat::dim3 wg_dim3{32, 1, 1};
-
-  size_t lmem_size_small = sizeof(value_type) * 32;
-  size_t lmem_size_medium = lmem_size_small * 32;
-  size_t lmem_size_large = lmem_size_medium * 32;
-
-  test_max_active_work_groups_per_cu<MyKernel, 3>(q, range_3d);
-  test_max_active_work_groups_per_cu<MyKernel, 2>(q, range_2d);
-  test_max_active_work_groups_per_cu<MyKernel, 1>(q, range_1d);
-  test_max_active_work_groups_per_cu<MyLocalMemKernel, 3>(q, range_3d,
-                                                          lmem_size_small);
-  test_max_active_work_groups_per_cu<MyLocalMemKernel, 3>(q, range_3d,
-                                                          lmem_size_medium);
-  test_max_active_work_groups_per_cu<MyLocalMemKernel, 3>(q, range_3d,
-                                                          lmem_size_large);
-  test_max_active_work_groups_per_cu<MyLocalMemKernel, 1>(q, range_1d,
-                                                          lmem_size_large);
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/util/util_cast_value_test.cpp b/sycl/test-e2e/syclcompat/util/util_cast_value_test.cpp
deleted file mode 100644
index d3c63f07bc9d5..0000000000000
--- a/sycl/test-e2e/syclcompat/util/util_cast_value_test.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  util_cast_value_test.cpp
- *
- *  Description:
- *    cast_value tests
- **************************************************************************/
-
-// The original source was under the license below:
-// ====------ UtilCastValueTest.cpp---------- -*- C++ -* ----===////
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===----------------------------------------------------------------------===//
-
-// REQUIRES: aspect-fp64
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include <sycl/detail/core.hpp>
-
-#include <syclcompat/device.hpp>
-#include <syclcompat/util.hpp>
-
-double cast_value(const double &val) {
-  int lo = syclcompat::cast_double_to_int(val, false);
-  int hi = syclcompat::cast_double_to_int(val);
-  return syclcompat::cast_ints_to_double(hi, lo);
-}
-
-void test_kernel_cast_value(double *g_odata) {
-  double a = 1.12123515e-25f;
-  g_odata[0] = cast_value(a);
-
-  a = 0.000000000000000000000000112123515f;
-  g_odata[1] = cast_value(a);
-
-  a = 3.1415926f;
-  g_odata[2] = cast_value(a);
-}
-
-void test_cast_value() {
-  sycl::queue q = syclcompat::get_default_queue();
-
-  unsigned int num_data = 3;
-  unsigned int mem_size = sizeof(double) * num_data;
-
-  double *h_out_data = (double *)malloc(mem_size);
-
-  for (unsigned int i = 0; i < num_data; i++)
-    h_out_data[i] = 0;
-
-  double *d_out_data;
-  d_out_data = (double *)sycl::malloc_device(mem_size, q);
-  q.memcpy(d_out_data, h_out_data, mem_size).wait();
-
-  q.parallel_for(
-      sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-      [=](sycl::nd_item<3> item_ct1) { test_kernel_cast_value(d_out_data); });
-
-  q.memcpy(h_out_data, d_out_data, mem_size).wait();
-
-  assert(h_out_data[0] == 1.12123515e-25f);
-  assert(h_out_data[1] == 0.000000000000000000000000112123515f);
-  assert(h_out_data[2] == 3.1415926f);
-
-  free(h_out_data);
-  sycl::free(d_out_data, q);
-}
-
-int main() {
-  test_cast_value();
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/util/util_fast_length_test_usmnone.cpp b/sycl/test-e2e/syclcompat/util/util_fast_length_test_usmnone.cpp
deleted file mode 100644
index a05595b18d6de..0000000000000
--- a/sycl/test-e2e/syclcompat/util/util_fast_length_test_usmnone.cpp
+++ /dev/null
@@ -1,169 +0,0 @@
-// ====------ util_fast_length_test.cpp---------- -*- C++ -* ----===////
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===----------------------------------------------------------------------===//
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#define SYCLCOMPAT_USM_LEVEL_NONE
-#include <sycl/detail/core.hpp>
-#include <syclcompat/syclcompat.hpp>
-
-void fast_length_test() {
-
-  {
-    float inputData_0(0.8970062715);
-
-    sycl::range<1> ndRng(1);
-    float *kernelResult = new float[1];
-    auto testQueue = syclcompat::get_default_queue();
-    {
-      sycl::buffer<float, 1> buffer(kernelResult, ndRng);
-
-      testQueue.submit([&](sycl::handler &h) {
-        auto resultPtr =
-            buffer.template get_access<sycl::access::mode::write>(h);
-
-        h.single_task(
-            [=]() { resultPtr[0] = syclcompat::fast_length(&inputData_0, 1); });
-      });
-    }
-    testQueue.wait_and_throw();
-    if (fabs(inputData_0 - *kernelResult) > 1e-5) {
-      printf("fast_length_test 1 failed\n");
-      exit(-1);
-    }
-    delete[] kernelResult;
-  }
-
-  {
-    float inputData_0[2] = {0.8335529744, 0.7346600673};
-
-    sycl::range<1> ndRng(1);
-    float *kernelResult = new float[1];
-    auto testQueue = syclcompat::get_default_queue();
-    {
-      sycl::buffer<float, 1> buffer(kernelResult, ndRng);
-
-      testQueue.submit([&](sycl::handler &h) {
-        auto resultPtr =
-            buffer.template get_access<sycl::access::mode::write>(h);
-
-        h.single_task(
-            [=]() { resultPtr[0] = syclcompat::fast_length(&inputData_0[0], 2); });
-      });
-    }
-    testQueue.wait_and_throw();
-
-    if (fabs(sqrtf(0.8335529744 * 0.8335529744 + 0.7346600673 * 0.7346600673) -
-             *kernelResult) > 1e-5) {
-      printf("fast_length_test 2 failed\n");
-      exit(-1);
-    }
-
-    delete[] kernelResult;
-  }
-
-  {
-    float inputData_0[3] = {0.1658983906, 0.590226484, 0.4891553616};
-
-    sycl::range<1> ndRng(1);
-    float *kernelResult = new float[1];
-    auto testQueue = syclcompat::get_default_queue();
-    {
-      sycl::buffer<float, 1> buffer(kernelResult, ndRng);
-
-      testQueue.submit([&](sycl::handler &h) {
-        auto resultPtr =
-            buffer.template get_access<sycl::access::mode::write>(h);
-
-        h.single_task(
-            [=]() { resultPtr[0] = syclcompat::fast_length(&inputData_0[0], 3); });
-      });
-    }
-    testQueue.wait_and_throw();
-
-    if (fabs(sqrtf(0.1658983906 * 0.1658983906 + 0.590226484 * 0.590226484 +
-                   0.4891553616 * 0.4891553616) -
-             *kernelResult) > 1e-5) {
-      printf("fast_length_test 3 failed\n");
-      exit(-1);
-    }
-
-    delete[] kernelResult;
-  }
-
-  {
-    float inputData_0[4] = {0.6041178723, 0.7760620605, 0.2944284976,
-                            0.6851913766};
-
-    sycl::range<1> ndRng(1);
-    float *kernelResult = new float[1];
-    auto testQueue = syclcompat::get_default_queue();
-    {
-      sycl::buffer<float, 1> buffer(kernelResult, ndRng);
-
-      testQueue.submit([&](sycl::handler &h) {
-        auto resultPtr =
-            buffer.template get_access<sycl::access::mode::write>(h);
-
-        h.single_task(
-            [=]() { resultPtr[0] = syclcompat::fast_length(&inputData_0[0], 4); });
-      });
-    }
-    testQueue.wait_and_throw();
-
-    if (fabs(sqrtf(0.6041178723 * 0.6041178723 + 0.7760620605 * 0.7760620605 +
-                   0.2944284976 * 0.2944284976 + 0.6851913766 * 0.6851913766) -
-             *kernelResult) > 1e-5) {
-      printf("fast_length_test 4 failed\n");
-      exit(-1);
-    }
-
-    delete[] kernelResult;
-  }
-
-  {
-    float inputData_0[5] = {0.6041178723, 0.7760620605, 0.2944284976,
-                            0.6851913766, 0.6851913766};
-
-    sycl::range<1> ndRng(1);
-    float *kernelResult = new float[1];
-    auto testQueue = syclcompat::get_default_queue();
-    {
-      sycl::buffer<float, 1> buffer(kernelResult, ndRng);
-
-      testQueue.submit([&](sycl::handler &h) {
-        auto resultPtr =
-            buffer.template get_access<sycl::access::mode::write>(h);
-
-        h.single_task(
-            [=]() { resultPtr[0] = syclcompat::fast_length(&inputData_0[0], 5); });
-      });
-    }
-    testQueue.wait_and_throw();
-
-    if (fabs(sqrtf(0.6041178723 * 0.6041178723 + 0.7760620605 * 0.7760620605 +
-                   0.2944284976 * 0.2944284976 + 0.6851913766 * 0.6851913766 +
-                   0.6851913766 * 0.6851913766) -
-             *kernelResult) > 1e-5) {
-      printf("fast_length_test 5 failed\n");
-      exit(-1);
-    }
-
-    delete[] kernelResult;
-  }
-  printf("fast_length test is passed!\n");
-}
-
-int main() {
-
-  fast_length_test();
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/util/util_find_first_set.cpp b/sycl/test-e2e/syclcompat/util/util_find_first_set.cpp
deleted file mode 100644
index ba0f8245eadb7..0000000000000
--- a/sycl/test-e2e/syclcompat/util/util_find_first_set.cpp
+++ /dev/null
@@ -1,127 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  util_find_first_set.cpp
- *
- *  Description:
- *    Find_first_set tests
- **************************************************************************/
-
-// The original source was under the license below:
-// ====------ UtilFindFirstSet.cpp---------- -*- C++ -* ----===////
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===----------------------------------------------------------------------===//
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include <sycl/detail/core.hpp>
-#include <syclcompat.hpp>
-
-void find_first_set_test(int *test_result) {
-  int a;
-  unsigned long long int lla;
-  int result;
-  a = 0;
-  result = syclcompat::ffs(a);
-  if (result != 0) {
-    *test_result = 1;
-    return;
-  }
-
-  a = -2147483648;
-  result = syclcompat::ffs(a);
-  if (result != 32) {
-    *test_result = 1;
-    return;
-  }
-
-  a = 128;
-  result = syclcompat::ffs(a);
-  if (result != 8) {
-    *test_result = 1;
-    return;
-  }
-
-  a = 2147483647;
-  result = syclcompat::ffs(a);
-  if (result != 1) {
-    *test_result = 1;
-    return;
-  }
-
-  lla = -9223372036854775808Ull;
-  result = syclcompat::ffs(lla);
-  if (result != 64) {
-    *test_result = 1;
-    return;
-  }
-
-  lla = -9223372036854775807Ull;
-  result = syclcompat::ffs(lla);
-  if (result != 1) {
-    *test_result = 1;
-    return;
-  }
-
-  lla = -9223372034707292160Ull;
-  result = syclcompat::ffs(lla);
-  if (result != 32) {
-    *test_result = 1;
-    return;
-  }
-
-  lla = 2147483648Ull;
-  result = syclcompat::ffs(lla);
-  if (result != 32) {
-    *test_result = 1;
-    return;
-  }
-}
-
-void test_find_first_set() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  syclcompat::device_ext &dev_ct1 = syclcompat::get_current_device();
-  sycl::queue q_ct1 = *dev_ct1.default_queue();
-  int *test_result, host_test_result = 0;
-
-  test_result = sycl::malloc_device<int>(1, q_ct1);
-  q_ct1.memcpy(test_result, &host_test_result, sizeof(int)).wait();
-
-  q_ct1.parallel_for(
-      sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-      [=](sycl::nd_item<3> item_ct1) { find_first_set_test(test_result); });
-
-  dev_ct1.queues_wait_and_throw();
-  find_first_set_test(&host_test_result);
-  assert(host_test_result == 0);
-  q_ct1.memcpy(&host_test_result, test_result, sizeof(int)).wait();
-  assert(host_test_result == 0);
-
-  sycl::free(test_result, q_ct1);
-}
-
-int main() {
-  test_find_first_set();
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/util/util_helpers.cpp b/sycl/test-e2e/syclcompat/util/util_helpers.cpp
deleted file mode 100644
index c8abfd124c85b..0000000000000
--- a/sycl/test-e2e/syclcompat/util/util_helpers.cpp
+++ /dev/null
@@ -1,102 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  util_helpers.cpp
- *
- *  Description:
- *    generic utility helpers tests
- **************************************************************************/
-
-// The original source was under the license below:
-// ====------ UtilSelectFromSubGroup.cpp---------- -*- C++ -* ----===////
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===----------------------------------------------------------------------===//
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include <sycl/detail/core.hpp>
-#include <syclcompat/util.hpp>
-
-void test_reinterpreted_queue_ptr() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  sycl::queue q;
-  sycl::queue *q_ptr = &q;
-  uintptr_t reinterpreted_q = reinterpret_cast<uintptr_t>(q_ptr);
-  assert(q_ptr == syclcompat::int_as_queue_ptr(reinterpreted_q));
-}
-
-void test_default_queue_from_int_as_queue_ptr() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  // Check that int_as_queue_ptr with x < 2 maps to the default queue.
-  // Queue addresses may not be equal, but the queues should have the same
-  // device.
-  auto default_name = syclcompat::get_default_queue()
-                          .get_device()
-                          .get_info<sycl::info::device::name>();
-  auto int_as_queue_name = syclcompat::int_as_queue_ptr(1)
-                               ->get_device()
-                               .get_info<sycl::info::device::name>();
-
-  assert(default_name == int_as_queue_name);
-}
-
-void foo(sycl::float2 *x, int n, sycl::nd_item<3> item_ct1, float f = .1) {}
-
-void test_args_selector() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  int n = 2;
-  sycl::float2 *a = syclcompat::malloc_host<sycl::float2>(n);
-  a[0] = {1.0, 2.0};
-  a[1] = {3.0, 4.0};
-  float f = .1;
-
-  void *kernelParams[3] = {
-      static_cast<void *>(&a),
-      static_cast<void *>(&n),
-      static_cast<void *>(&f),
-  };
-
-  syclcompat::args_selector<2, 1, decltype(foo)> selector(kernelParams,
-                                                          nullptr);
-  auto &a_ref = selector.get<0>();
-  auto &n_ref = selector.get<1>();
-  auto &f_ref = selector.get<2>();
-
-  assert(a_ref[0][0] == 1.0);
-  assert(a_ref[0][1] == 2.0);
-  assert(a_ref[1][0] == 3.0);
-  assert(a_ref[1][1] == 4.0);
-  assert(n_ref == 2);
-  assert(f_ref == .1f);
-
-  syclcompat::free(a);
-}
-
-int main() {
-  test_reinterpreted_queue_ptr();
-  test_default_queue_from_int_as_queue_ptr();
-  test_args_selector();
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/util/util_logical_group.cpp b/sycl/test-e2e/syclcompat/util/util_logical_group.cpp
deleted file mode 100644
index 8bf7ce8238379..0000000000000
--- a/sycl/test-e2e/syclcompat/util/util_logical_group.cpp
+++ /dev/null
@@ -1,87 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  util_logical_group.cpp
- *
- *  Description:
- *    logical_group tests
- **************************************************************************/
-
-// The original source was under the license below:
-// ====------ UtilLogicalGroup.cpp -------------------- -*- C++ -* ----===////
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===----------------------------------------------------------------------===//
-
-// REQUIRES: sg-32
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include <cstdio>
-
-#include <sycl/detail/core.hpp>
-#include <syclcompat.hpp>
-
-// work-item:
-// 0 ... 7, 8 ... 15, 16 ... 23, 24 ... 31, 32 ... 39, 40 ... 47, 48 ... 51
-// -------  --------  ---------  ---------  ---------  ---------  ---------
-// 0        1         2          3          4          5          6
-
-void kernel(unsigned int *result, sycl::nd_item<3> item_ct1) {
-  auto ttb = item_ct1.get_group();
-  syclcompat::experimental::logical_group tbt =
-      syclcompat::experimental::logical_group(item_ct1, item_ct1.get_group(),
-                                              8);
-
-  if (item_ct1.get_local_id(2) == 50) {
-    result[0] = tbt.get_local_linear_range();
-    result[1] = tbt.get_local_linear_id();
-    result[2] = tbt.get_group_linear_range();
-    result[3] = tbt.get_group_linear_id();
-  }
-}
-
-void test_logical_group() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  syclcompat::device_ext &dev_ct1 = syclcompat::get_current_device();
-  sycl::queue q_ct1 = *dev_ct1.default_queue();
-  unsigned int result_host[4];
-  unsigned int *result_device;
-  result_device = sycl::malloc_device<unsigned int>(4, q_ct1);
-  q_ct1.parallel_for(
-      sycl::nd_range<3>(sycl::range<3>(1, 1, 52), sycl::range<3>(1, 1, 52)),
-      [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(32)]] {
-        kernel(result_device, item_ct1);
-      });
-  q_ct1.memcpy(result_host, result_device, sizeof(unsigned int) * 4).wait();
-  sycl::free(result_device, q_ct1);
-
-  assert(result_host[0] == 4);
-  assert(result_host[1] == 2);
-  assert(result_host[2] == 7);
-  assert(result_host[3] == 6);
-}
-
-int main() {
-  test_logical_group();
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/util/util_match_all_over_group.cpp b/sycl/test-e2e/syclcompat/util/util_match_all_over_group.cpp
deleted file mode 100644
index a1abe3bae1ed0..0000000000000
--- a/sycl/test-e2e/syclcompat/util/util_match_all_over_group.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  util_match_all_over_group.cpp
- *
- *  Description:
- *    util_match_all_over_group tests
- **************************************************************************/
-
-// The original source was under the license below:
-// ====------ UtilSelectFromSubGroup.cpp---------- -*- C++ -* ----===////
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===----------------------------------------------------------------------===//
-
-// REQUIRES: sg-32
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include <sycl/detail/core.hpp>
-#include <syclcompat.hpp>
-
-constexpr unsigned int NUM_TESTS = 3;
-constexpr unsigned int SUBGROUP_SIZE = 32;
-constexpr unsigned int DATA_SIZE = NUM_TESTS * SUBGROUP_SIZE;
-
-void test_match_all_over_group() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  constexpr syclcompat::dim3 grid{1};
-  constexpr syclcompat::dim3 threads{SUBGROUP_SIZE};
-
-  unsigned int input[DATA_SIZE] = {
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, // #1
-      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-      0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, // #2
-      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-      0, 0, 0, 0, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, // #3
-      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-  };
-  unsigned int output[DATA_SIZE];
-  int pred[DATA_SIZE];
-  unsigned int *d_input = syclcompat::malloc<unsigned int>(DATA_SIZE);
-  unsigned int *d_output = syclcompat::malloc<unsigned int>(DATA_SIZE);
-  int *d_pred = syclcompat::malloc<int>(DATA_SIZE);
-
-  unsigned int member_mask = 0x00FF;
-  unsigned int expected[DATA_SIZE] = {
-      0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, // #1
-      0,      0,      0,      0,      0,      0,      0,      0,
-      0,      0,      0,      0,      0,      0,      0,      0,
-      0,      0,      0,      0,      0,      0,      0,      0,
-      0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, // #2
-      0,      0,      0,      0,      0,      0,      0,      0,
-      0,      0,      0,      0,      0,      0,      0,      0,
-      0,      0,      0,      0,      0,      0,      0,      0,
-      0,      0,      0,      0,      0,      0,      0,      0, // #3
-      0,      0,      0,      0,      0,      0,      0,      0,
-      0,      0,      0,      0,      0,      0,      0,      0,
-      0,      0,      0,      0,      0,      0,      0,      0,
-  };
-  unsigned int expected_pred[DATA_SIZE] = {
-      1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, // #1
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, // #2
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // #3
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  };
-
-  syclcompat::memcpy<unsigned int>(d_input, input, DATA_SIZE);
-  syclcompat::memset(d_output, 0, DATA_SIZE * sizeof(unsigned int));
-  syclcompat::memset(d_pred, 1, DATA_SIZE * sizeof(int));
-
-  sycl::queue q = syclcompat::get_default_queue();
-  q.parallel_for(
-      sycl::nd_range<1>(threads.size(), threads.size()),
-      [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(SUBGROUP_SIZE)]] {
-        for (auto id = item.get_global_linear_id(); id < DATA_SIZE;
-             id += SUBGROUP_SIZE)
-          d_output[id] = syclcompat::match_all_over_sub_group(
-              item.get_sub_group(), member_mask, d_input[id], &d_pred[id]);
-      });
-  q.wait_and_throw();
-  syclcompat::memcpy<unsigned int>(output, d_output, DATA_SIZE);
-  syclcompat::memcpy<int>(pred, d_pred, DATA_SIZE);
-
-  for (int i = 0; i < DATA_SIZE; ++i) {
-    assert(output[i] == expected[i]);
-    assert(pred[i] == expected_pred[i]);
-  }
-
-  syclcompat::free(d_input);
-  syclcompat::free(d_output);
-  syclcompat::free(d_pred);
-}
-
-int main() {
-  test_match_all_over_group();
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/util/util_match_any_over_group.cpp b/sycl/test-e2e/syclcompat/util/util_match_any_over_group.cpp
deleted file mode 100644
index 3a78768599f65..0000000000000
--- a/sycl/test-e2e/syclcompat/util/util_match_any_over_group.cpp
+++ /dev/null
@@ -1,102 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  util_match_any_over_group.cpp
- *
- *  Description:
- *    util_match_any_over_group tests
- **************************************************************************/
-
-// The original source was under the license below:
-// ====------ UtilSelectFromSubGroup.cpp---------- -*- C++ -* ----===////
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===----------------------------------------------------------------------===//
-
-// REQUIRES: sg-32
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include <sycl/detail/core.hpp>
-#include <syclcompat.hpp>
-
-constexpr unsigned int NUM_TESTS = 3;
-constexpr unsigned int SUBGROUP_SIZE = 32;
-constexpr unsigned int DATA_SIZE = NUM_TESTS * SUBGROUP_SIZE;
-
-void test_match_any_over_group() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  constexpr syclcompat::dim3 grid{1};
-  constexpr syclcompat::dim3 threads{DATA_SIZE};
-
-  unsigned int input[DATA_SIZE] = {
-      0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3,
-      4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // Subgroup #1
-      0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2,
-      4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // Subgroup #2
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // Subgroup #3
-  };
-  unsigned int output[DATA_SIZE];
-  unsigned int *d_input = syclcompat::malloc<unsigned int>(DATA_SIZE);
-  unsigned int *d_output = syclcompat::malloc<unsigned int>(DATA_SIZE);
-
-  unsigned int member_mask = 0x0FFF;
-  unsigned int expected[DATA_SIZE] = {
-      0x000F, 0x000F, 0x000F, 0x000F, 0x00F0, 0x00F0, 0x00F0, 0x00F0, //
-      0x0F00, 0x0F00, 0x0F00, 0x0F00, 0,      0,      0,      0,      //
-      0,      0,      0,      0,      0,      0,      0,      0,      //
-      0,      0,      0,      0,      0,      0,      0,      0,      // #1
-      0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, //
-      0x0F00, 0x0F00, 0x0F00, 0x0F00, 0,      0,      0,      0,      //
-      0,      0,      0,      0,      0,      0,      0,      0,      //
-      0,      0,      0,      0,      0,      0,      0,      0,      // #2
-      0x0FFF, 0x0FFF, 0x0FFF, 0x0FFF, 0x0FFF, 0x0FFF, 0x0FFF, 0x0FFF, //
-      0x0FFF, 0x0FFF, 0x0FFF, 0x0FFF, 0,      0,      0,      0,      //
-      0,      0,      0,      0,      0,      0,      0,      0,      //
-      0,      0,      0,      0,      0,      0,      0,      0,      // #3
-  };
-
-  syclcompat::memcpy<unsigned int>(d_input, input, DATA_SIZE);
-  sycl::queue q = syclcompat::get_default_queue();
-  q.parallel_for(
-      sycl::nd_range<1>(grid.size() * threads.size(), threads.size()),
-      [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(SUBGROUP_SIZE)]] {
-        auto id = item.get_global_linear_id();
-        d_output[id] = syclcompat::match_any_over_sub_group(
-            item.get_sub_group(), member_mask, d_input[id]);
-      });
-  q.wait_and_throw();
-  syclcompat::memcpy<unsigned int>(output, d_output, DATA_SIZE);
-
-  for (int i = 0; i < DATA_SIZE; ++i) {
-    assert(output[i] == expected[i]);
-  }
-
-  syclcompat::free(d_input);
-  syclcompat::free(d_output);
-}
-
-int main() {
-  test_match_any_over_group();
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/util/util_matrix_mem_copy_test.cpp b/sycl/test-e2e/syclcompat/util/util_matrix_mem_copy_test.cpp
deleted file mode 100644
index 96dc5fdf4f4cf..0000000000000
--- a/sycl/test-e2e/syclcompat/util/util_matrix_mem_copy_test.cpp
+++ /dev/null
@@ -1,115 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  util_matrix_mem_copy_test.cpp
- *
- *  Description:
- *    matrix_mem_copy tests
- **************************************************************************/
-
-// The original source was under the license below:
-// ====------ UtilMatrixMemCopyTest.cpp---------- -*- C++ -* ----===////
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===----------------------------------------------------------------------===//
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include <iostream>
-#include <sycl/detail/core.hpp>
-#include <syclcompat.hpp>
-
-#define M 3
-#define N 2
-
-void test_matrix_mem_copy_1() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  syclcompat::device_ext &dev_ct1 = syclcompat::get_current_device();
-  sycl::queue *q_ct1 = dev_ct1.default_queue();
-  float *devPtrA;
-  devPtrA = (float *)sycl::malloc_device(M * N * sizeof(float), *q_ct1);
-  float host_a[6] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
-  float host_b[6] = {-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f};
-  float host_c[6] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
-
-  syclcompat::detail::matrix_mem_copy((void *)devPtrA, (void *)host_a, M, M, M,
-                                      N, sizeof(float));
-  syclcompat::detail::matrix_mem_copy((void *)host_b, (void *)devPtrA, M, M, M,
-                                      N, sizeof(float));
-
-  for (int i = 0; i < M * N; i++) {
-    assert(fabs(host_b[i] - host_c[i]) <= 1e-5);
-  }
-
-  // Because to_ld == from_ld, matrix_mem_copy just do one copy.
-  // All padding data is also copied except the last padding.
-  float host_d[6] = {-2.0f, -2.0f, -2.0f, -2.0f, -2.0f, -2.0f};
-  float host_e[6] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, -2.0f};
-  syclcompat::detail::matrix_mem_copy(
-      (void *)host_d, (void *)devPtrA, M /*to_ld*/, M /*from_ld*/,
-      M - 1 /*rows*/, N /*cols*/, sizeof(float));
-
-  for (int i = 0; i < M * N; i++) {
-    assert(fabs(host_d[i] - host_e[i]) <= 1e-5);
-  }
-
-  sycl::free(devPtrA, *q_ct1);
-}
-
-void test_matrix_mem_copy_2() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  syclcompat::device_ext &dev_ct1 = syclcompat::get_current_device();
-  sycl::queue *q_ct1 = dev_ct1.default_queue();
-
-  float *devPtrA;
-  devPtrA = (float *)sycl::malloc_device(M * N * sizeof(float), *q_ct1);
-  float host_a[6] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
-  float host_b[6] = {-1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f};
-  float host_c[6] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
-
-  syclcompat::detail::matrix_mem_copy(devPtrA, host_a, M, M, M, N);
-  syclcompat::detail::matrix_mem_copy(host_b, devPtrA, M, M, M, N);
-
-  for (int i = 0; i < M * N; i++) {
-    assert(fabs(host_b[i] - host_c[i]) <= 1e-5);
-  }
-
-  float host_d[4] = {-2.0f, -2.0f, -2.0f, -2.0f};
-  float host_e[4] = {1.0f, 2.0f, 4.0f, 5.0f};
-  syclcompat::detail::matrix_mem_copy(host_d, devPtrA, M - 1 /*to_ld*/,
-                                      M /*from_ld*/, M - 1 /*rows*/,
-                                      N /*cols*/);
-
-  for (int i = 0; i < (M - 1) * (N - 1); i++) {
-    assert(fabs(host_d[i] - host_e[i]) <= 1e-5);
-  }
-
-  sycl::free(devPtrA, *q_ct1);
-}
-
-int main() {
-  test_matrix_mem_copy_1();
-  test_matrix_mem_copy_2();
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/util/util_nd_range_barrier_test.cpp b/sycl/test-e2e/syclcompat/util/util_nd_range_barrier_test.cpp
deleted file mode 100644
index 4573c1d7b214e..0000000000000
--- a/sycl/test-e2e/syclcompat/util/util_nd_range_barrier_test.cpp
+++ /dev/null
@@ -1,192 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  util_nd_range_barrier_test.cpp
- *
- *  Description:
- *    nd_range_barrier tests
- **************************************************************************/
-
-// The original source was under the license below:
-// ====------ UtilNdRangeBarrierTest.cpp---------- -*- C++ -* ----===////
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===----------------------------------------------------------------------===//
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include <cstring>
-
-#include <iostream>
-#include <stdio.h>
-#include <sycl/detail/core.hpp>
-#include <syclcompat.hpp>
-
-void kernel_1(sycl::nd_item<3> item_ct1,
-              sycl::atomic_ref<
-                  unsigned int, syclcompat::experimental::barrier_memory_order,
-                  sycl::memory_scope::device,
-                  sycl::access::address_space::global_space> &sync_ct1) {
-  syclcompat::experimental::nd_range_barrier(item_ct1, sync_ct1);
-}
-
-void kernel_2(sycl::nd_item<3> item_ct1,
-              sycl::atomic_ref<
-                  unsigned int, syclcompat::experimental::barrier_memory_order,
-                  sycl::memory_scope::device,
-                  sycl::access::address_space::global_space> &sync_ct1) {
-  syclcompat::experimental::nd_range_barrier(item_ct1, sync_ct1);
-
-  syclcompat::experimental::nd_range_barrier(item_ct1, sync_ct1);
-}
-
-void test_nd_range_barrier_dim3() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  syclcompat::device_ext &dev_ct1 = syclcompat::get_current_device();
-  sycl::queue *q_ct1 = dev_ct1.default_queue();
-  {
-    syclcompat::global_memory<unsigned int, 0> d_sync_ct1;
-    unsigned *sync_ct1 = d_sync_ct1.get_ptr(syclcompat::get_default_queue());
-    syclcompat::get_default_queue().memset(sync_ct1, 0, sizeof(int)).wait();
-
-    q_ct1
-        ->submit([&](sycl::handler &cgh) {
-          cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, 4) *
-                                                 sycl::range<3>(1, 1, 4),
-                                             sycl::range<3>(1, 1, 4)),
-                           [=](sycl::nd_item<3> item_ct1) {
-                             auto atm_sync_ct1 = sycl::atomic_ref<
-                                 unsigned int,
-                                 syclcompat::experimental::barrier_memory_order,
-                                 sycl::memory_scope::device,
-                                 sycl::access::address_space::global_space>(
-                                 sync_ct1[0]);
-                             kernel_1(item_ct1, atm_sync_ct1);
-                           });
-        })
-        .wait();
-  }
-  dev_ct1.queues_wait_and_throw();
-
-  {
-
-    syclcompat::global_memory<unsigned int, 0> d_sync_ct1;
-    unsigned *sync_ct1 = d_sync_ct1.get_ptr(syclcompat::get_default_queue());
-    syclcompat::get_default_queue().memset(sync_ct1, 0, sizeof(int)).wait();
-
-    q_ct1
-        ->submit([&](sycl::handler &cgh) {
-          cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, 4) *
-                                                 sycl::range<3>(1, 1, 4),
-                                             sycl::range<3>(1, 1, 4)),
-                           [=](sycl::nd_item<3> item_ct1) {
-                             auto atm_sync_ct1 = sycl::atomic_ref<
-                                 unsigned int,
-                                 syclcompat::experimental::barrier_memory_order,
-                                 sycl::memory_scope::device,
-                                 sycl::access::address_space::global_space>(
-                                 sync_ct1[0]);
-                             kernel_2(item_ct1, atm_sync_ct1);
-                           });
-        })
-        .wait();
-  }
-  dev_ct1.queues_wait_and_throw();
-}
-
-void kernel_1(sycl::nd_item<1> item_ct1,
-              sycl::atomic_ref<
-                  unsigned int, syclcompat::experimental::barrier_memory_order,
-                  sycl::memory_scope::device,
-                  sycl::access::address_space::global_space> &sync_ct1) {
-  syclcompat::experimental::nd_range_barrier(item_ct1, sync_ct1);
-}
-
-void kernel_2(sycl::nd_item<1> item_ct1,
-              sycl::atomic_ref<
-                  unsigned int, syclcompat::experimental::barrier_memory_order,
-                  sycl::memory_scope::device,
-                  sycl::access::address_space::global_space> &sync_ct1) {
-  syclcompat::experimental::nd_range_barrier(item_ct1, sync_ct1);
-
-  syclcompat::experimental::nd_range_barrier(item_ct1, sync_ct1);
-}
-
-void test_nd_range_barrier_dim1() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  syclcompat::device_ext &dev_ct1 = syclcompat::get_current_device();
-  sycl::queue *q_ct1 = dev_ct1.default_queue();
-
-  {
-    syclcompat::global_memory<unsigned int, 0> d_sync_ct1;
-    unsigned *sync_ct1 = d_sync_ct1.get_ptr(syclcompat::get_default_queue());
-    syclcompat::get_default_queue().memset(sync_ct1, 0, sizeof(int)).wait();
-
-    q_ct1
-        ->submit([&](sycl::handler &cgh) {
-          cgh.parallel_for(
-              sycl::nd_range<1>(sycl::range<1>(4) * sycl::range<1>(4),
-                                sycl::range<1>(4)),
-              [=](sycl::nd_item<1> item_ct1) {
-                auto atm_sync_ct1 = sycl::atomic_ref<
-                    unsigned int,
-                    syclcompat::experimental::barrier_memory_order,
-                    sycl::memory_scope::device,
-                    sycl::access::address_space::global_space>(sync_ct1[0]);
-                kernel_1(item_ct1, atm_sync_ct1);
-              });
-        })
-        .wait();
-  }
-
-  dev_ct1.queues_wait_and_throw();
-  {
-    syclcompat::global_memory<unsigned int, 0> d_sync_ct1;
-    unsigned *sync_ct1 = d_sync_ct1.get_ptr(syclcompat::get_default_queue());
-    syclcompat::get_default_queue().memset(sync_ct1, 0, sizeof(int)).wait();
-
-    q_ct1
-        ->submit([&](sycl::handler &cgh) {
-          cgh.parallel_for(
-              sycl::nd_range<1>(sycl::range<1>(4) * sycl::range<1>(4),
-                                sycl::range<1>(4)),
-              [=](sycl::nd_item<1> item_ct1) {
-                auto atm_sync_ct1 = sycl::atomic_ref<
-                    unsigned int,
-                    syclcompat::experimental::barrier_memory_order,
-                    sycl::memory_scope::device,
-                    sycl::access::address_space::global_space>(sync_ct1[0]);
-                kernel_2(item_ct1, atm_sync_ct1);
-              });
-        })
-        .wait();
-  }
-  dev_ct1.queues_wait_and_throw();
-}
-
-int main() {
-  test_nd_range_barrier_dim1();
-  test_nd_range_barrier_dim3();
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/util/util_occupancy_calculation.cpp b/sycl/test-e2e/syclcompat/util/util_occupancy_calculation.cpp
deleted file mode 100644
index e798363dee9ed..0000000000000
--- a/sycl/test-e2e/syclcompat/util/util_occupancy_calculation.cpp
+++ /dev/null
@@ -1,76 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  util_occupancy_calculation.cpp
- *
- *  Description:
- *    max_potential_wg and max_active_wg tests
- **************************************************************************/
-
-// The original source was under the license below:
-// ====--- util_calculate_max_active_wg_per_xecore.cpp ----- *- C++ -* ----===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-// ===---------------------------------------------------------------------===//
-
-// REQUIRES: gpu
-// REQUIRES: level_zero || opencl
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include <syclcompat/util.hpp>
-
-// These tests only check the API, not the functionality itself.
-void test_calculate_max_active_wg_per_xecore() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  int num_blocks;
-  int block_size = 128;
-  size_t dynamic_shared_memory_size = 0;
-  int sg_size = 32;
-  bool used_barrier = true;
-  bool used_large_grf = true;
-  syclcompat::experimental::calculate_max_active_wg_per_xecore(
-      &num_blocks, block_size, dynamic_shared_memory_size, sg_size,
-      used_barrier, used_large_grf);
-}
-
-void test_calculate_max_potential_wg() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  int num_blocks;
-  int block_size = 128;
-  size_t dynamic_shared_memory_size = 0;
-  int sg_size = 32;
-  bool used_barrier = true;
-  bool used_large_grf = true;
-
-  int block_size_limit = 0;
-  syclcompat::experimental::calculate_max_potential_wg(
-      &num_blocks, &block_size, block_size_limit, dynamic_shared_memory_size,
-      sg_size, used_barrier, used_large_grf);
-}
-
-int main() {
-  test_calculate_max_active_wg_per_xecore();
-  test_calculate_max_potential_wg();
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/util/util_perm_byte_test.cpp b/sycl/test-e2e/syclcompat/util/util_perm_byte_test.cpp
deleted file mode 100644
index f9fed15ec01bb..0000000000000
--- a/sycl/test-e2e/syclcompat/util/util_perm_byte_test.cpp
+++ /dev/null
@@ -1,82 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  util_perm_byte_test.cpp
- *
- *  Description:
- *    byte_level_permute tests
- **************************************************************************/
-
-// The original source was under the license below:
-// ====------ UtilPermByteTest.cpp---------- -*- C++ -* ----===////
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===----------------------------------------------------------------------===//
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <sycl/detail/core.hpp>
-#include <syclcompat.hpp>
-
-void byte_perm_ref(unsigned int *d_data) {
-
-  unsigned int lo;
-  unsigned int hi;
-
-  lo = 0x33221100;
-  hi = 0x77665544;
-
-  lo = 0x33221100;
-  hi = 0x77665544;
-
-  for (int i = 0; i < 17; i++)
-    d_data[i] = syclcompat::byte_level_permute(lo, hi, 0x1111 * i);
-
-  d_data[17] = syclcompat::byte_level_permute(lo, 0, 0x0123);
-  d_data[18] = syclcompat::byte_level_permute(lo, hi, 0x7531);
-  d_data[19] = syclcompat::byte_level_permute(lo, hi, 0x6420);
-}
-
-void test_byte_level_permute() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  const int N = 20;
-  unsigned int refer[N] = {0x0,        0x11111111, 0x22222222, 0x33333333,
-                           0x44444444, 0x55555555, 0x66666666, 0x77777777,
-                           0x0,        0x11111111, 0x22222222, 0x33333333,
-                           0x44444444, 0x55555555, 0x66666666, 0x77777777,
-                           0x11111100, 0x112233,   0x77553311, 0x66442200};
-  unsigned int data[N];
-
-  byte_perm_ref(data);
-
-  for (int i = 0; i < N; i++) {
-    assert(refer[i] == data[i]);
-  }
-}
-
-int main() {
-  test_byte_level_permute();
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/util/util_permute_sub_group_by_xor.cpp b/sycl/test-e2e/syclcompat/util/util_permute_sub_group_by_xor.cpp
deleted file mode 100644
index 6b0b478b1e367..0000000000000
--- a/sycl/test-e2e/syclcompat/util/util_permute_sub_group_by_xor.cpp
+++ /dev/null
@@ -1,170 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  util_permute_sub_group_by_xor.cpp
- *
- *  Description:
- *    permute_sub_group_by_xor tests
- **************************************************************************/
-
-// The original source was under the license below:
-// ====------ UtilPermuteSubGroupByXor.cpp---------- -*- C++ -* ----===////
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===----------------------------------------------------------------------===//
-
-// REQUIRES: sg-32
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include <sycl/detail/core.hpp>
-#include <syclcompat.hpp>
-
-#define WARP_SIZE 32
-#define DATA_NUM 128
-
-using namespace sycl::ext::oneapi::experimental;
-
-template <typename T = int> void init_data(T *data, int num) {
-  for (int i = 0; i < num; i++)
-    data[i] = i;
-}
-
-template <typename T = int>
-void verify_data(T *data, T *expect, int num, int step = 1) {
-  for (int i = 0; i < num; i = i + step) {
-    assert(data[i] == expect[i]);
-  }
-}
-
-void permute_sub_group_by_xor1(unsigned int *data, sycl::nd_item<3> item_ct1) {
-  int threadid = item_ct1.get_local_id(2) +
-                 item_ct1.get_local_id(1) * item_ct1.get_local_range(2) +
-                 item_ct1.get_local_id(0) * item_ct1.get_local_range(2) *
-                     item_ct1.get_local_range(1) +
-                 item_ct1.get_group(2) * item_ct1.get_local_range(2) *
-                     item_ct1.get_local_range(1) * item_ct1.get_local_range(0);
-  int output = 0;
-  output = syclcompat::permute_sub_group_by_xor(item_ct1.get_sub_group(),
-                                                threadid, 2);
-  data[threadid] = output;
-}
-
-void permute_sub_group_by_xor2(unsigned int *data, sycl::nd_item<3> item_ct1) {
-  int threadid = item_ct1.get_local_id(2) +
-                 item_ct1.get_local_id(1) * item_ct1.get_local_range(2) +
-                 item_ct1.get_local_id(0) * item_ct1.get_local_range(2) *
-                     item_ct1.get_local_range(1) +
-                 item_ct1.get_group(2) * item_ct1.get_local_range(2) *
-                     item_ct1.get_local_range(1) * item_ct1.get_local_range(0);
-  int output = 0;
-  output = syclcompat::permute_sub_group_by_xor(item_ct1.get_sub_group(),
-                                                threadid, 1, 8);
-  data[threadid] = output;
-}
-
-void test_permute_sub_group_by_xor() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  syclcompat::device_ext &dev_ct1 = syclcompat::get_current_device();
-  sycl::queue *q_ct1 = dev_ct1.default_queue();
-  bool Result = true;
-  unsigned int *dev_data_u = nullptr;
-  sycl::range<3> GridSize(1, 1, 1);
-  sycl::range<3> BlockSize(1, 1, 1);
-  dev_data_u = sycl::malloc_device<unsigned int>(DATA_NUM, *q_ct1);
-
-  GridSize = sycl::range<3>(1, 1, 2);
-  BlockSize = sycl::range<3>(1, 2, 32);
-  unsigned int expect1[DATA_NUM] = {
-      2,   3,   0,   1,   6,   7,   4,   5,   10,  11,  8,   9,   14,  15,  12,
-      13,  18,  19,  16,  17,  22,  23,  20,  21,  26,  27,  24,  25,  30,  31,
-      28,  29,  34,  35,  32,  33,  38,  39,  36,  37,  42,  43,  40,  41,  46,
-      47,  44,  45,  50,  51,  48,  49,  54,  55,  52,  53,  58,  59,  56,  57,
-      62,  63,  60,  61,  66,  67,  64,  65,  70,  71,  68,  69,  74,  75,  72,
-      73,  78,  79,  76,  77,  82,  83,  80,  81,  86,  87,  84,  85,  90,  91,
-      88,  89,  94,  95,  92,  93,  98,  99,  96,  97,  102, 103, 100, 101, 106,
-      107, 104, 105, 110, 111, 108, 109, 114, 115, 112, 113, 118, 119, 116, 117,
-      122, 123, 120, 121, 126, 127, 124, 125};
-  unsigned int host_dev_data_u[DATA_NUM];
-  init_data<unsigned int>(host_dev_data_u, DATA_NUM);
-  q_ct1->memcpy(dev_data_u, host_dev_data_u, DATA_NUM * sizeof(unsigned int))
-      .wait();
-
-  q_ct1->parallel_for(sycl::nd_range<3>(GridSize * BlockSize, BlockSize),
-                      [=](sycl::nd_item<3> item_ct1)
-                          [[sycl::reqd_sub_group_size(32)]] {
-                            permute_sub_group_by_xor1(dev_data_u, item_ct1);
-                          });
-
-  dev_ct1.queues_wait_and_throw();
-  q_ct1->memcpy(host_dev_data_u, dev_data_u, DATA_NUM * sizeof(unsigned int))
-      .wait();
-  verify_data<unsigned int>(host_dev_data_u, expect1, DATA_NUM);
-  sycl::free(dev_data_u, *q_ct1);
-}
-
-void test_permute_sub_group_by_xor_extra_arg() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  syclcompat::device_ext &dev_ct1 = syclcompat::get_current_device();
-  sycl::queue *q_ct1 = dev_ct1.default_queue();
-  bool Result = true;
-  unsigned int *dev_data_u = nullptr;
-  sycl::range<3> GridSize(1, 1, 1);
-  sycl::range<3> BlockSize(1, 1, 1);
-  dev_data_u = sycl::malloc_device<unsigned int>(DATA_NUM, *q_ct1);
-
-  GridSize = sycl::range<3>(1, 1, 2);
-  BlockSize = sycl::range<3>(1, 2, 32);
-  unsigned int expect2[DATA_NUM] = {
-      1,   0,   3,   2,   5,   4,   7,   6,   9,   8,   11,  10,  13,  12,  15,
-      14,  17,  16,  19,  18,  21,  20,  23,  22,  25,  24,  27,  26,  29,  28,
-      31,  30,  33,  32,  35,  34,  37,  36,  39,  38,  41,  40,  43,  42,  45,
-      44,  47,  46,  49,  48,  51,  50,  53,  52,  55,  54,  57,  56,  59,  58,
-      61,  60,  63,  62,  65,  64,  67,  66,  69,  68,  71,  70,  73,  72,  75,
-      74,  77,  76,  79,  78,  81,  80,  83,  82,  85,  84,  87,  86,  89,  88,
-      91,  90,  93,  92,  95,  94,  97,  96,  99,  98,  101, 100, 103, 102, 105,
-      104, 107, 106, 109, 108, 111, 110, 113, 112, 115, 114, 117, 116, 119, 118,
-      121, 120, 123, 122, 125, 124, 127, 126};
-  unsigned int host_dev_data_u[DATA_NUM];
-  init_data<unsigned int>(host_dev_data_u, DATA_NUM);
-
-  q_ct1->memcpy(dev_data_u, host_dev_data_u, DATA_NUM * sizeof(unsigned int))
-      .wait();
-  q_ct1->parallel_for(sycl::nd_range<3>(GridSize * BlockSize, BlockSize),
-                      [=](sycl::nd_item<3> item_ct1)
-                          [[sycl::reqd_sub_group_size(32)]] {
-                            permute_sub_group_by_xor2(dev_data_u, item_ct1);
-                          });
-
-  dev_ct1.queues_wait_and_throw();
-  q_ct1->memcpy(host_dev_data_u, dev_data_u, DATA_NUM * sizeof(unsigned int))
-      .wait();
-  verify_data<unsigned int>(host_dev_data_u, expect2, DATA_NUM);
-  sycl::free(dev_data_u, *q_ct1);
-}
-
-int main() {
-  test_permute_sub_group_by_xor();
-  test_permute_sub_group_by_xor_extra_arg();
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/util/util_reverse_bits_test.cpp b/sycl/test-e2e/syclcompat/util/util_reverse_bits_test.cpp
deleted file mode 100644
index 6336c08c837be..0000000000000
--- a/sycl/test-e2e/syclcompat/util/util_reverse_bits_test.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  util_reverse_bits_test.cpp
- *
- *  Description:
- *    reverse_bits tests
- **************************************************************************/
-
-// The original source was under the license below:
-// ====------ UtilReverseBitsTest.cpp---------- -*- C++ -* ----===////
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===----------------------------------------------------------------------===//
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include <sycl/detail/core.hpp>
-#include <syclcompat.hpp>
-
-void test_reverse_bits() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  unsigned int a = 1;
-  unsigned int b = syclcompat::reverse_bits(a);
-  assert(b == 0x80000000);
-
-  a = 0x12345678;
-  b = syclcompat::reverse_bits(a);
-  assert(b == 0x1e6a2c48);
-}
-
-int main() {
-  test_reverse_bits();
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/util/util_select_from_sub_group.cpp b/sycl/test-e2e/syclcompat/util/util_select_from_sub_group.cpp
deleted file mode 100644
index ffad55f257430..0000000000000
--- a/sycl/test-e2e/syclcompat/util/util_select_from_sub_group.cpp
+++ /dev/null
@@ -1,154 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  util_select_from_sub_group.cpp
- *
- *  Description:
- *    select_from_sub_group tests
- **************************************************************************/
-
-// The original source was under the license below:
-// ====------ UtilSelectFromSubGroup.cpp---------- -*- C++ -* ----===////
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===----------------------------------------------------------------------===//
-
-// REQUIRES: sg-32
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include <sycl/detail/core.hpp>
-#include <syclcompat.hpp>
-
-#define WARP_SIZE 32
-#define DATA_NUM 128
-
-template <typename T = int> void init_data(T *data, int num) {
-  for (int i = 0; i < num; i++)
-    data[i] = i;
-}
-
-template <typename T = int>
-void verify_data(T *data, T *expect, int num, int step = 1) {
-  for (int i = 0; i < num; i = i + step) {
-    assert(data[i] == expect[i]);
-  }
-}
-
-void select_from_sub_group1(unsigned int *data, sycl::nd_item<3> item_ct1) {
-  int threadid = item_ct1.get_local_id(2) +
-                 item_ct1.get_local_id(1) * item_ct1.get_local_range(2) +
-                 item_ct1.get_local_id(0) * item_ct1.get_local_range(2) *
-                     item_ct1.get_local_range(1) +
-                 item_ct1.get_group(2) * item_ct1.get_local_range(2) *
-                     item_ct1.get_local_range(1) * item_ct1.get_local_range(0);
-  int output = 0;
-  output = syclcompat::select_from_sub_group(item_ct1.get_sub_group(), threadid,
-                                             threadid + 1);
-  data[threadid] = output;
-}
-
-void select_from_sub_group2(unsigned int *data, sycl::nd_item<3> item_ct1) {
-  int threadid = item_ct1.get_local_id(2) +
-                 item_ct1.get_local_id(1) * item_ct1.get_local_range(2) +
-                 item_ct1.get_local_id(0) * item_ct1.get_local_range(2) *
-                     item_ct1.get_local_range(1) +
-                 item_ct1.get_group(2) * item_ct1.get_local_range(2) *
-                     item_ct1.get_local_range(1) * item_ct1.get_local_range(0);
-  int output = 0;
-  output = syclcompat::select_from_sub_group(item_ct1.get_sub_group(), threadid,
-                                             threadid + 1, 8);
-  data[threadid] = output;
-}
-
-void test_select_from_sub_group() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  syclcompat::device_ext &dev_ct1 = syclcompat::get_current_device();
-  sycl::queue *q_ct1 = dev_ct1.default_queue();
-  bool Result = true;
-  int *dev_data = nullptr;
-  unsigned int *dev_data_u = nullptr;
-  sycl::range<3> GridSize(1, 1, 1);
-  sycl::range<3> BlockSize(1, 1, 1);
-  dev_data = sycl::malloc_device<int>(DATA_NUM, *q_ct1);
-  dev_data_u = sycl::malloc_device<unsigned int>(DATA_NUM, *q_ct1);
-  unsigned int host_dev_data_u[DATA_NUM];
-  GridSize = sycl::range<3>(1, 1, 2);
-  BlockSize = sycl::range<3>(1, 2, 32);
-  unsigned int expect1[DATA_NUM] = {
-      1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,  15,
-      16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,
-      31,  0,   33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,
-      46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,
-      61,  62,  63,  32,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,
-      76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
-      91,  92,  93,  94,  95,  64,  97,  98,  99,  100, 101, 102, 103, 104, 105,
-      106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
-      121, 122, 123, 124, 125, 126, 127, 96};
-  init_data<unsigned int>(host_dev_data_u, DATA_NUM);
-  q_ct1->memcpy(dev_data_u, host_dev_data_u, DATA_NUM * sizeof(unsigned int))
-      .wait();
-  q_ct1->parallel_for(sycl::nd_range<3>(GridSize * BlockSize, BlockSize),
-                      [=](sycl::nd_item<3> item_ct1)
-                          [[sycl::reqd_sub_group_size(32)]] {
-                            select_from_sub_group1(dev_data_u, item_ct1);
-                          });
-
-  dev_ct1.queues_wait_and_throw();
-  q_ct1->memcpy(host_dev_data_u, dev_data_u, DATA_NUM * sizeof(unsigned int))
-      .wait();
-  verify_data<unsigned int>(host_dev_data_u, expect1, DATA_NUM);
-
-  GridSize = sycl::range<3>(1, 1, 2);
-  BlockSize = sycl::range<3>(1, 2, 32);
-  unsigned int expect2[DATA_NUM] = {
-      1,   2,   3,   4,   5,   6,   7,   0,   9,   10,  11,  12,  13,  14,  15,
-      8,   17,  18,  19,  20,  21,  22,  23,  16,  25,  26,  27,  28,  29,  30,
-      31,  24,  33,  34,  35,  36,  37,  38,  39,  32,  41,  42,  43,  44,  45,
-      46,  47,  40,  49,  50,  51,  52,  53,  54,  55,  48,  57,  58,  59,  60,
-      61,  62,  63,  56,  65,  66,  67,  68,  69,  70,  71,  64,  73,  74,  75,
-      76,  77,  78,  79,  72,  81,  82,  83,  84,  85,  86,  87,  80,  89,  90,
-      91,  92,  93,  94,  95,  88,  97,  98,  99,  100, 101, 102, 103, 96,  105,
-      106, 107, 108, 109, 110, 111, 104, 113, 114, 115, 116, 117, 118, 119, 112,
-      121, 122, 123, 124, 125, 126, 127, 120};
-  init_data<unsigned int>(host_dev_data_u, DATA_NUM);
-  q_ct1->memcpy(dev_data_u, host_dev_data_u, DATA_NUM * sizeof(unsigned int))
-      .wait();
-  q_ct1->parallel_for(sycl::nd_range<3>(GridSize * BlockSize, BlockSize),
-                      [=](sycl::nd_item<3> item_ct1)
-                          [[sycl::reqd_sub_group_size(32)]] {
-                            select_from_sub_group2(dev_data_u, item_ct1);
-                          });
-
-  dev_ct1.queues_wait_and_throw();
-  q_ct1->memcpy(host_dev_data_u, dev_data_u, DATA_NUM * sizeof(unsigned int))
-      .wait();
-  verify_data<unsigned int>(host_dev_data_u, expect2, DATA_NUM);
-
-  sycl::free(dev_data, *q_ct1);
-  sycl::free(dev_data_u, *q_ct1);
-}
-
-int main() {
-  test_select_from_sub_group();
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/util/util_shift_sub_group_left.cpp b/sycl/test-e2e/syclcompat/util/util_shift_sub_group_left.cpp
deleted file mode 100644
index 0fac1ee013d06..0000000000000
--- a/sycl/test-e2e/syclcompat/util/util_shift_sub_group_left.cpp
+++ /dev/null
@@ -1,154 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  util_shift_sub_group_left.cpp
- *
- *  Description:
- *    shift_sub_group_left tests
- **************************************************************************/
-
-// The original source was under the license below:
-// ====------ UtilShiftSubGroupLeft.cpp---------- -*- C++ -* ----===////
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===----------------------------------------------------------------------===//
-
-// REQUIRES: sg-32
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include <sycl/detail/core.hpp>
-#include <syclcompat.hpp>
-
-#define DATA_NUM 128
-
-template <typename T = int> void init_data(T *data, int num) {
-  for (int i = 0; i < num; i++)
-    data[i] = i;
-}
-
-template <typename T = int>
-void verify_data(T *data, T *expect, int num, int step = 1) {
-  for (int i = 0; i < num; i = i + step) {
-    assert(data[i] == expect[i]);
-  }
-}
-
-void shift_sub_group_left1(unsigned int *data, sycl::nd_item<3> item_ct1) {
-  int threadid = item_ct1.get_local_id(2) +
-                 item_ct1.get_local_id(1) * item_ct1.get_local_range(2) +
-                 item_ct1.get_local_id(0) * item_ct1.get_local_range(2) *
-                     item_ct1.get_local_range(1) +
-                 item_ct1.get_group(2) * item_ct1.get_local_range(2) *
-                     item_ct1.get_local_range(1) * item_ct1.get_local_range(0);
-  int output = 0;
-  output =
-      syclcompat::shift_sub_group_left(item_ct1.get_sub_group(), threadid, 1);
-  data[threadid] = output;
-}
-
-void shift_sub_group_left2(unsigned int *data, sycl::nd_item<3> item_ct1) {
-  int threadid = item_ct1.get_local_id(2) +
-                 item_ct1.get_local_id(1) * item_ct1.get_local_range(2) +
-                 item_ct1.get_local_id(0) * item_ct1.get_local_range(2) *
-                     item_ct1.get_local_range(1) +
-                 item_ct1.get_group(2) * item_ct1.get_local_range(2) *
-                     item_ct1.get_local_range(1) * item_ct1.get_local_range(0);
-  int output = 0;
-  output = syclcompat::shift_sub_group_left(item_ct1.get_sub_group(), threadid,
-                                            1, 8);
-  data[threadid] = output;
-}
-
-void test_shift_sub_group_left() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  syclcompat::device_ext &dev_ct1 = syclcompat::get_current_device();
-  sycl::queue *q_ct1 = dev_ct1.default_queue();
-  bool Result = true;
-  int *dev_data = nullptr;
-  unsigned int *dev_data_u = nullptr;
-  sycl::range<3> GridSize(1, 1, 1);
-  sycl::range<3> BlockSize(1, 1, 1);
-  dev_data = sycl::malloc_device<int>(DATA_NUM, *q_ct1);
-  dev_data_u = sycl::malloc_device<unsigned int>(DATA_NUM, *q_ct1);
-  unsigned int host_dev_data_u[DATA_NUM];
-  GridSize = sycl::range<3>(1, 1, 2);
-  BlockSize = sycl::range<3>(1, 2, 32);
-  unsigned int expect1[DATA_NUM] = {
-      1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,  15,
-      16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,
-      31,  31,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,
-      46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,
-      61,  62,  63,  63,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,
-      76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
-      91,  92,  93,  94,  95,  95,  97,  98,  99,  100, 101, 102, 103, 104, 105,
-      106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
-      121, 122, 123, 124, 125, 126, 127, 127};
-  init_data<unsigned int>(host_dev_data_u, DATA_NUM);
-  q_ct1->memcpy(dev_data_u, host_dev_data_u, DATA_NUM * sizeof(unsigned int))
-      .wait();
-  q_ct1->parallel_for(sycl::nd_range<3>(GridSize * BlockSize, BlockSize),
-                      [=](sycl::nd_item<3> item_ct1)
-                          [[sycl::reqd_sub_group_size(32)]] {
-                            shift_sub_group_left1(dev_data_u, item_ct1);
-                          });
-  dev_ct1.queues_wait_and_throw();
-  q_ct1->memcpy(host_dev_data_u, dev_data_u, DATA_NUM * sizeof(unsigned int))
-      .wait();
-  verify_data<unsigned int>(host_dev_data_u, expect1, DATA_NUM);
-
-  GridSize = sycl::range<3>(1, 1, 2);
-  BlockSize = sycl::range<3>(1, 2, 32);
-  unsigned int expect2[DATA_NUM] = {
-      1,   2,   3,   4,   5,   6,   7,   7,   9,   10,  11,  12,  13,  14,  15,
-      15,  17,  18,  19,  20,  21,  22,  23,  23,  25,  26,  27,  28,  29,  30,
-      31,  31,  33,  34,  35,  36,  37,  38,  39,  39,  41,  42,  43,  44,  45,
-      46,  47,  47,  49,  50,  51,  52,  53,  54,  55,  55,  57,  58,  59,  60,
-      61,  62,  63,  63,  65,  66,  67,  68,  69,  70,  71,  71,  73,  74,  75,
-      76,  77,  78,  79,  79,  81,  82,  83,  84,  85,  86,  87,  87,  89,  90,
-      91,  92,  93,  94,  95,  95,  97,  98,  99,  100, 101, 102, 103, 103, 105,
-      106, 107, 108, 109, 110, 111, 111, 113, 114, 115, 116, 117, 118, 119, 119,
-      121, 122, 123, 124, 125, 126, 127, 127};
-
-  init_data<unsigned int>(host_dev_data_u, DATA_NUM);
-  q_ct1->memcpy(dev_data_u, host_dev_data_u, DATA_NUM * sizeof(unsigned int))
-      .wait();
-
-  q_ct1->parallel_for(sycl::nd_range<3>(GridSize * BlockSize, BlockSize),
-                      [=](sycl::nd_item<3> item_ct1)
-                          [[sycl::reqd_sub_group_size(32)]] {
-                            shift_sub_group_left2(dev_data_u, item_ct1);
-                          });
-
-  dev_ct1.queues_wait_and_throw();
-  q_ct1->memcpy(host_dev_data_u, dev_data_u, DATA_NUM * sizeof(unsigned int))
-      .wait();
-  verify_data<unsigned int>(host_dev_data_u, expect2, DATA_NUM);
-
-  sycl::free(dev_data, *q_ct1);
-  sycl::free(dev_data_u, *q_ct1);
-}
-
-int main() {
-  test_shift_sub_group_left();
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/util/util_shift_sub_group_right.cpp b/sycl/test-e2e/syclcompat/util/util_shift_sub_group_right.cpp
deleted file mode 100644
index 0dbc985170f03..0000000000000
--- a/sycl/test-e2e/syclcompat/util/util_shift_sub_group_right.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  util_shift_sub_group_right.cpp
- *
- *  Description:
- *    shift_sub_group_right tests
- **************************************************************************/
-
-// The original source was under the license below:
-// ====------ UtilShiftSubGroupRight.cpp---------- -*- C++ -* ----===////
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===----------------------------------------------------------------------===//
-
-// REQUIRES: sg-32
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include <sycl/detail/core.hpp>
-#include <syclcompat.hpp>
-
-#define DATA_NUM 128
-
-template <typename T = int> void init_data(T *data, int num) {
-  for (int i = 0; i < num; i++)
-    data[i] = i;
-}
-
-template <typename T = int>
-void verify_data(T *data, T *expect, int num, int step = 1) {
-  for (int i = 0; i < num; i = i + step) {
-    assert(data[i] == expect[i]);
-  }
-}
-
-void shift_sub_group_right1(unsigned int *data, sycl::nd_item<3> item_ct1) {
-  int threadid = item_ct1.get_local_id(2) +
-                 item_ct1.get_local_id(1) * item_ct1.get_local_range(2) +
-                 item_ct1.get_local_id(0) * item_ct1.get_local_range(2) *
-                     item_ct1.get_local_range(1) +
-                 item_ct1.get_group(2) * item_ct1.get_local_range(2) *
-                     item_ct1.get_local_range(1) * item_ct1.get_local_range(0);
-  int output = 0;
-  output =
-      syclcompat::shift_sub_group_right(item_ct1.get_sub_group(), threadid, 1);
-  data[threadid] = output;
-}
-
-void shift_sub_group_right2(unsigned int *data, sycl::nd_item<3> item_ct1) {
-  int threadid = item_ct1.get_local_id(2) +
-                 item_ct1.get_local_id(1) * item_ct1.get_local_range(2) +
-                 item_ct1.get_local_id(0) * item_ct1.get_local_range(2) *
-                     item_ct1.get_local_range(1) +
-                 item_ct1.get_group(2) * item_ct1.get_local_range(2) *
-                     item_ct1.get_local_range(1) * item_ct1.get_local_range(0);
-  int output = 0;
-  output = syclcompat::shift_sub_group_right(item_ct1.get_sub_group(), threadid,
-                                             1, 8);
-  data[threadid] = output;
-}
-
-void test_shift_sub_group_right() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  syclcompat::device_ext &dev_ct1 = syclcompat::get_current_device();
-  sycl::queue *q_ct1 = dev_ct1.default_queue();
-  bool Result = true;
-  int *dev_data = nullptr;
-  unsigned int *dev_data_u = nullptr;
-  sycl::range<3> GridSize(1, 1, 1);
-  sycl::range<3> BlockSize(1, 1, 1);
-  dev_data = sycl::malloc_device<int>(DATA_NUM, *q_ct1);
-  dev_data_u = sycl::malloc_device<unsigned int>(DATA_NUM, *q_ct1);
-  unsigned int host_dev_data_u[DATA_NUM];
-  GridSize = sycl::range<3>(1, 1, 2);
-  BlockSize = sycl::range<3>(1, 2, 32);
-  unsigned int expect1[DATA_NUM] = {
-      0,   0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,
-      14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,
-      29,  30,  32,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,
-      44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,
-      59,  60,  61,  62,  64,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,
-      74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,
-      89,  90,  91,  92,  93,  94,  96,  96,  97,  98,  99,  100, 101, 102, 103,
-      104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118,
-      119, 120, 121, 122, 123, 124, 125, 126};
-
-  init_data<unsigned int>(host_dev_data_u, DATA_NUM);
-  q_ct1->memcpy(dev_data_u, host_dev_data_u, DATA_NUM * sizeof(unsigned int))
-      .wait();
-  q_ct1->parallel_for(sycl::nd_range<3>(GridSize * BlockSize, BlockSize),
-                      [=](sycl::nd_item<3> item_ct1)
-                          [[sycl::reqd_sub_group_size(32)]] {
-                            shift_sub_group_right1(dev_data_u, item_ct1);
-                          });
-
-  dev_ct1.queues_wait_and_throw();
-  q_ct1->memcpy(host_dev_data_u, dev_data_u, DATA_NUM * sizeof(unsigned int))
-      .wait();
-  verify_data<unsigned int>(host_dev_data_u, expect1, DATA_NUM);
-
-  GridSize = sycl::range<3>(1, 1, 2);
-  BlockSize = sycl::range<3>(1, 2, 32);
-  unsigned int expect2[DATA_NUM] = {
-      0,   0,   1,   2,   3,   4,   5,   6,   8,   8,   9,   10,  11,  12,  13,
-      14,  16,  16,  17,  18,  19,  20,  21,  22,  24,  24,  25,  26,  27,  28,
-      29,  30,  32,  32,  33,  34,  35,  36,  37,  38,  40,  40,  41,  42,  43,
-      44,  45,  46,  48,  48,  49,  50,  51,  52,  53,  54,  56,  56,  57,  58,
-      59,  60,  61,  62,  64,  64,  65,  66,  67,  68,  69,  70,  72,  72,  73,
-      74,  75,  76,  77,  78,  80,  80,  81,  82,  83,  84,  85,  86,  88,  88,
-      89,  90,  91,  92,  93,  94,  96,  96,  97,  98,  99,  100, 101, 102, 104,
-      104, 105, 106, 107, 108, 109, 110, 112, 112, 113, 114, 115, 116, 117, 118,
-      120, 120, 121, 122, 123, 124, 125, 126};
-  init_data<unsigned int>(host_dev_data_u, DATA_NUM);
-  q_ct1->memcpy(dev_data_u, host_dev_data_u, DATA_NUM * sizeof(unsigned int))
-      .wait();
-
-  q_ct1->parallel_for(sycl::nd_range<3>(GridSize * BlockSize, BlockSize),
-                      [=](sycl::nd_item<3> item_ct1)
-                          [[sycl::reqd_sub_group_size(32)]] {
-                            shift_sub_group_right2(dev_data_u, item_ct1);
-                          });
-
-  dev_ct1.queues_wait_and_throw();
-  q_ct1->memcpy(host_dev_data_u, dev_data_u, DATA_NUM * sizeof(unsigned int))
-      .wait();
-  verify_data<unsigned int>(host_dev_data_u, expect2, DATA_NUM);
-
-  sycl::free(dev_data, *q_ct1);
-  sycl::free(dev_data_u, *q_ct1);
-}
-
-int main() {
-  test_shift_sub_group_right();
-
-  return 0;
-}
diff --git a/sycl/test-e2e/syclcompat/util/util_ternary_logic_op_test.cpp b/sycl/test-e2e/syclcompat/util/util_ternary_logic_op_test.cpp
deleted file mode 100644
index 4e252c5df08a5..0000000000000
--- a/sycl/test-e2e/syclcompat/util/util_ternary_logic_op_test.cpp
+++ /dev/null
@@ -1,585 +0,0 @@
-// ====------ util_ternary_logic_op_test.cpp ------------ *- C/C++ -* ---===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//
-// ===--------------------------------------------------------------------===//
-
-// This file is modified from the code migrated by SYCLomatic.
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include <cmath>
-#include <cstdint>
-#include <cstdio>
-#include <stdlib.h>
-#include <sycl/detail/core.hpp>
-#include <syclcompat/util.hpp>
-
-// clang-format off
-void reference_of_ternary_logic_op(uint32_t &R, uint32_t A, uint32_t B, uint32_t C, uint32_t D) {
-  switch (D) {
-  case 0: R = 0; break;
-  case 1: R = (~A & ~B & ~C); break;
-  case 2: R = (~A & ~B & C); break;
-  case 3: R = (~A & ~B & ~C) | (~A & ~B & C); break;
-  case 4: R = (~A & B & ~C); break;
-  case 5: R = (~A & ~B & ~C) | (~A & B & ~C); break;
-  case 6: R = (~A & ~B & C) | (~A & B & ~C); break;
-  case 7: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C); break;
-  case 8: R = (~A & B & C); break;
-  case 9: R = (~A & ~B & ~C) | (~A & B & C); break;
-  case 10: R = (~A & ~B & C) | (~A & B & C); break;
-  case 11: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & C); break;
-  case 12: R = (~A & B & ~C) | (~A & B & C); break;
-  case 13: R = (~A & ~B & ~C) | (~A & B & ~C) | (~A & B & C); break;
-  case 14: R = (~A & ~B & C) | (~A & B & ~C) | (~A & B & C); break;
-  case 15: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (~A & B & C); break;
-  case 16: R = (A & ~B & ~C); break;
-  case 17: R = (~A & ~B & ~C) | (A & ~B & ~C); break;
-  case 18: R = (~A & ~B & C) | (A & ~B & ~C); break;
-  case 19: R = (~A & ~B & ~C) | (~A & ~B & C) | (A & ~B & ~C); break;
-  case 20: R = (~A & B & ~C) | (A & ~B & ~C); break;
-  case 21: R = (~A & ~B & ~C) | (~A & B & ~C) | (A & ~B & ~C); break;
-  case 22: R = (~A & ~B & C) | (~A & B & ~C) | (A & ~B & ~C); break;
-  case 23: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (A & ~B & ~C); break;
-  case 24: R = (~A & B & C) | (A & ~B & ~C); break;
-  case 25: R = (~A & ~B & ~C) | (~A & B & C) | (A & ~B & ~C); break;
-  case 26: R = (A & B | C) ^ A; break;
-  case 27: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & C) | (A & ~B & ~C); break;
-  case 28: R = (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C); break;
-  case 29: R = (~A & ~B & ~C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C); break;
-  case 30: R = A ^ (B | C); break;
-  case 31: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C); break;
-  case 32: R = (A & ~B & C); break;
-  case 33: R = (~A & ~B & ~C) | (A & ~B & C); break;
-  case 34: R = (~A & ~B & C) | (A & ~B & C); break;
-  case 35: R = (~A & ~B & ~C) | (~A & ~B & C) | (A & ~B & C); break;
-  case 36: R = (~A & B & ~C) | (A & ~B & C); break;
-  case 37: R = (~A & ~B & ~C) | (~A & B & ~C) | (A & ~B & C); break;
-  case 38: R = (~A & ~B & C) | (~A & B & ~C) | (A & ~B & C); break;
-  case 39: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (A & ~B & C); break;
-  case 40: R = (~A & B & C) | (A & ~B & C); break;
-  case 41: R = (~A & ~B & ~C) | (~A & B & C) | (A & ~B & C); break;
-  case 42: R = (~A & ~B & C) | (~A & B & C) | (A & ~B & C); break;
-  case 43: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & C) | (A & ~B & C); break;
-  case 44: R = (~A & B & ~C) | (~A & B & C) | (A & ~B & C); break;
-  case 45: R = ~A ^ (~B & C); break;
-  case 46: R = (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & C); break;
-  case 47: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & C); break;
-  case 48: R = (A & ~B & ~C) | (A & ~B & C); break;
-  case 49: R = (~A & ~B & ~C) | (A & ~B & ~C) | (A & ~B & C); break;
-  case 50: R = (~A & ~B & C) | (A & ~B & ~C) | (A & ~B & C); break;
-  case 51: R = (~A & ~B & ~C) | (~A & ~B & C) | (A & ~B & ~C) | (A & ~B & C); break;
-  case 52: R = (~A & B & ~C) | (A & ~B & ~C) | (A & ~B & C); break;
-  case 53: R = (~A & ~B & ~C) | (~A & B & ~C) | (A & ~B & ~C) | (A & ~B & C); break;
-  case 54: R = (~A & ~B & C) | (~A & B & ~C) | (A & ~B & ~C) | (A & ~B & C); break;
-  case 55: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (A & ~B & ~C) | (A & ~B & C); break;
-  case 56: R = (~A & B & C) | (A & ~B & ~C) | (A & ~B & C); break;
-  case 57: R = (~A & ~B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C); break;
-  case 58: R = (~A & ~B & C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C); break;
-  case 59: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C); break;
-  case 60: R = (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C); break;
-  case 61: R = (~A & ~B & ~C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C); break;
-  case 62: R = (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C); break;
-  case 63: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C); break;
-  case 64: R = A & B & ~C; break;
-  case 65: R = (~A & ~B & ~C) | (A & B & ~C); break;
-  case 66: R = (~A & ~B & C) | (A & B & ~C); break;
-  case 67: R = (~A & ~B & ~C) | (~A & ~B & C) | (A & B & ~C); break;
-  case 68: R = (~A & B & ~C) | (A & B & ~C); break;
-  case 69: R = (~A & ~B & ~C) | (~A & B & ~C) | (A & B & ~C); break;
-  case 70: R = (~A & ~B & C) | (~A & B & ~C) | (A & B & ~C); break;
-  case 71: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (A & B & ~C); break;
-  case 72: R = (~A & B & C) | (A & B & ~C); break;
-  case 73: R = (~A & ~B & ~C) | (~A & B & C) | (A & B & ~C); break;
-  case 74: R = (~A & ~B & C) | (~A & B & C) | (A & B & ~C); break;
-  case 75: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & C) | (A & B & ~C); break;
-  case 76: R = (~A & B & ~C) | (~A & B & C) | (A & B & ~C); break;
-  case 77: R = (~A & ~B & ~C) | (~A & B & ~C) | (~A & B & C) | (A & B & ~C); break;
-  case 78: R = (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & B & ~C); break;
-  case 79: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & B & ~C); break;
-  case 80: R = (A & ~B & ~C) | (A & B & ~C); break;
-  case 81: R = (~A & ~B & ~C) | (A & ~B & ~C) | (A & B & ~C); break;
-  case 82: R = (~A & ~B & C) | (A & ~B & ~C) | (A & B & ~C); break;
-  case 83: R = (~A & ~B & ~C) | (~A & ~B & C) | (A & ~B & ~C) | (A & B & ~C); break;
-  case 84: R = (~A & B & ~C) | (A & ~B & ~C) | (A & B & ~C); break;
-  case 85: R = (~A & ~B & ~C) | (~A & B & ~C) | (A & ~B & ~C) | (A & B & ~C); break;
-  case 86: R = (~A & ~B & C) | (~A & B & ~C) | (A & ~B & ~C) | (A & B & ~C); break;
-  case 87: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (A & ~B & ~C) | (A & B & ~C); break;
-  case 88: R = (~A & B & C) | (A & ~B & ~C) | (A & B & ~C); break;
-  case 89: R = (~A & ~B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & B & ~C); break;
-  case 90: R = (~A & ~B & C) | (~A & B & C) | (A & ~B & ~C) | (A & B & ~C); break;
-  case 91: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & C) | (A & ~B & ~C) | (A & B & ~C); break;
-  case 92: R = (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & B & ~C); break;
-  case 93: R = (~A & ~B & ~C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & B & ~C); break;
-  case 94: R = (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & B & ~C); break;
-  case 95: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & B & ~C); break;
-  case 96: R = (A & ~B & C) | (A & B & ~C); break;
-  case 97: R = (~A & ~B & ~C) | (A & ~B & C) | (A & B & ~C); break;
-  case 98: R = (~A & ~B & C) | (A & ~B & C) | (A & B & ~C); break;
-  case 99: R = (~A & ~B & ~C) | (~A & ~B & C) | (A & ~B & C) | (A & B & ~C); break;
-  case 100: R = (~A & B & ~C) | (A & ~B & C) | (A & B & ~C); break;
-  case 101: R = (~A & ~B & ~C) | (~A & B & ~C) | (A & ~B & C) | (A & B & ~C); break;
-  case 102: R = (~A & ~B & C) | (~A & B & ~C) | (A & ~B & C) | (A & B & ~C); break;
-  case 103: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (A & ~B & C) | (A & B & ~C); break;
-  case 104: R = (~A & B & C) | (A & ~B & C) | (A & B & ~C); break;
-  case 105: R = (~A & ~B & ~C) | (~A & B & C) | (A & ~B & C) | (A & B & ~C); break;
-  case 106: R = (~A & ~B & C) | (~A & B & C) | (A & ~B & C) | (A & B & ~C); break;
-  case 107: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & C) | (A & ~B & C) | (A & B & ~C); break;
-  case 108: R = (~A & B & ~C) | (~A & B & C) | (A & ~B & C) | (A & B & ~C); break;
-  case 109: R = (~A & ~B & ~C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & C) | (A & B & ~C); break;
-  case 110: R = (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & C) | (A & B & ~C); break;
-  case 111: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & C) | (A & B & ~C); break;
-  case 112: R = (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C); break;
-  case 113: R = (~A & ~B & ~C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C); break;
-  case 114: R = (~A & ~B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C); break;
-  case 115: R = (~A & ~B & ~C) | (~A & ~B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C); break;
-  case 116: R = (~A & B & ~C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C); break;
-  case 117: R = (~A & ~B & ~C) | (~A & B & ~C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C); break;
-  case 118: R = (~A & ~B & C) | (~A & B & ~C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C); break;
-  case 119: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C); break;
-  case 120: R = A ^ (B & C); break;
-  case 121: R = (~A & ~B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C); break;
-  case 122: R = (~A & ~B & C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C); break;
-  case 123: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C); break;
-  case 124: R = (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C); break;
-  case 125: R = (~A & ~B & ~C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C); break;
-  case 126: R = (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C); break;
-  case 127: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C); break;
-  case 128: R = A & B & C; break;
-  case 129: R = (~A & ~B & ~C) | (A & B & C); break;
-  case 130: R = (~A & ~B & C) | (A & B & C); break;
-  case 131: R = (~A & ~B & ~C) | (~A & ~B & C) | (A & B & C); break;
-  case 132: R = (~A & B & ~C) | (A & B & C); break;
-  case 133: R = (~A & ~B & ~C) | (~A & B & ~C) | (A & B & C); break;
-  case 134: R = (~A & ~B & C) | (~A & B & ~C) | (A & B & C); break;
-  case 135: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (A & B & C); break;
-  case 136: R = (~A & B & C) | (A & B & C); break;
-  case 137: R = (~A & ~B & ~C) | (~A & B & C) | (A & B & C); break;
-  case 138: R = (~A & ~B & C) | (~A & B & C) | (A & B & C); break;
-  case 139: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & C) | (A & B & C); break;
-  case 140: R = (~A & B & ~C) | (~A & B & C) | (A & B & C); break;
-  case 141: R = (~A & ~B & ~C) | (~A & B & ~C) | (~A & B & C) | (A & B & C); break;
-  case 142: R = (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & B & C); break;
-  case 143: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & B & C); break;
-  case 144: R = (A & ~B & ~C) | (A & B & C); break;
-  case 145: R = (~A & ~B & ~C) | (A & ~B & ~C) | (A & B & C); break;
-  case 146: R = (~A & ~B & C) | (A & ~B & ~C) | (A & B & C); break;
-  case 147: R = (~A & ~B & ~C) | (~A & ~B & C) | (A & ~B & ~C) | (A & B & C); break;
-  case 148: R = (~A & B & ~C) | (A & ~B & ~C) | (A & B & C); break;
-  case 149: R = (~A & ~B & ~C) | (~A & B & ~C) | (A & ~B & ~C) | (A & B & C); break;
-  case 150: R = A ^ B ^ C; break;
-  case 151: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (A & ~B & ~C) | (A & B & C); break;
-  case 152: R = (~A & B & C) | (A & ~B & ~C) | (A & B & C); break;
-  case 153: R = (~A & ~B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & B & C); break;
-  case 154: R = (~A & ~B & C) | (~A & B & C) | (A & ~B & ~C) | (A & B & C); break;
-  case 155: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & C) | (A & ~B & ~C) | (A & B & C); break;
-  case 156: R = (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & B & C); break;
-  case 157: R = (~A & ~B & ~C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & B & C); break;
-  case 158: R = (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & B & C); break;
-  case 159: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & B & C); break;
-  case 160: R = (A & ~B & C) | (A & B & C); break;
-  case 161: R = (~A & ~B & ~C) | (A & ~B & C) | (A & B & C); break;
-  case 162: R = (~A & ~B & C) | (A & ~B & C) | (A & B & C); break;
-  case 163: R = (~A & ~B & ~C) | (~A & ~B & C) | (A & ~B & C) | (A & B & C); break;
-  case 164: R = (~A & B & ~C) | (A & ~B & C) | (A & B & C); break;
-  case 165: R = (~A & ~B & ~C) | (~A & B & ~C) | (A & ~B & C) | (A & B & C); break;
-  case 166: R = (~A & ~B & C) | (~A & B & ~C) | (A & ~B & C) | (A & B & C); break;
-  case 167: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (A & ~B & C) | (A & B & C); break;
-  case 168: R = (~A & B & C) | (A & ~B & C) | (A & B & C); break;
-  case 169: R = (~A & ~B & ~C) | (~A & B & C) | (A & ~B & C) | (A & B & C); break;
-  case 170: R = (~A & ~B & C) | (~A & B & C) | (A & ~B & C) | (A & B & C); break;
-  case 171: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & C) | (A & ~B & C) | (A & B & C); break;
-  case 172: R = (~A & B & ~C) | (~A & B & C) | (A & ~B & C) | (A & B & C); break;
-  case 173: R = (~A & ~B & ~C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & C) | (A & B & C); break;
-  case 174: R = (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & C) | (A & B & C); break;
-  case 175: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & C) | (A & B & C); break;
-  case 176: R = (A & ~B & ~C) | (A & ~B & C) | (A & B & C); break;
-  case 177: R = (~A & ~B & ~C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & C); break;
-  case 178: R = (~A & ~B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & C); break;
-  case 179: R = (~A & ~B & ~C) | (~A & ~B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & C); break;
-  case 180: R = A ^ (B & ~C); break;
-  case 181: R = (~A & ~B & ~C) | (~A & B & ~C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & C); break;
-  case 182: R = (~A & ~B & C) | (~A & B & ~C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & C); break;
-  case 183: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & C); break;
-  case 184: R = (A ^ (B & (C ^ A))); break;
-  case 185: R = (~A & ~B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & C); break;
-  case 186: R = (~A & ~B & C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & C); break;
-  case 187: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & C); break;
-  case 188: R = (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & C); break;
-  case 189: R = (~A & ~B & ~C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & C); break;
-  case 190: R = (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & C); break;
-  case 191: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & C); break;
-  case 192: R = (A & B & ~C) | (A & B & C); break;
-  case 193: R = (~A & ~B & ~C) | (A & B & ~C) | (A & B & C); break;
-  case 194: R = (~A & ~B & C) | (A & B & ~C) | (A & B & C); break;
-  case 195: R = (~A & ~B & ~C) | (~A & ~B & C) | (A & B & ~C) | (A & B & C); break;
-  case 196: R = (~A & B & ~C) | (A & B & ~C) | (A & B & C); break;
-  case 197: R = (~A & ~B & ~C) | (~A & B & ~C) | (A & B & ~C) | (A & B & C); break;
-  case 198: R = (~A & ~B & C) | (~A & B & ~C) | (A & B & ~C) | (A & B & C); break;
-  case 199: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (A & B & ~C) | (A & B & C); break;
-  case 200: R = (~A & B & C) | (A & B & ~C) | (A & B & C); break;
-  case 201: R = (~A & ~B & ~C) | (~A & B & C) | (A & B & ~C) | (A & B & C); break;
-  case 202: R = (~A & ~B & C) | (~A & B & C) | (A & B & ~C) | (A & B & C); break;
-  case 203: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & C) | (A & B & ~C) | (A & B & C); break;
-  case 204: R = (~A & B & ~C) | (~A & B & C) | (A & B & ~C) | (A & B & C); break;
-  case 205: R = (~A & ~B & ~C) | (~A & B & ~C) | (~A & B & C) | (A & B & ~C) | (A & B & C); break;
-  case 206: R = (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & B & ~C) | (A & B & C); break;
-  case 207: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & B & ~C) | (A & B & C); break;
-  case 208: R = (A & ~B & ~C) | (A & B & ~C) | (A & B & C); break;
-  case 209: R = (~A & ~B & ~C) | (A & ~B & ~C) | (A & B & ~C) | (A & B & C); break;
-  case 210: R = A ^ (~B & C); break;
-  case 211: R = (~A & ~B & ~C) | (~A & ~B & C) | (A & ~B & ~C) | (A & B & ~C) | (A & B & C); break;
-  case 212: R = (~A & B & ~C) | (A & ~B & ~C) | (A & B & ~C) | (A & B & C); break;
-  case 213: R = (~A & ~B & ~C) | (~A & B & ~C) | (A & ~B & ~C) | (A & B & ~C) | (A & B & C); break;
-  case 214: R = (~A & ~B & C) | (~A & B & ~C) | (A & ~B & ~C) | (A & B & ~C) | (A & B & C); break;
-  case 215: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (A & ~B & ~C) | (A & B & ~C) | (A & B & C); break;
-  case 216: R = (~A & B & C) | (A & ~B & ~C) | (A & B & ~C) | (A & B & C); break;
-  case 217: R = (~A & ~B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & B & ~C) | (A & B & C); break;
-  case 218: R = (~A & ~B & C) | (~A & B & C) | (A & ~B & ~C) | (A & B & ~C) | (A & B & C); break;
-  case 219: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & C) | (A & ~B & ~C) | (A & B & ~C) | (A & B & C); break;
-  case 220: R = (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & B & ~C) | (A & B & C); break;
-  case 221: R = (~A & ~B & ~C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & B & ~C) | (A & B & C); break;
-  case 222: R = (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & B & ~C) | (A & B & C); break;
-  case 223: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & B & ~C) | (A & B & C); break;
-  case 224: R = (A & ~B & C) | (A & B & ~C) | (A & B & C); break;
-  case 225: R = (~A & ~B & ~C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break;
-  case 226: R = (~A & ~B & C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break;
-  case 227: R = (~A & ~B & ~C) | (~A & ~B & C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break;
-  case 228: R = (~A & B & ~C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break;
-  case 229: R = (~A & ~B & ~C) | (~A & B & ~C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break;
-  case 230: R = (~A & ~B & C) | (~A & B & ~C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break;
-  case 231: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break;
-  case 232: R = ((A & (B | C)) | (B & C)); break;
-  case 233: R = (~A & ~B & ~C) | (~A & B & C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break;
-  case 234: R = (A & B) | C; break;
-  case 235: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break;
-  case 236: R = (~A & B & ~C) | (~A & B & C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break;
-  case 237: R = (~A & ~B & ~C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break;
-  case 238: R = (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break;
-  case 239: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break;
-  case 240: R = (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break;
-  case 241: R = (~A & ~B & ~C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break;
-  case 242: R = (~A & ~B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break;
-  case 243: R = (~A & ~B & ~C) | (~A & ~B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break;
-  case 244: R = (~A & B & ~C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break;
-  case 245: R = (~A & ~B & ~C) | (~A & B & ~C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break;
-  case 246: R = (~A & ~B & C) | (~A & B & ~C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break;
-  case 247: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & ~C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break;
-  case 248: R = (~A & B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break;
-  case 249: R = (~A & ~B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break;
-  case 250: R = (~A & ~B & C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break;
-  case 251: R = (~A & ~B & ~C) | (~A & ~B & C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break;
-  case 252: R = (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break;
-  case 253: R = (~A & ~B & ~C) | (~A & B & ~C) | (~A & B & C) | (A & ~B & ~C) | (A & ~B & C) | (A & B & ~C) | (A & B & C); break;
-  case 254: R = A | B | C; break;
-  case 255: R = uint32_t(-1); break;
-  default: break;
-  }
-}
-
-void asm_ternary_logic_op(uint32_t &R, uint32_t A, uint32_t B, uint32_t C, uint32_t D) {
-  switch (D) {
-  case 0: R = syclcompat::ternary_logic_op(A, B, C, 0x0); break;
-  case 1: R = syclcompat::ternary_logic_op(A, B, C, 0x1); break;
-  case 2: R = syclcompat::ternary_logic_op(A, B, C, 0x2); break;
-  case 3: R = syclcompat::ternary_logic_op(A, B, C, 0x3); break;
-  case 4: R = syclcompat::ternary_logic_op(A, B, C, 0x4); break;
-  case 5: R = syclcompat::ternary_logic_op(A, B, C, 0x5); break;
-  case 6: R = syclcompat::ternary_logic_op(A, B, C, 0x6); break;
-  case 7: R = syclcompat::ternary_logic_op(A, B, C, 0x7); break;
-  case 8: R = syclcompat::ternary_logic_op(A, B, C, 0x8); break;
-  case 9: R = syclcompat::ternary_logic_op(A, B, C, 0x9); break;
-  case 10: R = syclcompat::ternary_logic_op(A, B, C, 0xA); break;
-  case 11: R = syclcompat::ternary_logic_op(A, B, C, 0xB); break;
-  case 12: R = syclcompat::ternary_logic_op(A, B, C, 0xC); break;
-  case 13: R = syclcompat::ternary_logic_op(A, B, C, 0xD); break;
-  case 14: R = syclcompat::ternary_logic_op(A, B, C, 0xE); break;
-  case 15: R = syclcompat::ternary_logic_op(A, B, C, 0xF); break;
-  case 16: R = syclcompat::ternary_logic_op(A, B, C, 0x10); break;
-  case 17: R = syclcompat::ternary_logic_op(A, B, C, 0x11); break;
-  case 18: R = syclcompat::ternary_logic_op(A, B, C, 0x12); break;
-  case 19: R = syclcompat::ternary_logic_op(A, B, C, 0x13); break;
-  case 20: R = syclcompat::ternary_logic_op(A, B, C, 0x14); break;
-  case 21: R = syclcompat::ternary_logic_op(A, B, C, 0x15); break;
-  case 22: R = syclcompat::ternary_logic_op(A, B, C, 0x16); break;
-  case 23: R = syclcompat::ternary_logic_op(A, B, C, 0x17); break;
-  case 24: R = syclcompat::ternary_logic_op(A, B, C, 0x18); break;
-  case 25: R = syclcompat::ternary_logic_op(A, B, C, 0x19); break;
-  case 26: R = syclcompat::ternary_logic_op(A, B, C, 0x1A); break;
-  case 27: R = syclcompat::ternary_logic_op(A, B, C, 0x1B); break;
-  case 28: R = syclcompat::ternary_logic_op(A, B, C, 0x1C); break;
-  case 29: R = syclcompat::ternary_logic_op(A, B, C, 0x1D); break;
-  case 30: R = syclcompat::ternary_logic_op(A, B, C, 0x1E); break;
-  case 31: R = syclcompat::ternary_logic_op(A, B, C, 0x1F); break;
-  case 32: R = syclcompat::ternary_logic_op(A, B, C, 0x20); break;
-  case 33: R = syclcompat::ternary_logic_op(A, B, C, 0x21); break;
-  case 34: R = syclcompat::ternary_logic_op(A, B, C, 0x22); break;
-  case 35: R = syclcompat::ternary_logic_op(A, B, C, 0x23); break;
-  case 36: R = syclcompat::ternary_logic_op(A, B, C, 0x24); break;
-  case 37: R = syclcompat::ternary_logic_op(A, B, C, 0x25); break;
-  case 38: R = syclcompat::ternary_logic_op(A, B, C, 0x26); break;
-  case 39: R = syclcompat::ternary_logic_op(A, B, C, 0x27); break;
-  case 40: R = syclcompat::ternary_logic_op(A, B, C, 0x28); break;
-  case 41: R = syclcompat::ternary_logic_op(A, B, C, 0x29); break;
-  case 42: R = syclcompat::ternary_logic_op(A, B, C, 0x2A); break;
-  case 43: R = syclcompat::ternary_logic_op(A, B, C, 0x2B); break;
-  case 44: R = syclcompat::ternary_logic_op(A, B, C, 0x2C); break;
-  case 45: R = syclcompat::ternary_logic_op(A, B, C, 0x2D); break;
-  case 46: R = syclcompat::ternary_logic_op(A, B, C, 0x2E); break;
-  case 47: R = syclcompat::ternary_logic_op(A, B, C, 0x2F); break;
-  case 48: R = syclcompat::ternary_logic_op(A, B, C, 0x30); break;
-  case 49: R = syclcompat::ternary_logic_op(A, B, C, 0x31); break;
-  case 50: R = syclcompat::ternary_logic_op(A, B, C, 0x32); break;
-  case 51: R = syclcompat::ternary_logic_op(A, B, C, 0x33); break;
-  case 52: R = syclcompat::ternary_logic_op(A, B, C, 0x34); break;
-  case 53: R = syclcompat::ternary_logic_op(A, B, C, 0x35); break;
-  case 54: R = syclcompat::ternary_logic_op(A, B, C, 0x36); break;
-  case 55: R = syclcompat::ternary_logic_op(A, B, C, 0x37); break;
-  case 56: R = syclcompat::ternary_logic_op(A, B, C, 0x38); break;
-  case 57: R = syclcompat::ternary_logic_op(A, B, C, 0x39); break;
-  case 58: R = syclcompat::ternary_logic_op(A, B, C, 0x3A); break;
-  case 59: R = syclcompat::ternary_logic_op(A, B, C, 0x3B); break;
-  case 60: R = syclcompat::ternary_logic_op(A, B, C, 0x3C); break;
-  case 61: R = syclcompat::ternary_logic_op(A, B, C, 0x3D); break;
-  case 62: R = syclcompat::ternary_logic_op(A, B, C, 0x3E); break;
-  case 63: R = syclcompat::ternary_logic_op(A, B, C, 0x3F); break;
-  case 64: R = syclcompat::ternary_logic_op(A, B, C, 0x40); break;
-  case 65: R = syclcompat::ternary_logic_op(A, B, C, 0x41); break;
-  case 66: R = syclcompat::ternary_logic_op(A, B, C, 0x42); break;
-  case 67: R = syclcompat::ternary_logic_op(A, B, C, 0x43); break;
-  case 68: R = syclcompat::ternary_logic_op(A, B, C, 0x44); break;
-  case 69: R = syclcompat::ternary_logic_op(A, B, C, 0x45); break;
-  case 70: R = syclcompat::ternary_logic_op(A, B, C, 0x46); break;
-  case 71: R = syclcompat::ternary_logic_op(A, B, C, 0x47); break;
-  case 72: R = syclcompat::ternary_logic_op(A, B, C, 0x48); break;
-  case 73: R = syclcompat::ternary_logic_op(A, B, C, 0x49); break;
-  case 74: R = syclcompat::ternary_logic_op(A, B, C, 0x4A); break;
-  case 75: R = syclcompat::ternary_logic_op(A, B, C, 0x4B); break;
-  case 76: R = syclcompat::ternary_logic_op(A, B, C, 0x4C); break;
-  case 77: R = syclcompat::ternary_logic_op(A, B, C, 0x4D); break;
-  case 78: R = syclcompat::ternary_logic_op(A, B, C, 0x4E); break;
-  case 79: R = syclcompat::ternary_logic_op(A, B, C, 0x4F); break;
-  case 80: R = syclcompat::ternary_logic_op(A, B, C, 0x50); break;
-  case 81: R = syclcompat::ternary_logic_op(A, B, C, 0x51); break;
-  case 82: R = syclcompat::ternary_logic_op(A, B, C, 0x52); break;
-  case 83: R = syclcompat::ternary_logic_op(A, B, C, 0x53); break;
-  case 84: R = syclcompat::ternary_logic_op(A, B, C, 0x54); break;
-  case 85: R = syclcompat::ternary_logic_op(A, B, C, 0x55); break;
-  case 86: R = syclcompat::ternary_logic_op(A, B, C, 0x56); break;
-  case 87: R = syclcompat::ternary_logic_op(A, B, C, 0x57); break;
-  case 88: R = syclcompat::ternary_logic_op(A, B, C, 0x58); break;
-  case 89: R = syclcompat::ternary_logic_op(A, B, C, 0x59); break;
-  case 90: R = syclcompat::ternary_logic_op(A, B, C, 0x5A); break;
-  case 91: R = syclcompat::ternary_logic_op(A, B, C, 0x5B); break;
-  case 92: R = syclcompat::ternary_logic_op(A, B, C, 0x5C); break;
-  case 93: R = syclcompat::ternary_logic_op(A, B, C, 0x5D); break;
-  case 94: R = syclcompat::ternary_logic_op(A, B, C, 0x5E); break;
-  case 95: R = syclcompat::ternary_logic_op(A, B, C, 0x5F); break;
-  case 96: R = syclcompat::ternary_logic_op(A, B, C, 0x60); break;
-  case 97: R = syclcompat::ternary_logic_op(A, B, C, 0x61); break;
-  case 98: R = syclcompat::ternary_logic_op(A, B, C, 0x62); break;
-  case 99: R = syclcompat::ternary_logic_op(A, B, C, 0x63); break;
-  case 100: R = syclcompat::ternary_logic_op(A, B, C, 0x64); break;
-  case 101: R = syclcompat::ternary_logic_op(A, B, C, 0x65); break;
-  case 102: R = syclcompat::ternary_logic_op(A, B, C, 0x66); break;
-  case 103: R = syclcompat::ternary_logic_op(A, B, C, 0x67); break;
-  case 104: R = syclcompat::ternary_logic_op(A, B, C, 0x68); break;
-  case 105: R = syclcompat::ternary_logic_op(A, B, C, 0x69); break;
-  case 106: R = syclcompat::ternary_logic_op(A, B, C, 0x6A); break;
-  case 107: R = syclcompat::ternary_logic_op(A, B, C, 0x6B); break;
-  case 108: R = syclcompat::ternary_logic_op(A, B, C, 0x6C); break;
-  case 109: R = syclcompat::ternary_logic_op(A, B, C, 0x6D); break;
-  case 110: R = syclcompat::ternary_logic_op(A, B, C, 0x6E); break;
-  case 111: R = syclcompat::ternary_logic_op(A, B, C, 0x6F); break;
-  case 112: R = syclcompat::ternary_logic_op(A, B, C, 0x70); break;
-  case 113: R = syclcompat::ternary_logic_op(A, B, C, 0x71); break;
-  case 114: R = syclcompat::ternary_logic_op(A, B, C, 0x72); break;
-  case 115: R = syclcompat::ternary_logic_op(A, B, C, 0x73); break;
-  case 116: R = syclcompat::ternary_logic_op(A, B, C, 0x74); break;
-  case 117: R = syclcompat::ternary_logic_op(A, B, C, 0x75); break;
-  case 118: R = syclcompat::ternary_logic_op(A, B, C, 0x76); break;
-  case 119: R = syclcompat::ternary_logic_op(A, B, C, 0x77); break;
-  case 120: R = syclcompat::ternary_logic_op(A, B, C, 0x78); break;
-  case 121: R = syclcompat::ternary_logic_op(A, B, C, 0x79); break;
-  case 122: R = syclcompat::ternary_logic_op(A, B, C, 0x7A); break;
-  case 123: R = syclcompat::ternary_logic_op(A, B, C, 0x7B); break;
-  case 124: R = syclcompat::ternary_logic_op(A, B, C, 0x7C); break;
-  case 125: R = syclcompat::ternary_logic_op(A, B, C, 0x7D); break;
-  case 126: R = syclcompat::ternary_logic_op(A, B, C, 0x7E); break;
-  case 127: R = syclcompat::ternary_logic_op(A, B, C, 0x7F); break;
-  case 128: R = syclcompat::ternary_logic_op(A, B, C, 0x80); break;
-  case 129: R = syclcompat::ternary_logic_op(A, B, C, 0x81); break;
-  case 130: R = syclcompat::ternary_logic_op(A, B, C, 0x82); break;
-  case 131: R = syclcompat::ternary_logic_op(A, B, C, 0x83); break;
-  case 132: R = syclcompat::ternary_logic_op(A, B, C, 0x84); break;
-  case 133: R = syclcompat::ternary_logic_op(A, B, C, 0x85); break;
-  case 134: R = syclcompat::ternary_logic_op(A, B, C, 0x86); break;
-  case 135: R = syclcompat::ternary_logic_op(A, B, C, 0x87); break;
-  case 136: R = syclcompat::ternary_logic_op(A, B, C, 0x88); break;
-  case 137: R = syclcompat::ternary_logic_op(A, B, C, 0x89); break;
-  case 138: R = syclcompat::ternary_logic_op(A, B, C, 0x8A); break;
-  case 139: R = syclcompat::ternary_logic_op(A, B, C, 0x8B); break;
-  case 140: R = syclcompat::ternary_logic_op(A, B, C, 0x8C); break;
-  case 141: R = syclcompat::ternary_logic_op(A, B, C, 0x8D); break;
-  case 142: R = syclcompat::ternary_logic_op(A, B, C, 0x8E); break;
-  case 143: R = syclcompat::ternary_logic_op(A, B, C, 0x8F); break;
-  case 144: R = syclcompat::ternary_logic_op(A, B, C, 0x90); break;
-  case 145: R = syclcompat::ternary_logic_op(A, B, C, 0x91); break;
-  case 146: R = syclcompat::ternary_logic_op(A, B, C, 0x92); break;
-  case 147: R = syclcompat::ternary_logic_op(A, B, C, 0x93); break;
-  case 148: R = syclcompat::ternary_logic_op(A, B, C, 0x94); break;
-  case 149: R = syclcompat::ternary_logic_op(A, B, C, 0x95); break;
-  case 150: R = syclcompat::ternary_logic_op(A, B, C, 0x96); break;
-  case 151: R = syclcompat::ternary_logic_op(A, B, C, 0x97); break;
-  case 152: R = syclcompat::ternary_logic_op(A, B, C, 0x98); break;
-  case 153: R = syclcompat::ternary_logic_op(A, B, C, 0x99); break;
-  case 154: R = syclcompat::ternary_logic_op(A, B, C, 0x9A); break;
-  case 155: R = syclcompat::ternary_logic_op(A, B, C, 0x9B); break;
-  case 156: R = syclcompat::ternary_logic_op(A, B, C, 0x9C); break;
-  case 157: R = syclcompat::ternary_logic_op(A, B, C, 0x9D); break;
-  case 158: R = syclcompat::ternary_logic_op(A, B, C, 0x9E); break;
-  case 159: R = syclcompat::ternary_logic_op(A, B, C, 0x9F); break;
-  case 160: R = syclcompat::ternary_logic_op(A, B, C, 0xA0); break;
-  case 161: R = syclcompat::ternary_logic_op(A, B, C, 0xA1); break;
-  case 162: R = syclcompat::ternary_logic_op(A, B, C, 0xA2); break;
-  case 163: R = syclcompat::ternary_logic_op(A, B, C, 0xA3); break;
-  case 164: R = syclcompat::ternary_logic_op(A, B, C, 0xA4); break;
-  case 165: R = syclcompat::ternary_logic_op(A, B, C, 0xA5); break;
-  case 166: R = syclcompat::ternary_logic_op(A, B, C, 0xA6); break;
-  case 167: R = syclcompat::ternary_logic_op(A, B, C, 0xA7); break;
-  case 168: R = syclcompat::ternary_logic_op(A, B, C, 0xA8); break;
-  case 169: R = syclcompat::ternary_logic_op(A, B, C, 0xA9); break;
-  case 170: R = syclcompat::ternary_logic_op(A, B, C, 0xAA); break;
-  case 171: R = syclcompat::ternary_logic_op(A, B, C, 0xAB); break;
-  case 172: R = syclcompat::ternary_logic_op(A, B, C, 0xAC); break;
-  case 173: R = syclcompat::ternary_logic_op(A, B, C, 0xAD); break;
-  case 174: R = syclcompat::ternary_logic_op(A, B, C, 0xAE); break;
-  case 175: R = syclcompat::ternary_logic_op(A, B, C, 0xAF); break;
-  case 176: R = syclcompat::ternary_logic_op(A, B, C, 0xB0); break;
-  case 177: R = syclcompat::ternary_logic_op(A, B, C, 0xB1); break;
-  case 178: R = syclcompat::ternary_logic_op(A, B, C, 0xB2); break;
-  case 179: R = syclcompat::ternary_logic_op(A, B, C, 0xB3); break;
-  case 180: R = syclcompat::ternary_logic_op(A, B, C, 0xB4); break;
-  case 181: R = syclcompat::ternary_logic_op(A, B, C, 0xB5); break;
-  case 182: R = syclcompat::ternary_logic_op(A, B, C, 0xB6); break;
-  case 183: R = syclcompat::ternary_logic_op(A, B, C, 0xB7); break;
-  case 184: R = syclcompat::ternary_logic_op(A, B, C, 0xB8); break;
-  case 185: R = syclcompat::ternary_logic_op(A, B, C, 0xB9); break;
-  case 186: R = syclcompat::ternary_logic_op(A, B, C, 0xBA); break;
-  case 187: R = syclcompat::ternary_logic_op(A, B, C, 0xBB); break;
-  case 188: R = syclcompat::ternary_logic_op(A, B, C, 0xBC); break;
-  case 189: R = syclcompat::ternary_logic_op(A, B, C, 0xBD); break;
-  case 190: R = syclcompat::ternary_logic_op(A, B, C, 0xBE); break;
-  case 191: R = syclcompat::ternary_logic_op(A, B, C, 0xBF); break;
-  case 192: R = syclcompat::ternary_logic_op(A, B, C, 0xC0); break;
-  case 193: R = syclcompat::ternary_logic_op(A, B, C, 0xC1); break;
-  case 194: R = syclcompat::ternary_logic_op(A, B, C, 0xC2); break;
-  case 195: R = syclcompat::ternary_logic_op(A, B, C, 0xC3); break;
-  case 196: R = syclcompat::ternary_logic_op(A, B, C, 0xC4); break;
-  case 197: R = syclcompat::ternary_logic_op(A, B, C, 0xC5); break;
-  case 198: R = syclcompat::ternary_logic_op(A, B, C, 0xC6); break;
-  case 199: R = syclcompat::ternary_logic_op(A, B, C, 0xC7); break;
-  case 200: R = syclcompat::ternary_logic_op(A, B, C, 0xC8); break;
-  case 201: R = syclcompat::ternary_logic_op(A, B, C, 0xC9); break;
-  case 202: R = syclcompat::ternary_logic_op(A, B, C, 0xCA); break;
-  case 203: R = syclcompat::ternary_logic_op(A, B, C, 0xCB); break;
-  case 204: R = syclcompat::ternary_logic_op(A, B, C, 0xCC); break;
-  case 205: R = syclcompat::ternary_logic_op(A, B, C, 0xCD); break;
-  case 206: R = syclcompat::ternary_logic_op(A, B, C, 0xCE); break;
-  case 207: R = syclcompat::ternary_logic_op(A, B, C, 0xCF); break;
-  case 208: R = syclcompat::ternary_logic_op(A, B, C, 0xD0); break;
-  case 209: R = syclcompat::ternary_logic_op(A, B, C, 0xD1); break;
-  case 210: R = syclcompat::ternary_logic_op(A, B, C, 0xD2); break;
-  case 211: R = syclcompat::ternary_logic_op(A, B, C, 0xD3); break;
-  case 212: R = syclcompat::ternary_logic_op(A, B, C, 0xD4); break;
-  case 213: R = syclcompat::ternary_logic_op(A, B, C, 0xD5); break;
-  case 214: R = syclcompat::ternary_logic_op(A, B, C, 0xD6); break;
-  case 215: R = syclcompat::ternary_logic_op(A, B, C, 0xD7); break;
-  case 216: R = syclcompat::ternary_logic_op(A, B, C, 0xD8); break;
-  case 217: R = syclcompat::ternary_logic_op(A, B, C, 0xD9); break;
-  case 218: R = syclcompat::ternary_logic_op(A, B, C, 0xDA); break;
-  case 219: R = syclcompat::ternary_logic_op(A, B, C, 0xDB); break;
-  case 220: R = syclcompat::ternary_logic_op(A, B, C, 0xDC); break;
-  case 221: R = syclcompat::ternary_logic_op(A, B, C, 0xDD); break;
-  case 222: R = syclcompat::ternary_logic_op(A, B, C, 0xDE); break;
-  case 223: R = syclcompat::ternary_logic_op(A, B, C, 0xDF); break;
-  case 224: R = syclcompat::ternary_logic_op(A, B, C, 0xE0); break;
-  case 225: R = syclcompat::ternary_logic_op(A, B, C, 0xE1); break;
-  case 226: R = syclcompat::ternary_logic_op(A, B, C, 0xE2); break;
-  case 227: R = syclcompat::ternary_logic_op(A, B, C, 0xE3); break;
-  case 228: R = syclcompat::ternary_logic_op(A, B, C, 0xE4); break;
-  case 229: R = syclcompat::ternary_logic_op(A, B, C, 0xE5); break;
-  case 230: R = syclcompat::ternary_logic_op(A, B, C, 0xE6); break;
-  case 231: R = syclcompat::ternary_logic_op(A, B, C, 0xE7); break;
-  case 232: R = syclcompat::ternary_logic_op(A, B, C, 0xE8); break;
-  case 233: R = syclcompat::ternary_logic_op(A, B, C, 0xE9); break;
-  case 234: R = syclcompat::ternary_logic_op(A, B, C, 0xEA); break;
-  case 235: R = syclcompat::ternary_logic_op(A, B, C, 0xEB); break;
-  case 236: R = syclcompat::ternary_logic_op(A, B, C, 0xEC); break;
-  case 237: R = syclcompat::ternary_logic_op(A, B, C, 0xED); break;
-  case 238: R = syclcompat::ternary_logic_op(A, B, C, 0xEE); break;
-  case 239: R = syclcompat::ternary_logic_op(A, B, C, 0xEF); break;
-  case 240: R = syclcompat::ternary_logic_op(A, B, C, 0xF0); break;
-  case 241: R = syclcompat::ternary_logic_op(A, B, C, 0xF1); break;
-  case 242: R = syclcompat::ternary_logic_op(A, B, C, 0xF2); break;
-  case 243: R = syclcompat::ternary_logic_op(A, B, C, 0xF3); break;
-  case 244: R = syclcompat::ternary_logic_op(A, B, C, 0xF4); break;
-  case 245: R = syclcompat::ternary_logic_op(A, B, C, 0xF5); break;
-  case 246: R = syclcompat::ternary_logic_op(A, B, C, 0xF6); break;
-  case 247: R = syclcompat::ternary_logic_op(A, B, C, 0xF7); break;
-  case 248: R = syclcompat::ternary_logic_op(A, B, C, 0xF8); break;
-  case 249: R = syclcompat::ternary_logic_op(A, B, C, 0xF9); break;
-  case 250: R = syclcompat::ternary_logic_op(A, B, C, 0xFA); break;
-  case 251: R = syclcompat::ternary_logic_op(A, B, C, 0xFB); break;
-  case 252: R = syclcompat::ternary_logic_op(A, B, C, 0xFC); break;
-  case 253: R = syclcompat::ternary_logic_op(A, B, C, 0xFD); break;
-  case 254: R = syclcompat::ternary_logic_op(A, B, C, 0xFE); break;
-  case 255: R = syclcompat::ternary_logic_op(A, B, C, 0xFF); break;
-  }
-}
-
-// clang-format on
-
-void ternary_logic_op(int *ec) {
-  uint32_t X, Y, A = 1, B = 2, C = 3, D;
-  for (D = 0; D < 256; ++D) {
-    reference_of_ternary_logic_op(X, A, B, C, D);
-    asm_ternary_logic_op(Y, A, B, C, D);
-    if (X != Y) {
-      *ec = D;
-      return;
-    }
-  }
-  *ec = 0;
-}
-
-int main() {
-  syclcompat::device_ext &dev_ct1 = syclcompat::get_current_device();
-  sycl::queue &q_ct1 = *dev_ct1.default_queue();
-  int ret = 0;
-  int *d_ec = nullptr;
-  d_ec = sycl::malloc_device<int>(1, q_ct1);
-
-  auto wait_and_check = [&](const char *case_name) {
-    syclcompat::get_current_device().queues_wait_and_throw();
-    int ec = 0;
-    syclcompat::get_default_queue().memcpy(&ec, d_ec, sizeof(int)).wait();
-    if (ec != 0)
-      printf("Test %s failed: return code = %d\n", case_name, ec);
-    ret = ret || ec;
-  };
-
-  q_ct1.parallel_for(
-      sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
-      [=](sycl::nd_item<3> item_ct1) { ternary_logic_op(d_ec); });
-  wait_and_check("ternary_logic_op");
-
-  syclcompat::wait_and_free(d_ec, q_ct1);
-
-  return ret;
-}
diff --git a/sycl/test/check_device_code/syclcompat_local_mem.cpp b/sycl/test/check_device_code/syclcompat_local_mem.cpp
deleted file mode 100644
index e9c2bd320bff4..0000000000000
--- a/sycl/test/check_device_code/syclcompat_local_mem.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-// RUN: %clangxx -fsycl -fsycl-device-only -S -emit-llvm %s -o - | FileCheck %s
-
-// The test checks that multiple calls to the same template instantiation of
-// syclcompat local_mem function result in separate allocations.
-
-// CHECK: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] poison, align 4
-// CHECK-NEXT: @WGLocalMem{{.*}} = internal addrspace(3) global [4 x i8] poison, align 4
-
-#include <sycl/detail/core.hpp>
-#include <syclcompat/memory.hpp>
-
-using namespace sycl;
-
-int main() {
-  queue Q;
-
-  int **Out = malloc_shared<int *>(2, Q);
-
-  Q.submit([&](handler &Cgh) {
-    Cgh.parallel_for(nd_range<1>({1}, {1}), [=](nd_item<1> Item) {
-      auto Ptr0 = syclcompat::local_mem<int[1]>();
-      auto Ptr1 = syclcompat::local_mem<int[1]>();
-      Out[0] = Ptr0;
-      Out[1] = Ptr1;
-    });
-  });
-}
diff --git a/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp b/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp
index 1a93a6ad8db61..f4a2f27905f2e 100644
--- a/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp
+++ b/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp
@@ -54,7 +54,7 @@
 // tests to match the required format and in that case you should just update
 // (i.e. reduce) the number and the list below.
 //
-// NUMBER-OF-UNSUPPORTED-WITHOUT-INFO: 149
+// NUMBER-OF-UNSUPPORTED-WITHOUT-INFO: 142
 //
 // List of improperly UNSUPPORTED tests.
 // Remove the CHECK once the test has been properly UNSUPPORTED.
@@ -201,10 +201,3 @@
 // CHECK-NEXT: Tracing/usm/queue_single_task_released_pointer.cpp
 // CHECK-NEXT: USM/badmalloc.cpp
 // CHECK-NEXT: USM/pointer_query_descendent_device.cpp
-// CHECK-NEXT: syclcompat/atomic/atomic_arith.cpp
-// CHECK-NEXT: syclcompat/atomic/atomic_bitwise.cpp
-// CHECK-NEXT: syclcompat/atomic/atomic_class.cpp
-// CHECK-NEXT: syclcompat/atomic/atomic_comp_exchange.cpp
-// CHECK-NEXT: syclcompat/atomic/atomic_memory_acq_rel.cpp
-// CHECK-NEXT: syclcompat/atomic/atomic_minmax.cpp
-// CHECK-NEXT: syclcompat/kernel/kernel_lin.cpp
diff --git a/sycl/test/syclcompat/launch/kernel_properties.cpp b/sycl/test/syclcompat/launch/kernel_properties.cpp
deleted file mode 100644
index 6beefce73d14b..0000000000000
--- a/sycl/test/syclcompat/launch/kernel_properties.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  kernel_properties.cpp
- *
- *  Description:
- *     launch<F> with kernel_properties tests
- **************************************************************************/
-
-// We need hardware which can support at least 2 sub-group sizes, since that
-// hardware (presumably) supports the `intel_reqd_sub_group_size` attribute.
-// REQUIRES: sg-32 && sg-16
-// RUN: %clangxx -fsycl -fsycl-device-only -Xclang -fsycl-is-device %if cl_options %{/clang:-S /clang:-emit-llvm%} %else %{-S -emit-llvm%} %s -o - | FileCheck %s
-#include <sycl/ext/oneapi/kernel_properties/properties.hpp>
-#include <sycl/detail/core.hpp>
-#include <sycl/ext/oneapi/properties/properties.hpp>
-
-#include <syclcompat/launch.hpp>
-
-namespace compat_exp = syclcompat::experimental;
-namespace sycl_exp = sycl::ext::oneapi::experimental;
-
-// Dummy kernel function for testing
-inline void empty_kernel_1(){};
-inline void empty_kernel_2(){};
-
-// Set `sub_group_size` property for kernel & check it becomes attribute
-// `reqd_sub_group_size`
-int test_kernel_properties() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  compat_exp::kernel_properties my_k_props{sycl_exp::sub_group_size<32>};
-  compat_exp::launch_policy my_config(sycl::nd_range<1>{{32}, {32}},
-                                      my_k_props);
-  compat_exp::launch<empty_kernel_1>(my_config);
-
-  //CHECK: {{define.*kernel.*empty_kernel_1.* !intel_reqd_sub_group_size !}}
-  return 0;
-}
-
-// Negative test for previous test
-int test_no_kernel_properties() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-  compat_exp::launch_policy my_config(sycl::nd_range<1>{{32}, {32}});
-  compat_exp::launch<empty_kernel_2>(my_config);
-
-  //CHECK-NOT: {{define.*kernel.*empty_kernel_2.* !intel_reqd_sub_group_size !}}
-  return 0;
-}
diff --git a/sycl/test/syclcompat/launch/launch_inlining.cpp b/sycl/test/syclcompat/launch/launch_inlining.cpp
deleted file mode 100644
index a224837139a56..0000000000000
--- a/sycl/test/syclcompat/launch/launch_inlining.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  launch_inlining.cpp
- *
- *  Description:
- *    Ensure kernels are inlined
- **************************************************************************/
-// RUN: %clangxx -fsycl -fgpu-inline-threshold=0 %if cl_options %{/clang:-S /clang:-emit-llvm%} %else %{-S -emit-llvm%} %s -o - | FileCheck %s
-// We set -fgpu-inline-threshold=0 to disable heuristic inlining for the
-// purposes of the test
-#include <sycl/detail/core.hpp>
-#include <sycl/group_barrier.hpp>
-#include <syclcompat/launch.hpp>
-#include <syclcompat/memory.hpp>
-
-namespace compat_exp = syclcompat::experimental;
-namespace sycl_exp = sycl::ext::oneapi::experimental;
-namespace sycl_intel_exp = sycl::ext::intel::experimental;
-
-static constexpr int LOCAL_MEM_SIZE = 1024;
-
-// CHECK: define {{.*}}spir_kernel{{.*}}write_mem_kernel{{.*}} {
-// CHECK-NOT: call {{.*}}write_mem_kernel
-// CHECK: }
-
-template <typename T> void write_mem_kernel(T *data, int num_elements) {
-  const int id =
-      sycl::ext::oneapi::this_work_item::get_nd_item<1>().get_global_id(0);
-  if (id < num_elements) {
-    data[id] = static_cast<T>(id);
-  }
-};
-
-// CHECK: define {{.*}}spir_kernel{{.*}}dynamic_local_mem_typed_kernel{{.*}} {
-// CHECK-NOT: call {{.*}}dynamic_local_mem_typed_kernel
-// CHECK: }
-template <typename T>
-void dynamic_local_mem_typed_kernel(T *data, char *local_mem) {
-  constexpr size_t num_elements = LOCAL_MEM_SIZE / sizeof(T);
-  T *typed_local_mem = reinterpret_cast<T *>(local_mem);
-
-  const int id =
-      sycl::ext::oneapi::this_work_item::get_nd_item<1>().get_global_id(0);
-  if (id < num_elements) {
-    typed_local_mem[id] = static_cast<T>(id);
-  }
-  sycl::group_barrier(sycl::ext::oneapi::this_work_item::get_work_group<1>());
-  if (id < num_elements) {
-    data[id] = typed_local_mem[num_elements - id - 1];
-  }
-};
-
-int test_write_mem() {
-  compat_exp::launch_policy my_dim3_config(syclcompat::dim3{32},
-                                           syclcompat::dim3{32});
-
-  const int memsize = 1024;
-  int *d_a = (int *)syclcompat::malloc(memsize);
-  compat_exp::launch<write_mem_kernel<int>>(my_dim3_config, d_a,
-                                            memsize / sizeof(int))
-      .wait();
-
-  syclcompat::free(d_a);
-  return 0;
-}
-
-int test_lmem_launch() {
-  int local_mem_size = LOCAL_MEM_SIZE;
-
-  size_t num_elements = local_mem_size / sizeof(int);
-  int *d_a = (int *)syclcompat::malloc(local_mem_size);
-
-  compat_exp::launch_policy my_config(
-      sycl::nd_range<1>{{256}, {256}},
-      compat_exp::local_mem_size(local_mem_size));
-
-  compat_exp::launch<dynamic_local_mem_typed_kernel<int>>(my_config, d_a)
-      .wait();
-
-  syclcompat::free(d_a);
-
-  return 0;
-}
diff --git a/sycl/test/syclcompat/launch/launch_policy_lmem_neg.cpp b/sycl/test/syclcompat/launch/launch_policy_lmem_neg.cpp
deleted file mode 100644
index 5c2750e86b705..0000000000000
--- a/sycl/test/syclcompat/launch/launch_policy_lmem_neg.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  launch_policy_lmem_neg.cpp
- *
- *  Description:
- *     Negative testing for launch_policy - local memory specific
- *     These tests are in their own TU because they instantiate some of the same
- *     templates as tests in launch_policy_neg.cpp
- **************************************************************************/
-
-// RUN: not %clangxx -fsycl -fsyntax-only %s 2>&1 | FileCheck -vv %s
-
-#include <sycl/detail/core.hpp>
-#include <sycl/ext/oneapi/properties/properties.hpp>
-
-#include <syclcompat/launch.hpp>
-#include <syclcompat/memory.hpp>
-
-// Dummy kernels for testing
-inline void int_kernel(int a){};
-inline void dynamic_local_mem_empty_kernel(char *a){};
-
-namespace compat_exp = syclcompat::experimental;
-namespace sycl_exp = sycl::ext::oneapi::experimental;
-
-void test_lmem_launch() {
-  sycl::nd_range<3> launch_range{{1, 1, 32}, {1, 1, 32}};
-
-  // Missing local mem
-  {
-    compat_exp::launch_policy policy(
-        launch_range,
-        compat_exp::kernel_properties{sycl_exp::sub_group_size<32>});
-    compat_exp::launch<dynamic_local_mem_empty_kernel>(policy);
-    //CHECK-DAG: error: static assertion failed due to requirement 'syclcompat::args_compatible
-  }
-
-  // Unneeded local mem
-  {
-    compat_exp::launch_policy lmem_policy(launch_range,
-                                          compat_exp::local_mem_size{1024});
-    int int_arg{1};
-    compat_exp::launch<int_kernel>(lmem_policy, int_arg);
-    //CHECK-DAG: error: static assertion failed due to requirement 'syclcompat::args_compatible
-  }
-}
diff --git a/sycl/test/syclcompat/launch/launch_policy_neg.cpp b/sycl/test/syclcompat/launch/launch_policy_neg.cpp
deleted file mode 100644
index 15a2d675e46af..0000000000000
--- a/sycl/test/syclcompat/launch/launch_policy_neg.cpp
+++ /dev/null
@@ -1,191 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  launch_policy_neg.cpp
- *
- *  Description:
- *     Negative tests for new launch_policy.
- **************************************************************************/
-
-// RUN: not %clangxx -fsycl -fsyntax-only %s -DCHECK1 2>&1 | FileCheck -vv %s --check-prefixes=CHECK1
-// RUN: not %clangxx -fsycl -fsyntax-only %s -DCHECK2 2>&1 | FileCheck -vv %s --check-prefixes=CHECK2
-// RUN: not %clangxx -fsycl -fsyntax-only %s -DCHECK3 2>&1 | FileCheck -vv %s --check-prefixes=CHECK3
-// RUN: not %clangxx -fsycl -fsyntax-only %s -DCHECK4 2>&1 | FileCheck -vv %s --check-prefixes=CHECK4
-// RUN: not %clangxx -fsycl -fsyntax-only %s -DCHECK5 2>&1 | FileCheck -vv %s --check-prefixes=CHECK5
-// RUN: not %clangxx -fsycl -fsyntax-only %s -DCHECK6 2>&1 | FileCheck -vv %s --check-prefixes=CHECK6
-// RUN: not %clangxx -fsycl -fsyntax-only %s -DCHECK7 2>&1 | FileCheck -vv %s --check-prefixes=CHECK7
-// RUN: not %clangxx -fsycl -fsyntax-only %s -DCHECK8 2>&1 | FileCheck -vv %s --check-prefixes=CHECK8
-// RUN: not %clangxx -fsycl -fsyntax-only %s -DCHECK9 2>&1 | FileCheck -vv %s --check-prefixes=CHECK9
-// RUN: not %clangxx -fsycl -fsyntax-only %s -DCHECK10 2>&1 | FileCheck -vv %s --check-prefixes=CHECK10
-// RUN: not %clangxx -fsycl -fsyntax-only %s -DCHECK11 2>&1 | FileCheck -vv %s --check-prefixes=CHECK11
-// RUN: not %clangxx -fsycl -fsyntax-only %s -DCHECK12 2>&1 | FileCheck -vv %s --check-prefixes=CHECK12
-
-#include <sycl/ext/oneapi/kernel_properties/properties.hpp>
-#include <sycl/detail/core.hpp>
-#include <sycl/ext/oneapi/properties/properties.hpp>
-#include <sycl/group_barrier.hpp>
-
-#include <syclcompat/launch.hpp>
-#include <syclcompat/memory.hpp>
-#include <syclcompat/dims.hpp>
-
-namespace compat_exp = syclcompat::experimental;
-namespace sycl_exp = sycl::ext::oneapi::experimental;
-
-// Notes on use of FileCheck here:
-// Failures do not necessarily occur in order (hence use of CHECK-DAG)
-// Additionally a `static_assert` hit during a template instantiation will only
-// be hit once per unique concrete class. The only solution (aside from hacking
-// the examples to have different template types) would presumably be multiple
-// compilation units?
-
-// Dummy kernels for testing
-inline void empty_kernel(){};
-inline void int_kernel(int a){};
-inline void int_ptr_kernel(int *a){};
-
-inline void dynamic_local_mem_empty_kernel(char *a){};
-
-template <typename T>
-inline void dynamic_local_mem_basicdt_kernel(T value, char *local_mem){};
-
-
-// Dummy property container for negative testing
-template <typename Properties> struct dummy_properties {
-  static_assert(sycl_exp::is_property_list_v<Properties>);
-  using Props = Properties;
-
-  template <typename... Props>
-  dummy_properties(Props... properties) : props{properties...} {}
-
-  Properties props;
-};
-template <typename... Props>
-dummy_properties(Props... props)
-    -> dummy_properties<decltype(sycl_exp::properties(props...))>;
-
-void test_variadic_config_ctor() {
-  std::cout << __PRETTY_FUNCTION__ << std::endl;
-
-#ifdef CHECK1
-  // Missing range
-  {
-    compat_exp::launch_policy missing_range_config(
-        compat_exp::kernel_properties{sycl_exp::sub_group_size<32>});
-    //CHECK1: error: static assertion failed due to requirement 'syclcompat::detail::is_range_or_nd_range_v
-  }
-#endif
-#ifdef CHECK2
-  // Duplicate nd_range
-  {
-    sycl::nd_range<3> launch_range{{1,1,32},{1,1,32}};
-    compat_exp::launch_policy duplicate_nd_range_config(launch_range, launch_range);
-    //CHECK2: error: static assertion failed{{.*Did you forget to wrap}}
-  }
-#endif
-#ifdef CHECK3
-  // Duplicate range
-  {
-    sycl::range<3> launch_range{1,1,32};
-    compat_exp::launch_policy duplicate_nd_range_config(launch_range, launch_range);
-    //CHECK3: error: static assertion failed{{.*Did you forget to wrap}}
-  }
-#endif
-#ifdef CHECK4
-  // Unwrapped property
-  {
-    sycl::nd_range<3> launch_range{{1,1,32},{1,1,32}};
-    compat_exp::launch_policy unwrapped_property_config(launch_range, {sycl_exp::sub_group_size<32>});
-    //CHECK4: error: no viable constructor or deduction guide for deduction of template arguments of 'compat_exp::launch_policy'
-  }
-#endif
-#ifdef CHECK5
-  // Foreign object in ctor
-  {
-    dummy_properties foreign_object{sycl_exp::sub_group_size<32>};
-    sycl::nd_range<3> launch_range{{1,1,32},{1,1,32}};
-    compat_exp::launch_policy unwrapped_property_config(launch_range, foreign_object);
-    //CHECK5: error: static assertion failed{{.*Did you forget to wrap}}
-  }
-#endif
-#ifdef CHECK6
-  // Local mem with sycl::range launch 1
-  {
-    sycl::range<3> launch_range{1, 1, 32};
-    compat_exp::local_mem_size lmem_size(0);
-    compat_exp::launch_policy range_and_local_mem_config_1(launch_range,
-                                                         lmem_size);
-    //CHECK6: error: static assertion failed due to requirement 'syclcompat::detail::is_nd_range_v<sycl::range<3>> || !true': sycl::range kernel launches are incompatible with local
-  }
-#endif
-#ifdef CHECK7
-  // Local mem with sycl::range launch 2
-  {
-    syclcompat::dim3 launch_range{32, 1, 1};
-    compat_exp::local_mem_size lmem_size(0);
-    compat_exp::launch_policy range_and_local_mem_config_2(launch_range, compat_exp::kernel_properties{sycl_exp::sub_group_size<32>},
-                                                         lmem_size);
-    //CHECK7: error: static assertion failed due to requirement 'syclcompat::detail::is_nd_range_v<sycl::range<3>> || !true': sycl::range kernel launches are incompatible with local
-  }
-#endif
-#ifdef CHECK8
-  // Duplicate local_mem spec
-  {
-    sycl::nd_range<3> launch_range{{1, 1, 32}, {1, 1, 32}};
-    compat_exp::local_mem_size lmem_size(0);
-    compat_exp::launch_policy duplicate_local_mem_config(launch_range, lmem_size, lmem_size);
-    //CHECK8: error: static assertion failed{{.*(exactly once|duplicate type)}}
-  }
-#endif
-#ifdef CHECK9
-  // Duplicate kernel_properties spec
-  {
-    sycl::nd_range<3> launch_range{{1, 1, 32}, {1, 1, 32}};
-    compat_exp::kernel_properties kernel_props{sycl_exp::sub_group_size<32>};
-    compat_exp::launch_policy duplicate_kernel_properties_config(launch_range, kernel_props, kernel_props);
-    //CHECK9: error: static assertion failed due to requirement{{.*type appears more than once}}
-  }
-#endif
-#ifdef CHECK10
-  // Duplicate launch_properties spec
-  {
-    sycl::nd_range<3> launch_range{{1, 1, 32}, {1, 1, 32}};
-    compat_exp::launch_properties launch_props{};
-    compat_exp::local_mem_size lmem_size(0);
-    compat_exp::launch_policy duplicate_launch_properties_config(launch_range, launch_props, lmem_size, launch_props);
-    //CHECK10: error: static assertion failed due to requirement{{.*type appears more than once}}
-  }
-#endif
-#ifdef CHECK11
-  // Missing kernel args
-  {
-    sycl::range<3> launch_range{1, 1, 32};
-    compat_exp::launch_policy range_only(launch_range);
-    compat_exp::launch<int_kernel>(range_only);
-    //CHECK11: error: static assertion failed due to requirement 'syclcompat::args_compatible
-  }
-#endif
-#ifdef CHECK12
-  // Extra kernel args
-  {
-    sycl::nd_range<3> launch_range{{1, 1, 32}, {1, 1, 32}};
-    compat_exp::launch_policy range_only(launch_range);
-    int extra_arg = 1;
-    compat_exp::launch<empty_kernel>(range_only, extra_arg);
-    //CHECK12: error: static assertion failed due to requirement 'syclcompat::args_compatible
-  }
-#endif
-}
diff --git a/sycl/test/syclcompat/memory_adl.cpp b/sycl/test/syclcompat/memory_adl.cpp
deleted file mode 100644
index ac6d812d19f3e..0000000000000
--- a/sycl/test/syclcompat/memory_adl.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/***************************************************************************
- *
- *  Copyright (C) Codeplay Software Ltd.
- *
- *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
- *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
- *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *  SYCLcompat API
- *
- *  memory_adl.cpp
- *
- *  Description:
- *    Tests to ensure global namespace functions don't clash via ADL
- **************************************************************************/
-
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -fsyntax-only
-// Test that no syclcompat:: functions clash with global namespace fns due to ADL
-#include <sycl/sycl.hpp>
-#include <syclcompat/syclcompat.hpp>
-
-int main(){
-  syclcompat::device_info dummy_info;
-  syclcompat::device_info dummy_info_2;
-  memset(&dummy_info, 0, sizeof(syclcompat::device_info));
-  memcpy(&dummy_info, &dummy_info_2, sizeof(syclcompat::device_info));
-  free(&dummy_info);
-}
diff --git a/sycl/test/syclcompat/warnings_deprecated.cpp b/sycl/test/syclcompat/warnings_deprecated.cpp
deleted file mode 100644
index 65ff03173148e..0000000000000
--- a/sycl/test/syclcompat/warnings_deprecated.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-// Test to verify that syclcompat namespace and APIs generate deprecation
-// warnings.
-
-// RUN: %clangxx -fsycl -fsyntax-only -Xclang -verify -Xclang -verify-ignore-unexpected=note,warning %s -Wall -Wextra
-
-#include <syclcompat/syclcompat.hpp>
-
-int main() {
-  // Test deprecated namespace
-  // expected-warning@+1{{'syclcompat' is deprecated}}
-  syclcompat::dim3 grid(1, 1, 1);
-
-  // expected-warning@+1{{'syclcompat' is deprecated}}
-  auto queue = syclcompat::get_default_queue();
-
-  // Test deprecated memory APIs
-  // expected-warning@+1{{'syclcompat' is deprecated}}
-  void *ptr = syclcompat::malloc(1024);
-
-  // expected-warning@+1{{'syclcompat' is deprecated}}
-  syclcompat::free(ptr);
-
-  // Test deprecated utility APIs
-  // expected-warning@+1{{'syclcompat' is deprecated}}
-  auto device_count = syclcompat::device_count();
-
-  // expected-warning@+1{{'syclcompat' is deprecated}}
-  syclcompat::wait();
-
-  // Test deprecated atomic APIs
-  int value = 42;
-  int operand = 10;
-  // expected-warning@+1{{'syclcompat' is deprecated}}
-  syclcompat::atomic_fetch_add(&value, operand);
-
-  // Test deprecated math APIs
-  // expected-warning@+1{{'syclcompat' is deprecated}}
-  auto result = syclcompat::max(1, 2);
-
-  // Test deprecated device APIs
-  // expected-warning@+1{{'syclcompat' is deprecated}}
-  syclcompat::device_info info;
-
-  // Test deprecated experimental APIs
-  // expected-warning@+1{{'syclcompat' is deprecated}}
-  syclcompat::experimental::launch_policy my_config(
-      sycl::nd_range<1>{{32}, {32}});
-
-  return 0;
-}