From 5a000ac7b8f52045dd97da87ce1df871bba8a4fe Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Wed, 25 Sep 2024 15:17:47 +0100
Subject: [PATCH 01/26] No constexpr construction in math tests

bfloat16 can't be constexpr constructed
---
 .../test-e2e/syclcompat/math/math_compare.cpp | 36 ++++++-------
 sycl/test-e2e/syclcompat/math/math_ops.cpp    | 54 +++++++++----------
 2 files changed, 45 insertions(+), 45 deletions(-)
diff --git a/sycl/test-e2e/syclcompat/math/math_compare.cpp b/sycl/test-e2e/syclcompat/math/math_compare.cpp
index c42b22b199888..11afc7420dc6d 100644
--- a/sycl/test-e2e/syclcompat/math/math_compare.cpp
+++ b/sycl/test-e2e/syclcompat/math/math_compare.cpp
@@ -56,7 +56,7 @@ template <typename ValueT> void test_compare() {
 
   constexpr syclcompat::dim3 grid{1};
   constexpr syclcompat::dim3 threads{1};
-  constexpr ValueT op1 = static_cast<ValueT>(1.0);
+  const ValueT op1 = static_cast<ValueT>(1.0);
   ValueT op2 = sycl::nan(static_cast<unsigned int>(0));
 
   //  1.0 == 1.0 -> True
@@ -102,7 +102,7 @@ template <typename ValueT> void test_compare_vec() {
 
   constexpr syclcompat::dim3 grid{1};
   constexpr syclcompat::dim3 threads{1};
-  constexpr Container op1 = {static_cast<ValueT>(1.0),
+  const Container op1 = {static_cast<ValueT>(1.0),
                              static_cast<ValueT>(2.0)};
   Container op2 = {static_cast<ValueT>(1.0),
                    sycl::nan(static_cast<unsigned int>(0))};
@@ -110,12 +110,12 @@ template <typename ValueT> void test_compare_vec() {
   // bool2 does not exist, 1.0 and 0.0 floats are used for true
   // and false instead.
   //  1.0 == 1.0, 2.0 == NaN -> {true, false}
-  constexpr Container res1 = {1.0, 0.0};
+  const Container res1 = {1.0, 0.0};
   BinaryOpTestLauncher<Container, Container>(grid, threads)
       .template launch_test<compare_equal_vec_kernel<Container>>(op1, op2,
                                                                  res1);
   //  1.0 != 1.0, 2.0 != NaN -> {false, false}
-  constexpr Container res2 = {0.0, 0.0};
+  const Container res2 = {0.0, 0.0};
   BinaryOpTestLauncher<Container, Container>(grid, threads)
       .template launch_test<compare_not_equal_vec_kernel<Container>>(op1, op2,
                                                                      res2);
@@ -137,7 +137,7 @@ void test_unordered_compare() {
 
   constexpr syclcompat::dim3 grid{1};
   constexpr syclcompat::dim3 threads{1};
-  constexpr ValueT op1 = static_cast<ValueT>(1.0);
+  const ValueT op1 = static_cast<ValueT>(1.0);
   ValueT op2 = sycl::nan(static_cast<unsigned int>(0));
 
   // Unordered comparison checks if either operand is NaN, or the binaryop holds
@@ -183,7 +183,7 @@ template <typename ValueT> void test_unordered_compare_vec() {
 
   constexpr syclcompat::dim3 grid{1};
   constexpr syclcompat::dim3 threads{1};
-  constexpr Container op1 = {static_cast<ValueT>(1.0),
+  const Container op1 = {static_cast<ValueT>(1.0),
                              static_cast<ValueT>(2.0)};
   Container op2 = {static_cast<ValueT>(1.0),
                    sycl::nan(static_cast<unsigned int>(0))};
@@ -191,12 +191,12 @@ template <typename ValueT> void test_unordered_compare_vec() {
   // bool2 does not exist, 1.0 and 0.0 floats are used for true
   // and false instead.
   //  1.0 == 1.0, 2.0 == NaN -> {true, true}
-  constexpr Container res1 = {1.0, 1.0};
+  const Container res1 = {1.0, 1.0};
   BinaryOpTestLauncher<Container, Container>(grid, threads)
       .template launch_test<unordered_compare_equal_vec_kernel<Container>>(
           op1, op2, res1);
   //  1.0 != 1.0, 2.0 != NaN -> {false, true}
-  constexpr Container res2 = {0.0, 1.0};
+  const Container res2 = {0.0, 1.0};
   BinaryOpTestLauncher<Container, Container>(grid, threads)
       .template launch_test<unordered_compare_not_equal_vec_kernel<Container>>(
           op1, op2, res2);
@@ -213,7 +213,7 @@ template <typename ValueT> void test_compare_both() {
 
   constexpr syclcompat::dim3 grid{1};
   constexpr syclcompat::dim3 threads{1};
-  constexpr Container op1 = {static_cast<ValueT>(1.0),
+  const Container op1 = {static_cast<ValueT>(1.0),
                              static_cast<ValueT>(2.0)};
   Container op2 = {static_cast<ValueT>(1.0),
                    sycl::nan(static_cast<unsigned int>(0))};
@@ -242,7 +242,7 @@ template <typename ValueT> void test_unordered_compare_both() {
 
   constexpr syclcompat::dim3 grid{1};
   constexpr syclcompat::dim3 threads{1};
-  constexpr Container op1 = {static_cast<ValueT>(1.0),
+  const Container op1 = {static_cast<ValueT>(1.0),
                              static_cast<ValueT>(2.0)};
   Container op2 = {static_cast<ValueT>(1.0),
                    sycl::nan(static_cast<unsigned int>(0))};
@@ -272,13 +272,13 @@ template <typename ValueT> void test_compare_mask() {
 
   constexpr syclcompat::dim3 grid{1};
   constexpr syclcompat::dim3 threads{1};
-  constexpr Container op1 = {static_cast<ValueT>(1.0),
+  const Container op1 = {static_cast<ValueT>(1.0),
                              static_cast<ValueT>(2.0)};
-  constexpr Container op2 = {static_cast<ValueT>(2.0),
+  const Container op2 = {static_cast<ValueT>(2.0),
                              static_cast<ValueT>(1.0)};
-  constexpr Container op3 = {static_cast<ValueT>(1.0),
+  const Container op3 = {static_cast<ValueT>(1.0),
                              static_cast<ValueT>(3.0)};
-  constexpr Container op4 = {static_cast<ValueT>(3.0),
+  const Container op4 = {static_cast<ValueT>(3.0),
                              static_cast<ValueT>(2.0)};
   Container op5 = {sycl::nan(static_cast<unsigned int>(0)),
                    sycl::nan(static_cast<unsigned int>(0))};
@@ -320,13 +320,13 @@ template <typename ValueT> void test_unordered_compare_mask() {
 
   constexpr syclcompat::dim3 grid{1};
   constexpr syclcompat::dim3 threads{1};
-  constexpr Container op1 = {static_cast<ValueT>(1.0),
+  const Container op1 = {static_cast<ValueT>(1.0),
                              static_cast<ValueT>(2.0)};
-  constexpr Container op2 = {static_cast<ValueT>(2.0),
+  const Container op2 = {static_cast<ValueT>(2.0),
                              static_cast<ValueT>(1.0)};
-  constexpr Container op3 = {static_cast<ValueT>(1.0),
+  const Container op3 = {static_cast<ValueT>(1.0),
                              static_cast<ValueT>(3.0)};
-  constexpr Container op4 = {static_cast<ValueT>(3.0),
+  const Container op4 = {static_cast<ValueT>(3.0),
                              static_cast<ValueT>(2.0)};
   Container op5 = {sycl::nan(static_cast<unsigned int>(0)),
                    sycl::nan(static_cast<unsigned int>(0))};
diff --git a/sycl/test-e2e/syclcompat/math/math_ops.cpp b/sycl/test-e2e/syclcompat/math/math_ops.cpp
index 258c2a12ba1e5..d30bac3ed4b11 100644
--- a/sycl/test-e2e/syclcompat/math/math_ops.cpp
+++ b/sycl/test-e2e/syclcompat/math/math_ops.cpp
@@ -43,9 +43,9 @@ void test_syclcompat_max() {
 
   constexpr syclcompat::dim3 grid{1};
   constexpr syclcompat::dim3 threads{1};
-  constexpr ValueT op1 = static_cast<ValueT>(5);
-  constexpr ValueU op2 = static_cast<ValueU>(10);
-  constexpr std::common_type_t<ValueT, ValueU> res = static_cast<ValueU>(10);
+  const ValueT op1 = static_cast<ValueT>(5);
+  const ValueU op2 = static_cast<ValueU>(10);
+  const std::common_type_t<ValueT, ValueU> res = static_cast<ValueU>(10);
 
   BinaryOpTestLauncher<ValueT, ValueU>(grid, threads)
       .template launch_test<max_kernel<ValueT, ValueU>>(op1, op2, res);
@@ -63,9 +63,9 @@ void test_syclcompat_min() {
 
   constexpr syclcompat::dim3 grid{1};
   constexpr syclcompat::dim3 threads{1};
-  constexpr ValueT op1 = static_cast<ValueT>(5);
-  constexpr ValueU op2 = static_cast<ValueU>(10);
-  constexpr std::common_type_t<ValueT, ValueU> res =
+  const ValueT op1 = static_cast<ValueT>(5);
+  const ValueU op2 = static_cast<ValueU>(10);
+  const std::common_type_t<ValueT, ValueU> res =
       static_cast<std::common_type_t<ValueT, ValueU>>(5);
 
   BinaryOpTestLauncher<ValueT, ValueU>(grid, threads)
@@ -84,11 +84,11 @@ void test_syclcompat_fmin_nan() {
 
   constexpr syclcompat::dim3 grid{1};
   constexpr syclcompat::dim3 threads{1};
-  constexpr ValueT op1 = static_cast<ValueT>(5);
-  constexpr ValueU op2 = static_cast<ValueU>(10);
+  const ValueT op1 = static_cast<ValueT>(5);
+  const ValueU op2 = static_cast<ValueU>(10);
   ValueU op3 = sycl::nan(static_cast<unsigned int>(0));
 
-  constexpr std::common_type_t<ValueT, ValueU> res =
+  const std::common_type_t<ValueT, ValueU> res =
       static_cast<std::common_type_t<ValueT, ValueU>>(5);
 
   BinaryOpTestLauncher<ValueT, ValueU>(grid, threads)
@@ -110,11 +110,11 @@ void test_syclcompat_fmax_nan() {
 
   constexpr syclcompat::dim3 grid{1};
   constexpr syclcompat::dim3 threads{1};
-  constexpr ValueT op1 = static_cast<ValueT>(5);
-  constexpr ValueU op2 = static_cast<ValueU>(10);
+  const ValueT op1 = static_cast<ValueT>(5);
+  const ValueU op2 = static_cast<ValueU>(10);
   ValueU op3 = sycl::nan(static_cast<unsigned int>(0));
 
-  constexpr std::common_type_t<ValueT, ValueU> res =
+  const std::common_type_t<ValueT, ValueU> res =
       static_cast<std::common_type_t<ValueT, ValueU>>(10);
 
   BinaryOpTestLauncher<ValueT, ValueU>(grid, threads)
@@ -146,9 +146,9 @@ void test_syclcompat_pow() {
   constexpr syclcompat::dim3 grid{1};
   constexpr syclcompat::dim3 threads{1};
   // 3 ** 3 = 27
-  constexpr ValueT op1 = static_cast<ValueT>(3);
-  constexpr ValueU op2 = static_cast<ValueU>(3);
-  constexpr ValueT res = static_cast<ValueT>(27);
+  const ValueT op1 = static_cast<ValueT>(3);
+  const ValueU op2 = static_cast<ValueU>(3);
+  const ValueT res = static_cast<ValueT>(27);
 
   BinaryOpTestLauncher<ValueT, ValueU>(grid, threads)
       .template launch_test<pow_kernel<ValueT, ValueU>>(op1, op2, res);
@@ -165,25 +165,25 @@ template <typename ValueT> void test_syclcompat_relu() {
   constexpr syclcompat::dim3 threads{1};
 
   // relu(3) = 3, relu(-value) = 0
-  constexpr ValueT op1 = static_cast<ValueT>(3);
-  constexpr ValueT res1 = static_cast<ValueT>(3);
+  const ValueT op1 = static_cast<ValueT>(3);
+  const ValueT res1 = static_cast<ValueT>(3);
   UnaryOpTestLauncher<ValueT>(grid, threads)
       .template launch_test<relu_kernel<ValueT>>(op1, res1);
 
-  constexpr ValueT op2 = static_cast<ValueT>(-3);
-  constexpr ValueT res2 = static_cast<ValueT>(0);
+  const ValueT op2 = static_cast<ValueT>(-3);
+  const ValueT res2 = static_cast<ValueT>(0);
   UnaryOpTestLauncher<ValueT>(grid, threads)
       .template launch_test<relu_kernel<ValueT>>(op2, res2);
 
   using ValueU = sycl::vec<ValueT, 2>;
-  constexpr ValueU op3{op1, op2};
-  constexpr ValueU res3{res1, res2};
+  const ValueU op3{op1, op2};
+  const ValueU res3{res1, res2};
   UnaryOpTestLauncher<ValueU>(grid, threads)
       .template launch_test<relu_kernel<ValueU>>(op3, res3);
 
   using ValueV = sycl::marray<ValueT, 2>;
-  constexpr ValueV op4{op1, op2};
-  constexpr ValueV res4{res1, res2};
+  const ValueV op4{op1, op2};
+  const ValueV res4{res1, res2};
   UnaryOpTestLauncher<ValueV>(grid, threads)
       .template launch_test<relu_kernel<ValueV>>(op4, res4);
 }
@@ -198,13 +198,13 @@ template <typename ValueT> void test_syclcompat_cbrt() {
   constexpr syclcompat::dim3 grid{1};
   constexpr syclcompat::dim3 threads{1};
 
-  constexpr ValueT op1 = static_cast<ValueT>(1);
-  constexpr ValueT res1 = static_cast<ValueT>(1);
+  const ValueT op1 = static_cast<ValueT>(1);
+  const ValueT res1 = static_cast<ValueT>(1);
   UnaryOpTestLauncher<ValueT>(grid, threads)
       .template launch_test<cbrt_kernel<ValueT>>(op1, res1);
 
-  constexpr ValueT op2 = static_cast<ValueT>(64);
-  constexpr ValueT res2 = static_cast<ValueT>(4);
+  const ValueT op2 = static_cast<ValueT>(64);
+  const ValueT res2 = static_cast<ValueT>(4);
   UnaryOpTestLauncher<ValueT>(grid, threads)
       .template launch_test<cbrt_kernel<ValueT>>(op2, res2);
 }

From c32b2633eebb00b80591645fe03fcb9153dc2e37 Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Thu, 26 Sep 2024 16:13:08 +0100
Subject: [PATCH 02/26] Add syclcompat::is_floating_point_v

This includes sycl::half and sycl::ext::oneapi::bfloat16
---
 sycl/doc/syclcompat/README.md      | 21 +++++++++++++++++++++
 sycl/include/syclcompat/traits.hpp | 17 +++++++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/sycl/doc/syclcompat/README.md b/sycl/doc/syclcompat/README.md
index 6dd8708afeb62..059c0a8aecfe2 100644
--- a/sycl/doc/syclcompat/README.md
+++ b/sycl/doc/syclcompat/README.md
@@ -1690,7 +1690,27 @@ second operand, respectively. These three APIs return a single 32-bit value with
 the accumulated result, which is unsigned if both operands are `uint32_t` and
 signed otherwise.
 
+Various maths functions are defined operate on any floating point types.
+`syclcompat::is_floating_point_v` extends the standard library's
+`std::is_floating_point_v` to include `sycl::half` and, where available,
+`sycl::ext::oneapi::bfloat16`.
+
 ```cpp
+namespace syclcompat{
+
+// Trait for extended floating point definition
+template <typename T>
+struct is_floating_point : std::is_floating_point<T>{};
+
+template <> struct is_floating_point<sycl::half> : std::true_type {};
+
+#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
+template <> struct is_floating_point<sycl::ext::oneapi::bfloat16> : std::true_type {};
+#endif
+template <typename T>
+
+inline constexpr bool is_floating_point_v = is_floating_point<T>::value;
+
 inline unsigned int funnelshift_l(unsigned int low, unsigned int high,
                                   unsigned int shift); 
 
@@ -1889,6 +1909,7 @@ inline dot_product_acc_t<T1, T2> dp2a_hi(T1 a, T2 b,
 template <typename T1, typename T2>
 inline dot_product_acc_t<T1, T2> dp4a(T1 a, T2 b,
                                       dot_product_acc_t<T1, T2> c);
+} // namespace syclcompat
 ```
 
 `vectorized_binary` computes the `BinaryOperation` for two operands,
diff --git a/sycl/include/syclcompat/traits.hpp b/sycl/include/syclcompat/traits.hpp
index 2f389ccf79484..435b0df98a32d 100644
--- a/sycl/include/syclcompat/traits.hpp
+++ b/sycl/include/syclcompat/traits.hpp
@@ -22,6 +22,10 @@
 
 #pragma once
 
+#include <sycl/feature_test.hpp>
+#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
+#include "sycl/ext/oneapi/bfloat16.hpp"
+#endif
 #include <cstddef>
 #include <sycl/ext/oneapi/properties/properties.hpp>
 #include <sycl/ext/oneapi/properties/property_value.hpp>
@@ -250,4 +254,17 @@ using are_all_props = std::conjunction<
 
 } // namespace experimental::detail
 
+// Trait for extended floating point definition
+template <typename T>
+struct is_floating_point : std::is_floating_point<T>{};
+
+template <> struct is_floating_point<sycl::half> : std::true_type {};
+
+#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
+template <> struct is_floating_point<sycl::ext::oneapi::bfloat16> : std::true_type {};
+#endif
+
+template <typename T>
+inline constexpr bool is_floating_point_v = is_floating_point<T>::value;
+
 } // namespace syclcompat

From 9f28f62d1bb878a6b7d1305197554887a290d1e8 Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Fri, 27 Sep 2024 11:54:34 +0100
Subject: [PATCH 03/26] Add bfloat16 to tests and generalize to container types

- Adding bfloat to type lists
- Making test fixtures work with sycl::vec<type,N> and
sycl::marray<type,N>
---
 sycl/test-e2e/syclcompat/common.hpp         | 38 +++++++++++++++++--
 sycl/test-e2e/syclcompat/math/math_fixt.hpp | 41 ++++++++++++++++-----
 2 files changed, 66 insertions(+), 13 deletions(-)

diff --git a/sycl/test-e2e/syclcompat/common.hpp b/sycl/test-e2e/syclcompat/common.hpp
index 368089e89e85a..7e3e5e7b3d70c 100644
--- a/sycl/test-e2e/syclcompat/common.hpp
+++ b/sycl/test-e2e/syclcompat/common.hpp
@@ -22,6 +22,10 @@
 
 #pragma once
 
+#include <sycl/feature_test.hpp>
+#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
+#include "sycl/ext/oneapi/bfloat16.hpp"
+#endif
 #include <sycl/half_type.hpp>
 #include <tuple>
 
@@ -44,8 +48,36 @@ template <typename Tuple, typename Func> void instantiate_all_types(Func &&f) {
     f<T>();                                                                    \
   });
 
+#define INSTANTIATE_ALL_CONTAINER_TYPES(tuple, container, f)                   \
+  instantiate_all_types<tuple>([](auto index) {                                \
+    using T = std::tuple_element_t<decltype(index)::value, tuple>;             \
+    f<container, T>();                                                         \
+  });
+
 using value_type_list =
-    std::tuple<int, unsigned int, short, unsigned short, long, unsigned long,
-               long long, unsigned long long, float, double, sycl::half>;
+    std::tuple<char, signed char, unsigned char, int, unsigned int, short,
+               unsigned short, long, unsigned long, long long,
+               unsigned long long, float, double, sycl::half
+#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
+               ,sycl::ext::oneapi::bfloat16
+#endif
+>;
+
+using fp_type_list =
+    std::tuple<float, double, sycl::half, sycl::ext::oneapi::bfloat16>;
 
-using fp_type_list = std::tuple<float, double, sycl::half>;
+using marray_type_list =
+    std::tuple<char, signed char, short, int, long, long long, unsigned char,
+               unsigned short, unsigned int, unsigned long, unsigned long long,
+               float, double, sycl::half
+#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
+              , sycl::ext::oneapi::bfloat16
+#endif
+>;
+using vec_type_list = std::tuple<int8_t, int16_t, int32_t, int64_t, uint8_t,
+                                 uint16_t, uint32_t, uint64_t, float, double,
+                                 sycl::half
+#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
+              , sycl::ext::oneapi::bfloat16
+#endif
+>;
diff --git a/sycl/test-e2e/syclcompat/math/math_fixt.hpp b/sycl/test-e2e/syclcompat/math/math_fixt.hpp
index 8b395f94faca1..c40cd669f795e 100644
--- a/sycl/test-e2e/syclcompat/math/math_fixt.hpp
+++ b/sycl/test-e2e/syclcompat/math/math_fixt.hpp
@@ -51,8 +51,22 @@ static constexpr bool contained_is_floating_point_v = false;
 template <typename Container>
 static constexpr bool contained_is_floating_point_v<
     Container, std::void_t<typename Container::value_type>> =
-    std::is_floating_point_v<typename Container::value_type> ||
-    std::is_same_v<typename Container::value_type, sycl::half>;
+    syclcompat::is_floating_point_v<typename Container::value_type>;
+
+template <typename... Ts> struct container_common_type;
+
+template <template <typename, int> typename Container, typename T, typename U,
+          int Size>
+struct container_common_type<Container<T, Size>, Container<U, Size>> {
+  using type = Container<std::common_type_t<T, U>, Size>;
+};
+
+template <typename T, typename U> struct container_common_type<T, U> {
+  using type = std::common_type_t<T, U>;
+};
+
+template <typename T, typename U>
+using container_common_type_t = typename container_common_type<T, U>::type;
 
 template <typename ValueT> struct should_skip {
   bool operator()(const sycl::device &dev) const {
@@ -79,17 +93,24 @@ template <typename ValueT> struct should_skip {
 #define CHECK(ResultT, RESULT, EXPECTED)                                       \
   if constexpr (std::is_integral_v<ResultT>) {                                 \
     assert(RESULT == EXPECTED);                                                \
-  } else if constexpr (std::is_floating_point_v<ResultT> ||                    \
-                       std::is_same_v<ResultT, sycl::half>) {                  \
-    if (sycl::isnan(RESULT))                                                   \
-      assert(sycl::isnan(EXPECTED));                                           \
+  } else if constexpr (contained_is_integral_v<ResultT>) {                     \
+    for (size_t i = 0; i < RESULT.size(); i++)                                 \
+      assert(RESULT[i] == EXPECTED[i]);                                        \
+  } else if constexpr (syclcompat::is_floating_point_v<ResultT>) {             \
+    if (syclcompat::detail::isnan(RESULT))                                     \
+      assert(syclcompat::detail::isnan(EXPECTED));                             \
     else                                                                       \
       assert(fabs(RESULT - EXPECTED) < ERROR_TOLERANCE);                       \
   } else if constexpr (contained_is_floating_point_v<ResultT>) {               \
-    for (size_t i = 0; i < RESULT.size(); i++)                                 \
-      assert(fabs(RESULT[i] - EXPECTED[i]) < ERROR_TOLERANCE);                 \
+    for (size_t i = 0; i < RESULT.size(); i++) {                               \
+      if (syclcompat::detail::isnan(RESULT[i])) {                              \
+        assert(syclcompat::detail::isnan(EXPECTED[i]));                        \
+      } else {                                                                 \
+        assert(fabs(RESULT[i] - EXPECTED[i]) < ERROR_TOLERANCE);               \
+      }                                                                        \
+    }                                                                          \
   } else {                                                                     \
-    static_assert(0, "Math_fixt.hpp should not have arrived here.");           \
+    static_assert(0, "math_fixt.hpp should not have arrived here.");           \
   }
 
 class OpTestLauncher {
@@ -107,7 +128,7 @@ class OpTestLauncher {
 
 // Templated ResultT to support both arithmetic and boolean operators
 template <typename ValueT, typename ValueU,
-          typename ResultT = std::common_type_t<ValueT, ValueU>>
+          typename ResultT = container_common_type_t<ValueT, ValueU>>
 class BinaryOpTestLauncher : OpTestLauncher {
 protected:
   ValueT *op1_;

From 83196543be86f971b754ef4f6b919d0fb83974a0 Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Fri, 27 Sep 2024 12:39:16 +0100
Subject: [PATCH 04/26] Add some guards & TODOs to math.hpp

No new functionality here
---
 sycl/include/syclcompat/math.hpp | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/sycl/include/syclcompat/math.hpp b/sycl/include/syclcompat/math.hpp
index 785d95f6f2404..7fca551986e37 100644
--- a/sycl/include/syclcompat/math.hpp
+++ b/sycl/include/syclcompat/math.hpp
@@ -31,12 +31,18 @@
 
 #pragma once
 
+#include <sycl/feature_test.hpp>
+
+// TODO(syclcompat-lib-reviewers): this should not be required
 #ifndef SYCL_EXT_ONEAPI_COMPLEX
 #define SYCL_EXT_ONEAPI_COMPLEX
 #endif
 
+#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
 #include <sycl/ext/oneapi/experimental/bfloat16_math.hpp>
+#endif
 #include <sycl/ext/oneapi/experimental/complex/complex.hpp>
+#include <syclcompat/traits.hpp>
 
 namespace syclcompat {
 namespace detail {
@@ -46,18 +52,18 @@ namespace complex_namespace = sycl::ext::oneapi::experimental;
 template <typename ValueT>
 using complex_type = detail::complex_namespace::complex<ValueT>;
 
+template <typename T>
+constexpr bool is_int32_type = std::is_same_v<std::decay_t<T>, int32_t> ||
+  std::is_same_v<std::decay_t<T>, uint32_t>;
+
 template <typename ValueT>
 inline ValueT clamp(ValueT val, ValueT min_val, ValueT max_val) {
   return sycl::clamp(val, min_val, max_val);
 }
-
-template <typename T>
-constexpr bool is_int32_type = std::is_same_v<std::decay_t<T>, int32_t> ||
-                               std::is_same_v<std::decay_t<T>, uint32_t>;
-
 #ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
-// TODO: Follow the process to add this to the extension. If added,
-// remove this functionality from the header.
+// TODO(syclcompat-lib-reviewers): Follow the process to add this (& other math
+// fns) to the bfloat16 math function extension. If added, remove this
+// functionality from the header.
 template <>
 inline sycl::ext::oneapi::bfloat16 clamp(sycl::ext::oneapi::bfloat16 val,
                                          sycl::ext::oneapi::bfloat16 min_val,

From ec00197b849e83dae6296e74142a0d11211d150f Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Fri, 27 Sep 2024 13:04:58 +0100
Subject: [PATCH 05/26] Generalize should_skip to variadic types

Mixed type cases aren't caught otherwise
---
 sycl/test-e2e/syclcompat/math/math_fixt.hpp | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/sycl/test-e2e/syclcompat/math/math_fixt.hpp b/sycl/test-e2e/syclcompat/math/math_fixt.hpp
index c40cd669f795e..cacd6ea1fb32c 100644
--- a/sycl/test-e2e/syclcompat/math/math_fixt.hpp
+++ b/sycl/test-e2e/syclcompat/math/math_fixt.hpp
@@ -68,18 +68,18 @@ template <typename T, typename U> struct container_common_type<T, U> {
 template <typename T, typename U>
 using container_common_type_t = typename container_common_type<T, U>::type;
 
-template <typename ValueT> struct should_skip {
+template <typename ...ValueT> struct should_skip {
   bool operator()(const sycl::device &dev) const {
-    if constexpr (std::is_same_v<ValueT, double> ||
-                  contained_is_same_v<ValueT, double>) {
+    if constexpr ((std::is_same_v<ValueT, double> || ...) ||
+                  (contained_is_same_v<ValueT, double> || ...)) {
       if (!dev.has(sycl::aspect::fp64)) {
         std::cout << "  sycl::aspect::fp64 not supported by the SYCL device."
                   << std::endl;
         return true;
       }
     }
-    if constexpr (std::is_same_v<ValueT, sycl::half> ||
-                  contained_is_same_v<ValueT, sycl::half>) {
+    if constexpr ((std::is_same_v<ValueT, sycl::half> || ...) ||
+                  (contained_is_same_v<ValueT, sycl::half> || ...)) {
       if (!dev.has(sycl::aspect::fp16)) {
         std::cout << "  sycl::aspect::fp16 not supported by the SYCL device."
                   << std::endl;
@@ -139,9 +139,9 @@ class BinaryOpTestLauncher : OpTestLauncher {
   BinaryOpTestLauncher(const syclcompat::dim3 &grid,
                        const syclcompat::dim3 &threads,
                        const size_t data_size = 1)
-      : OpTestLauncher{
-            grid, threads, data_size,
-            should_skip<ValueT>()(syclcompat::get_current_device())} {
+      : OpTestLauncher{grid, threads, data_size,
+                       should_skip<ValueT, ValueU, ResultT>()(
+                           syclcompat::get_current_device())} {
     if (skip_)
       return;
     op1_ = syclcompat::malloc<ValueT>(data_size);
@@ -183,7 +183,7 @@ class UnaryOpTestLauncher : OpTestLauncher {
                       const size_t data_size = 1)
       : OpTestLauncher{
             grid, threads, data_size,
-            should_skip<ValueT>()(syclcompat::get_current_device())} {
+            should_skip<ValueT, ResultT>()(syclcompat::get_current_device())} {
     if (skip_)
       return;
     op_ = syclcompat::malloc<ValueT>(data_size);

From f8f62aa8133f8426fb720fcdaf5906fe5d58175d Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Mon, 30 Sep 2024 12:42:26 +0100
Subject: [PATCH 06/26] Specialize std::common_type for bfloat16

---
 sycl/include/syclcompat/math.hpp | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/sycl/include/syclcompat/math.hpp b/sycl/include/syclcompat/math.hpp
index 7fca551986e37..f4b6fb45c6559 100644
--- a/sycl/include/syclcompat/math.hpp
+++ b/sycl/include/syclcompat/math.hpp
@@ -2248,3 +2248,29 @@ inline constexpr unsigned extend_vcompare4_add(AT a, BT b, unsigned c,
 }
 
 } // namespace syclcompat
+
+// Specialize std::common_type for bfloat16
+// Semantics here match bfloat16.hpp operator overloads (all mixed type math
+// ops return bfloat16)
+// TODO(syclcompat-lib-reviewers) Move this to bfloat extension
+namespace std {
+    template <>
+    struct common_type<sycl::ext::oneapi::bfloat16> {
+        using type = sycl::ext::oneapi::bfloat16;
+    };
+
+    template <>
+    struct common_type<sycl::ext::oneapi::bfloat16, sycl::ext::oneapi::bfloat16> {
+        using type = sycl::ext::oneapi::bfloat16;
+    };
+
+    template <typename T>
+    struct common_type<sycl::ext::oneapi::bfloat16, T> {
+        using type = sycl::ext::oneapi::bfloat16; //std::common_type_t<float, T>;  // sycl::ext::oneapi::bfloat16 promotes to float
+    };
+
+    template <typename T>
+    struct common_type<T, sycl::ext::oneapi::bfloat16> {
+        using type = sycl::ext::oneapi::bfloat16; //std::common_type_t<T, float>;  // sycl::ext::oneapi::bfloat16 promotes to float
+    };
+}

From dad728cb52e4b8a14398a51e42c93020acc4539b Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Mon, 30 Sep 2024 14:15:59 +0100
Subject: [PATCH 07/26] Add bfloat16 support (& more extensive tests) to
 fmin/max_nan

- Added support for bfloat16
- Added tests for `sycl::vec<T,2>` & `sycl::marray<T,2>`
---
 sycl/include/syclcompat/math.hpp           | 36 +++++++++--
 sycl/test-e2e/syclcompat/math/math_ops.cpp | 69 +++++++++++++++++++++-
 2 files changed, 99 insertions(+), 6 deletions(-)

diff --git a/sycl/include/syclcompat/math.hpp b/sycl/include/syclcompat/math.hpp
index f4b6fb45c6559..d7f5b17256ab4 100644
--- a/sycl/include/syclcompat/math.hpp
+++ b/sycl/include/syclcompat/math.hpp
@@ -750,15 +750,29 @@ inline std::common_type_t<ValueT, ValueU> fmax_nan(const ValueT a,
                                                    const ValueU b) {
   if (detail::isnan(a) || detail::isnan(b))
     return NAN;
-  return sycl::fmax(static_cast<std::common_type_t<ValueT, ValueU>>(a),
-                    static_cast<std::common_type_t<ValueT, ValueU>>(b));
+  if constexpr (std::is_same_v<std::common_type_t<ValueT, ValueU>,
+                               sycl::ext::oneapi::bfloat16>) {
+    return sycl::ext::oneapi::experimental::fmax(
+        static_cast<std::common_type_t<ValueT, ValueU>>(a),
+        static_cast<std::common_type_t<ValueT, ValueU>>(b));
+  } else {
+    return sycl::fmax(static_cast<std::common_type_t<ValueT, ValueU>>(a),
+                      static_cast<std::common_type_t<ValueT, ValueU>>(b));
+  }
 }
+
 template <typename ValueT, typename ValueU>
 inline sycl::vec<std::common_type_t<ValueT, ValueU>, 2>
 fmax_nan(const sycl::vec<ValueT, 2> a, const sycl::vec<ValueU, 2> b) {
   return {fmax_nan(a[0], b[0]), fmax_nan(a[1], b[1])};
 }
 
+template <typename ValueT, typename ValueU>
+inline sycl::marray<std::common_type_t<ValueT, ValueU>, 2>
+fmax_nan(const sycl::marray<ValueT, 2> a, const sycl::marray<ValueU, 2> b) {
+  return {fmax_nan(a[0], b[0]), fmax_nan(a[1], b[1])};
+}
+
 /// Performs 2 elements comparison and returns the smaller one. If either of
 /// inputs is NaN, then return NaN.
 /// \param [in] a The first value
@@ -769,15 +783,29 @@ inline std::common_type_t<ValueT, ValueU> fmin_nan(const ValueT a,
                                                    const ValueU b) {
   if (detail::isnan(a) || detail::isnan(b))
     return NAN;
-  return sycl::fmin(static_cast<std::common_type_t<ValueT, ValueU>>(a),
-                    static_cast<std::common_type_t<ValueT, ValueU>>(b));
+  if constexpr (std::is_same_v<std::common_type_t<ValueT, ValueU>,
+                               sycl::ext::oneapi::bfloat16>) {
+    return sycl::ext::oneapi::experimental::fmin(
+        static_cast<std::common_type_t<ValueT, ValueU>>(a),
+        static_cast<std::common_type_t<ValueT, ValueU>>(b));
+  } else {
+    return sycl::fmin(static_cast<std::common_type_t<ValueT, ValueU>>(a),
+                      static_cast<std::common_type_t<ValueT, ValueU>>(b));
+  }
 }
+
 template <typename ValueT, typename ValueU>
 inline sycl::vec<std::common_type_t<ValueT, ValueU>, 2>
 fmin_nan(const sycl::vec<ValueT, 2> a, const sycl::vec<ValueU, 2> b) {
   return {fmin_nan(a[0], b[0]), fmin_nan(a[1], b[1])};
 }
 
+template <typename ValueT, typename ValueU>
+inline sycl::marray<std::common_type_t<ValueT, ValueU>, 2>
+fmin_nan(const sycl::marray<ValueT, 2> a, const sycl::marray<ValueU, 2> b) {
+  return {fmin_nan(a[0], b[0]), fmin_nan(a[1], b[1])};
+}
+
 // pow functions overload.
 inline float pow(const float a, const int b) { return sycl::pown(a, b); }
 inline double pow(const double a, const int b) { return sycl::pown(a, b); }
diff --git a/sycl/test-e2e/syclcompat/math/math_ops.cpp b/sycl/test-e2e/syclcompat/math/math_ops.cpp
index d30bac3ed4b11..26d7bd1b7e707 100644
--- a/sycl/test-e2e/syclcompat/math/math_ops.cpp
+++ b/sycl/test-e2e/syclcompat/math/math_ops.cpp
@@ -74,7 +74,7 @@ void test_syclcompat_min() {
 
 template <typename ValueT, typename ValueU>
 inline void fmin_nan_kernel(ValueT *a, ValueU *b,
-                            std::common_type_t<ValueT, ValueU> *r) {
+                            container_common_type_t<ValueT, ValueU> *r) {
   *r = syclcompat::fmin_nan(*a, *b);
 }
 
@@ -98,9 +98,35 @@ void test_syclcompat_fmin_nan() {
       .template launch_test<fmin_nan_kernel<ValueT, ValueU>>(op1, op3, op3);
 }
 
+template <template <typename T, int Dim> typename ContainerT, typename ValueT, typename ValueU = ValueT>
+void test_container_syclcompat_fmin_nan(){
+  std::cout << __PRETTY_FUNCTION__ << std::endl;
+
+  constexpr syclcompat::dim3 grid{1};
+  constexpr syclcompat::dim3 threads{1};
+
+  using ValueTU = std::common_type_t<ValueT, ValueU>;
+  using ContT = ContainerT<ValueT, 2>;
+  using ContU = ContainerT<ValueU, 2>;
+  using ContTU = ContainerT<ValueTU, 2>;
+
+  const ContT op4 = {static_cast<ValueT>(5), static_cast<ValueT>(10)};
+  const ContU op5 = {static_cast<ValueU>(10), static_cast<ValueU>(5)};
+  const ContU op6 = {sycl::nan(static_cast<unsigned int>(0)), sycl::nan(static_cast<unsigned int>(0))};
+  const ContTU op6_res = {sycl::nan(static_cast<unsigned int>(0)), sycl::nan(static_cast<unsigned int>(0))};
+
+  const ContTU res2{static_cast<ValueTU>(5), static_cast<ValueTU>(5)};
+
+  BinaryOpTestLauncher<ContT, ContU>(grid, threads)
+      .template launch_test<fmin_nan_kernel<ContT, ContU>>(op4, op5, res2);
+
+  BinaryOpTestLauncher<ContT, ContU>(grid, threads)
+      .template launch_test<fmin_nan_kernel<ContT, ContU>>(op4, op6, op6_res);
+}
+
 template <typename ValueT, typename ValueU>
 inline void fmax_nan_kernel(ValueT *a, ValueU *b,
-                            std::common_type_t<ValueT, ValueU> *r) {
+                            container_common_type_t<ValueT, ValueU> *r) {
   *r = syclcompat::fmax_nan(*a, *b);
 }
 
@@ -124,6 +150,32 @@ void test_syclcompat_fmax_nan() {
       .template launch_test<fmax_nan_kernel<ValueT, ValueU>>(op1, op3, op3);
 }
 
+template <template <typename T, int Dim> typename ContainerT, typename ValueT, typename ValueU = ValueT>
+void test_container_syclcompat_fmax_nan(){
+  std::cout << __PRETTY_FUNCTION__ << std::endl;
+
+  constexpr syclcompat::dim3 grid{1};
+  constexpr syclcompat::dim3 threads{1};
+
+  using ValueTU = std::common_type_t<ValueT, ValueU>;
+  using ContT = ContainerT<ValueT, 2>;
+  using ContU = ContainerT<ValueU, 2>;
+  using ContTU = ContainerT<ValueTU, 2>;
+
+  const ContT op4 = {static_cast<ValueT>(5), static_cast<ValueT>(10)};
+  const ContU op5 = {static_cast<ValueU>(10), static_cast<ValueU>(5)};
+  const ContU op6 = {sycl::nan(static_cast<unsigned int>(0)), sycl::nan(static_cast<unsigned int>(0))};
+  const ContTU op6_res = {sycl::nan(static_cast<unsigned int>(0)), sycl::nan(static_cast<unsigned int>(0))};
+
+  const ContTU res2{static_cast<ValueTU>(10), static_cast<ValueTU>(10)};
+
+  BinaryOpTestLauncher<ContT, ContU>(grid, threads)
+      .template launch_test<fmax_nan_kernel<ContT, ContU>>(op4, op5, res2);
+
+  BinaryOpTestLauncher<ContT, ContU>(grid, threads)
+      .template launch_test<fmax_nan_kernel<ContT, ContU>>(op4, op6, op6_res);
+}
+
 template <typename ValueT, typename ValueU>
 inline void pow_kernel(ValueT *a, ValueU *b, ValueT *r) {
   *r = syclcompat::pow(*a, *b);
@@ -269,9 +321,22 @@ int main() {
   test_syclcompat_min<long, int>();
 
   INSTANTIATE_ALL_TYPES(fp_type_list, test_syclcompat_fmin_nan);
+  INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::vec, test_container_syclcompat_fmin_nan);
+  INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::marray, test_container_syclcompat_fmin_nan);
   test_syclcompat_fmin_nan<double, float>();
+  test_container_syclcompat_fmin_nan<sycl::vec, float, double>();
+#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
+  test_container_syclcompat_fmin_nan<sycl::vec, sycl::ext::oneapi::bfloat16, double>();
+#endif
+
   INSTANTIATE_ALL_TYPES(fp_type_list, test_syclcompat_fmax_nan);
+  INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::vec, test_container_syclcompat_fmax_nan);
+  INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::marray, test_container_syclcompat_fmax_nan);
   test_syclcompat_fmax_nan<double, float>();
+  test_container_syclcompat_fmax_nan<sycl::vec, float, double>();
+#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
+  test_container_syclcompat_fmax_nan<sycl::vec, sycl::ext::oneapi::bfloat16, double>();
+#endif
 
   INSTANTIATE_ALL_TYPES(value_type_list, test_syclcompat_pow);
   test_syclcompat_pow<float, int>();

From 9681cc7bc4ce475285aec9b294a050dded72f0dc Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Mon, 30 Sep 2024 14:34:06 +0100
Subject: [PATCH 08/26] Define constexpr bool for bfloat16 support

& a bit of tidying
---
 sycl/include/syclcompat/math.hpp           | 13 +++++++++++--
 sycl/test-e2e/syclcompat/math/math_ops.cpp | 12 ++++++++----
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/sycl/include/syclcompat/math.hpp b/sycl/include/syclcompat/math.hpp
index d7f5b17256ab4..6a9bfc3291a15 100644
--- a/sycl/include/syclcompat/math.hpp
+++ b/sycl/include/syclcompat/math.hpp
@@ -56,6 +56,13 @@ template <typename T>
 constexpr bool is_int32_type = std::is_same_v<std::decay_t<T>, int32_t> ||
   std::is_same_v<std::decay_t<T>, uint32_t>;
 
+// Helper constexpr bool to avoid ugly macros where possible
+#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
+  constexpr bool support_bfloat16_math = true;
+#else
+  constexpr bool support_bfloat16_math = false;
+#endif
+
 template <typename ValueT>
 inline ValueT clamp(ValueT val, ValueT min_val, ValueT max_val) {
   return sycl::clamp(val, min_val, max_val);
@@ -750,7 +757,8 @@ inline std::common_type_t<ValueT, ValueU> fmax_nan(const ValueT a,
                                                    const ValueU b) {
   if (detail::isnan(a) || detail::isnan(b))
     return NAN;
-  if constexpr (std::is_same_v<std::common_type_t<ValueT, ValueU>,
+  if constexpr (detail::support_bfloat16_math &&
+                std::is_same_v<std::common_type_t<ValueT, ValueU>,
                                sycl::ext::oneapi::bfloat16>) {
     return sycl::ext::oneapi::experimental::fmax(
         static_cast<std::common_type_t<ValueT, ValueU>>(a),
@@ -783,7 +791,8 @@ inline std::common_type_t<ValueT, ValueU> fmin_nan(const ValueT a,
                                                    const ValueU b) {
   if (detail::isnan(a) || detail::isnan(b))
     return NAN;
-  if constexpr (std::is_same_v<std::common_type_t<ValueT, ValueU>,
+  if constexpr (detail::support_bfloat16_math &&
+                std::is_same_v<std::common_type_t<ValueT, ValueU>,
                                sycl::ext::oneapi::bfloat16>) {
     return sycl::ext::oneapi::experimental::fmin(
         static_cast<std::common_type_t<ValueT, ValueU>>(a),
diff --git a/sycl/test-e2e/syclcompat/math/math_ops.cpp b/sycl/test-e2e/syclcompat/math/math_ops.cpp
index 26d7bd1b7e707..58df133f5d3bd 100644
--- a/sycl/test-e2e/syclcompat/math/math_ops.cpp
+++ b/sycl/test-e2e/syclcompat/math/math_ops.cpp
@@ -82,14 +82,16 @@ template <typename ValueT, typename ValueU = ValueT>
 void test_syclcompat_fmin_nan() {
   std::cout << __PRETTY_FUNCTION__ << std::endl;
 
+  using ValueTU = std::common_type_t<ValueT, ValueU>;
+
   constexpr syclcompat::dim3 grid{1};
   constexpr syclcompat::dim3 threads{1};
   const ValueT op1 = static_cast<ValueT>(5);
   const ValueU op2 = static_cast<ValueU>(10);
   ValueU op3 = sycl::nan(static_cast<unsigned int>(0));
 
-  const std::common_type_t<ValueT, ValueU> res =
-      static_cast<std::common_type_t<ValueT, ValueU>>(5);
+  const ValueTU res =
+      static_cast<ValueTU>(5);
 
   BinaryOpTestLauncher<ValueT, ValueU>(grid, threads)
       .template launch_test<fmin_nan_kernel<ValueT, ValueU>>(op1, op2, res);
@@ -134,14 +136,16 @@ template <typename ValueT, typename ValueU = ValueT>
 void test_syclcompat_fmax_nan() {
   std::cout << __PRETTY_FUNCTION__ << std::endl;
 
+  using ValueTU = std::common_type_t<ValueT, ValueU>;
+
   constexpr syclcompat::dim3 grid{1};
   constexpr syclcompat::dim3 threads{1};
   const ValueT op1 = static_cast<ValueT>(5);
   const ValueU op2 = static_cast<ValueU>(10);
   ValueU op3 = sycl::nan(static_cast<unsigned int>(0));
 
-  const std::common_type_t<ValueT, ValueU> res =
-      static_cast<std::common_type_t<ValueT, ValueU>>(10);
+  const ValueTU res =
+      static_cast<ValueTU>(10);
 
   BinaryOpTestLauncher<ValueT, ValueU>(grid, threads)
       .template launch_test<fmax_nan_kernel<ValueT, ValueU>>(op1, op2, res);

From a5e8e10525bef873314e8e08d14fa3d6f201d835 Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Mon, 30 Sep 2024 14:37:33 +0100
Subject: [PATCH 09/26] Add marray fmin/fmax_nan to docs

---
 sycl/doc/syclcompat/README.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/sycl/doc/syclcompat/README.md b/sycl/doc/syclcompat/README.md
index 059c0a8aecfe2..25f461e32d18f 100644
--- a/sycl/doc/syclcompat/README.md
+++ b/sycl/doc/syclcompat/README.md
@@ -1779,6 +1779,10 @@ template <typename ValueT, typename ValueU>
 inline sycl::vec<std::common_type_t<ValueT, ValueU>, 2>
 fmax_nan(const sycl::vec<ValueT, 2> a, const sycl::vec<ValueU, 2> b);
 
+template <typename ValueT, typename ValueU>
+inline sycl::marray<std::common_type_t<ValueT, ValueU>, 2>
+fmax_nan(const sycl::marray<ValueT, 2> a, const sycl::marray<ValueU, 2> b);
+
 // Performs 2 elements comparison and returns the smaller one. If either of
 // inputs is NaN, then return NaN.
 template <typename ValueT, typename ValueU>
@@ -1788,6 +1792,10 @@ template <typename ValueT, typename ValueU>
 inline sycl::vec<std::common_type_t<ValueT, ValueU>, 2>
 fmin_nan(const sycl::vec<ValueT, 2> a, const sycl::vec<ValueU, 2> b);
 
+template <typename ValueT, typename ValueU>
+inline sycl::marray<std::common_type_t<ValueT, ValueU>, 2>
+fmin_nan(const sycl::marray<ValueT, 2> a, const sycl::marray<ValueU, 2> b);
+
 inline float pow(const float a, const int b) { return sycl::pown(a, b); }
 inline double pow(const double a, const int b) { return sycl::pown(a, b); }
 

From 4497d82309b6bb3c912aed5e4c662c3dbf2d4b1b Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Mon, 30 Sep 2024 14:49:01 +0100
Subject: [PATCH 10/26] Extend isnan support to bfloat16

& more tests
---
 sycl/include/syclcompat/math.hpp           | 12 ++++++------
 sycl/test-e2e/syclcompat/math/math_ops.cpp | 17 +++++++++++------
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/sycl/include/syclcompat/math.hpp b/sycl/include/syclcompat/math.hpp
index 6a9bfc3291a15..0afb170dbd19a 100644
--- a/sycl/include/syclcompat/math.hpp
+++ b/sycl/include/syclcompat/math.hpp
@@ -231,13 +231,13 @@ inline constexpr RetT extend_vbinary4(AT a, BT b, RetT c,
 }
 
 template <typename ValueT> inline bool isnan(const ValueT a) {
-  return sycl::isnan(a);
-}
-#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
-inline bool isnan(const sycl::ext::oneapi::bfloat16 a) {
-  return sycl::ext::oneapi::experimental::isnan(a);
+  if constexpr (detail::support_bfloat16_math &&
+                std::is_same_v<ValueT, sycl::ext::oneapi::bfloat16>) {
+    return sycl::ext::oneapi::experimental::isnan(a);
+  } else {
+    return sycl::isnan(a);
+  }
 }
-#endif
 
 // FIXME(syclcompat-lib-reviewers): move bfe outside detail once perf is
 // improved & semantics understood
diff --git a/sycl/test-e2e/syclcompat/math/math_ops.cpp b/sycl/test-e2e/syclcompat/math/math_ops.cpp
index 58df133f5d3bd..eac8bce92a1b6 100644
--- a/sycl/test-e2e/syclcompat/math/math_ops.cpp
+++ b/sycl/test-e2e/syclcompat/math/math_ops.cpp
@@ -265,22 +265,25 @@ template <typename ValueT> void test_syclcompat_cbrt() {
       .template launch_test<cbrt_kernel<ValueT>>(op2, res2);
 }
 
-void isnan_kernel(sycl::float2 *a, sycl::float2 *r) {
+template <typename T>
+void isnan_kernel(T *a, T *r) {
   *r = syclcompat::isnan(*a);
 }
 
+template <template <typename, int> typename ContainerT, typename ValueT>
 void test_isnan() {
   std::cout << __PRETTY_FUNCTION__ << std::endl;
 
+  using ContT = ContainerT<ValueT, 2>;
   constexpr syclcompat::dim3 grid{1};
   constexpr syclcompat::dim3 threads{1};
-  sycl::float2 op1 = {sycl::nan(static_cast<unsigned int>(0)), 1.0f};
+  ContT op1 = {sycl::nan(static_cast<unsigned int>(0)), 1.0f};
   // bool2 does not exist,1.0 and 0.0 floats are used for true
   // and false instead.
-  sycl::float2 expect = {1.0, 0.0};
+  ContT expect = {1.0, 0.0};
 
-  UnaryOpTestLauncher<sycl::float2>(grid, threads)
-      .template launch_test<isnan_kernel>(op1, expect);
+  UnaryOpTestLauncher<ContT>(grid, threads)
+      .template launch_test<isnan_kernel<ContT>>(op1, expect);
 }
 
 // Hardcoded limits to avoid a "TernaryOpTestLauncher"
@@ -349,7 +352,9 @@ int main() {
   INSTANTIATE_ALL_TYPES(fp_type_list, test_syclcompat_relu);
   INSTANTIATE_ALL_TYPES(fp_type_list, test_syclcompat_cbrt);
 
-  test_isnan();
+  INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::vec, test_isnan);
+  INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::marray, test_isnan);
+
   INSTANTIATE_ALL_TYPES(value_type_list, test_clamp);
 
   return 0;

From 1fe59806229b4d020f11cba30fab473b16345a0a Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Mon, 30 Sep 2024 15:01:49 +0100
Subject: [PATCH 11/26] Formatting

---
 sycl/include/syclcompat/math.hpp | 45 ++++++++++++++++----------------
 1 file changed, 23 insertions(+), 22 deletions(-)

diff --git a/sycl/include/syclcompat/math.hpp b/sycl/include/syclcompat/math.hpp
index 0afb170dbd19a..45ee4a36f59da 100644
--- a/sycl/include/syclcompat/math.hpp
+++ b/sycl/include/syclcompat/math.hpp
@@ -58,9 +58,9 @@ constexpr bool is_int32_type = std::is_same_v<std::decay_t<T>, int32_t> ||
 
 // Helper constexpr bool to avoid ugly macros where possible
 #ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
-  constexpr bool support_bfloat16_math = true;
+constexpr bool support_bfloat16_math = true;
 #else
-  constexpr bool support_bfloat16_math = false;
+constexpr bool support_bfloat16_math = false;
 #endif
 
 template <typename ValueT>
@@ -2291,23 +2291,24 @@ inline constexpr unsigned extend_vcompare4_add(AT a, BT b, unsigned c,
 // ops return bfloat16)
 // TODO(syclcompat-lib-reviewers) Move this to bfloat extension
 namespace std {
-    template <>
-    struct common_type<sycl::ext::oneapi::bfloat16> {
-        using type = sycl::ext::oneapi::bfloat16;
-    };
-
-    template <>
-    struct common_type<sycl::ext::oneapi::bfloat16, sycl::ext::oneapi::bfloat16> {
-        using type = sycl::ext::oneapi::bfloat16;
-    };
-
-    template <typename T>
-    struct common_type<sycl::ext::oneapi::bfloat16, T> {
-        using type = sycl::ext::oneapi::bfloat16; //std::common_type_t<float, T>;  // sycl::ext::oneapi::bfloat16 promotes to float
-    };
-
-    template <typename T>
-    struct common_type<T, sycl::ext::oneapi::bfloat16> {
-        using type = sycl::ext::oneapi::bfloat16; //std::common_type_t<T, float>;  // sycl::ext::oneapi::bfloat16 promotes to float
-    };
-}
+template <> struct common_type<sycl::ext::oneapi::bfloat16> {
+  using type = sycl::ext::oneapi::bfloat16;
+};
+
+template <>
+struct common_type<sycl::ext::oneapi::bfloat16, sycl::ext::oneapi::bfloat16> {
+  using type = sycl::ext::oneapi::bfloat16;
+};
+
+template <typename T> struct common_type<sycl::ext::oneapi::bfloat16, T> {
+  using type = sycl::ext::oneapi::bfloat16; // std::common_type_t<float, T>;  //
+                                            // sycl::ext::oneapi::bfloat16
+                                            // promotes to float
+};
+
+template <typename T> struct common_type<T, sycl::ext::oneapi::bfloat16> {
+  using type = sycl::ext::oneapi::bfloat16; // std::common_type_t<T, float>;  //
+                                            // sycl::ext::oneapi::bfloat16
+                                            // promotes to float
+};
+} // namespace std

From 847d45dd190089c8d680093f92aefcf90d8c4cb7 Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Mon, 30 Sep 2024 20:29:39 +0100
Subject: [PATCH 12/26] Fix bug in cmul_add tests

marray version never tested!
---
 sycl/test-e2e/syclcompat/math/math_complex.cpp | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/sycl/test-e2e/syclcompat/math/math_complex.cpp b/sycl/test-e2e/syclcompat/math/math_complex.cpp
index d8bedb7082f97..27e2bf8af8f71 100644
--- a/sycl/test-e2e/syclcompat/math/math_complex.cpp
+++ b/sycl/test-e2e/syclcompat/math/math_complex.cpp
@@ -50,6 +50,15 @@ template <typename T> bool check(T x, float *e) {
   return false;
 }
 
+template <typename T> bool check(sycl::marray<T, 2> x, float *e) {
+  float precision = ERROR_TOLERANCE;
+  if ((x[0] - e[0] < precision) && (x[0] - e[0] > -precision) &&
+      (x[1] - e[1] < precision) && (x[1] - e[1] > -precision)) {
+    return true;
+  }
+  return false;
+}
+
 template <> bool check<float>(float x, float *e) {
   float precision = ERROR_TOLERANCE;
   if ((x - e[0] < precision) && (x - e[0] > -precision)) {
@@ -206,10 +215,10 @@ void kernel_mul_add(int *result) {
   m_f2 = sycl::marray<float, 2>(-3.6, 4.5);
   m_f3 = sycl::marray<float, 2>(1.0, -1.0);
 
-  auto a3 = syclcompat::cmul_add(d1, d2, d3);
+  auto a3 = syclcompat::cmul_add(m_d1, m_d2, m_d3);
   r = r && check(a3, expect);
 
-  auto a4 = syclcompat::cmul_add(f1, f2, f3);
+  auto a4 = syclcompat::cmul_add(m_f1, m_f2, m_f3);
   r = r && check(a4, expect + 2);
 
   *result = r;

From 55e95ad93b8fd7398f7d53d98cbc1fada3e407c0 Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Tue, 1 Oct 2024 10:11:24 +0100
Subject: [PATCH 13/26] Add cmul_add<bfloat> & draft test

Function casts to sycl::complex<float> (no native bfloat16 support)
---
 sycl/include/syclcompat/math.hpp              | 22 +++++
 .../test-e2e/syclcompat/math/math_complex.cpp | 81 ++++++++++++++-----
 2 files changed, 85 insertions(+), 18 deletions(-)

diff --git a/sycl/include/syclcompat/math.hpp b/sycl/include/syclcompat/math.hpp
index 45ee4a36f59da..dbc58615ff8c8 100644
--- a/sycl/include/syclcompat/math.hpp
+++ b/sycl/include/syclcompat/math.hpp
@@ -929,6 +929,28 @@ inline sycl::marray<ValueT, 2> cmul_add(const sycl::marray<ValueT, 2> a,
   t = t * u + v;
   return sycl::marray<ValueT, 2>{t.real(), t.imag()};
 }
+template <>
+inline sycl::vec<sycl::ext::oneapi::bfloat16, 2>
+cmul_add(const sycl::vec<sycl::ext::oneapi::bfloat16, 2> a,
+         const sycl::vec<sycl::ext::oneapi::bfloat16, 2> b,
+         const sycl::vec<sycl::ext::oneapi::bfloat16, 2> c) {
+  sycl::ext::oneapi::experimental::complex<float> t(a[0], a[1]);
+  sycl::ext::oneapi::experimental::complex<float> u(b[0], b[1]);
+  sycl::ext::oneapi::experimental::complex<float> v(c[0], c[1]);
+  t = t * u + v;
+  return sycl::vec<sycl::ext::oneapi::bfloat16, 2>{t.real(), t.imag()};
+}
+template <>
+inline sycl::marray<sycl::ext::oneapi::bfloat16, 2>
+cmul_add(const sycl::marray<sycl::ext::oneapi::bfloat16, 2> a,
+         const sycl::marray<sycl::ext::oneapi::bfloat16, 2> b,
+         const sycl::marray<sycl::ext::oneapi::bfloat16, 2> c) {
+  sycl::ext::oneapi::experimental::complex<float> t(a[0], a[1]);
+  sycl::ext::oneapi::experimental::complex<float> u(b[0], b[1]);
+  sycl::ext::oneapi::experimental::complex<float> v(c[0], c[1]);
+  t = t * u + v;
+  return sycl::marray<sycl::ext::oneapi::bfloat16, 2>{t.real(), t.imag()};
+}
 
 /// A sycl::abs wrapper functors.
 struct abs {
diff --git a/sycl/test-e2e/syclcompat/math/math_complex.cpp b/sycl/test-e2e/syclcompat/math/math_complex.cpp
index 27e2bf8af8f71..8207d1222bda5 100644
--- a/sycl/test-e2e/syclcompat/math/math_complex.cpp
+++ b/sycl/test-e2e/syclcompat/math/math_complex.cpp
@@ -184,19 +184,62 @@ void kernel_mul(int *result) {
   *result = r;
 }
 
+template <typename T>
+std::array<T,2> complex_mul(std::array<T, 2> a, std::array<T, 2> b){
+  std::array<T, 2> result;
+  result[0] = (a[0] * b[0]) - (a[1] * b[1]);
+  result[1] = (a[0] * b[1]) + (a[1] * b[0]);
+  return result;
+}
+
+template <typename T>
+std::array<T,2> complex_add(std::array<T, 2> a, std::array<T, 2> b){
+  return {a[0] + b[0], a[1] + b[1]};
+}
+
+template <typename T> void mul_add_groundtruth() {
+
+  using complex_t = std::complex<double>;
+  using arr_t = std::array<T, 2>;
+
+  arr_t d1 = arr_t({static_cast<T>(5.4), static_cast<T>(-6.3)});
+  arr_t d2 = arr_t({static_cast<T>(-7.2), static_cast<T>(8.1)});
+  arr_t d3 = arr_t({static_cast<T>(1.0), static_cast<T>(-1.0)});
+
+  arr_t f1 = arr_t({static_cast<T>(1.8), static_cast<T>(-2.7)});
+  arr_t f2 = arr_t({static_cast<T>(-3.6), static_cast<T>(4.5)});
+  arr_t f3 = arr_t({static_cast<T>(1.0), static_cast<T>(-1.0)});
+
+  arr_t ra1 = complex_add(complex_mul(d1, d2), d3);
+  arr_t ra2 = complex_add(complex_mul(f1, f2), f3);
+
+  T expect[4] = {13.150000, 88.100000, 6.670001, 16.820000};
+
+  // complex_t r1 = d1 * d2 + d3;
+  // complex_t r2 = f1 * f2 + f3;
+
+  std::cout << "r1: " << static_cast<T>(ra1[0]) << ", "
+            << static_cast<T>(ra1[1]) << std::endl;
+  std::cout << "Expect 1: " << expect[0] << ", " << expect[1] << std::endl;
+  std::cout << "r2: " << static_cast<T>(ra2[0]) << ", "
+            << static_cast<T>(ra2[1]) << std::endl;
+  std::cout << "Expect 2: " << expect[2] << ", " << expect[3] << std::endl;
+}
+
+template <typename T>
 void kernel_mul_add(int *result) {
-  sycl::double2 d1, d2, d3;
-  sycl::float2 f1, f2, f3;
-  sycl::marray<double, 2> m_d1, m_d2, m_d3;
-  sycl::marray<float, 2> m_f1, m_f2, m_f3;
+  sycl::vec<T, 2> d1, d2, d3;
+  sycl::vec<T, 2> f1, f2, f3;
+  sycl::marray<T, 2> m_d1, m_d2, m_d3;
+  sycl::marray<T, 2> m_f1, m_f2, m_f3;
 
-  d1 = sycl::double2(5.4, -6.3);
-  d2 = sycl::double2(-7.2, 8.1);
-  d3 = sycl::double2(1.0, -1.0);
+  d1 = sycl::vec<T, 2>(5.4, -6.3);
+  d2 = sycl::vec<T, 2>(-7.2, 8.1);
+  d3 = sycl::vec<T, 2>(1.0, -1.0);
 
-  f1 = sycl::float2(1.8, -2.7);
-  f2 = sycl::float2(-3.6, 4.5);
-  f3 = sycl::float2(1.0, -1.0);
+  f1 = sycl::vec<T, 2>(1.8, -2.7);
+  f2 = sycl::vec<T, 2>(-3.6, 4.5);
+  f3 = sycl::vec<T, 2>(1.0, -1.0);
 
   bool r = true;
   float expect[4] = {13.150000, 88.100000, 6.670001, 16.820000};
@@ -207,13 +250,13 @@ void kernel_mul_add(int *result) {
   auto a2 = syclcompat::cmul_add(f1, f2, f3);
   r = r && check(a2, expect + 2);
 
-  m_d1 = sycl::marray<double, 2>(5.4, -6.3);
-  m_d2 = sycl::marray<double, 2>(-7.2, 8.1);
-  m_d3 = sycl::marray<double, 2>(1.0, -1.0);
+  m_d1 = sycl::marray<T, 2>(5.4, -6.3);
+  m_d2 = sycl::marray<T, 2>(-7.2, 8.1);
+  m_d3 = sycl::marray<T, 2>(1.0, -1.0);
 
-  m_f1 = sycl::marray<float, 2>(1.8, -2.7);
-  m_f2 = sycl::marray<float, 2>(-3.6, 4.5);
-  m_f3 = sycl::marray<float, 2>(1.0, -1.0);
+  m_f1 = sycl::marray<T, 2>(1.8, -2.7);
+  m_f2 = sycl::marray<T, 2>(-3.6, 4.5);
+  m_f3 = sycl::marray<T, 2>(1.0, -1.0);
 
   auto a3 = syclcompat::cmul_add(m_d1, m_d2, m_d3);
   r = r && check(a3, expect);
@@ -241,9 +284,11 @@ void test_conj() {
   ComplexLauncher<kernel_conj>().launch();
 }
 
+template <typename T>
 void test_mul_add() {
   std::cout << __PRETTY_FUNCTION__ << std::endl;
-  ComplexLauncher<kernel_mul_add>().launch();
+  mul_add_groundtruth<T>();
+  ComplexLauncher<kernel_mul_add<T>>().launch();
 }
 
 int main() {
@@ -251,7 +296,7 @@ int main() {
   test_mul();
   test_div();
   test_conj();
-  test_mul_add();
+  INSTANTIATE_ALL_TYPES(fp_type_list, test_mul_add);
 
   return 0;
 }

From 5b0e0b563fd32b849bb8add4aa171dd76dd19721 Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Tue, 1 Oct 2024 19:52:18 +0100
Subject: [PATCH 14/26] Enable relu for bfloat

---
 sycl/include/syclcompat/math.hpp | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/sycl/include/syclcompat/math.hpp b/sycl/include/syclcompat/math.hpp
index dbc58615ff8c8..be31fab6b372e 100644
--- a/sycl/include/syclcompat/math.hpp
+++ b/sycl/include/syclcompat/math.hpp
@@ -838,24 +838,20 @@ pow(const ValueT a, const ValueU b) {
 /// \param [in] a The input value
 /// \returns the relu saturation result
 template <typename ValueT>
-inline std::enable_if_t<std::is_floating_point_v<ValueT> ||
-                            std::is_same_v<sycl::half, ValueT>,
-                        ValueT>
+inline std::enable_if_t<syclcompat::is_floating_point_v<ValueT>, ValueT>
 relu(const ValueT a) {
   if (!detail::isnan(a) && a < ValueT(0))
     return ValueT(0);
   return a;
 }
 template <class ValueT>
-inline std::enable_if_t<std::is_floating_point_v<ValueT> ||
-                            std::is_same_v<sycl::half, ValueT>,
+inline std::enable_if_t<syclcompat::is_floating_point_v<ValueT>,
                         sycl::vec<ValueT, 2>>
 relu(const sycl::vec<ValueT, 2> a) {
   return {relu(a[0]), relu(a[1])};
 }
 template <class ValueT>
-inline std::enable_if_t<std::is_floating_point_v<ValueT> ||
-                            std::is_same_v<sycl::half, ValueT>,
+inline std::enable_if_t<syclcompat::is_floating_point_v<ValueT>,
                         sycl::marray<ValueT, 2>>
 relu(const sycl::marray<ValueT, 2> a) {
   return {relu(a[0]), relu(a[1])};

From 51860b35cdb738119a2fa0288cd497c55b281b1b Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Tue, 1 Oct 2024 19:54:11 +0100
Subject: [PATCH 15/26] Add bfloat16-container support to clamp

i.e. sycl::marray<bfloat16, N>, sycl::vec<bfloat16, N>
---
 sycl/include/syclcompat/math.hpp           | 22 ++++++++++++++++++++++
 sycl/test-e2e/syclcompat/math/math_ops.cpp | 20 ++++++++++++++++++++
 2 files changed, 42 insertions(+)

diff --git a/sycl/include/syclcompat/math.hpp b/sycl/include/syclcompat/math.hpp
index be31fab6b372e..250742cab8a3c 100644
--- a/sycl/include/syclcompat/math.hpp
+++ b/sycl/include/syclcompat/math.hpp
@@ -81,6 +81,28 @@ inline sycl::ext::oneapi::bfloat16 clamp(sycl::ext::oneapi::bfloat16 val,
     return max_val;
   return val;
 }
+
+template <typename T, int Size>
+inline std::enable_if_t<std::is_same_v<T, sycl::ext::oneapi::bfloat16>,
+                        sycl::vec<T, Size>>
+clamp(sycl::vec<T, Size> val, sycl::vec<T, Size> min_val,
+      sycl::vec<T, Size> max_val) {
+  return [&val, &min_val, &max_val]<int... I>(std::integer_sequence<int, I...>) {
+    return sycl::vec<T, Size>{
+        clamp<sycl::ext::oneapi::bfloat16>(val[I], min_val[I], max_val[I])...};
+  }(std::make_integer_sequence<int, Size>{});
+}
+
+template <typename T, std::size_t Size>
+inline std::enable_if_t<std::is_same_v<T, sycl::ext::oneapi::bfloat16>,
+                        sycl::marray<T, Size>>
+clamp(sycl::marray<T, Size> val, sycl::marray<T, Size> min_val,
+      sycl::marray<T, Size> max_val) {
+  return [&val, &min_val, &max_val]<std::size_t... I>(std::index_sequence<I...>) {
+    return sycl::marray<T, Size>{
+        clamp<sycl::ext::oneapi::bfloat16>(val[I], min_val[I], max_val[I])...};
+  }(std::make_index_sequence<Size>{});
+}
 #endif
 
 template <typename VecT, class BinaryOperation, class = void>
diff --git a/sycl/test-e2e/syclcompat/math/math_ops.cpp b/sycl/test-e2e/syclcompat/math/math_ops.cpp
index eac8bce92a1b6..f68f2937ee7db 100644
--- a/sycl/test-e2e/syclcompat/math/math_ops.cpp
+++ b/sycl/test-e2e/syclcompat/math/math_ops.cpp
@@ -317,6 +317,24 @@ template <typename ValueT> void test_clamp() {
       .template launch_test<clamp_kernel<ValueT>>(op3, expect3);
 }
 
+template <template <typename T, int Dim> typename ContainerT, typename ValueT> void test_container_clamp() {
+  std::cout << __PRETTY_FUNCTION__ << std::endl;
+
+  constexpr syclcompat::dim3 grid{1};
+  constexpr syclcompat::dim3 threads{1};
+  ValueT op1 = static_cast<ValueT>(7);
+  ValueT expect1 = static_cast<ValueT>(7);
+
+  ValueT op2 = static_cast<ValueT>(MAX_CLAMP + 1);
+  ValueT expect2 = static_cast<ValueT>(MAX_CLAMP);
+
+  using ContT = ContainerT<ValueT, 2>;
+  const ContT op4{op1, op2};
+  const ContT expect4{expect1, expect2};
+  UnaryOpTestLauncher<ContT>(grid, threads)
+      .template launch_test<clamp_kernel<ContT>>(op4, expect4);
+}
+
 int main() {
   INSTANTIATE_ALL_TYPES(value_type_list, test_syclcompat_max);
   INSTANTIATE_ALL_TYPES(value_type_list, test_syclcompat_min);
@@ -356,6 +374,8 @@ int main() {
   INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::marray, test_isnan);
 
   INSTANTIATE_ALL_TYPES(value_type_list, test_clamp);
+  INSTANTIATE_ALL_CONTAINER_TYPES(vec_type_list, sycl::vec, test_container_clamp);
+  INSTANTIATE_ALL_CONTAINER_TYPES(marray_type_list, sycl::marray, test_container_clamp);
 
   return 0;
 }

From d16dd560cf6cc20cbd3c9455805ce4871fc603eb Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Tue, 1 Oct 2024 20:13:42 +0100
Subject: [PATCH 16/26] Add max & min bfloat16 support & tests

Also reimplement `fmax_nan` & `fmin_nan` in terms of improved max/min
---
 sycl/include/syclcompat/math.hpp           | 59 ++++++++++------------
 sycl/test-e2e/syclcompat/math/math_ops.cpp | 11 +++-
 2 files changed, 37 insertions(+), 33 deletions(-)

diff --git a/sycl/include/syclcompat/math.hpp b/sycl/include/syclcompat/math.hpp
index 250742cab8a3c..1c912f2786c18 100644
--- a/sycl/include/syclcompat/math.hpp
+++ b/sycl/include/syclcompat/math.hpp
@@ -732,7 +732,7 @@ cbrt(ValueT val) {
 // For floating-point types, `float` or `double` arguments are acceptable.
 // For integer types, `std::uint32_t`, `std::int32_t`, `std::uint64_t` or
 // `std::int64_t` type arguments are acceptable.
-// sycl::half supported as well.
+// sycl::half supported as well, and sycl::ext::oneapi::bfloat16 if available.
 template <typename ValueT, typename ValueU>
 inline std::enable_if_t<std::is_integral_v<ValueT> &&
                             std::is_integral_v<ValueU>,
@@ -741,15 +741,23 @@ min(ValueT a, ValueU b) {
   return sycl::min(static_cast<std::common_type_t<ValueT, ValueU>>(a),
                    static_cast<std::common_type_t<ValueT, ValueU>>(b));
 }
+
 template <typename ValueT, typename ValueU>
-inline std::enable_if_t<std::is_floating_point_v<ValueT> &&
-                            std::is_floating_point_v<ValueU>,
+inline std::enable_if_t<syclcompat::is_floating_point_v<ValueT> &&
+                            syclcompat::is_floating_point_v<ValueU>,
                         std::common_type_t<ValueT, ValueU>>
 min(ValueT a, ValueU b) {
-  return sycl::fmin(static_cast<std::common_type_t<ValueT, ValueU>>(a),
-                    static_cast<std::common_type_t<ValueT, ValueU>>(b));
+  if constexpr (detail::support_bfloat16_math &&
+                std::is_same_v<std::common_type_t<ValueT, ValueU>,
+                               sycl::ext::oneapi::bfloat16>) {
+    return sycl::ext::oneapi::experimental::fmin(
+        static_cast<std::common_type_t<ValueT, ValueU>>(a),
+        static_cast<std::common_type_t<ValueT, ValueU>>(b));
+  } else {
+    return sycl::fmin(static_cast<std::common_type_t<ValueT, ValueU>>(a),
+                      static_cast<std::common_type_t<ValueT, ValueU>>(b));
+  }
 }
-inline sycl::half min(sycl::half a, sycl::half b) { return sycl::fmin(a, b); }
 
 template <typename ValueT, typename ValueU>
 inline std::enable_if_t<std::is_integral_v<ValueT> &&
@@ -760,14 +768,21 @@ max(ValueT a, ValueU b) {
                    static_cast<std::common_type_t<ValueT, ValueU>>(b));
 }
 template <typename ValueT, typename ValueU>
-inline std::enable_if_t<std::is_floating_point_v<ValueT> &&
-                            std::is_floating_point_v<ValueU>,
+inline std::enable_if_t<syclcompat::is_floating_point_v<ValueT> &&
+                            syclcompat::is_floating_point_v<ValueU>,
                         std::common_type_t<ValueT, ValueU>>
 max(ValueT a, ValueU b) {
-  return sycl::fmax(static_cast<std::common_type_t<ValueT, ValueU>>(a),
-                    static_cast<std::common_type_t<ValueT, ValueU>>(b));
+  if constexpr (detail::support_bfloat16_math &&
+                std::is_same_v<std::common_type_t<ValueT, ValueU>,
+                               sycl::ext::oneapi::bfloat16>) {
+    return sycl::ext::oneapi::experimental::fmax(
+        static_cast<std::common_type_t<ValueT, ValueU>>(a),
+        static_cast<std::common_type_t<ValueT, ValueU>>(b));
+  } else {
+    return sycl::fmax(static_cast<std::common_type_t<ValueT, ValueU>>(a),
+                      static_cast<std::common_type_t<ValueT, ValueU>>(b));
+  }
 }
-inline sycl::half max(sycl::half a, sycl::half b) { return sycl::fmax(a, b); }
 
 /// Performs 2 elements comparison and returns the bigger one. If either of
 /// inputs is NaN, then return NaN.
@@ -779,16 +794,7 @@ inline std::common_type_t<ValueT, ValueU> fmax_nan(const ValueT a,
                                                    const ValueU b) {
   if (detail::isnan(a) || detail::isnan(b))
     return NAN;
-  if constexpr (detail::support_bfloat16_math &&
-                std::is_same_v<std::common_type_t<ValueT, ValueU>,
-                               sycl::ext::oneapi::bfloat16>) {
-    return sycl::ext::oneapi::experimental::fmax(
-        static_cast<std::common_type_t<ValueT, ValueU>>(a),
-        static_cast<std::common_type_t<ValueT, ValueU>>(b));
-  } else {
-    return sycl::fmax(static_cast<std::common_type_t<ValueT, ValueU>>(a),
-                      static_cast<std::common_type_t<ValueT, ValueU>>(b));
-  }
+  return syclcompat::max(a, b);
 }
 
 template <typename ValueT, typename ValueU>
@@ -813,16 +819,7 @@ inline std::common_type_t<ValueT, ValueU> fmin_nan(const ValueT a,
                                                    const ValueU b) {
   if (detail::isnan(a) || detail::isnan(b))
     return NAN;
-  if constexpr (detail::support_bfloat16_math &&
-                std::is_same_v<std::common_type_t<ValueT, ValueU>,
-                               sycl::ext::oneapi::bfloat16>) {
-    return sycl::ext::oneapi::experimental::fmin(
-        static_cast<std::common_type_t<ValueT, ValueU>>(a),
-        static_cast<std::common_type_t<ValueT, ValueU>>(b));
-  } else {
-    return sycl::fmin(static_cast<std::common_type_t<ValueT, ValueU>>(a),
-                      static_cast<std::common_type_t<ValueT, ValueU>>(b));
-  }
+  return syclcompat::min(a,b);
 }
 
 template <typename ValueT, typename ValueU>
diff --git a/sycl/test-e2e/syclcompat/math/math_ops.cpp b/sycl/test-e2e/syclcompat/math/math_ops.cpp
index f68f2937ee7db..088230ed968d9 100644
--- a/sycl/test-e2e/syclcompat/math/math_ops.cpp
+++ b/sycl/test-e2e/syclcompat/math/math_ops.cpp
@@ -34,7 +34,7 @@
 template <typename ValueT, typename ValueU>
 inline void max_kernel(ValueT *a, ValueU *b,
                        std::common_type_t<ValueT, ValueU> *r) {
-  *r = syclcompat::max(*a, *b);
+  *r = syclcompat::max<ValueT, ValueU>(*a, *b);
 }
 
 template <typename ValueT, typename ValueU = ValueT>
@@ -54,7 +54,7 @@ void test_syclcompat_max() {
 template <typename ValueT, typename ValueU>
 inline void min_kernel(ValueT *a, ValueU *b,
                        std::common_type_t<ValueT, ValueU> *r) {
-  *r = syclcompat::min(*a, *b);
+  *r = syclcompat::min<ValueT,ValueU>(*a, *b);
 }
 
 template <typename ValueT, typename ValueU = ValueT>
@@ -342,8 +342,15 @@ int main() {
   // Basic testing of deduction to avoid combinatorial explosion
   test_syclcompat_max<double, float>();
   test_syclcompat_max<long, int>();
+#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
+  test_syclcompat_max<sycl::ext::oneapi::bfloat16, float>();
+#endif
+
   test_syclcompat_min<double, float>();
   test_syclcompat_min<long, int>();
+#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
+  test_syclcompat_min<sycl::ext::oneapi::bfloat16, float>();
+#endif
 
   INSTANTIATE_ALL_TYPES(fp_type_list, test_syclcompat_fmin_nan);
   INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::vec, test_container_syclcompat_fmin_nan);

From 05b0748c4c2741aeae859dc2a121078c8a36d735 Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Wed, 2 Oct 2024 09:54:55 +0100
Subject: [PATCH 17/26] Generalize compare_mask & unordered_compare_mask

- compare_mask & unordered_compare_mask support sycl::marray
- sycl::marray tests for all `compare` APIs
---
 sycl/doc/syclcompat/README.md                 | 11 ++---
 sycl/include/syclcompat/math.hpp              | 12 ++---
 .../test-e2e/syclcompat/math/math_compare.cpp | 48 ++++++++++++-------
 3 files changed, 41 insertions(+), 30 deletions(-)

diff --git a/sycl/doc/syclcompat/README.md b/sycl/doc/syclcompat/README.md
index 25f461e32d18f..3d26f96e4fe06 100644
--- a/sycl/doc/syclcompat/README.md
+++ b/sycl/doc/syclcompat/README.md
@@ -1856,14 +1856,13 @@ unordered_compare_both(const ValueT a, const ValueT b,
                        const BinaryOperation binary_op);
 
 template <typename ValueT, class BinaryOperation>
-inline unsigned compare_mask(const sycl::vec<ValueT, 2> a,
-                             const sycl::vec<ValueT, 2> b,
-                             const BinaryOperation binary_op);
+inline std::enable_if_t<ValueT::size() == 2, unsigned>
+compare_mask(const ValueT a, const ValueT b, const BinaryOperation binary_op);
 
 template <typename ValueT, class BinaryOperation>
-inline unsigned unordered_compare_mask(const sycl::vec<ValueT, 2> a,
-                                       const sycl::vec<ValueT, 2> b,
-                                       const BinaryOperation binary_op);
+inline std::enable_if_t<ValueT::size() == 2, unsigned>
+unordered_compare_mask(const ValueT a, const ValueT b,
+                       const BinaryOperation binary_op);
 
 template <typename S, typename T> inline T vectorized_max(T a, T b);
 
diff --git a/sycl/include/syclcompat/math.hpp b/sycl/include/syclcompat/math.hpp
index 1c912f2786c18..83ba826596ea0 100644
--- a/sycl/include/syclcompat/math.hpp
+++ b/sycl/include/syclcompat/math.hpp
@@ -32,6 +32,7 @@
 #pragma once
 
 #include <sycl/feature_test.hpp>
+#include <type_traits>
 
 // TODO(syclcompat-lib-reviewers): this should not be required
 #ifndef SYCL_EXT_ONEAPI_COMPLEX
@@ -578,9 +579,8 @@ unordered_compare_both(const ValueT a, const ValueT b,
 /// \param [in] binary_op functor that implements the binary operation
 /// \returns the comparison result
 template <typename ValueT, class BinaryOperation>
-inline unsigned compare_mask(const sycl::vec<ValueT, 2> a,
-                             const sycl::vec<ValueT, 2> b,
-                             const BinaryOperation binary_op) {
+inline std::enable_if_t<ValueT::size() == 2, unsigned>
+compare_mask(const ValueT a, const ValueT b, const BinaryOperation binary_op) {
   // Since compare returns 0 or 1, -compare will be 0x00000000 or 0xFFFFFFFF
   return ((-compare(a[0], b[0], binary_op)) << 16) |
          ((-compare(a[1], b[1], binary_op)) & 0xFFFF);
@@ -594,9 +594,9 @@ inline unsigned compare_mask(const sycl::vec<ValueT, 2> a,
 /// \param [in] binary_op functor that implements the binary operation
 /// \returns the comparison result
 template <typename ValueT, class BinaryOperation>
-inline unsigned unordered_compare_mask(const sycl::vec<ValueT, 2> a,
-                                       const sycl::vec<ValueT, 2> b,
-                                       const BinaryOperation binary_op) {
+inline std::enable_if_t<ValueT::size() == 2, unsigned>
+unordered_compare_mask(const ValueT a, const ValueT b,
+                       const BinaryOperation binary_op) {
   return ((-unordered_compare(a[0], b[0], binary_op)) << 16) |
          ((-unordered_compare(a[1], b[1], binary_op)) & 0xFFFF);
 }
diff --git a/sycl/test-e2e/syclcompat/math/math_compare.cpp b/sycl/test-e2e/syclcompat/math/math_compare.cpp
index 11afc7420dc6d..0f77160a564e7 100644
--- a/sycl/test-e2e/syclcompat/math/math_compare.cpp
+++ b/sycl/test-e2e/syclcompat/math/math_compare.cpp
@@ -96,9 +96,10 @@ void compare_not_equal_vec_kernel(Container *a, Container *b, Container *r) {
   *r = syclcompat::compare(*a, *b, std::not_equal_to<>());
 }
 
-template <typename ValueT> void test_compare_vec() {
+template <template <typename T, int Dim> typename ContainerT,
+typename ValueT> void test_compare_vec() {
   std::cout << __PRETTY_FUNCTION__ << std::endl;
-  using Container = sycl::vec<ValueT, 2>;
+  using Container = ContainerT<ValueT, 2>;
 
   constexpr syclcompat::dim3 grid{1};
   constexpr syclcompat::dim3 threads{1};
@@ -177,9 +178,10 @@ void unordered_compare_not_equal_vec_kernel(Container *a, Container *b,
   *r = syclcompat::unordered_compare(*a, *b, std::not_equal_to<>());
 }
 
-template <typename ValueT> void test_unordered_compare_vec() {
+template <template <typename T, int Dim> typename ContainerT,
+typename ValueT> void test_unordered_compare_vec() {
   std::cout << __PRETTY_FUNCTION__ << std::endl;
-  using Container = sycl::vec<ValueT, 2>;
+  using Container = ContainerT<ValueT, 2>;
 
   constexpr syclcompat::dim3 grid{1};
   constexpr syclcompat::dim3 threads{1};
@@ -207,9 +209,10 @@ void compare_both_kernel(Container *a, Container *b, bool *r) {
   *r = syclcompat::compare_both(*a, *b, std::equal_to<>());
 }
 
-template <typename ValueT> void test_compare_both() {
+template <template <typename T, int Dim> typename ContainerT,
+typename ValueT> void test_compare_both() {
   std::cout << __PRETTY_FUNCTION__ << std::endl;
-  using Container = sycl::vec<ValueT, 2>;
+  using Container = ContainerT<ValueT, 2>;
 
   constexpr syclcompat::dim3 grid{1};
   constexpr syclcompat::dim3 threads{1};
@@ -236,9 +239,10 @@ void unordered_compare_both_kernel(Container *a, Container *b, bool *r) {
   *r = syclcompat::unordered_compare_both(*a, *b, std::equal_to<>());
 }
 
-template <typename ValueT> void test_unordered_compare_both() {
+template <template <typename T, int Dim> typename ContainerT,
+typename ValueT> void test_unordered_compare_both() {
   std::cout << __PRETTY_FUNCTION__ << std::endl;
-  using Container = sycl::vec<ValueT, 2>;
+  using Container = ContainerT<ValueT, 2>;
 
   constexpr syclcompat::dim3 grid{1};
   constexpr syclcompat::dim3 threads{1};
@@ -266,9 +270,10 @@ void compare_mask_kernel(Container *a, Container *b, unsigned *r) {
   *r = syclcompat::compare_mask(*a, *b, std::equal_to<>());
 }
 
-template <typename ValueT> void test_compare_mask() {
+template <template <typename T, int Dim> typename ContainerT,
+typename ValueT> void test_compare_mask() {
   std::cout << __PRETTY_FUNCTION__ << std::endl;
-  using Container = sycl::vec<ValueT, 2>;
+  using Container = ContainerT<ValueT, 2>;
 
   constexpr syclcompat::dim3 grid{1};
   constexpr syclcompat::dim3 threads{1};
@@ -314,9 +319,10 @@ void unordered_compare_mask_kernel(Container *a, Container *b, unsigned *r) {
   *r = syclcompat::unordered_compare_mask(*a, *b, std::equal_to<>());
 }
 
-template <typename ValueT> void test_unordered_compare_mask() {
+template <template <typename T, int Dim> typename ContainerT,
+typename ValueT> void test_unordered_compare_mask() {
   std::cout << __PRETTY_FUNCTION__ << std::endl;
-  using Container = sycl::vec<ValueT, 2>;
+  using Container = ContainerT<ValueT, 2>;
 
   constexpr syclcompat::dim3 grid{1};
   constexpr syclcompat::dim3 threads{1};
@@ -360,12 +366,18 @@ template <typename ValueT> void test_unordered_compare_mask() {
 int main() {
   INSTANTIATE_ALL_TYPES(fp_type_list, test_compare);
   INSTANTIATE_ALL_TYPES(fp_type_list, test_unordered_compare);
-  INSTANTIATE_ALL_TYPES(fp_type_list, test_compare_vec);
-  INSTANTIATE_ALL_TYPES(fp_type_list, test_unordered_compare_vec);
-  INSTANTIATE_ALL_TYPES(fp_type_list, test_compare_both);
-  INSTANTIATE_ALL_TYPES(fp_type_list, test_unordered_compare_both);
-  INSTANTIATE_ALL_TYPES(fp_type_list, test_compare_mask);
-  INSTANTIATE_ALL_TYPES(fp_type_list, test_unordered_compare_mask);
+  INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::vec, test_compare_vec);
+  INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::marray, test_compare_vec);
+  INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::vec, test_unordered_compare_vec);
+  INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::marray, test_unordered_compare_vec);
+  INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::vec, test_compare_both);
+  INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::marray, test_compare_both);
+  INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::vec, test_unordered_compare_both);
+  INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::marray, test_unordered_compare_both);
+  INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::vec, test_compare_mask);
+  INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::marray, test_compare_mask);
+  INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::vec, test_unordered_compare_mask);
+  INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::marray, test_unordered_compare_mask);
 
   return 0;
 }

From fd9ebf4a225209e584ab05b59a7c46b318f8324f Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Wed, 2 Oct 2024 12:05:06 +0100
Subject: [PATCH 18/26] Tidy a comment

---
 sycl/include/syclcompat/math.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sycl/include/syclcompat/math.hpp b/sycl/include/syclcompat/math.hpp
index 83ba826596ea0..d4ea488e4e8a8 100644
--- a/sycl/include/syclcompat/math.hpp
+++ b/sycl/include/syclcompat/math.hpp
@@ -843,10 +843,10 @@ inline typename std::enable_if_t<std::is_floating_point_v<ValueT>, ValueT>
 pow(const ValueT a, const ValueU b) {
   return sycl::pow(a, static_cast<ValueT>(b));
 }
-
-// TODO: calling pow with non-floating point values is currently defaulting to
-// double, which fails on devices without aspect::fp64. This has to be properly
-// documented, and maybe changed to support all devices.
+// TODO(syclcompat-lib-reviewers)  calling pow with non-floating point values
+// is currently defaulting to double, which fails on devices without
+// aspect::fp64. This has to be properly documented, and maybe changed to
+// support all devices.
 template <typename ValueT, typename ValueU>
 inline typename std::enable_if_t<!std::is_floating_point_v<ValueT>, double>
 pow(const ValueT a, const ValueU b) {

From e5f92317b1739d628f4132f8be663ead98cea698 Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Wed, 2 Oct 2024 12:14:28 +0100
Subject: [PATCH 19/26] Add bfloat16 support to cbrt

---
 sycl/doc/syclcompat/README.md    |  8 ++------
 sycl/include/syclcompat/math.hpp | 10 ++++++----
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/sycl/doc/syclcompat/README.md b/sycl/doc/syclcompat/README.md
index 3d26f96e4fe06..c486828f5ac59 100644
--- a/sycl/doc/syclcompat/README.md
+++ b/sycl/doc/syclcompat/README.md
@@ -1736,12 +1736,8 @@ inline std::enable_if_t<ValueT::size() == 2, ValueT> isnan(const ValueT a);
 
 // cbrt function wrapper.
 template <typename ValueT>
-inline std::enable_if_t<std::is_floating_point_v<ValueT> ||
-                            std::is_same_v<sycl::half, ValueT>,
-                        ValueT>
-cbrt(ValueT val) {
-  return sycl::cbrt(static_cast<ValueT>(val));
-}
+inline std::enable_if_t<syclcompat::is_floating_point_v<ValueT>, ValueT>
+cbrt(ValueT val);
 
 // For floating-point types, `float` or `double` arguments are acceptable.
 // For integer types, `std::uint32_t`, `std::int32_t`, `std::uint64_t` or
diff --git a/sycl/include/syclcompat/math.hpp b/sycl/include/syclcompat/math.hpp
index d4ea488e4e8a8..d7804eda34107 100644
--- a/sycl/include/syclcompat/math.hpp
+++ b/sycl/include/syclcompat/math.hpp
@@ -721,11 +721,13 @@ inline std::enable_if_t<ValueT::size() == 2, ValueT> isnan(const ValueT a) {
 
 /// cbrt function wrapper.
 template <typename ValueT>
-inline std::enable_if_t<std::is_floating_point_v<ValueT> ||
-                            std::is_same_v<sycl::half, ValueT>,
-                        ValueT>
+inline std::enable_if_t<syclcompat::is_floating_point_v<ValueT>, ValueT>
 cbrt(ValueT val) {
-  return sycl::cbrt(static_cast<ValueT>(val));
+  if constexpr (std::is_same_v<sycl::ext::oneapi::bfloat16, ValueT>) {
+    return static_cast<ValueT>(sycl::cbrt(static_cast<float>(val)));
+  } else {
+    return sycl::cbrt(static_cast<ValueT>(val));
+  }
 }
 
 // min/max function overloads.

From f95dc85e05aa2f1b064af5b011de79dbd262b593 Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Wed, 2 Oct 2024 12:18:05 +0100
Subject: [PATCH 20/26] Revert "Add cmul_add<bfloat> & draft test"

This reverts commit 55e95ad93b8fd7398f7d53d98cbc1fada3e407c0.
---
 sycl/include/syclcompat/math.hpp              | 22 -----
 .../test-e2e/syclcompat/math/math_complex.cpp | 81 +++++--------------
 2 files changed, 18 insertions(+), 85 deletions(-)

diff --git a/sycl/include/syclcompat/math.hpp b/sycl/include/syclcompat/math.hpp
index d7804eda34107..e80989684503c 100644
--- a/sycl/include/syclcompat/math.hpp
+++ b/sycl/include/syclcompat/math.hpp
@@ -946,28 +946,6 @@ inline sycl::marray<ValueT, 2> cmul_add(const sycl::marray<ValueT, 2> a,
   t = t * u + v;
   return sycl::marray<ValueT, 2>{t.real(), t.imag()};
 }
-template <>
-inline sycl::vec<sycl::ext::oneapi::bfloat16, 2>
-cmul_add(const sycl::vec<sycl::ext::oneapi::bfloat16, 2> a,
-         const sycl::vec<sycl::ext::oneapi::bfloat16, 2> b,
-         const sycl::vec<sycl::ext::oneapi::bfloat16, 2> c) {
-  sycl::ext::oneapi::experimental::complex<float> t(a[0], a[1]);
-  sycl::ext::oneapi::experimental::complex<float> u(b[0], b[1]);
-  sycl::ext::oneapi::experimental::complex<float> v(c[0], c[1]);
-  t = t * u + v;
-  return sycl::vec<sycl::ext::oneapi::bfloat16, 2>{t.real(), t.imag()};
-}
-template <>
-inline sycl::marray<sycl::ext::oneapi::bfloat16, 2>
-cmul_add(const sycl::marray<sycl::ext::oneapi::bfloat16, 2> a,
-         const sycl::marray<sycl::ext::oneapi::bfloat16, 2> b,
-         const sycl::marray<sycl::ext::oneapi::bfloat16, 2> c) {
-  sycl::ext::oneapi::experimental::complex<float> t(a[0], a[1]);
-  sycl::ext::oneapi::experimental::complex<float> u(b[0], b[1]);
-  sycl::ext::oneapi::experimental::complex<float> v(c[0], c[1]);
-  t = t * u + v;
-  return sycl::marray<sycl::ext::oneapi::bfloat16, 2>{t.real(), t.imag()};
-}
 
 /// A sycl::abs wrapper functors.
 struct abs {
diff --git a/sycl/test-e2e/syclcompat/math/math_complex.cpp b/sycl/test-e2e/syclcompat/math/math_complex.cpp
index 8207d1222bda5..27e2bf8af8f71 100644
--- a/sycl/test-e2e/syclcompat/math/math_complex.cpp
+++ b/sycl/test-e2e/syclcompat/math/math_complex.cpp
@@ -184,62 +184,19 @@ void kernel_mul(int *result) {
   *result = r;
 }
 
-template <typename T>
-std::array<T,2> complex_mul(std::array<T, 2> a, std::array<T, 2> b){
-  std::array<T, 2> result;
-  result[0] = (a[0] * b[0]) - (a[1] * b[1]);
-  result[1] = (a[0] * b[1]) + (a[1] * b[0]);
-  return result;
-}
-
-template <typename T>
-std::array<T,2> complex_add(std::array<T, 2> a, std::array<T, 2> b){
-  return {a[0] + b[0], a[1] + b[1]};
-}
-
-template <typename T> void mul_add_groundtruth() {
-
-  using complex_t = std::complex<double>;
-  using arr_t = std::array<T, 2>;
-
-  arr_t d1 = arr_t({static_cast<T>(5.4), static_cast<T>(-6.3)});
-  arr_t d2 = arr_t({static_cast<T>(-7.2), static_cast<T>(8.1)});
-  arr_t d3 = arr_t({static_cast<T>(1.0), static_cast<T>(-1.0)});
-
-  arr_t f1 = arr_t({static_cast<T>(1.8), static_cast<T>(-2.7)});
-  arr_t f2 = arr_t({static_cast<T>(-3.6), static_cast<T>(4.5)});
-  arr_t f3 = arr_t({static_cast<T>(1.0), static_cast<T>(-1.0)});
-
-  arr_t ra1 = complex_add(complex_mul(d1, d2), d3);
-  arr_t ra2 = complex_add(complex_mul(f1, f2), f3);
-
-  T expect[4] = {13.150000, 88.100000, 6.670001, 16.820000};
-
-  // complex_t r1 = d1 * d2 + d3;
-  // complex_t r2 = f1 * f2 + f3;
-
-  std::cout << "r1: " << static_cast<T>(ra1[0]) << ", "
-            << static_cast<T>(ra1[1]) << std::endl;
-  std::cout << "Expect 1: " << expect[0] << ", " << expect[1] << std::endl;
-  std::cout << "r2: " << static_cast<T>(ra2[0]) << ", "
-            << static_cast<T>(ra2[1]) << std::endl;
-  std::cout << "Expect 2: " << expect[2] << ", " << expect[3] << std::endl;
-}
-
-template <typename T>
 void kernel_mul_add(int *result) {
-  sycl::vec<T, 2> d1, d2, d3;
-  sycl::vec<T, 2> f1, f2, f3;
-  sycl::marray<T, 2> m_d1, m_d2, m_d3;
-  sycl::marray<T, 2> m_f1, m_f2, m_f3;
+  sycl::double2 d1, d2, d3;
+  sycl::float2 f1, f2, f3;
+  sycl::marray<double, 2> m_d1, m_d2, m_d3;
+  sycl::marray<float, 2> m_f1, m_f2, m_f3;
 
-  d1 = sycl::vec<T, 2>(5.4, -6.3);
-  d2 = sycl::vec<T, 2>(-7.2, 8.1);
-  d3 = sycl::vec<T, 2>(1.0, -1.0);
+  d1 = sycl::double2(5.4, -6.3);
+  d2 = sycl::double2(-7.2, 8.1);
+  d3 = sycl::double2(1.0, -1.0);
 
-  f1 = sycl::vec<T, 2>(1.8, -2.7);
-  f2 = sycl::vec<T, 2>(-3.6, 4.5);
-  f3 = sycl::vec<T, 2>(1.0, -1.0);
+  f1 = sycl::float2(1.8, -2.7);
+  f2 = sycl::float2(-3.6, 4.5);
+  f3 = sycl::float2(1.0, -1.0);
 
   bool r = true;
   float expect[4] = {13.150000, 88.100000, 6.670001, 16.820000};
@@ -250,13 +207,13 @@ void kernel_mul_add(int *result) {
   auto a2 = syclcompat::cmul_add(f1, f2, f3);
   r = r && check(a2, expect + 2);
 
-  m_d1 = sycl::marray<T, 2>(5.4, -6.3);
-  m_d2 = sycl::marray<T, 2>(-7.2, 8.1);
-  m_d3 = sycl::marray<T, 2>(1.0, -1.0);
+  m_d1 = sycl::marray<double, 2>(5.4, -6.3);
+  m_d2 = sycl::marray<double, 2>(-7.2, 8.1);
+  m_d3 = sycl::marray<double, 2>(1.0, -1.0);
 
-  m_f1 = sycl::marray<T, 2>(1.8, -2.7);
-  m_f2 = sycl::marray<T, 2>(-3.6, 4.5);
-  m_f3 = sycl::marray<T, 2>(1.0, -1.0);
+  m_f1 = sycl::marray<float, 2>(1.8, -2.7);
+  m_f2 = sycl::marray<float, 2>(-3.6, 4.5);
+  m_f3 = sycl::marray<float, 2>(1.0, -1.0);
 
   auto a3 = syclcompat::cmul_add(m_d1, m_d2, m_d3);
   r = r && check(a3, expect);
@@ -284,11 +241,9 @@ void test_conj() {
   ComplexLauncher<kernel_conj>().launch();
 }
 
-template <typename T>
 void test_mul_add() {
   std::cout << __PRETTY_FUNCTION__ << std::endl;
-  mul_add_groundtruth<T>();
-  ComplexLauncher<kernel_mul_add<T>>().launch();
+  ComplexLauncher<kernel_mul_add>().launch();
 }
 
 int main() {
@@ -296,7 +251,7 @@ int main() {
   test_mul();
   test_div();
   test_conj();
-  INSTANTIATE_ALL_TYPES(fp_type_list, test_mul_add);
+  test_mul_add();
 
   return 0;
 }

From 4da9833fe8adf9294b53363bddd9882ebae02b54 Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Tue, 15 Oct 2024 11:59:15 +0100
Subject: [PATCH 21/26] Move std::common_type specialization to traits.hpp

and remove stale comments
---
 sycl/include/syclcompat/math.hpp   | 27 ---------------------------
 sycl/include/syclcompat/traits.hpp | 23 +++++++++++++++++++++++
 2 files changed, 23 insertions(+), 27 deletions(-)

diff --git a/sycl/include/syclcompat/math.hpp b/sycl/include/syclcompat/math.hpp
index e80989684503c..6858812e10efe 100644
--- a/sycl/include/syclcompat/math.hpp
+++ b/sycl/include/syclcompat/math.hpp
@@ -2302,30 +2302,3 @@ inline constexpr unsigned extend_vcompare4_add(AT a, BT b, unsigned c,
 }
 
 } // namespace syclcompat
-
-// Specialize std::common_type for bfloat16
-// Semantics here match bfloat16.hpp operator overloads (all mixed type math
-// ops return bfloat16)
-// TODO(syclcompat-lib-reviewers) Move this to bfloat extension
-namespace std {
-template <> struct common_type<sycl::ext::oneapi::bfloat16> {
-  using type = sycl::ext::oneapi::bfloat16;
-};
-
-template <>
-struct common_type<sycl::ext::oneapi::bfloat16, sycl::ext::oneapi::bfloat16> {
-  using type = sycl::ext::oneapi::bfloat16;
-};
-
-template <typename T> struct common_type<sycl::ext::oneapi::bfloat16, T> {
-  using type = sycl::ext::oneapi::bfloat16; // std::common_type_t<float, T>;  //
-                                            // sycl::ext::oneapi::bfloat16
-                                            // promotes to float
-};
-
-template <typename T> struct common_type<T, sycl::ext::oneapi::bfloat16> {
-  using type = sycl::ext::oneapi::bfloat16; // std::common_type_t<T, float>;  //
-                                            // sycl::ext::oneapi::bfloat16
-                                            // promotes to float
-};
-} // namespace std
diff --git a/sycl/include/syclcompat/traits.hpp b/sycl/include/syclcompat/traits.hpp
index 435b0df98a32d..ae267af021fa7 100644
--- a/sycl/include/syclcompat/traits.hpp
+++ b/sycl/include/syclcompat/traits.hpp
@@ -268,3 +268,26 @@ template <typename T>
 inline constexpr bool is_floating_point_v = is_floating_point<T>::value;
 
 } // namespace syclcompat
+
+// Specialize std::common_type for bfloat16
+// Semantics here match bfloat16.hpp operator overloads (all mixed type math
+// ops return bfloat16)
+// TODO(syclcompat-lib-reviewers) Move this to bfloat extension
+namespace std {
+template <> struct common_type<sycl::ext::oneapi::bfloat16> {
+  using type = sycl::ext::oneapi::bfloat16;
+};
+
+template <>
+struct common_type<sycl::ext::oneapi::bfloat16, sycl::ext::oneapi::bfloat16> {
+  using type = sycl::ext::oneapi::bfloat16;
+};
+
+template <typename T> struct common_type<sycl::ext::oneapi::bfloat16, T> {
+  using type = sycl::ext::oneapi::bfloat16;
+};
+
+template <typename T> struct common_type<T, sycl::ext::oneapi::bfloat16> {
+  using type = sycl::ext::oneapi::bfloat16;
+};
+} // namespace std

From ac2b4e97bc1f5a8baa135ae94503900b342ddf5c Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Wed, 16 Oct 2024 12:38:02 +0100
Subject: [PATCH 22/26] Document std::common_type_t<bfloat16,...>

---
 sycl/doc/syclcompat/README.md | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/sycl/doc/syclcompat/README.md b/sycl/doc/syclcompat/README.md
index ccce6fbf54e6d..ba645c707a9ba 100644
--- a/sycl/doc/syclcompat/README.md
+++ b/sycl/doc/syclcompat/README.md
@@ -1728,7 +1728,31 @@ signed otherwise.
 Various maths functions are defined operate on any floating point types.
 `syclcompat::is_floating_point_v` extends the standard library's
 `std::is_floating_point_v` to include `sycl::half` and, where available,
-`sycl::ext::oneapi::bfloat16`.
+`sycl::ext::oneapi::bfloat16`. The current version of SYCLcompat also provides
+a specialization of `std::common_type_t` for `sycl::ext::oneapi::bfloat16`,
+though this will be moved to the `sycl_ext_oneapi_bfloat16` extension in
+future.
+
+```cpp
+namespace std {
+template <> struct common_type<sycl::ext::oneapi::bfloat16> {
+  using type = sycl::ext::oneapi::bfloat16;
+};
+
+template <>
+struct common_type<sycl::ext::oneapi::bfloat16, sycl::ext::oneapi::bfloat16> {
+  using type = sycl::ext::oneapi::bfloat16;
+};
+
+template <typename T> struct common_type<sycl::ext::oneapi::bfloat16, T> {
+  using type = sycl::ext::oneapi::bfloat16;
+};
+
+template <typename T> struct common_type<T, sycl::ext::oneapi::bfloat16> {
+  using type = sycl::ext::oneapi::bfloat16;
+};
+} // namespace std
+```
 
 ```cpp
 namespace syclcompat{

From b0303bbd566fcbcb206c4ee3590ae00c54d0cd9e Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Wed, 16 Oct 2024 12:39:05 +0100
Subject: [PATCH 23/26] Revert unneeded cbrt<bfloat16> support

---
 sycl/doc/syclcompat/README.md              |  4 +++-
 sycl/include/syclcompat/math.hpp           | 10 ++++------
 sycl/test-e2e/syclcompat/common.hpp        | 10 ++++++++--
 sycl/test-e2e/syclcompat/math/math_ops.cpp |  2 +-
 4 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/sycl/doc/syclcompat/README.md b/sycl/doc/syclcompat/README.md
index ba645c707a9ba..c3a263d2d072a 100644
--- a/sycl/doc/syclcompat/README.md
+++ b/sycl/doc/syclcompat/README.md
@@ -1795,7 +1795,9 @@ inline std::enable_if_t<ValueT::size() == 2, ValueT> isnan(const ValueT a);
 
 // cbrt function wrapper.
 template <typename ValueT>
-inline std::enable_if_t<syclcompat::is_floating_point_v<ValueT>, ValueT>
+inline std::enable_if_t<std::is_floating_point_v<ValueT> ||
+                            std::is_same_v<ValueT, sycl::half>,
+                        ValueT>
 cbrt(ValueT val);
 
 // For floating-point types, `float` or `double` arguments are acceptable.
diff --git a/sycl/include/syclcompat/math.hpp b/sycl/include/syclcompat/math.hpp
index 6858812e10efe..f65ad54222846 100644
--- a/sycl/include/syclcompat/math.hpp
+++ b/sycl/include/syclcompat/math.hpp
@@ -721,13 +721,11 @@ inline std::enable_if_t<ValueT::size() == 2, ValueT> isnan(const ValueT a) {
 
 /// cbrt function wrapper.
 template <typename ValueT>
-inline std::enable_if_t<syclcompat::is_floating_point_v<ValueT>, ValueT>
+inline std::enable_if_t<std::is_floating_point_v<ValueT> ||
+                            std::is_same_v<ValueT, sycl::half>,
+                        ValueT>
 cbrt(ValueT val) {
-  if constexpr (std::is_same_v<sycl::ext::oneapi::bfloat16, ValueT>) {
-    return static_cast<ValueT>(sycl::cbrt(static_cast<float>(val)));
-  } else {
-    return sycl::cbrt(static_cast<ValueT>(val));
-  }
+  return sycl::cbrt(static_cast<ValueT>(val));
 }
 
 // min/max function overloads.
diff --git a/sycl/test-e2e/syclcompat/common.hpp b/sycl/test-e2e/syclcompat/common.hpp
index 7e3e5e7b3d70c..6defae338976c 100644
--- a/sycl/test-e2e/syclcompat/common.hpp
+++ b/sycl/test-e2e/syclcompat/common.hpp
@@ -63,8 +63,14 @@ using value_type_list =
 #endif
 >;
 
-using fp_type_list =
-    std::tuple<float, double, sycl::half, sycl::ext::oneapi::bfloat16>;
+using fp_type_list_no_bfloat16 = std::tuple<float, double, sycl::half>;
+
+using fp_type_list = std::tuple<float, double, sycl::half
+
+#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
+                ,sycl::ext::oneapi::bfloat16
+#endif
+>;
 
 using marray_type_list =
     std::tuple<char, signed char, short, int, long, long long, unsigned char,
diff --git a/sycl/test-e2e/syclcompat/math/math_ops.cpp b/sycl/test-e2e/syclcompat/math/math_ops.cpp
index 088230ed968d9..d52d9c60d8ded 100644
--- a/sycl/test-e2e/syclcompat/math/math_ops.cpp
+++ b/sycl/test-e2e/syclcompat/math/math_ops.cpp
@@ -375,7 +375,7 @@ int main() {
   test_syclcompat_pow<double, int>();
 
   INSTANTIATE_ALL_TYPES(fp_type_list, test_syclcompat_relu);
-  INSTANTIATE_ALL_TYPES(fp_type_list, test_syclcompat_cbrt);
+  INSTANTIATE_ALL_TYPES(fp_type_list_no_bfloat16, test_syclcompat_cbrt);
 
   INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::vec, test_isnan);
   INSTANTIATE_ALL_CONTAINER_TYPES(fp_type_list, sycl::marray, test_isnan);

From fc1176442e748ae2103894d13418b6aff9864237 Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Wed, 16 Oct 2024 12:39:20 +0100
Subject: [PATCH 24/26] Review fix includes

---
 sycl/include/syclcompat/traits.hpp  | 2 +-
 sycl/test-e2e/syclcompat/common.hpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sycl/include/syclcompat/traits.hpp b/sycl/include/syclcompat/traits.hpp
index ae267af021fa7..7ed4d765251bc 100644
--- a/sycl/include/syclcompat/traits.hpp
+++ b/sycl/include/syclcompat/traits.hpp
@@ -24,7 +24,7 @@
 
 #include <sycl/feature_test.hpp>
 #ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
-#include "sycl/ext/oneapi/bfloat16.hpp"
+#include <sycl/ext/oneapi/bfloat16.hpp>
 #endif
 #include <cstddef>
 #include <sycl/ext/oneapi/properties/properties.hpp>
diff --git a/sycl/test-e2e/syclcompat/common.hpp b/sycl/test-e2e/syclcompat/common.hpp
index 6defae338976c..ff840c98209bd 100644
--- a/sycl/test-e2e/syclcompat/common.hpp
+++ b/sycl/test-e2e/syclcompat/common.hpp
@@ -24,7 +24,7 @@
 
 #include <sycl/feature_test.hpp>
 #ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
-#include "sycl/ext/oneapi/bfloat16.hpp"
+#include <sycl/ext/oneapi/bfloat16.hpp>
 #endif
 #include <sycl/half_type.hpp>
 #include <tuple>

From f21f62e3975efd06bacf0a0a575160d5a21d3b8a Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Wed, 16 Oct 2024 12:40:39 +0100
Subject: [PATCH 25/26] Assert bfloat16 math support in `isnan`, `max`, and
 `min`

---
 sycl/include/syclcompat/math.hpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sycl/include/syclcompat/math.hpp b/sycl/include/syclcompat/math.hpp
index f65ad54222846..a3ee2b2085788 100644
--- a/sycl/include/syclcompat/math.hpp
+++ b/sycl/include/syclcompat/math.hpp
@@ -254,8 +254,8 @@ inline constexpr RetT extend_vbinary4(AT a, BT b, RetT c,
 }
 
 template <typename ValueT> inline bool isnan(const ValueT a) {
-  if constexpr (detail::support_bfloat16_math &&
-                std::is_same_v<ValueT, sycl::ext::oneapi::bfloat16>) {
+  if constexpr (std::is_same_v<ValueT, sycl::ext::oneapi::bfloat16>) {
+    static_assert(detail::support_bfloat16_math);
     return sycl::ext::oneapi::experimental::isnan(a);
   } else {
     return sycl::isnan(a);
@@ -747,9 +747,9 @@ inline std::enable_if_t<syclcompat::is_floating_point_v<ValueT> &&
                             syclcompat::is_floating_point_v<ValueU>,
                         std::common_type_t<ValueT, ValueU>>
 min(ValueT a, ValueU b) {
-  if constexpr (detail::support_bfloat16_math &&
-                std::is_same_v<std::common_type_t<ValueT, ValueU>,
+  if constexpr (std::is_same_v<std::common_type_t<ValueT, ValueU>,
                                sycl::ext::oneapi::bfloat16>) {
+    static_assert(detail::support_bfloat16_math);
     return sycl::ext::oneapi::experimental::fmin(
         static_cast<std::common_type_t<ValueT, ValueU>>(a),
         static_cast<std::common_type_t<ValueT, ValueU>>(b));
@@ -772,9 +772,9 @@ inline std::enable_if_t<syclcompat::is_floating_point_v<ValueT> &&
                             syclcompat::is_floating_point_v<ValueU>,
                         std::common_type_t<ValueT, ValueU>>
 max(ValueT a, ValueU b) {
-  if constexpr (detail::support_bfloat16_math &&
-                std::is_same_v<std::common_type_t<ValueT, ValueU>,
+  if constexpr (std::is_same_v<std::common_type_t<ValueT, ValueU>,
                                sycl::ext::oneapi::bfloat16>) {
+    static_assert(detail::support_bfloat16_math);
     return sycl::ext::oneapi::experimental::fmax(
         static_cast<std::common_type_t<ValueT, ValueU>>(a),
         static_cast<std::common_type_t<ValueT, ValueU>>(b));

From 20012b36a950c5b93ed70264d580127a30ceb053 Mon Sep 17 00:00:00 2001
From: Joe Todd <joe.todd@codeplay.com>
Date: Thu, 17 Oct 2024 10:49:10 +0100
Subject: [PATCH 26/26] Fix incorrect local memory usage in tests

---
 .../syclcompat/launch/launch_policy_lmem.cpp  | 21 ++++++++++++-------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/sycl/test-e2e/syclcompat/launch/launch_policy_lmem.cpp b/sycl/test-e2e/syclcompat/launch/launch_policy_lmem.cpp
index a22d54474d9ed..41f9a8cbee747 100644
--- a/sycl/test-e2e/syclcompat/launch/launch_policy_lmem.cpp
+++ b/sycl/test-e2e/syclcompat/launch/launch_policy_lmem.cpp
@@ -58,14 +58,19 @@ void dynamic_local_mem_typed_kernel(T *data, char *local_mem) {
   constexpr size_t num_elements = memsize / sizeof(T);
   T *typed_local_mem = reinterpret_cast<T *>(local_mem);
 
-  const int id =
-      sycl::ext::oneapi::this_work_item::get_nd_item<3>().get_global_linear_id();
-  if (id < num_elements) {
-    typed_local_mem[id] = static_cast<T>(id);
-  }
-  syclcompat::wg_barrier();
-  if (id < num_elements) {
-    data[id] = typed_local_mem[num_elements - id - 1];
+  const int local_id =
+      sycl::ext::oneapi::this_work_item::get_nd_item<3>().get_local_linear_id();
+  const int group_id =
+      sycl::ext::oneapi::this_work_item::get_nd_item<3>().get_group_linear_id();
+  // Only operate in first work-group
+  if (group_id == 0) {
+    if (local_id < num_elements) {
+      typed_local_mem[local_id] = static_cast<T>(local_id);
+    }
+    syclcompat::wg_barrier();
+    if (local_id < num_elements) {
+      data[local_id] = typed_local_mem[num_elements - local_id - 1];
+    }
   }
 };