From f6bb8a34faf22d72d85bb2c4167cd4805ecba750 Mon Sep 17 00:00:00 2001
From: rdeodhar <rajiv.deodhar@intel.com>
Date: Wed, 20 Jan 2021 09:45:52 -0800
Subject: [PATCH 01/34] [SYCL] Localize variables declared in inline asms.

Signed-off-by: rdeodhar <rajiv.deodhar@intel.com>
---
 SYCL/InlineAsm/asm_if.cpp     | 4 +++-
 SYCL/InlineAsm/asm_loop.cpp   | 4 +++-
 SYCL/InlineAsm/asm_switch.cpp | 4 +++-
 3 files changed, 9 insertions(+), 3 deletions(-)
diff --git a/SYCL/InlineAsm/asm_if.cpp b/SYCL/InlineAsm/asm_if.cpp
index 6b2ba16d65..2212dd83e0 100644
--- a/SYCL/InlineAsm/asm_if.cpp
+++ b/SYCL/InlineAsm/asm_if.cpp
@@ -21,9 +21,11 @@ template <typename T = DataType> struct KernelFunctor : WithOutputBuffer<T> {
     ](cl::sycl::id<1> wiID) [[intel::reqd_sub_group_size(8)]] {
           int Output = 0;
 #if defined(__SYCL_DEVICE_ONLY__)
-          asm volatile(".decl P1 v_type=P num_elts=1\n"
+          asm volatile("{\n"
+                       ".decl P1 v_type=P num_elts=1\n"
                        "cmp.eq (M1_NM, 8) P1 %1(0,0)<0;1,0> 0x0:b\n"
                        "(P1) sel (M1_NM, 8) %0(0,0)<1> 0x7:d 0x8:d"
+                       "}\n"
                        : "=rw"(Output)
                        : "rw"(switchField));
 
diff --git a/SYCL/InlineAsm/asm_loop.cpp b/SYCL/InlineAsm/asm_loop.cpp
index 67c2b82f1e..b3647929eb 100644
--- a/SYCL/InlineAsm/asm_loop.cpp
+++ b/SYCL/InlineAsm/asm_loop.cpp
@@ -29,7 +29,8 @@ struct KernelFunctor : WithInputBuffers<T, 2>, WithOutputBuffer<T> {
         cl::sycl::range<1>{this->getOutputBufferSize()}, [=
     ](cl::sycl::id<1> wiID) [[intel::reqd_sub_group_size(8)]] {
 #if defined(__SYCL_DEVICE_ONLY__)
-          asm volatile(".decl P1 v_type=P num_elts=8\n"
+          asm volatile("{\n"
+                       ".decl P1 v_type=P num_elts=8\n"
                        ".decl P2 v_type=P num_elts=8\n"
                        ".decl temp v_type=G type=d num_elts=8 align=dword\n"
                        "mov (M1, 8) %0(0, 0)<1> 0x0:d\n"
@@ -42,6 +43,7 @@ struct KernelFunctor : WithInputBuffers<T, 2>, WithOutputBuffer<T> {
                        "cmp.lt (M1, 8) P2 temp(0,0)<0;8,1> %1(0,0)<0;8,1>\n"
                        "(P2) goto (M1, 8) label1\n"
                        "label0:"
+                       "}\n"
                        : "+rw"(C[wiID])
                        : "rw"(A[wiID]), "rw"(B[wiID]));
 #else
diff --git a/SYCL/InlineAsm/asm_switch.cpp b/SYCL/InlineAsm/asm_switch.cpp
index 5f8a732680..8163e60948 100644
--- a/SYCL/InlineAsm/asm_switch.cpp
+++ b/SYCL/InlineAsm/asm_switch.cpp
@@ -21,7 +21,8 @@ template <typename T = DataType> struct KernelFunctor : WithOutputBuffer<T> {
     ](cl::sycl::id<1> wiID) [[intel::reqd_sub_group_size(8)]] {
           int Output = 0;
 #if defined(__SYCL_DEVICE_ONLY__)
-          asm volatile(".decl P1 v_type=P num_elts=1\n"
+          asm volatile("{\n"
+                       ".decl P1 v_type=P num_elts=1\n"
                        ".decl P2 v_type=P num_elts=1\n"
                        ".decl P3 v_type=P num_elts=1\n"
                        "cmp.ne (M1_NM, 8) P1 %1(0,0)<0;1,0> 0x0:d\n"
@@ -37,6 +38,7 @@ template <typename T = DataType> struct KernelFunctor : WithOutputBuffer<T> {
                        "(P3) goto (M1, 1) label2\n"
                        "mov (M1, 8) %0(0,0)<1> 0x7:d\n"
                        "label2:"
+                       "}\n"
                        : "=rw"(Output)
                        : "rw"(switchField));
 

From bb523e8bd0648c7f048385a5d735e297c13e6938 Mon Sep 17 00:00:00 2001
From: rdeodhar <rajiv.deodhar@intel.com>
Date: Mon, 25 Jan 2021 15:43:49 -0800
Subject: [PATCH 02/34] [SYCL] Test for disabling range rounding.

Signed-off-by: rdeodhar <rajiv.deodhar@intel.com>
---
 .../parallel_for_disable_range_roundup.cpp    | 62 +++++++++++++++++++
 1 file changed, 62 insertions(+)
 create mode 100755 SYCL/Basic/parallel_for_disable_range_roundup.cpp

diff --git a/SYCL/Basic/parallel_for_disable_range_roundup.cpp b/SYCL/Basic/parallel_for_disable_range_roundup.cpp
new file mode 100755
index 0000000000..d8d168a5f4
--- /dev/null
+++ b/SYCL/Basic/parallel_for_disable_range_roundup.cpp
@@ -0,0 +1,62 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -DSYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING %s -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE=1 %t.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-DISABLED
+
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -sycl-std=2017 %s -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE=1 %t.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-DISABLED
+
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -sycl-std=2020 %s -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE=1 %t.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-ENABLED
+
+#include <CL/sycl.hpp>
+
+using namespace sycl;
+
+range<1> Range1 = {0};
+
+void check(const char *msg, size_t v, size_t ref) {
+  std::cout << msg << v << std::endl;
+  assert(v == ref);
+}
+
+int try_rounding_off(size_t size) {
+  range<1> Size{size};
+  int Counter = 0;
+  {
+    buffer<range<1>, 1> BufRange(&Range1, 1);
+    buffer<int, 1> BufCounter(&Counter, 1);
+    queue myQueue;
+
+    std::cout << "Run parallel_for" << std::endl;
+    myQueue.submit([&](handler &cgh) {
+      auto AccRange = BufRange.get_access<access::mode::read_write>(cgh);
+      auto AccCounter = BufCounter.get_access<access::mode::atomic>(cgh);
+      cgh.parallel_for<class PF_init_item1>(Size, [=](item<1> ITEM) {
+        AccCounter[0].fetch_add(1);
+        AccRange[0] = ITEM.get_range(0);
+      });
+    });
+    myQueue.wait();
+  }
+  check("Size seen by user = ", Range1.get(0), size);
+  check("Counter = ", Counter, size);
+  return 0;
+}
+
+int main() {
+  int x;
+
+  x = 10;
+  try_rounding_off(x);
+
+  return 0;
+}
+
+// CHECK-DISABLED:  Run parallel_for
+// CHECK-DISABLED-NOT: parallel_for range adjusted from 10
+// CHECK-DISABLED:  Size seen by user = 10
+// CHECK-DISABLED-NEXT:  Counter = 10
+
+// CHECK-ENABLED:  Run parallel_for
+// CHECK-ENABLED-NEXT: parallel_for range adjusted from 10
+// CHECK-ENABLED-NEXT:  Size seen by user = 10
+// CHECK-ENABLED-NEXT:  Counter = 10

From d69e8a1374bb38d605b4e86c96999ec0702afe09 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Thu, 4 Aug 2022 13:37:57 -0700
Subject: [PATCH 03/34] [SYCL] Adjustments to tests to account for moving
 bfloat16 support out of experimental status.

Signed-off-by: Rajiv Deodhar <rajiv.deodhar@intel.com>
---
 SYCL/BFloat16/bfloat16_builtins.cpp       |  2 +-
 SYCL/BFloat16/bfloat16_type.hpp           | 42 +++++++++++------------
 SYCL/Matrix/element_wise_all_ops_cuda.cpp |  2 +-
 SYCL/Matrix/element_wise_wi_marray.cpp    |  2 +-
 SYCL/Matrix/joint_matrix_tensorcore.cpp   |  2 +-
 5 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/SYCL/BFloat16/bfloat16_builtins.cpp b/SYCL/BFloat16/bfloat16_builtins.cpp
index 9ccf988492..253bd875ef 100644
--- a/SYCL/BFloat16/bfloat16_builtins.cpp
+++ b/SYCL/BFloat16/bfloat16_builtins.cpp
@@ -12,7 +12,7 @@
 #include <vector>
 
 using namespace sycl;
-using namespace sycl::ext::oneapi::experimental;
+using namespace sycl::ext::oneapi;
 
 constexpr int N = 60; // divisible by all tested array sizes
 constexpr float bf16_eps = 0.00390625;
diff --git a/SYCL/BFloat16/bfloat16_type.hpp b/SYCL/BFloat16/bfloat16_type.hpp
index 17742fb5dc..fa06d81281 100644
--- a/SYCL/BFloat16/bfloat16_type.hpp
+++ b/SYCL/BFloat16/bfloat16_type.hpp
@@ -1,5 +1,5 @@
 #include <iostream>
-#include <sycl/ext/oneapi/experimental/bfloat16.hpp>
+#include <sycl/ext/oneapi/bfloat16.hpp>
 #include <sycl/sycl.hpp>
 
 #include <cmath>
@@ -21,7 +21,7 @@ void verify_conv_implicit(queue &q, buffer<float, 1> &a, range<1> &r,
   q.submit([&](handler &cgh) {
     auto A = a.get_access<access::mode::read_write>(cgh);
     cgh.parallel_for<class calc_conv>(r, [=](id<1> index) {
-      sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]};
+      sycl::ext::oneapi::bfloat16 AVal{A[index]};
       A[index] = AVal;
     });
   });
@@ -35,8 +35,8 @@ void verify_conv_explicit(queue &q, buffer<float, 1> &a, range<1> &r,
     auto A = a.get_access<access::mode::read_write>(cgh);
     cgh.parallel_for<class calc_conv_impl>(r, [=](id<1> index) {
       uint16_t AVal =
-          sycl::ext::oneapi::experimental::bfloat16::from_float(A[index]);
-      A[index] = sycl::ext::oneapi::experimental::bfloat16::to_float(AVal);
+          sycl::ext::oneapi::bfloat16::from_float(A[index]);
+      A[index] = sycl::ext::oneapi::bfloat16::to_float(AVal);
     });
   });
 
@@ -52,9 +52,9 @@ void verify_add(queue &q, buffer<float, 1> &a, buffer<float, 1> &b, range<1> &r,
     auto B = b.get_access<access::mode::read>(cgh);
     auto C = c.get_access<access::mode::write>(cgh);
     cgh.parallel_for<class calc_add_expl>(r, [=](id<1> index) {
-      sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]};
-      sycl::ext::oneapi::experimental::bfloat16 BVal{B[index]};
-      sycl::ext::oneapi::experimental::bfloat16 CVal = AVal + BVal;
+      sycl::ext::oneapi::bfloat16 AVal{A[index]};
+      sycl::ext::oneapi::bfloat16 BVal{B[index]};
+      sycl::ext::oneapi::bfloat16 CVal = AVal + BVal;
       C[index] = CVal;
     });
   });
@@ -71,9 +71,9 @@ void verify_sub(queue &q, buffer<float, 1> &a, buffer<float, 1> &b, range<1> &r,
     auto B = b.get_access<access::mode::read>(cgh);
     auto C = c.get_access<access::mode::write>(cgh);
     cgh.parallel_for<class calc_sub>(r, [=](id<1> index) {
-      sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]};
-      sycl::ext::oneapi::experimental::bfloat16 BVal{B[index]};
-      sycl::ext::oneapi::experimental::bfloat16 CVal = AVal - BVal;
+      sycl::ext::oneapi::bfloat16 AVal{A[index]};
+      sycl::ext::oneapi::bfloat16 BVal{B[index]};
+      sycl::ext::oneapi::bfloat16 CVal = AVal - BVal;
       C[index] = CVal;
     });
   });
@@ -88,8 +88,8 @@ void verify_minus(queue &q, buffer<float, 1> &a, range<1> &r, const float ref) {
     auto A = a.get_access<access::mode::read>(cgh);
     auto C = c.get_access<access::mode::write>(cgh);
     cgh.parallel_for<class calc_minus>(r, [=](id<1> index) {
-      sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]};
-      sycl::ext::oneapi::experimental::bfloat16 CVal = -AVal;
+      sycl::ext::oneapi::bfloat16 AVal{A[index]};
+      sycl::ext::oneapi::bfloat16 CVal = -AVal;
       C[index] = CVal;
     });
   });
@@ -106,9 +106,9 @@ void verify_mul(queue &q, buffer<float, 1> &a, buffer<float, 1> &b, range<1> &r,
     auto B = b.get_access<access::mode::read>(cgh);
     auto C = c.get_access<access::mode::write>(cgh);
     cgh.parallel_for<class calc_mul>(r, [=](id<1> index) {
-      sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]};
-      sycl::ext::oneapi::experimental::bfloat16 BVal{B[index]};
-      sycl::ext::oneapi::experimental::bfloat16 CVal = AVal * BVal;
+      sycl::ext::oneapi::bfloat16 AVal{A[index]};
+      sycl::ext::oneapi::bfloat16 BVal{B[index]};
+      sycl::ext::oneapi::bfloat16 CVal = AVal * BVal;
       C[index] = CVal;
     });
   });
@@ -125,9 +125,9 @@ void verify_div(queue &q, buffer<float, 1> &a, buffer<float, 1> &b, range<1> &r,
     auto B = b.get_access<access::mode::read>(cgh);
     auto C = c.get_access<access::mode::write>(cgh);
     cgh.parallel_for<class calc_div>(r, [=](id<1> index) {
-      sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]};
-      sycl::ext::oneapi::experimental::bfloat16 BVal{B[index]};
-      sycl::ext::oneapi::experimental::bfloat16 CVal = AVal / BVal;
+      sycl::ext::oneapi::bfloat16 AVal{A[index]};
+      sycl::ext::oneapi::bfloat16 BVal{B[index]};
+      sycl::ext::oneapi::bfloat16 CVal = AVal / BVal;
       C[index] = CVal;
     });
   });
@@ -144,12 +144,12 @@ void verify_logic(queue &q, buffer<float, 1> &a, buffer<float, 1> &b,
     auto B = b.get_access<access::mode::read>(cgh);
     auto C = c.get_access<access::mode::write>(cgh);
     cgh.parallel_for<class logic>(r, [=](id<1> index) {
-      sycl::ext::oneapi::experimental::bfloat16 AVal{A[index]};
-      sycl::ext::oneapi::experimental::bfloat16 BVal{B[index]};
+      sycl::ext::oneapi::bfloat16 AVal{A[index]};
+      sycl::ext::oneapi::bfloat16 BVal{B[index]};
       if (AVal) {
         if (AVal > BVal || AVal >= BVal || AVal < BVal || AVal <= BVal ||
             !BVal) {
-          sycl::ext::oneapi::experimental::bfloat16 CVal =
+          sycl::ext::oneapi::bfloat16 CVal =
               AVal != BVal ? AVal : BVal;
           CVal--;
           CVal++;
diff --git a/SYCL/Matrix/element_wise_all_ops_cuda.cpp b/SYCL/Matrix/element_wise_all_ops_cuda.cpp
index c73da53888..27cf26e935 100644
--- a/SYCL/Matrix/element_wise_all_ops_cuda.cpp
+++ b/SYCL/Matrix/element_wise_all_ops_cuda.cpp
@@ -14,7 +14,7 @@
 
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
-using sycl::ext::oneapi::experimental::bfloat16;
+using sycl::ext::oneapi::bfloat16;
 
 #define SG_SZ 32
 constexpr size_t nWGperDim = 2;
diff --git a/SYCL/Matrix/element_wise_wi_marray.cpp b/SYCL/Matrix/element_wise_wi_marray.cpp
index 5b82ebf9db..99ee6d00d3 100644
--- a/SYCL/Matrix/element_wise_wi_marray.cpp
+++ b/SYCL/Matrix/element_wise_wi_marray.cpp
@@ -13,7 +13,7 @@
 #include <sycl/sycl.hpp>
 
 using namespace sycl;
-using namespace sycl::ext::oneapi::experimental;
+using namespace sycl::ext::oneapi;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
 #define SG_SZ 32
diff --git a/SYCL/Matrix/joint_matrix_tensorcore.cpp b/SYCL/Matrix/joint_matrix_tensorcore.cpp
index 664944bacd..1500e9af7e 100644
--- a/SYCL/Matrix/joint_matrix_tensorcore.cpp
+++ b/SYCL/Matrix/joint_matrix_tensorcore.cpp
@@ -10,7 +10,7 @@
 #include <sycl/sycl.hpp>
 
 using namespace sycl;
-using namespace sycl::ext::oneapi::experimental;
+using namespace sycl::ext::oneapi;
 using namespace sycl::ext::oneapi::experimental::matrix;
 constexpr float bf16_eps = 0.00390625;
 

From 8da975e17fd91dc5867f0446d1abd707508aeb20 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Thu, 4 Aug 2022 13:55:45 -0700
Subject: [PATCH 04/34] Formatting change.

---
 SYCL/BFloat16/bfloat16_type.hpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/SYCL/BFloat16/bfloat16_type.hpp b/SYCL/BFloat16/bfloat16_type.hpp
index fa06d81281..a7827887d3 100644
--- a/SYCL/BFloat16/bfloat16_type.hpp
+++ b/SYCL/BFloat16/bfloat16_type.hpp
@@ -34,8 +34,7 @@ void verify_conv_explicit(queue &q, buffer<float, 1> &a, range<1> &r,
   q.submit([&](handler &cgh) {
     auto A = a.get_access<access::mode::read_write>(cgh);
     cgh.parallel_for<class calc_conv_impl>(r, [=](id<1> index) {
-      uint16_t AVal =
-          sycl::ext::oneapi::bfloat16::from_float(A[index]);
+      uint16_t AVal = sycl::ext::oneapi::bfloat16::from_float(A[index]);
       A[index] = sycl::ext::oneapi::bfloat16::to_float(AVal);
     });
   });
@@ -149,8 +148,7 @@ void verify_logic(queue &q, buffer<float, 1> &a, buffer<float, 1> &b,
       if (AVal) {
         if (AVal > BVal || AVal >= BVal || AVal < BVal || AVal <= BVal ||
             !BVal) {
-          sycl::ext::oneapi::bfloat16 CVal =
-              AVal != BVal ? AVal : BVal;
+          sycl::ext::oneapi::bfloat16 CVal = AVal != BVal ? AVal : BVal;
           CVal--;
           CVal++;
           if (AVal == BVal) {

From 29452c0d053b3f2b8727bad487709dab7df59850 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Thu, 8 Sep 2022 17:36:19 -0700
Subject: [PATCH 05/34] Corrections for bfloat16 moved out of experimental.

---
 SYCL/BFloat16/bfloat16_type.hpp                 | 4 ++--
 SYCL/BFloat16/bfloat_hw.cpp                     | 2 +-
 SYCL/ESIMD/api/bin_and_cmp_ops_heavy.cpp        | 2 +-
 SYCL/ESIMD/api/replicate_smoke.cpp              | 2 +-
 SYCL/ESIMD/api/simd_copy_to_from.cpp            | 2 +-
 SYCL/ESIMD/api/simd_subscript_operator.cpp      | 2 +-
 SYCL/ESIMD/api/simd_view_subscript_operator.cpp | 2 +-
 SYCL/ESIMD/api/svm_gather_scatter.cpp           | 2 +-
 SYCL/ESIMD/api/unary_ops_heavy.cpp              | 2 +-
 SYCL/ESIMD/esimd_test_utils.hpp                 | 2 +-
 10 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/SYCL/BFloat16/bfloat16_type.hpp b/SYCL/BFloat16/bfloat16_type.hpp
index a7827887d3..c779a26b30 100644
--- a/SYCL/BFloat16/bfloat16_type.hpp
+++ b/SYCL/BFloat16/bfloat16_type.hpp
@@ -34,8 +34,8 @@ void verify_conv_explicit(queue &q, buffer<float, 1> &a, range<1> &r,
   q.submit([&](handler &cgh) {
     auto A = a.get_access<access::mode::read_write>(cgh);
     cgh.parallel_for<class calc_conv_impl>(r, [=](id<1> index) {
-      uint16_t AVal = sycl::ext::oneapi::bfloat16::from_float(A[index]);
-      A[index] = sycl::ext::oneapi::bfloat16::to_float(AVal);
+      sycl::ext::oneapi::bfloat16 AVal = A[index];
+      A[index] = float(AVal);
     });
   });
 
diff --git a/SYCL/BFloat16/bfloat_hw.cpp b/SYCL/BFloat16/bfloat_hw.cpp
index 29d63c7fa9..bb1a2a1e1f 100644
--- a/SYCL/BFloat16/bfloat_hw.cpp
+++ b/SYCL/BFloat16/bfloat_hw.cpp
@@ -16,7 +16,7 @@ using get_uint_type_of_size = typename std::conditional_t<
         std::conditional_t<Size == 4, uint32_t,
                            std::conditional_t<Size == 8, uint64_t, void>>>>;
 
-using bfloat16 = sycl::ext::oneapi::experimental::bfloat16;
+using bfloat16 = sycl::ext::oneapi::bfloat16;
 using Bfloat16StorageT = get_uint_type_of_size<sizeof(bfloat16)>;
 
 bool test(float Val, Bfloat16StorageT Bits) {
diff --git a/SYCL/ESIMD/api/bin_and_cmp_ops_heavy.cpp b/SYCL/ESIMD/api/bin_and_cmp_ops_heavy.cpp
index da908391f8..c34c98becc 100644
--- a/SYCL/ESIMD/api/bin_and_cmp_ops_heavy.cpp
+++ b/SYCL/ESIMD/api/bin_and_cmp_ops_heavy.cpp
@@ -30,7 +30,7 @@
 
 using namespace sycl;
 using namespace sycl::ext::intel::esimd;
-using bfloat16 = sycl::ext::oneapi::experimental::bfloat16;
+using bfloat16 = sycl::ext::oneapi::bfloat16;
 
 template <class T1, class T2, int VL, class OpClass, class Ops> class TestID;
 
diff --git a/SYCL/ESIMD/api/replicate_smoke.cpp b/SYCL/ESIMD/api/replicate_smoke.cpp
index c9985edc93..abc6121211 100644
--- a/SYCL/ESIMD/api/replicate_smoke.cpp
+++ b/SYCL/ESIMD/api/replicate_smoke.cpp
@@ -20,7 +20,7 @@
 
 using namespace sycl;
 using namespace sycl::ext::intel::esimd;
-using bfloat16 = sycl::ext::oneapi::experimental::bfloat16;
+using bfloat16 = sycl::ext::oneapi::bfloat16;
 
 template <class T> struct char_to_int {
   using type = typename std::conditional<
diff --git a/SYCL/ESIMD/api/simd_copy_to_from.cpp b/SYCL/ESIMD/api/simd_copy_to_from.cpp
index 2113772cd8..ee25baaefa 100644
--- a/SYCL/ESIMD/api/simd_copy_to_from.cpp
+++ b/SYCL/ESIMD/api/simd_copy_to_from.cpp
@@ -38,7 +38,7 @@
 using namespace sycl;
 using namespace sycl::ext::intel;
 using namespace sycl::ext::intel::esimd;
-using bfloat16 = sycl::ext::oneapi::experimental::bfloat16;
+using bfloat16 = sycl::ext::oneapi::bfloat16;
 
 template <typename T, int N, typename Flags>
 bool testUSM(queue &Q, T *Src, T *Dst, unsigned Off, Flags) {
diff --git a/SYCL/ESIMD/api/simd_subscript_operator.cpp b/SYCL/ESIMD/api/simd_subscript_operator.cpp
index 481fab3b04..19e16734ba 100644
--- a/SYCL/ESIMD/api/simd_subscript_operator.cpp
+++ b/SYCL/ESIMD/api/simd_subscript_operator.cpp
@@ -24,7 +24,7 @@
 
 using namespace sycl;
 using namespace sycl::ext::intel::esimd;
-using bfloat16 = sycl::ext::oneapi::experimental::bfloat16;
+using bfloat16 = sycl::ext::oneapi::bfloat16;
 
 template <class T> bool test(queue &q) {
   std::cout << "Testing " << typeid(T).name() << "...\n";
diff --git a/SYCL/ESIMD/api/simd_view_subscript_operator.cpp b/SYCL/ESIMD/api/simd_view_subscript_operator.cpp
index b9923fa25b..85ea650e1c 100644
--- a/SYCL/ESIMD/api/simd_view_subscript_operator.cpp
+++ b/SYCL/ESIMD/api/simd_view_subscript_operator.cpp
@@ -25,7 +25,7 @@
 
 using namespace sycl;
 using namespace sycl::ext::intel::esimd;
-using bfloat16 = sycl::ext::oneapi::experimental::bfloat16;
+using bfloat16 = sycl::ext::oneapi::bfloat16;
 
 template <class T> class TestID;
 
diff --git a/SYCL/ESIMD/api/svm_gather_scatter.cpp b/SYCL/ESIMD/api/svm_gather_scatter.cpp
index 87f37fe094..6cedc76792 100644
--- a/SYCL/ESIMD/api/svm_gather_scatter.cpp
+++ b/SYCL/ESIMD/api/svm_gather_scatter.cpp
@@ -25,7 +25,7 @@
 using namespace sycl;
 using namespace sycl::ext::intel;
 using namespace sycl::ext::intel::esimd;
-using bfloat16 = sycl::ext::oneapi::experimental::bfloat16;
+using bfloat16 = sycl::ext::oneapi::bfloat16;
 
 template <typename T, int N> bool test(queue &Q) {
   std::cout << "  Running " << typeid(T).name() << " test, N=" << N << "...\n";
diff --git a/SYCL/ESIMD/api/unary_ops_heavy.cpp b/SYCL/ESIMD/api/unary_ops_heavy.cpp
index bc86616e61..f7a774e0b6 100644
--- a/SYCL/ESIMD/api/unary_ops_heavy.cpp
+++ b/SYCL/ESIMD/api/unary_ops_heavy.cpp
@@ -30,7 +30,7 @@
 
 using namespace sycl;
 using namespace sycl::ext::intel::esimd;
-using bfloat16 = sycl::ext::oneapi::experimental::bfloat16;
+using bfloat16 = sycl::ext::oneapi::bfloat16;
 
 template <class T, int VL, class Ops> class TestID;
 
diff --git a/SYCL/ESIMD/esimd_test_utils.hpp b/SYCL/ESIMD/esimd_test_utils.hpp
index 453cb83b9a..b78debfee2 100644
--- a/SYCL/ESIMD/esimd_test_utils.hpp
+++ b/SYCL/ESIMD/esimd_test_utils.hpp
@@ -557,7 +557,7 @@ TID(uint32_t)
 TID(int64_t)
 TID(uint64_t)
 TID(half)
-TID(sycl::ext::oneapi::experimental::bfloat16)
+TID(sycl::ext::oneapi::bfloat16)
 TID(float)
 TID(double)
 

From 63b05d164e78bcf8b62017f7783482686c1f4b54 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Wed, 14 Sep 2022 16:20:22 -0700
Subject: [PATCH 06/34] Correct test to run on multiple root devices.

---
 SYCL/Basic/buffer/buffer_migrate.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/SYCL/Basic/buffer/buffer_migrate.cpp b/SYCL/Basic/buffer/buffer_migrate.cpp
index 5016c802bd..66ea2caf09 100644
--- a/SYCL/Basic/buffer/buffer_migrate.cpp
+++ b/SYCL/Basic/buffer/buffer_migrate.cpp
@@ -36,6 +36,9 @@ int main() {
           range<1>(1), [=](id<1> ID) { Accessor[ID] |= (1 << Index); });
     });
     Q.wait();
+    // Access buffer on host to ensure updated value is read from and sent back
+    // to device
+    Buffer.get_host_access();
     ++Index;
   }
 

From cb629ff50650c97d657c213ca8a8fbc73ca876bc Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Thu, 15 Sep 2022 18:10:46 -0700
Subject: [PATCH 07/34] Removed test correction since test is now disabled.

---
 SYCL/Basic/buffer/buffer_migrate.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/SYCL/Basic/buffer/buffer_migrate.cpp b/SYCL/Basic/buffer/buffer_migrate.cpp
index 65231c6b52..715106a97b 100644
--- a/SYCL/Basic/buffer/buffer_migrate.cpp
+++ b/SYCL/Basic/buffer/buffer_migrate.cpp
@@ -38,9 +38,6 @@ int main() {
           range<1>(1), [=](id<1> ID) { Accessor[ID] |= (1 << Index); });
     });
     Q.wait();
-    // Access buffer on host to ensure updated value is read from and sent back
-    // to device
-    Buffer.get_host_access();
     ++Index;
   }
 

From df36f2dc2ad2724d2f89f96e9ec328ee90da9a02 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Tue, 20 Sep 2022 16:23:16 -0700
Subject: [PATCH 08/34] Modifications and additions to bfloat16 tests.

---
 SYCL/BFloat16/bfloat16_conversions.cpp | 50 ++++++++++++++++++++++++++
 SYCL/BFloat16/bfloat16_type.cpp        | 11 +++---
 SYCL/BFloat16/bfloat16_type.hpp        | 15 +++++++-
 3 files changed, 69 insertions(+), 7 deletions(-)
 create mode 100755 SYCL/BFloat16/bfloat16_conversions.cpp

diff --git a/SYCL/BFloat16/bfloat16_conversions.cpp b/SYCL/BFloat16/bfloat16_conversions.cpp
new file mode 100755
index 0000000000..7ec68692c7
--- /dev/null
+++ b/SYCL/BFloat16/bfloat16_conversions.cpp
@@ -0,0 +1,50 @@
+// UNSUPPORTED: hip
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// Currently the feature is supported only on CPU and GPU, natively or by
+// software emulation.
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUNx: %ACC_RUN_PLACEHOLDER %t.out
+
+//==----------- bfloat16_conversions.cpp - SYCL bfloat16 type test ----------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <iostream>
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+template <typename T> T calculate(T a, T b) {
+  sycl::ext::oneapi::bfloat16 x = a;
+  sycl::ext::oneapi::bfloat16 y = b;
+  sycl::ext::oneapi::bfloat16 z = x + y;
+  T result = z;
+  return result;
+}
+
+template <typename T> int test() {
+  T a{5.6f};
+  T b{-1.1f};
+  T result = calculate(a, b);
+  if (result == 4.5f)
+    return 0;
+  return 1;
+}
+
+int main() {
+  int result;
+  result = test<sycl::half>();
+  result |= test<float>();
+  if (result)
+    std::cout << "FAIL\n";
+  else
+    std::cout << "PASS\n";
+
+  return result;
+}
+
diff --git a/SYCL/BFloat16/bfloat16_type.cpp b/SYCL/BFloat16/bfloat16_type.cpp
index 25bb8ac15c..a741b78573 100644
--- a/SYCL/BFloat16/bfloat16_type.cpp
+++ b/SYCL/BFloat16/bfloat16_type.cpp
@@ -1,10 +1,9 @@
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
-// TODO currently the feature isn't supported on most of the devices
-//      need to enable the test when the aspect and device_if feature are
-//      introduced
-// RUNx: %CPU_RUN_PLACEHOLDER %t.out
-// RUNx: %GPU_RUN_PLACEHOLDER %t.out
+// Currently the feature is supported only on CPU and GPU, natively or by
+// software emulation.
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUNx: %ACC_RUN_PLACEHOLDER %t.out
 
 //==----------- bfloat16_type.cpp - SYCL bfloat16 type test ----------------==//
diff --git a/SYCL/BFloat16/bfloat16_type.hpp b/SYCL/BFloat16/bfloat16_type.hpp
index c779a26b30..9e1f39dbea 100644
--- a/SYCL/BFloat16/bfloat16_type.hpp
+++ b/SYCL/BFloat16/bfloat16_type.hpp
@@ -167,7 +167,7 @@ void verify_logic(queue &q, buffer<float, 1> &a, buffer<float, 1> &b,
 }
 
 int run_tests() {
-  device dev{default_selector()};
+  device dev{default_selector_v};
 
   // TODO: replace is_gpu check with extension check when the appropriate part
   // of implementation ready (aspect)
@@ -189,18 +189,31 @@ int run_tests() {
   queue q{dev};
 
   verify_conv_implicit(q, a, r, 5.0);
+  std::cout << "PASS verify_conv_implicit\n";
   verify_conv_explicit(q, a, r, 5.0);
+  std::cout << "PASS verify_conv_explicit\n";
   verify_add(q, a, b, r, 7.0);
+  std::cout << "PASS verify_add\n";
   verify_sub(q, a, b, r, 3.0);
+  std::cout << "PASS verify_sub\n";
   verify_mul(q, a, b, r, 10.0);
+  std::cout << "PASS verify_mul\n";
   verify_div(q, a, b, r, 2.5);
+  std::cout << "PASS verify_div\n";
   verify_logic(q, a, b, r, 7.0);
+  std::cout << "PASS verify_logic\n";
   verify_add(q, a, b_neg, r, 3.0);
+  std::cout << "PASS verify_add\n";
   verify_sub(q, a, b_neg, r, 7.0);
+  std::cout << "PASS verify_sub\n";
   verify_minus(q, a, r, -5.0);
+  std::cout << "PASS verify_minus\n";
   verify_mul(q, a, b_neg, r, -10.0);
+  std::cout << "PASS verify_mul\n";
   verify_div(q, a, b_neg, r, -2.5);
+  std::cout << "PASS verify_div\n";
   verify_logic(q, a, b_neg, r, 3.0);
+  std::cout << "PASS verify_logic\n";
 
   return 0;
 }

From 5bf23846e0d98d573844df8a389c97976cc80e7b Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Tue, 20 Sep 2022 16:42:35 -0700
Subject: [PATCH 09/34] Formatting change.

---
 SYCL/BFloat16/bfloat16_conversions.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/SYCL/BFloat16/bfloat16_conversions.cpp b/SYCL/BFloat16/bfloat16_conversions.cpp
index 7ec68692c7..c220179565 100755
--- a/SYCL/BFloat16/bfloat16_conversions.cpp
+++ b/SYCL/BFloat16/bfloat16_conversions.cpp
@@ -6,13 +6,13 @@
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUNx: %ACC_RUN_PLACEHOLDER %t.out
 
-//==----------- bfloat16_conversions.cpp - SYCL bfloat16 type test ----------==//
+//==---------- bfloat16_conversions.cpp - SYCL bfloat16 type test ---------==//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-//===----------------------------------------------------------------------===//
+//===---------------------------------------------------------------------===//
 
 #include <iostream>
 #include <sycl/sycl.hpp>

From e9683a15a8597cb7733dc2005b9a8f7547ad4136 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Tue, 20 Sep 2022 16:52:58 -0700
Subject: [PATCH 10/34] Formatting change.

---
 SYCL/BFloat16/bfloat16_conversions.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/SYCL/BFloat16/bfloat16_conversions.cpp b/SYCL/BFloat16/bfloat16_conversions.cpp
index c220179565..fff438c3ac 100755
--- a/SYCL/BFloat16/bfloat16_conversions.cpp
+++ b/SYCL/BFloat16/bfloat16_conversions.cpp
@@ -47,4 +47,3 @@ int main() {
 
   return result;
 }
-

From c066b3a5499113e1586ea46599bad8c43a701435 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Fri, 23 Sep 2022 09:14:27 -0700
Subject: [PATCH 11/34] Enable bfloat16 test on GPU only.

---
 SYCL/BFloat16/bfloat16_type.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/SYCL/BFloat16/bfloat16_type.cpp b/SYCL/BFloat16/bfloat16_type.cpp
index a741b78573..c146710dbd 100644
--- a/SYCL/BFloat16/bfloat16_type.cpp
+++ b/SYCL/BFloat16/bfloat16_type.cpp
@@ -1,8 +1,8 @@
 // UNSUPPORTED: hip
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
-// Currently the feature is supported only on CPU and GPU, natively or by
-// software emulation.
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// Currently the feature is supported only on GPU, natively or by software
+// emulation.
+// RUNx: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUNx: %ACC_RUN_PLACEHOLDER %t.out
 

From b0c0848be3e004b08a71b9df077be2575c161f0f Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Tue, 27 Sep 2022 13:23:42 -0700
Subject: [PATCH 12/34] Reenable bfloat16 test on CPU.

---
 SYCL/BFloat16/bfloat16_type.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/SYCL/BFloat16/bfloat16_type.cpp b/SYCL/BFloat16/bfloat16_type.cpp
index c146710dbd..5322709117 100644
--- a/SYCL/BFloat16/bfloat16_type.cpp
+++ b/SYCL/BFloat16/bfloat16_type.cpp
@@ -1,8 +1,7 @@
 // UNSUPPORTED: hip
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
-// Currently the feature is supported only on GPU, natively or by software
-// emulation.
-// RUNx: %CPU_RUN_PLACEHOLDER %t.out
+// TODO currently the feature isn't supported on FPGA.
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUNx: %ACC_RUN_PLACEHOLDER %t.out
 

From 49a522a33b7d0382e033c87370452ac65fa5620b Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Mon, 3 Oct 2022 18:02:11 -0700
Subject: [PATCH 13/34] Corrected test to run on host and device.

---
 SYCL/BFloat16/bfloat16_conversions.cpp | 30 +++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/SYCL/BFloat16/bfloat16_conversions.cpp b/SYCL/BFloat16/bfloat16_conversions.cpp
index fff438c3ac..add8eae6b1 100755
--- a/SYCL/BFloat16/bfloat16_conversions.cpp
+++ b/SYCL/BFloat16/bfloat16_conversions.cpp
@@ -20,17 +20,35 @@
 using namespace sycl;
 
 template <typename T> T calculate(T a, T b) {
-  sycl::ext::oneapi::bfloat16 x = a;
+  sycl::ext::oneapi::bfloat16 x = -a;
   sycl::ext::oneapi::bfloat16 y = b;
   sycl::ext::oneapi::bfloat16 z = x + y;
   T result = z;
   return result;
 }
 
-template <typename T> int test() {
-  T a{5.6f};
+template <typename T> int test_device() {
+  T data[3] = {-7.0f, 8.1f, 0.0f};
+
+  queue deviceQueue;
+  buffer<T, 1> buf{data, 3};
+  deviceQueue.submit([&](handler &cgh) {
+    accessor numbers{buf, cgh, read_write};
+    cgh.single_task([=]() { numbers[2] = calculate(numbers[0], numbers[1]); });
+  });
+
+  host_accessor hostOutAcc{buf, read_only};
+  std::cout << "Device Result = " << hostOutAcc[2] << std::endl;
+  if (hostOutAcc[2] == 15.125f)
+    return 0;
+  return 1;
+}
+
+template <typename T> int test_host() {
+  T a{-5.6f};
   T b{-1.1f};
   T result = calculate(a, b);
+  std::cout << "Host Result = " << result << std::endl;
   if (result == 4.5f)
     return 0;
   return 1;
@@ -38,8 +56,10 @@ template <typename T> int test() {
 
 int main() {
   int result;
-  result = test<sycl::half>();
-  result |= test<float>();
+  result = test_host<sycl::half>();
+  result |= test_host<float>();
+  result |= test_device<sycl::half>();
+  result |= test_device<float>();
   if (result)
     std::cout << "FAIL\n";
   else

From 7e84648fdde701f38c95e339ed487ef6f2083c29 Mon Sep 17 00:00:00 2001
From: rdeodhar <rajiv.deodhar@intel.com>
Date: Tue, 4 Oct 2022 16:11:07 -0700
Subject: [PATCH 14/34] Update SYCL/BFloat16/bfloat16_type.cpp

Reenable execution on a CUDA>

Co-authored-by: JackAKirk <jack.kirk@codeplay.com>
---
 SYCL/BFloat16/bfloat16_type.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/SYCL/BFloat16/bfloat16_type.cpp b/SYCL/BFloat16/bfloat16_type.cpp
index 5322709117..d252d50d75 100644
--- a/SYCL/BFloat16/bfloat16_type.cpp
+++ b/SYCL/BFloat16/bfloat16_type.cpp
@@ -1,4 +1,7 @@
 // UNSUPPORTED: hip
+// RUN: %if cuda %{%clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 %s -o %t.out %}
+// TODO enable the below when CI supports >=sm_80
+// RUNx: %if cuda %{%GPU_RUN_PLACEHOLDER %t.out %}
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // TODO currently the feature isn't supported on FPGA.
 // RUN: %CPU_RUN_PLACEHOLDER %t.out

From 8f492d6464b4a7c45959a301fad463c1fbad2143 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Mon, 24 Oct 2022 23:17:20 -0700
Subject: [PATCH 15/34] Added test for use of fallback bfloat16 library.

---
 SYCL/BFloat16/bfloat16_example.cpp | 65 ++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)
 create mode 100755 SYCL/BFloat16/bfloat16_example.cpp

diff --git a/SYCL/BFloat16/bfloat16_example.cpp b/SYCL/BFloat16/bfloat16_example.cpp
new file mode 100755
index 0000000000..a4386dfe3b
--- /dev/null
+++ b/SYCL/BFloat16/bfloat16_example.cpp
@@ -0,0 +1,65 @@
+///
+/// Check if bfloat16 example works using fallback libraries
+///
+
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-targets=spir64 %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen9" %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device *" %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen9" %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen9" %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+using sycl::ext::oneapi::bfloat16;
+
+float foo(float a, float b) {
+  // Convert from float to bfloat16.
+  bfloat16 A{a};
+  bfloat16 B{b};
+
+  // Convert A and B from bfloat16 to float, do addition on floating-point
+  // numbers, then convert the result to bfloat16 and store it in C.
+  bfloat16 C = A + B;
+
+  // Return the result converted from bfloat16 to float.
+  return C;
+}
+
+int main(int argc, char *argv[]) {
+  float data[3] = {7.0, 8.1, 0.0};
+
+  std::cout << "CPU Result = " << foo(7.0f, 8.1f) << std::endl;
+
+  queue deviceQueue;
+  buffer<float, 1> buf{data, 3};
+
+  deviceQueue.submit([&](handler &cgh) {
+    accessor numbers{buf, cgh, read_write};
+    cgh.single_task([=]() { numbers[2] = foo(numbers[0], numbers[1]); });
+  });
+
+  host_accessor hostOutAcc{buf, read_only};
+  std::cout << "GPU Result = " << hostOutAcc[2] << std::endl;
+
+  return 0;
+}
+

From 4805061c10ba612135fd8e24c1804e8a34702829 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Mon, 24 Oct 2022 23:25:52 -0700
Subject: [PATCH 16/34] Formatting change

---
 SYCL/BFloat16/bfloat16_example.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/SYCL/BFloat16/bfloat16_example.cpp b/SYCL/BFloat16/bfloat16_example.cpp
index a4386dfe3b..d413cf99af 100755
--- a/SYCL/BFloat16/bfloat16_example.cpp
+++ b/SYCL/BFloat16/bfloat16_example.cpp
@@ -62,4 +62,3 @@ int main(int argc, char *argv[]) {
 
   return 0;
 }
-

From 33c7d2b1870d006094571dcfc1bde60407b99769 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Tue, 25 Oct 2022 10:41:30 -0700
Subject: [PATCH 17/34] Enhanced the bfloat16 fallback test.

---
 SYCL/BFloat16/bfloat16_example.cpp | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/SYCL/BFloat16/bfloat16_example.cpp b/SYCL/BFloat16/bfloat16_example.cpp
index d413cf99af..967ca8c10d 100755
--- a/SYCL/BFloat16/bfloat16_example.cpp
+++ b/SYCL/BFloat16/bfloat16_example.cpp
@@ -26,6 +26,13 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
+// RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc" %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc" %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+
+
 #include <sycl/sycl.hpp>
 
 using namespace sycl;
@@ -47,7 +54,12 @@ float foo(float a, float b) {
 int main(int argc, char *argv[]) {
   float data[3] = {7.0, 8.1, 0.0};
 
-  std::cout << "CPU Result = " << foo(7.0f, 8.1f) << std::endl;
+  float result_host = foo(7.0f, 8.1f);
+  std::cout << "CPU Result = " << result_host << std::endl;
+  if (std::abs(15.1f - result_host) > 0.1) {
+    std::cout << "Test failed. Expected CPU Result ~= 15.1" << std::endl;
+    return 1;
+  }
 
   queue deviceQueue;
   buffer<float, 1> buf{data, 3};
@@ -58,7 +70,12 @@ int main(int argc, char *argv[]) {
   });
 
   host_accessor hostOutAcc{buf, read_only};
-  std::cout << "GPU Result = " << hostOutAcc[2] << std::endl;
+  float result_device = hostOutAcc[2];
+  std::cout << "GPU Result = " << result_device << std::endl;
+  if (std::abs(result_host - result_device) > 0.1) {
+    std::cout << "Test failed. CPU Result !~= GPU result" << std::endl;
+    return 1;
+  }
 
   return 0;
 }

From 21dcd205d9633b9c181badb0c9fc737b4dc4eca2 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Tue, 25 Oct 2022 11:04:30 -0700
Subject: [PATCH 18/34] Formatting change.

---
 SYCL/BFloat16/bfloat16_example.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/SYCL/BFloat16/bfloat16_example.cpp b/SYCL/BFloat16/bfloat16_example.cpp
index 967ca8c10d..cc08dd8cb0 100755
--- a/SYCL/BFloat16/bfloat16_example.cpp
+++ b/SYCL/BFloat16/bfloat16_example.cpp
@@ -32,7 +32,6 @@
 // RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc" %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 
-
 #include <sycl/sycl.hpp>
 
 using namespace sycl;

From cbfe491ee21f9d581c1bbefb589a15fae3e84062 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Tue, 25 Oct 2022 13:53:10 -0700
Subject: [PATCH 19/34] Adjusted bfloat16 aspect.

---
 SYCL/Basic/aspects.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/SYCL/Basic/aspects.cpp b/SYCL/Basic/aspects.cpp
index 428e6898c5..037de63324 100644
--- a/SYCL/Basic/aspects.cpp
+++ b/SYCL/Basic/aspects.cpp
@@ -54,8 +54,8 @@ int main() {
     if (plt.has(aspect::fp64)) {
       std::cout << "  fp64" << std::endl;
     }
-    if (plt.has(aspect::ext_oneapi_bfloat16)) {
-      std::cout << " ext_oneapi_bfloat16" << std::endl;
+    if (plt.has(aspect::ext_oneapi_bfloat16_math_functions)) {
+      std::cout << " ext_oneapi_bfloat16_math_functions" << std::endl;
     }
     if (plt.has(aspect::int64_base_atomics)) {
       std::cout << "  base atomic operations" << std::endl;

From 3721b9719dfc786fb62d5d183fc08afab5d67c4f Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Tue, 25 Oct 2022 22:39:24 -0700
Subject: [PATCH 20/34] Changes to test to set up required environment.

---
 SYCL/BFloat16/bfloat16_example.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/SYCL/BFloat16/bfloat16_example.cpp b/SYCL/BFloat16/bfloat16_example.cpp
index cc08dd8cb0..1d2718d745 100755
--- a/SYCL/BFloat16/bfloat16_example.cpp
+++ b/SYCL/BFloat16/bfloat16_example.cpp
@@ -2,6 +2,10 @@
 /// Check if bfloat16 example works using fallback libraries
 ///
 
+// REQUIRES: opencl-aot, ocloc, cpu, gpu
+// UNSUPPORTED: cuda
+// CUDA is not compatible with SPIR.
+
 // RUN: %clangxx -fsycl %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out

From 44c6ccb8adada5fcec2b2b0703f941b29b8199d0 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Wed, 26 Oct 2022 09:46:40 -0700
Subject: [PATCH 21/34] Adjustment to test.

---
 SYCL/BFloat16/bfloat16_example.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/SYCL/BFloat16/bfloat16_example.cpp b/SYCL/BFloat16/bfloat16_example.cpp
index 1d2718d745..ac47ce7e4f 100755
--- a/SYCL/BFloat16/bfloat16_example.cpp
+++ b/SYCL/BFloat16/bfloat16_example.cpp
@@ -15,11 +15,9 @@
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen9" %s -o %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device *" %s -o %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen9" %s -o %t.out

From c3155bf7b4765534e77b7127cc2a3c71cbbdd992 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Wed, 26 Oct 2022 11:59:51 -0700
Subject: [PATCH 22/34] Test adjustment.

---
 SYCL/BFloat16/bfloat16_example.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/SYCL/BFloat16/bfloat16_example.cpp b/SYCL/BFloat16/bfloat16_example.cpp
index ac47ce7e4f..53e95f06d3 100755
--- a/SYCL/BFloat16/bfloat16_example.cpp
+++ b/SYCL/BFloat16/bfloat16_example.cpp
@@ -14,19 +14,14 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
-// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen9" %s -o %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
-
 // RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device *" %s -o %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen9" %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen9" %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc" %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out

From f2edee8ea73df468179bc9c2ea9cc0378421d22c Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Wed, 26 Oct 2022 13:40:20 -0700
Subject: [PATCH 23/34] Reenabled some tests with specific GPU requirement.

---
 SYCL/BFloat16/bfloat16_example.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/SYCL/BFloat16/bfloat16_example.cpp b/SYCL/BFloat16/bfloat16_example.cpp
index 53e95f06d3..7af0092877 100755
--- a/SYCL/BFloat16/bfloat16_example.cpp
+++ b/SYCL/BFloat16/bfloat16_example.cpp
@@ -2,7 +2,7 @@
 /// Check if bfloat16 example works using fallback libraries
 ///
 
-// REQUIRES: opencl-aot, ocloc, cpu, gpu
+// REQUIRES: opencl-aot, ocloc, cpu, gpu-intel-gen9
 // UNSUPPORTED: cuda
 // CUDA is not compatible with SPIR.
 
@@ -14,14 +14,20 @@
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen9" %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
 // RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device *" %s -o %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen9" %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen9" %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 
 // RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc" %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out

From 2c6f4cefd58d638682bf159cba634a4d01c1f3d0 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Mon, 7 Nov 2022 09:30:29 -0800
Subject: [PATCH 24/34] Changes for bfloat16 moved out of experimental.

---
 SYCL/Matrix/joint_matrix_bfloat16_32x64.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/SYCL/Matrix/joint_matrix_bfloat16_32x64.cpp b/SYCL/Matrix/joint_matrix_bfloat16_32x64.cpp
index 483b15b567..456b040b36 100644
--- a/SYCL/Matrix/joint_matrix_bfloat16_32x64.cpp
+++ b/SYCL/Matrix/joint_matrix_bfloat16_32x64.cpp
@@ -18,7 +18,7 @@
 
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
-using bfloat16 = sycl::ext::oneapi::experimental::bfloat16;
+using bfloat16 = sycl::ext::oneapi::bfloat16;
 
 #define SG_SZ 16
 
@@ -144,13 +144,13 @@ int main() {
     for (int j = 0; j < MATRIX_K; j++) {
       // bfloat16 is created using unsigned short since conversion from float to
       // bfloat16 is not supported on the host side yet
-      A[i][j] = bfloat16::from_bits(make_bf16(1.0f * (i + j)));
+      A[i][j] = make_bf16(1.0f * (i + j));
       Aref[i][j] = make_bf16(1.0f * (i + j));
     }
   }
   for (int i = 0; i < MATRIX_K / 2; i++) {
     for (int j = 0; j < MATRIX_N * 2; j++) {
-      B[i][j] = bfloat16::from_bits((make_bf16(2.0f * i + 3.0f * j)));
+      B[i][j] = make_bf16(2.0f * i + 3.0f * j);
       Bref[i][j] = make_bf16(2.0f * i + 3.0f * j);
     }
   }

From 43768d840b8b4a223d69d5a9309a8d1d6c8744a3 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Mon, 7 Nov 2022 21:09:47 -0800
Subject: [PATCH 25/34] Aspect bfloat16 has been removed.

---
 SYCL/BFloat16/bfloat16_type_cuda.cpp | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/SYCL/BFloat16/bfloat16_type_cuda.cpp b/SYCL/BFloat16/bfloat16_type_cuda.cpp
index 30d1f122a2..81c4a08f12 100644
--- a/SYCL/BFloat16/bfloat16_type_cuda.cpp
+++ b/SYCL/BFloat16/bfloat16_type_cuda.cpp
@@ -12,13 +12,4 @@
 
 #include "bfloat16_type.hpp"
 
-int main() {
-  bool has_bfloat16_aspect = false;
-  for (const auto &plt : sycl::platform::get_platforms()) {
-    if (plt.has(aspect::ext_oneapi_bfloat16))
-      has_bfloat16_aspect = true;
-  }
-
-  if (has_bfloat16_aspect)
-    return run_tests();
-}
+int main() { return run_tests(); }

From 5ba6d7426436cafce7e1e19bc9b6e9c80c121028 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Mon, 7 Nov 2022 21:22:40 -0800
Subject: [PATCH 26/34] Aspect bfloat16 has been replaced by
 bfloat16_math_functions.

---
 SYCL/BFloat16/bfloat16_builtins.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SYCL/BFloat16/bfloat16_builtins.cpp b/SYCL/BFloat16/bfloat16_builtins.cpp
index 253bd875ef..262550c5ed 100644
--- a/SYCL/BFloat16/bfloat16_builtins.cpp
+++ b/SYCL/BFloat16/bfloat16_builtins.cpp
@@ -222,7 +222,7 @@ bool check(float a, float b) {
 int main() {
   queue q;
 
-  if (q.get_device().has(aspect::ext_oneapi_bfloat16)) {
+  if (q.get_device().has(aspect::ext_oneapi_bfloat16_math_functions)) {
     std::vector<float> a(N), b(N), c(N);
     int err = 0;
 

From ccc85dffd0587560abad505d5928aab43e723dea Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Mon, 7 Nov 2022 23:49:25 -0800
Subject: [PATCH 27/34] Check aspect fp16 before using sycl::half.

---
 SYCL/BFloat16/bfloat16_conversions.cpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/SYCL/BFloat16/bfloat16_conversions.cpp b/SYCL/BFloat16/bfloat16_conversions.cpp
index add8eae6b1..78442f2ffc 100755
--- a/SYCL/BFloat16/bfloat16_conversions.cpp
+++ b/SYCL/BFloat16/bfloat16_conversions.cpp
@@ -27,12 +27,11 @@ template <typename T> T calculate(T a, T b) {
   return result;
 }
 
-template <typename T> int test_device() {
+template <typename T> int test_device(queue Q) {
   T data[3] = {-7.0f, 8.1f, 0.0f};
 
-  queue deviceQueue;
   buffer<T, 1> buf{data, 3};
-  deviceQueue.submit([&](handler &cgh) {
+  Q.submit([&](handler &cgh) {
     accessor numbers{buf, cgh, read_write};
     cgh.single_task([=]() { numbers[2] = calculate(numbers[0], numbers[1]); });
   });
@@ -55,11 +54,13 @@ template <typename T> int test_host() {
 }
 
 int main() {
+  queue Q;
   int result;
   result = test_host<sycl::half>();
   result |= test_host<float>();
-  result |= test_device<sycl::half>();
-  result |= test_device<float>();
+  if (Q.get_device().has(aspect::fp16))
+    result |= test_device<sycl::half>(Q);
+  result |= test_device<float>(Q);
   if (result)
     std::cout << "FAIL\n";
   else

From 3b60cb1ddc667919e1ea83118c62ff6635a540d5 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Tue, 8 Nov 2022 08:47:34 -0800
Subject: [PATCH 28/34] Account for lack of fp16 support on some devices.

---
 SYCL/BFloat16/bfloat16_conversions.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SYCL/BFloat16/bfloat16_conversions.cpp b/SYCL/BFloat16/bfloat16_conversions.cpp
index 78442f2ffc..ed5d7ae583 100755
--- a/SYCL/BFloat16/bfloat16_conversions.cpp
+++ b/SYCL/BFloat16/bfloat16_conversions.cpp
@@ -1,5 +1,5 @@
 // UNSUPPORTED: hip
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-device-code-split=per_kernel %s -o %t.out
 // Currently the feature is supported only on CPU and GPU, natively or by
 // software emulation.
 // RUN: %CPU_RUN_PLACEHOLDER %t.out

From 1b0d6f248bf333ad34276da380bad64c33d5d86b Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Tue, 8 Nov 2022 18:10:08 -0800
Subject: [PATCH 29/34] Reduce expected precision of bfloat16 calculations.

---
 SYCL/BFloat16/bfloat16_type.hpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/SYCL/BFloat16/bfloat16_type.hpp b/SYCL/BFloat16/bfloat16_type.hpp
index 9e1f39dbea..6c376437f4 100644
--- a/SYCL/BFloat16/bfloat16_type.hpp
+++ b/SYCL/BFloat16/bfloat16_type.hpp
@@ -11,8 +11,7 @@ constexpr size_t N = 100;
 template <typename T> void assert_close(const T &C, const float ref) {
   for (size_t i = 0; i < N; i++) {
     auto diff = C[i] - ref;
-    assert(std::fabs(static_cast<float>(diff)) <
-           std::numeric_limits<float>::epsilon());
+    assert(std::fabs(static_cast<float>(diff)) < 0.1);
   }
 }
 

From 8d9f34fa97332b79e1f8b760bc1f1a2aa74a7232 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Wed, 9 Nov 2022 06:00:06 -0800
Subject: [PATCH 30/34] Adjustments for bfloat16 header.

---
 SYCL/Matrix/element_wise_all_ops_cuda_legacy.cpp | 2 +-
 SYCL/Matrix/joint_matrix_bfloat16.cpp            | 2 +-
 SYCL/Matrix/joint_matrix_bfloat16_use.cpp        | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/SYCL/Matrix/element_wise_all_ops_cuda_legacy.cpp b/SYCL/Matrix/element_wise_all_ops_cuda_legacy.cpp
index 71a7f9fa66..5222b8d025 100644
--- a/SYCL/Matrix/element_wise_all_ops_cuda_legacy.cpp
+++ b/SYCL/Matrix/element_wise_all_ops_cuda_legacy.cpp
@@ -14,7 +14,7 @@
 
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
-using sycl::ext::oneapi::experimental::bfloat16;
+using sycl::ext::oneapi::bfloat16;
 
 #define SG_SZ 32
 constexpr size_t nWGperDim = 2;
diff --git a/SYCL/Matrix/joint_matrix_bfloat16.cpp b/SYCL/Matrix/joint_matrix_bfloat16.cpp
index 4d35b5f5a5..e665617156 100644
--- a/SYCL/Matrix/joint_matrix_bfloat16.cpp
+++ b/SYCL/Matrix/joint_matrix_bfloat16.cpp
@@ -16,7 +16,7 @@
 
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
-using bfloat16 = sycl::ext::oneapi::experimental::bfloat16;
+using bfloat16 = sycl::ext::oneapi::bfloat16;
 
 #define SG_SZ 16
 
diff --git a/SYCL/Matrix/joint_matrix_bfloat16_use.cpp b/SYCL/Matrix/joint_matrix_bfloat16_use.cpp
index 5c110336e3..aa6412195e 100644
--- a/SYCL/Matrix/joint_matrix_bfloat16_use.cpp
+++ b/SYCL/Matrix/joint_matrix_bfloat16_use.cpp
@@ -17,7 +17,7 @@
 #include <sycl/sycl.hpp>
 
 using namespace sycl::ext::oneapi::experimental::matrix;
-using bfloat16 = sycl::ext::oneapi::experimental::bfloat16;
+using bfloat16 = sycl::ext::oneapi::bfloat16;
 
 #define SG_SZ 16
 

From f1975c510b4af8df48ab210e2f11068ed0f50e5e Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Tue, 15 Nov 2022 10:34:38 -0800
Subject: [PATCH 31/34] Test adjustment.

---
 SYCL/BFloat16/bfloat16_type.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SYCL/BFloat16/bfloat16_type.cpp b/SYCL/BFloat16/bfloat16_type.cpp
index d252d50d75..28f1bf621b 100644
--- a/SYCL/BFloat16/bfloat16_type.cpp
+++ b/SYCL/BFloat16/bfloat16_type.cpp
@@ -2,7 +2,7 @@
 // RUN: %if cuda %{%clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_80 %s -o %t.out %}
 // TODO enable the below when CI supports >=sm_80
 // RUNx: %if cuda %{%GPU_RUN_PLACEHOLDER %t.out %}
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl %s -o %t.out
 // TODO currently the feature isn't supported on FPGA.
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out

From 80c3468b01f6fc47cafa5507788ebe0f195f6c24 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Thu, 17 Nov 2022 21:03:58 -0800
Subject: [PATCH 32/34] Replace double constants with float.

---
 SYCL/BFloat16/bfloat16_type.hpp | 36 ++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/SYCL/BFloat16/bfloat16_type.hpp b/SYCL/BFloat16/bfloat16_type.hpp
index 6c376437f4..570755bf67 100644
--- a/SYCL/BFloat16/bfloat16_type.hpp
+++ b/SYCL/BFloat16/bfloat16_type.hpp
@@ -152,8 +152,8 @@ void verify_logic(queue &q, buffer<float, 1> &a, buffer<float, 1> &b,
           CVal++;
           if (AVal == BVal) {
             CVal -= AVal;
-            CVal *= 3.0;
-            CVal /= 2.0;
+            CVal *= 3.0f;
+            CVal /= 2.0f;
           } else
             CVal += BVal;
           C[index] = CVal;
@@ -176,9 +176,9 @@ int run_tests() {
     return 0;
   }
 
-  std::vector<float> vec_a(N, 5.0);
-  std::vector<float> vec_b(N, 2.0);
-  std::vector<float> vec_b_neg(N, -2.0);
+  std::vector<float> vec_a(N, 5.0f);
+  std::vector<float> vec_b(N, 2.0f);
+  std::vector<float> vec_b_neg(N, -2.0f);
 
   range<1> r(N);
   buffer<float, 1> a{vec_a.data(), r};
@@ -187,31 +187,31 @@ int run_tests() {
 
   queue q{dev};
 
-  verify_conv_implicit(q, a, r, 5.0);
+  verify_conv_implicit(q, a, r, 5.0f);
   std::cout << "PASS verify_conv_implicit\n";
-  verify_conv_explicit(q, a, r, 5.0);
+  verify_conv_explicit(q, a, r, 5.0f);
   std::cout << "PASS verify_conv_explicit\n";
-  verify_add(q, a, b, r, 7.0);
+  verify_add(q, a, b, r, 7.0f);
   std::cout << "PASS verify_add\n";
-  verify_sub(q, a, b, r, 3.0);
+  verify_sub(q, a, b, r, 3.0f);
   std::cout << "PASS verify_sub\n";
-  verify_mul(q, a, b, r, 10.0);
+  verify_mul(q, a, b, r, 10.0f);
   std::cout << "PASS verify_mul\n";
-  verify_div(q, a, b, r, 2.5);
+  verify_div(q, a, b, r, 2.5f);
   std::cout << "PASS verify_div\n";
-  verify_logic(q, a, b, r, 7.0);
+  verify_logic(q, a, b, r, 7.0f);
   std::cout << "PASS verify_logic\n";
-  verify_add(q, a, b_neg, r, 3.0);
+  verify_add(q, a, b_neg, r, 3.0f);
   std::cout << "PASS verify_add\n";
-  verify_sub(q, a, b_neg, r, 7.0);
+  verify_sub(q, a, b_neg, r, 7.0f);
   std::cout << "PASS verify_sub\n";
-  verify_minus(q, a, r, -5.0);
+  verify_minus(q, a, r, -5.0f);
   std::cout << "PASS verify_minus\n";
-  verify_mul(q, a, b_neg, r, -10.0);
+  verify_mul(q, a, b_neg, r, -10.0f);
   std::cout << "PASS verify_mul\n";
-  verify_div(q, a, b_neg, r, -2.5);
+  verify_div(q, a, b_neg, r, -2.5f);
   std::cout << "PASS verify_div\n";
-  verify_logic(q, a, b_neg, r, 3.0);
+  verify_logic(q, a, b_neg, r, 3.0f);
   std::cout << "PASS verify_logic\n";
 
   return 0;

From db0a1bce18bb96ef9273139078621b03acc4530e Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Thu, 17 Nov 2022 21:51:30 -0800
Subject: [PATCH 33/34] Replace double constants with float.

---
 SYCL/BFloat16/bfloat16_example.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/SYCL/BFloat16/bfloat16_example.cpp b/SYCL/BFloat16/bfloat16_example.cpp
index 7af0092877..fab3795679 100755
--- a/SYCL/BFloat16/bfloat16_example.cpp
+++ b/SYCL/BFloat16/bfloat16_example.cpp
@@ -54,11 +54,11 @@ float foo(float a, float b) {
 }
 
 int main(int argc, char *argv[]) {
-  float data[3] = {7.0, 8.1, 0.0};
+  float data[3] = {7.0f, 8.1f, 0.0f};
 
   float result_host = foo(7.0f, 8.1f);
   std::cout << "CPU Result = " << result_host << std::endl;
-  if (std::abs(15.1f - result_host) > 0.1) {
+  if (std::abs(15.1f - result_host) > 0.1f) {
     std::cout << "Test failed. Expected CPU Result ~= 15.1" << std::endl;
     return 1;
   }
@@ -74,7 +74,7 @@ int main(int argc, char *argv[]) {
   host_accessor hostOutAcc{buf, read_only};
   float result_device = hostOutAcc[2];
   std::cout << "GPU Result = " << result_device << std::endl;
-  if (std::abs(result_host - result_device) > 0.1) {
+  if (std::abs(result_host - result_device) > 0.1f) {
     std::cout << "Test failed. CPU Result !~= GPU result" << std::endl;
     return 1;
   }

From 8511e529ab83c8a77c921eb103ecc986c9cef049 Mon Sep 17 00:00:00 2001
From: Rajiv Deodhar <rajiv.deodhar@intel.com>
Date: Tue, 22 Nov 2022 00:41:52 -0800
Subject: [PATCH 34/34] Fix test to return 0 from main.

---
 SYCL/KernelAndProgram/kernel-bundle-merge-options.hpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/SYCL/KernelAndProgram/kernel-bundle-merge-options.hpp b/SYCL/KernelAndProgram/kernel-bundle-merge-options.hpp
index a3661cab56..7d0c7dcc5f 100644
--- a/SYCL/KernelAndProgram/kernel-bundle-merge-options.hpp
+++ b/SYCL/KernelAndProgram/kernel-bundle-merge-options.hpp
@@ -35,4 +35,6 @@ int main() {
   } catch (...) {
     // Ignore all exceptions
   }
+
+  return 0;
 }