From ce18b546c80067008bb8cc2b3b2de91c4462e3a4 Mon Sep 17 00:00:00 2001
From: Alexey Sachkov <alexey.sachkov@intel.com>
Date: Thu, 4 Jul 2024 08:34:30 -0700
Subject: [PATCH 1/6] [SYCL][NFC] Drop Gen9 detection from E2E tests

Gen9 HW is not officially supported anymore by our product, we don't
have such machines in our CI and therefore it doesn't make sense to keep
those legacy LIT features and their usage.
---
 sycl/test-e2e/BFloat16/bfloat16_example.cpp   |   8 +-
 .../test-e2e/ESIMD/ext_math_ieee_sqrt_div.cpp |   2 +-
 sycl/test-e2e/ESIMD/histogram_raw_send.cpp    | 297 ------------------
 sycl/test-e2e/ESIMD/lit.local.cfg             |   3 -
 sycl/test-e2e/ESIMD/vadd_raw_send.cpp         | 186 -----------
 .../Plugin/level_zero_device_free_mem.cpp     |   2 +-
 sycl/test-e2e/Properties/cache_config.cpp     |   5 -
 sycl/test-e2e/README.md                       |   1 -
 sycl/test-e2e/lit.cfg.py                      |   2 -
 9 files changed, 6 insertions(+), 500 deletions(-)
 delete mode 100644 sycl/test-e2e/ESIMD/histogram_raw_send.cpp
 delete mode 100644 sycl/test-e2e/ESIMD/vadd_raw_send.cpp

diff --git a/sycl/test-e2e/BFloat16/bfloat16_example.cpp b/sycl/test-e2e/BFloat16/bfloat16_example.cpp
index d3136a293bbf9..35cc249e48def 100644
--- a/sycl/test-e2e/BFloat16/bfloat16_example.cpp
+++ b/sycl/test-e2e/BFloat16/bfloat16_example.cpp
@@ -2,7 +2,7 @@
 /// Check if bfloat16 example works using fallback libraries
 ///
 
-// REQUIRES: opencl-aot, ocloc, gpu-intel-gen9
+// REQUIRES: opencl-aot, ocloc, gpu-intel-gen12
 // UNSUPPORTED: cuda
 // CUDA is not compatible with SPIR.
 // UNSUPPORTED: accelerator
@@ -13,16 +13,16 @@
 // RUN: %clangxx -fsycl -fsycl-targets=spir64 %s -o %t.out
 // RUN: %{run} %t.out
 
-// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen9" %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen12" %s -o %t.out
 // RUN: %{run} %t.out
 
 // RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device *" %s -o %t.out
 // RUN: %if gpu %{ %{run} %t.out %}
 
-// RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen9" %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen12" %s -o %t.out
 // RUN: %{run} %t.out
 
-// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen9" %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen12" %s -o %t.out
 // RUN: %{run} %t.out
 
 // RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc" %s -o %t.out
diff --git a/sycl/test-e2e/ESIMD/ext_math_ieee_sqrt_div.cpp b/sycl/test-e2e/ESIMD/ext_math_ieee_sqrt_div.cpp
index 052ef69cc0d2c..164641059fe98 100644
--- a/sycl/test-e2e/ESIMD/ext_math_ieee_sqrt_div.cpp
+++ b/sycl/test-e2e/ESIMD/ext_math_ieee_sqrt_div.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-gen9 || gpu-intel-pvc
+// REQUIRES: gpu-intel-pvc
 
 // DEFINE: %{mathflags} = %if cl_options %{/clang:-fno-fast-math%} %else %{-fno-fast-math%}
 // RUN: %{build} -fsycl-device-code-split=per_kernel %{mathflags} -o %t.out
diff --git a/sycl/test-e2e/ESIMD/histogram_raw_send.cpp b/sycl/test-e2e/ESIMD/histogram_raw_send.cpp
deleted file mode 100644
index 26648af91393b..0000000000000
--- a/sycl/test-e2e/ESIMD/histogram_raw_send.cpp
+++ /dev/null
@@ -1,297 +0,0 @@
-//==-histogram_raw_send.cpp  - DPC++ ESIMD on-device test-==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------===//
-// REQUIRES: gpu-intel-gen9
-// UNSUPPORTED: gpu-intel-dg1,gpu-intel-dg2,gpu-intel-pvc
-// UNSUPPORTED: ze_debug
-// RUN: %{build} -o %t1.out
-// RUN: %{run} %t1.out
-// RUN: %{build} -DUSE_CONSTEXPR_API -o %t2.out
-// RUN: %{run} %t2.out
-// RUN: %{build} -DUSE_SUPPORTED_API -o %t3.out
-// RUN: %{run} %t3.out
-
-// The test checks raw send functionality with atomic write implementation
-// on SKL. It does not work on DG1 due to send instruction incompatibility.
-
-#include "esimd_test_utils.hpp"
-
-#include <sycl/accessor_image.hpp>
-
-#include <array>
-
-using namespace sycl;
-
-#define NUM_BINS 256
-#define IMG_WIDTH 1024
-#define IMG_HEIGHT 1024
-//
-// each parallel_for handles 64x32 bytes
-//
-#define BLOCK_WIDTH 32
-#define BLOCK_HEIGHT 64
-
-void histogram_CPU(unsigned int width, unsigned int height, uint8_t *srcY,
-                   unsigned int *cpuHistogram) {
-  int i;
-  for (i = 0; i < width * height; i++) {
-    cpuHistogram[srcY[i]] += 1;
-  }
-}
-
-void writeHist(unsigned int *hist) {
-  int total = 0;
-
-  std::cerr << "\nHistogram: \n";
-  for (int i = 0; i < NUM_BINS; i += 8) {
-    std::cerr << "\n  [" << i << " - " << i + 7 << "]:";
-    for (int j = 0; j < 8; j++) {
-      std::cerr << "\t" << hist[i + j];
-      total += hist[i + j];
-    }
-  }
-  std::cerr << "\nTotal = " << total << " \n";
-}
-
-int checkHistogram(unsigned int *refHistogram, unsigned int *hist) {
-
-  for (int i = 0; i < NUM_BINS; i++) {
-    if (refHistogram[i] != hist[i]) {
-      return 0;
-    }
-  }
-  return 1;
-}
-
-using namespace sycl::ext::intel;
-using namespace sycl::ext::intel::esimd;
-
-template <atomic_op Op, typename T, int n>
-ESIMD_INLINE void atomic_write(T *bins, simd<unsigned, n> offset,
-                               simd<T, n> src0) {
-  simd<T, n> oldDst;
-  simd<uintptr_t, n> vAddr(reinterpret_cast<uintptr_t>(bins));
-  simd<uintptr_t, n> vOffset = offset;
-  vAddr += vOffset;
-
-  uint32_t exDesc = 0x4C;
-  uint32_t desc = 0x414A7FF;
-  constexpr uint8_t execSize = 0x83;
-  constexpr uint8_t sfid = 0x1;
-  constexpr uint8_t numDst = 0x1;
-  constexpr uint8_t numSrc0 = 0x2;
-  constexpr uint8_t numSrc1 = 0x1;
-  constexpr uint8_t isEOT = 0;
-  constexpr uint8_t isSendc = 0;
-
-#ifdef USE_CONSTEXPR_API
-  experimental::esimd::raw_sends<execSize, sfid, numSrc0, numSrc1, numDst,
-                                 isEOT, isSendc>(oldDst, vAddr, src0, exDesc,
-                                                 desc);
-#elif defined(USE_SUPPORTED_API)
-  esimd::raw_sends<execSize, sfid, numSrc0, numSrc1, numDst,
-                   raw_send_eot::not_eot, raw_send_sendc::not_sendc>(
-      oldDst, vAddr, src0, exDesc, desc);
-
-#else
-  experimental::esimd::raw_sends(oldDst, vAddr, src0, exDesc, desc, execSize,
-                                 sfid, numSrc0, numSrc1, numDst, isEOT,
-                                 isSendc);
-#endif
-}
-
-int main(int argc, char *argv[]) {
-
-  const char *input_file = nullptr;
-  unsigned int width = IMG_WIDTH * sizeof(unsigned int);
-  unsigned int height = IMG_HEIGHT;
-
-  if (argc == 2) {
-    input_file = argv[1];
-  } else {
-    std::cerr << "Usage: Histogram.exe input_file" << std::endl;
-    std::cerr << "No input file specificed. Use default random value ...."
-              << std::endl;
-  }
-
-  // ------------------------------------------------------------------------
-  // Read in image luma plane
-
-  // Allocate Input Buffer
-  queue q = esimd_test::createQueue();
-  esimd_test::printTestLabel(q);
-
-  esimd_test::shared_vector<uint8_t> srcY_vec(
-      width * height, esimd_test::shared_allocator<uint8_t>{q});
-  esimd_test::shared_vector<unsigned int> bins_vec(
-      NUM_BINS, esimd_test::shared_allocator<unsigned int>{q});
-  uint8_t *srcY = srcY_vec.data();
-  ;
-  unsigned int *bins = bins_vec.data();
-
-  uint range_width = width / BLOCK_WIDTH;
-  uint range_height = height / BLOCK_HEIGHT;
-
-  // Initializes input.
-  unsigned int input_size = width * height;
-  std::cerr << "Processing inputs\n";
-
-  if (input_file != nullptr) {
-    FILE *f = fopen(input_file, "rb");
-    if (f == NULL) {
-      std::cerr << "Error opening file " << input_file;
-      std::exit(1);
-    }
-
-    unsigned int cnt = fread(srcY, sizeof(unsigned char), input_size, f);
-    if (cnt != input_size) {
-      std::cerr << "Error reading input from " << input_file;
-      std::exit(1);
-    }
-  } else {
-    srand(2009);
-    for (int i = 0; i < input_size; ++i) {
-      srcY[i] = rand() % 256;
-    }
-  }
-
-  for (int i = 0; i < NUM_BINS; i++) {
-    bins[i] = 0;
-  }
-
-  // ------------------------------------------------------------------------
-  // CPU Execution:
-
-  unsigned int cpuHistogram[NUM_BINS];
-  memset(cpuHistogram, 0, sizeof(cpuHistogram));
-  histogram_CPU(width, height, srcY, cpuHistogram);
-
-  sycl::image<2> Img(srcY, image_channel_order::rgba,
-                     image_channel_type::unsigned_int32,
-                     range<2>{width / sizeof(uint4), height});
-
-  // Start Timer
-  esimd_test::Timer timer;
-  double start;
-
-  double kernel_times = 0;
-  unsigned num_iters = 10;
-  const bool profiling =
-      q.has_property<sycl::property::queue::enable_profiling>();
-  try {
-    // num_iters + 1, iteration#0 is for warmup
-    for (int iter = 0; iter <= num_iters; ++iter) {
-      double etime = 0;
-      for (int b = 0; b < NUM_BINS; b++)
-        bins[b] = 0;
-      // create ranges
-      // We need that many task groups
-      auto GlobalRange = range<1>(range_width * range_height);
-      // We need that many tasks in each group
-      auto LocalRange = range<1>(1);
-      nd_range<1> Range(GlobalRange, LocalRange);
-
-      auto e = q.submit([&](handler &cgh) {
-        auto readAcc = Img.get_access<uint4, sycl::access::mode::read>(cgh);
-
-        cgh.parallel_for<class Hist>(
-            Range, [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL {
-              // Get thread origin offsets
-              uint tid = ndi.get_group(0);
-              uint h_pos = (tid % range_width) * BLOCK_WIDTH;
-              uint v_pos = (tid / range_width) * BLOCK_HEIGHT;
-
-              // Declare a 8x32 uchar matrix to store the input block pixel
-              // value
-              simd<unsigned char, 8 * 32> in;
-
-              // Declare a vector to store the local histogram
-              simd<unsigned int, NUM_BINS> histogram(0);
-
-              // Each thread handles BLOCK_HEIGHTxBLOCK_WIDTH pixel block
-              for (int y = 0; y < BLOCK_HEIGHT / 8; y++) {
-                // Perform 2D media block read to load 8x32 pixel block
-                in = media_block_load<unsigned char, 8, 32>(readAcc, h_pos,
-                                                            v_pos);
-
-            // Accumulate local histogram for each pixel value
-#pragma unroll
-                for (int i = 0; i < 8; i++) {
-#pragma unroll
-                  for (int j = 0; j < 32; j++) {
-                    histogram.select<1, 1>(in[i * 32 + j]) += 1;
-                  }
-                }
-
-                // Update starting offset for the next work block
-                v_pos += 8;
-              }
-
-              // Declare a vector to store the offset for atomic write operation
-              simd<unsigned int, 8> offset(0, 1); // init to 0, 1, 2, ..., 7
-              offset *= sizeof(unsigned int);
-
-          // Update global sum by atomically adding each local histogram
-#pragma unroll
-              for (int i = 0; i < NUM_BINS; i += 8) {
-                // Declare a vector to store the source for atomic write
-                // operation
-                simd<unsigned int, 8> src;
-                src = histogram.select<8, 1>(i);
-
-#ifdef __SYCL_DEVICE_ONLY__
-                // flat_atomic<atomic_op::add, unsigned int,
-                // 8>(bins, offset, src, 1);
-                atomic_write<atomic_op::add, unsigned int, 8>(bins, offset,
-                                                              src);
-                offset += 8 * sizeof(unsigned int);
-#else
-                simd<unsigned int, 8> vals;
-                vals.copy_from(bins + i);
-                vals = vals + src;
-                vals.copy_to(bins + i);
-#endif
-              }
-            });
-      });
-      e.wait();
-      if (profiling) {
-        etime = esimd_test::report_time("kernel time", e, e);
-        if (iter > 0)
-          kernel_times += etime;
-      }
-      if (iter == 0)
-        start = timer.Elapsed();
-    }
-
-    // SYCL will enqueue and run the kernel. Recall that the buffer's data is
-    // given back to the host at the end of scope.
-    // make sure data is given back to the host at the end of this scope
-  } catch (sycl::exception const &e) {
-    std::cout << "SYCL exception caught: " << e.what() << '\n';
-    return 1;
-  }
-
-  // End timer.
-  double end = timer.Elapsed();
-
-  esimd_test::display_timing_stats(profiling ? &kernel_times : nullptr,
-                                   num_iters, (end - start) * 1000);
-
-  writeHist(bins);
-  writeHist(cpuHistogram);
-  // Checking Histogram
-  if (checkHistogram(cpuHistogram, bins)) {
-    std::cerr << "PASSED\n";
-    return 0;
-  } else {
-    std::cerr << "FAILED\n";
-    return 1;
-  }
-
-  return 0;
-}
diff --git a/sycl/test-e2e/ESIMD/lit.local.cfg b/sycl/test-e2e/ESIMD/lit.local.cfg
index 239c2e46e691b..2b47b5c784c27 100644
--- a/sycl/test-e2e/ESIMD/lit.local.cfg
+++ b/sycl/test-e2e/ESIMD/lit.local.cfg
@@ -3,9 +3,6 @@ import platform
 config.unsupported_features += ['cuda', 'hip']
 config.required_features += ['gpu']
 
-if 'gpu-intel-gen9' in config.available_features and platform.system() == 'Windows':
-  config.unsupported = True
-
 # We need this to fix failures when run on OCL.
 # The current DG2 postcommit job only runs L0 anyway,
 # so there's no difference in coverage.
diff --git a/sycl/test-e2e/ESIMD/vadd_raw_send.cpp b/sycl/test-e2e/ESIMD/vadd_raw_send.cpp
deleted file mode 100644
index e4ae7c044d21b..0000000000000
--- a/sycl/test-e2e/ESIMD/vadd_raw_send.cpp
+++ /dev/null
@@ -1,186 +0,0 @@
-//==---------------- vadd_raw_send.cpp  - DPC++ ESIMD on-device test--------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-gen9
-// UNSUPPORTED: gpu-intel-dg1,gpu-intel-dg2,gpu-intel-pvc
-// RUN: %{build} -fno-sycl-esimd-force-stateless-mem -o %t1.out
-// RUN: %{run} %t1.out
-// RUN: %{build} -fno-sycl-esimd-force-stateless-mem -DUSE_CONSTEXPR_API -o %t2.out
-// RUN: %{run} %t2.out
-// RUN: %{build} -fno-sycl-esimd-force-stateless-mem -DUSE_SUPPORTED_API -o %t3.out
-// RUN: %{run} %t3.out
-// The test checks raw send functionality with block read/write implementation
-// on SKL. It does not work on DG1 due to send instruction incompatibility.
-
-#include "esimd_test_utils.hpp"
-
-using namespace sycl;
-
-using namespace sycl::ext::intel;
-using namespace sycl::ext::intel::esimd;
-
-template <typename T, int N, typename AccessorTy>
-ESIMD_INLINE simd<T, N> dwaligned_block_read(AccessorTy acc,
-                                             unsigned int offset) {
-  simd<unsigned int, 8> src0;
-  simd<T, N> oldDst;
-
-  src0.select<1, 1>(2) = offset;
-  uint32_t exDesc = 0xA;
-  SurfaceIndex desc = esimd::get_surface_index(acc);
-  desc += 0x2284300;
-  constexpr uint8_t execSize = 0x84;
-  constexpr uint8_t sfid = 0x0;
-  constexpr uint8_t numSrc0 = 0x1;
-  constexpr uint8_t numDst = 0x2;
-#ifdef USE_CONSTEXPR_API
-  return experimental::esimd::raw_send<execSize, sfid, numSrc0, numDst>(
-      oldDst, src0, exDesc, desc);
-#elif defined(USE_SUPPORTED_API)
-  return esimd::raw_send<execSize, sfid, numSrc0, numDst>(oldDst, src0, exDesc,
-                                                          desc);
-#else
-  return experimental::esimd::raw_send(oldDst, src0, exDesc, desc, execSize,
-                                       sfid, numSrc0, numDst);
-#endif
-}
-
-template <typename T, int N, typename AccessorTy>
-ESIMD_INLINE void block_write1(AccessorTy acc, unsigned int offset,
-                               simd<T, N> data) {
-  simd<unsigned int, 8> src0;
-
-  src0.template select<1, 1>(2) = offset >> 4;
-  uint32_t exDesc = 0x4A;
-  SurfaceIndex desc = esimd::get_surface_index(acc);
-  desc += 0x20A0200;
-  constexpr uint8_t execSize = 0x83;
-  constexpr uint8_t sfid = 0x0;
-  constexpr uint8_t numSrc0 = 0x1;
-  constexpr uint8_t numSrc1 = 0x1;
-#ifdef USE_CONSTEXPR_API
-  return experimental::esimd::raw_sends<execSize, sfid, numSrc0, numSrc1>(
-      src0, data, exDesc, desc);
-#elif defined(USE_SUPPORTED_API)
-  return esimd::raw_sends<execSize, sfid, numSrc0, numSrc1>(src0, data, exDesc,
-                                                            desc);
-#else
-  return experimental::esimd::raw_sends(src0, data, exDesc, desc, execSize,
-                                        sfid, numSrc0, numSrc1);
-#endif
-}
-
-template <typename T, int N, typename AccessorTy>
-ESIMD_INLINE void block_write2(AccessorTy acc, unsigned int offset,
-                               simd<T, N> data) {
-  simd<uint32_t, 16> src0;
-  auto src0_ref1 =
-      src0.template select<8, 1>(0).template bit_cast_view<unsigned int>();
-  auto src0_ref2 = src0.template select<8, 1>(8);
-
-  src0_ref1.template select<1, 1>(2) = offset >> 4;
-  src0_ref2 = data.template bit_cast_view<uint32_t>();
-  uint32_t exDesc = 0xA;
-  SurfaceIndex desc = esimd::get_surface_index(acc);
-  desc += 0x40A0200;
-  constexpr uint8_t execSize = 0x83;
-  constexpr uint8_t sfid = 0x0;
-  constexpr uint8_t numSrc0 = 0x2;
-#ifdef USE_CONSTEXPR_API
-  return experimental::esimd::raw_send<execSize, sfid, numSrc0>(src0, exDesc,
-                                                                desc);
-#elif defined(USE_SUPPORTED_API)
-  return esimd::raw_send<execSize, sfid, numSrc0>(src0, exDesc, desc);
-#else
-  return experimental::esimd::raw_send(src0, exDesc, desc, execSize, sfid,
-                                       numSrc0);
-#endif
-}
-
-template <typename T> int test(queue q) {
-  constexpr unsigned Size = 1024 * 128;
-  constexpr unsigned VL = sizeof(T) == 4 ? 16 : 32;
-  T *A = new T[Size];
-  T *B = new T[Size];
-  T *C = new T[Size];
-
-  for (unsigned i = 0; i < Size; ++i) {
-    A[i] = B[i] = i;
-    C[i] = 0;
-  }
-
-  try {
-    buffer<T, 1> bufa(A, range<1>(Size));
-    buffer<T, 1> bufb(B, range<1>(Size));
-    buffer<T, 1> bufc(C, range<1>(Size));
-
-    // We need that many workgroups
-    range<1> GlobalRange{Size / VL};
-
-    // We need that many threads in each group
-    range<1> LocalRange{1};
-
-    auto e = q.submit([&](handler &cgh) {
-      auto PA = bufa.template get_access<access::mode::read>(cgh);
-      auto PB = bufb.template get_access<access::mode::read>(cgh);
-      auto PC = bufc.template get_access<access::mode::write>(cgh);
-      cgh.parallel_for(
-          GlobalRange * LocalRange, [=](id<1> i) SYCL_ESIMD_KERNEL {
-            unsigned int offset = i * VL * sizeof(T);
-            simd<T, VL> va = dwaligned_block_read<T, VL>(PA, offset);
-            simd<T, VL> vb = dwaligned_block_read<T, VL>(PB, offset);
-            simd<T, VL> vc = va + vb;
-            constexpr int SIZE = VL / 2;
-            block_write1(PC, offset, vc.template select<SIZE, 1>(0).read());
-            offset += SIZE * sizeof(T);
-            block_write2(PC, offset, vc.template select<SIZE, 1>(SIZE).read());
-          });
-    });
-    e.wait();
-  } catch (sycl::exception const &e) {
-    std::cout << "SYCL exception caught: " << e.what() << '\n';
-
-    delete[] A;
-    delete[] B;
-    delete[] C;
-    return 1;
-  }
-
-  int err_cnt = 0;
-
-  for (unsigned i = 0; i < Size; ++i) {
-    if (A[i] + B[i] != C[i]) {
-      if (++err_cnt < 10) {
-        std::cout << "failed at index " << i << ", " << C[i] << " != " << A[i]
-                  << " + " << B[i] << "\n";
-      }
-    }
-  }
-
-  delete[] A;
-  delete[] B;
-  delete[] C;
-
-  std::cout << (err_cnt > 0 ? "FAILED\n" : "Passed\n");
-  return err_cnt;
-}
-
-int main(void) {
-
-  queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler());
-
-  auto dev = q.get_device();
-  std::cout << "Running on " << dev.get_info<sycl::info::device::name>()
-            << "\n";
-  int err_cnt = 0;
-
-  err_cnt += test<float>(q);
-  if (dev.has(sycl::aspect::fp16)) {
-    err_cnt += test<sycl::half>(q);
-  }
-  return err_cnt > 0 ? 1 : 0;
-}
diff --git a/sycl/test-e2e/Plugin/level_zero_device_free_mem.cpp b/sycl/test-e2e/Plugin/level_zero_device_free_mem.cpp
index 560f427e88b13..d9423cfc82a35 100644
--- a/sycl/test-e2e/Plugin/level_zero_device_free_mem.cpp
+++ b/sycl/test-e2e/Plugin/level_zero_device_free_mem.cpp
@@ -5,7 +5,7 @@
 // so requiring DG2. There may be more devices in our CI supporting this aspect.
 // REQUIRES: gpu-intel-dg2
 // REQUIRES: level_zero, level_zero_dev_kit
-// UNSUPPORTED: gpu-intel-gen9, gpu-intel-gen12
+// UNSUPPORTED: gpu-intel-gen12
 // The query of free memory is not supported on integrated devices
 //
 // RUN: %{build} %level_zero_options -o %t.out
diff --git a/sycl/test-e2e/Properties/cache_config.cpp b/sycl/test-e2e/Properties/cache_config.cpp
index 72d5d68d3837e..0cda3e97a5d1f 100644
--- a/sycl/test-e2e/Properties/cache_config.cpp
+++ b/sycl/test-e2e/Properties/cache_config.cpp
@@ -1,9 +1,4 @@
-
 // REQUIRES: gpu, level_zero
-// TODO: There is a bug on Windows Gen 9 with reductions
-// which is not related to tested feature. Enable back when
-// bug is fixed on Windows Gen9
-// UNSUPPORTED: gpu-intel-gen9 && windows
 
 // RUN: %{build} -o %t.out
 // RUN: env UR_L0_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s
diff --git a/sycl/test-e2e/README.md b/sycl/test-e2e/README.md
index f054c51874ccf..a300cf95b8ade 100644
--- a/sycl/test-e2e/README.md
+++ b/sycl/test-e2e/README.md
@@ -221,7 +221,6 @@ unavailable.
  * **ocloc**, **opencl-aot** - Specific AOT tool availability;
  * **level_zero_dev_kit** - Level_Zero headers and libraries availability;
  * **cuda_dev_kit** - CUDA SDK headers and libraries availability;
- * **gpu-intel-gen9**  - Intel GPU Gen9  availability;
  * **gpu-intel-gen11** - Intel GPU Gen11 availability;
  * **gpu-intel-gen12** - Intel GPU Gen12 availability;
  * **gpu-intel-dg1** - Intel GPU DG1 availability;
diff --git a/sycl/test-e2e/lit.cfg.py b/sycl/test-e2e/lit.cfg.py
index 9a1676da29bb7..701633587272c 100644
--- a/sycl/test-e2e/lit.cfg.py
+++ b/sycl/test-e2e/lit.cfg.py
@@ -136,8 +136,6 @@
 config.substitutions.append(("%sycl_include", config.sycl_include))
 
 # Intel GPU FAMILY availability
-if lit_config.params.get("gpu-intel-gen9", False):
-    config.available_features.add("gpu-intel-gen9")
 if lit_config.params.get("gpu-intel-gen11", False):
     config.available_features.add("gpu-intel-gen11")
 if lit_config.params.get("gpu-intel-gen12", False):

From 5d244cb674cc455063e67e05a8089b358d22f1d3 Mon Sep 17 00:00:00 2001
From: Alexey Sachkov <alexey.sachkov@intel.com>
Date: Wed, 10 Jul 2024 08:35:08 -0700
Subject: [PATCH 2/6] An attempt to fix bfloat16 example test

---
 sycl/test-e2e/BFloat16/bfloat16_example.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sycl/test-e2e/BFloat16/bfloat16_example.cpp b/sycl/test-e2e/BFloat16/bfloat16_example.cpp
index 35cc249e48def..9c39a733c89d8 100644
--- a/sycl/test-e2e/BFloat16/bfloat16_example.cpp
+++ b/sycl/test-e2e/BFloat16/bfloat16_example.cpp
@@ -19,10 +19,10 @@
 // RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device *" %s -o %t.out
 // RUN: %if gpu %{ %{run} %t.out %}
 
-// RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen12" %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen12lp" %s -o %t.out
 // RUN: %{run} %t.out
 
-// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen12" %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen12lp" %s -o %t.out
 // RUN: %{run} %t.out
 
 // RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc" %s -o %t.out

From c1283bff693c00cb4a0d63497d0923a01339d719 Mon Sep 17 00:00:00 2001
From: Alexey Sachkov <alexey.sachkov@intel.com>
Date: Thu, 11 Jul 2024 03:40:51 -0700
Subject: [PATCH 3/6] Fix missed gen12 -> gen12lp

---
 sycl/test-e2e/BFloat16/bfloat16_example.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/test-e2e/BFloat16/bfloat16_example.cpp b/sycl/test-e2e/BFloat16/bfloat16_example.cpp
index 9c39a733c89d8..c507b33643041 100644
--- a/sycl/test-e2e/BFloat16/bfloat16_example.cpp
+++ b/sycl/test-e2e/BFloat16/bfloat16_example.cpp
@@ -13,7 +13,7 @@
 // RUN: %clangxx -fsycl -fsycl-targets=spir64 %s -o %t.out
 // RUN: %{run} %t.out
 
-// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen12" %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen12lp" %s -o %t.out
 // RUN: %{run} %t.out
 
 // RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device *" %s -o %t.out

From 1326e5d1e08b5f2bc377990503bd4a294788e5dc Mon Sep 17 00:00:00 2001
From: "Larsen, Steffen" <steffen.larsen@intel.com>
Date: Fri, 20 Sep 2024 01:36:22 -0700
Subject: [PATCH 4/6] Split bfloat16_example

Signed-off-by: Larsen, Steffen <steffen.larsen@intel.com>
---
 sycl/test-e2e/BFloat16/bfloat16_example.cpp   | 66 ++-----------------
 sycl/test-e2e/BFloat16/bfloat16_example.hpp   | 47 +++++++++++++
 .../BFloat16/bfloat16_example_cpu.cpp         | 18 +++++
 .../BFloat16/bfloat16_example_gpu.cpp         | 18 +++++
 4 files changed, 88 insertions(+), 61 deletions(-)
 create mode 100644 sycl/test-e2e/BFloat16/bfloat16_example.hpp
 create mode 100644 sycl/test-e2e/BFloat16/bfloat16_example_cpu.cpp
 create mode 100644 sycl/test-e2e/BFloat16/bfloat16_example_gpu.cpp

diff --git a/sycl/test-e2e/BFloat16/bfloat16_example.cpp b/sycl/test-e2e/BFloat16/bfloat16_example.cpp
index 5c8604cb547e3..b588ce609633a 100644
--- a/sycl/test-e2e/BFloat16/bfloat16_example.cpp
+++ b/sycl/test-e2e/BFloat16/bfloat16_example.cpp
@@ -1,80 +1,24 @@
 ///
-/// Check if bfloat16 example works using fallback libraries
+/// Check if bfloat16 example works using fallback libraries AOT compiled for
+/// both GPU and CPU.
 ///
 
 // REQUIRES: opencl-aot, ocloc, gpu-intel-gen12
 
-// CUDA is not compatible with SPIR.
-// UNSUPPORTED: cuda
-
 // RUN: %clangxx -fsycl %s -o %t.out
 // RUN: %{run} %t.out
 
 // RUN: %clangxx -fsycl -fsycl-targets=spir64 %s -o %t.out
 // RUN: %{run} %t.out
 
-// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen12lp" %s -o %t.out
-// RUN: %{run} %t.out
-
-// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device *" %s -o %t.out
-// RUN: %if gpu %{ %{run} %t.out %}
-
 // RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen12lp" %s -o %t.out
 // RUN: %{run} %t.out
 
 // RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen12lp" %s -o %t.out
 // RUN: %{run} %t.out
 
-// RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc" %s -o %t.out
-// RUN: %if cpu %{ %{run} %t.out %}
-
-// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc" %s -o %t.out
-// RUN: %if cpu %{ %{run} %t.out %}
-
-#include <sycl/detail/core.hpp>
-#include <sycl/ext/oneapi/bfloat16.hpp>
-
-using namespace sycl;
-using sycl::ext::oneapi::bfloat16;
-
-float foo(float a, float b) {
-  // Convert from float to bfloat16.
-  bfloat16 A{a};
-  bfloat16 B{b};
-
-  // Convert A and B from bfloat16 to float, do addition on floating-point
-  // numbers, then convert the result to bfloat16 and store it in C.
-  bfloat16 C = A + B;
-
-  // Return the result converted from bfloat16 to float.
-  return C;
-}
-
-int main(int argc, char *argv[]) {
-  float data[3] = {7.0f, 8.1f, 0.0f};
-
-  float result_host = foo(7.0f, 8.1f);
-  std::cout << "CPU Result = " << result_host << std::endl;
-  if (std::abs(15.1f - result_host) > 0.1f) {
-    std::cout << "Test failed. Expected CPU Result ~= 15.1" << std::endl;
-    return 1;
-  }
-
-  queue deviceQueue;
-  buffer<float, 1> buf{data, 3};
-
-  deviceQueue.submit([&](handler &cgh) {
-    accessor numbers{buf, cgh, read_write};
-    cgh.single_task([=]() { numbers[2] = foo(numbers[0], numbers[1]); });
-  });
-
-  host_accessor hostOutAcc{buf, read_only};
-  float result_device = hostOutAcc[2];
-  std::cout << "GPU Result = " << result_device << std::endl;
-  if (std::abs(result_host - result_device) > 0.1f) {
-    std::cout << "Test failed. CPU Result !~= GPU result" << std::endl;
-    return 1;
-  }
+#include "bfloat16_example.hpp"
 
-  return 0;
+int main() {
+  return runTest();
 }
diff --git a/sycl/test-e2e/BFloat16/bfloat16_example.hpp b/sycl/test-e2e/BFloat16/bfloat16_example.hpp
new file mode 100644
index 0000000000000..85c9172ed174f
--- /dev/null
+++ b/sycl/test-e2e/BFloat16/bfloat16_example.hpp
@@ -0,0 +1,47 @@
+#include <sycl/detail/core.hpp>
+#include <sycl/ext/oneapi/bfloat16.hpp>
+
+using namespace sycl;
+using sycl::ext::oneapi::bfloat16;
+
+float foo(float a, float b) {
+  // Convert from float to bfloat16.
+  bfloat16 A{a};
+  bfloat16 B{b};
+
+  // Convert A and B from bfloat16 to float, do addition on floating-point
+  // numbers, then convert the result to bfloat16 and store it in C.
+  bfloat16 C = A + B;
+
+  // Return the result converted from bfloat16 to float.
+  return C;
+}
+
+int runTest() {
+  float data[3] = {7.0f, 8.1f, 0.0f};
+
+  float result_host = foo(7.0f, 8.1f);
+  std::cout << "Host Result = " << result_host << std::endl;
+  if (std::abs(15.1f - result_host) > 0.1f) {
+    std::cout << "Test failed. Expected Host Result ~= 15.1" << std::endl;
+    return 1;
+  }
+
+  queue deviceQueue;
+  buffer<float, 1> buf{data, 3};
+
+  deviceQueue.submit([&](handler &cgh) {
+    accessor numbers{buf, cgh, read_write};
+    cgh.single_task([=]() { numbers[2] = foo(numbers[0], numbers[1]); });
+  });
+
+  host_accessor hostOutAcc{buf, read_only};
+  float result_device = hostOutAcc[2];
+  std::cout << "Device Result = " << result_device << std::endl;
+  if (std::abs(result_host - result_device) > 0.1f) {
+    std::cout << "Test failed. Host Result !~= Device result" << std::endl;
+    return 1;
+  }
+
+  return 0;
+}
diff --git a/sycl/test-e2e/BFloat16/bfloat16_example_cpu.cpp b/sycl/test-e2e/BFloat16/bfloat16_example_cpu.cpp
new file mode 100644
index 0000000000000..218862b78aa81
--- /dev/null
+++ b/sycl/test-e2e/BFloat16/bfloat16_example_cpu.cpp
@@ -0,0 +1,18 @@
+///
+/// Check if bfloat16 example works using fallback libraries AOT compiled for
+/// CPU.
+///
+
+// REQUIRES: opencl-aot, ocloc, gpu-intel-gen12, cpu
+
+// RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc" %s -o %t.out
+// RUN: %{run} %t.out
+
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc" %s -o %t.out
+// RUN: %{run} %t.out
+
+#include "bfloat16_example.hpp"
+
+int main() {
+  return runTest();
+}
diff --git a/sycl/test-e2e/BFloat16/bfloat16_example_gpu.cpp b/sycl/test-e2e/BFloat16/bfloat16_example_gpu.cpp
new file mode 100644
index 0000000000000..a9c547dc3120f
--- /dev/null
+++ b/sycl/test-e2e/BFloat16/bfloat16_example_gpu.cpp
@@ -0,0 +1,18 @@
+///
+/// Check if bfloat16 example works using fallback libraries AOT compiled for
+/// GPU.
+///
+
+// REQUIRES: opencl-aot, ocloc, gpu-intel-gen12, gpu
+
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen12lp" %s -o %t.out
+// RUN: %{run} %t.out
+
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device *" %s -o %t.out
+// RUN: %{run} %t.out
+
+#include "bfloat16_example.hpp"
+
+int main() {
+  return runTest();
+}

From 625c4a7c75dc434c2b66308e804f2c5a26ed703c Mon Sep 17 00:00:00 2001
From: "Larsen, Steffen" <steffen.larsen@intel.com>
Date: Fri, 20 Sep 2024 03:29:17 -0700
Subject: [PATCH 5/6] Change restrictions on bfloat16_example

Signed-off-by: Larsen, Steffen <steffen.larsen@intel.com>
---
 sycl/test-e2e/BFloat16/bfloat16_example.cpp   | 14 +------------
 .../BFloat16/bfloat16_example_aot.cpp         | 21 +++++++++++++++++++
 ...e_cpu.cpp => bfloat16_example_aot_cpu.cpp} | 10 ++++-----
 ...e_gpu.cpp => bfloat16_example_aot_gpu.cpp} |  6 +++---
 4 files changed, 30 insertions(+), 21 deletions(-)
 create mode 100644 sycl/test-e2e/BFloat16/bfloat16_example_aot.cpp
 rename sycl/test-e2e/BFloat16/{bfloat16_example_cpu.cpp => bfloat16_example_aot_cpu.cpp} (54%)
 rename sycl/test-e2e/BFloat16/{bfloat16_example_gpu.cpp => bfloat16_example_aot_gpu.cpp} (72%)

diff --git a/sycl/test-e2e/BFloat16/bfloat16_example.cpp b/sycl/test-e2e/BFloat16/bfloat16_example.cpp
index b588ce609633a..3746e19c950d9 100644
--- a/sycl/test-e2e/BFloat16/bfloat16_example.cpp
+++ b/sycl/test-e2e/BFloat16/bfloat16_example.cpp
@@ -1,22 +1,10 @@
 ///
-/// Check if bfloat16 example works using fallback libraries AOT compiled for
-/// both GPU and CPU.
+/// Checks a simple case of bfloat16, also employed for AOT library fallback.
 ///
 
-// REQUIRES: opencl-aot, ocloc, gpu-intel-gen12
-
 // RUN: %clangxx -fsycl %s -o %t.out
 // RUN: %{run} %t.out
 
-// RUN: %clangxx -fsycl -fsycl-targets=spir64 %s -o %t.out
-// RUN: %{run} %t.out
-
-// RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen12lp" %s -o %t.out
-// RUN: %{run} %t.out
-
-// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen12lp" %s -o %t.out
-// RUN: %{run} %t.out
-
 #include "bfloat16_example.hpp"
 
 int main() {
diff --git a/sycl/test-e2e/BFloat16/bfloat16_example_aot.cpp b/sycl/test-e2e/BFloat16/bfloat16_example_aot.cpp
new file mode 100644
index 0000000000000..8337716c3191e
--- /dev/null
+++ b/sycl/test-e2e/BFloat16/bfloat16_example_aot.cpp
@@ -0,0 +1,21 @@
+///
+/// Check if bfloat16 example works using fallback libraries AOT compiled for
+/// both GPU and CPU.
+///
+
+// REQUIRES: opencl-aot, ocloc, gpu-intel-gen12, any-device-is-cpu
+
+// RUN: %clangxx -fsycl -fsycl-targets=spir64 %s -o %t.out
+// RUN: %{run} %t.out
+
+// RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen12lp" %s -o %t.out
+// RUN: %{run} %t.out
+
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen12lp" %s -o %t.out
+// RUN: %{run} %t.out
+
+#include "bfloat16_example.hpp"
+
+int main() {
+  return runTest();
+}
diff --git a/sycl/test-e2e/BFloat16/bfloat16_example_cpu.cpp b/sycl/test-e2e/BFloat16/bfloat16_example_aot_cpu.cpp
similarity index 54%
rename from sycl/test-e2e/BFloat16/bfloat16_example_cpu.cpp
rename to sycl/test-e2e/BFloat16/bfloat16_example_aot_cpu.cpp
index 218862b78aa81..10d21f1eecfc2 100644
--- a/sycl/test-e2e/BFloat16/bfloat16_example_cpu.cpp
+++ b/sycl/test-e2e/BFloat16/bfloat16_example_aot_cpu.cpp
@@ -3,13 +3,13 @@
 /// CPU.
 ///
 
-// REQUIRES: opencl-aot, ocloc, gpu-intel-gen12, cpu
+// REQUIRES: opencl-aot, ocloc, gpu-intel-gen12, any-device-is-cpu
 
-// RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc" %s -o %t.out
-// RUN: %{run} %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device dg1" %s -o %t.out
+// RUN: if cpu %{ %{run} %t.out %}
 
-// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc" %s -o %t.out
-// RUN: %{run} %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device dg1" %s -o %t.out
+// RUN: if cpu %{ %{run} %t.out %}
 
 #include "bfloat16_example.hpp"
 
diff --git a/sycl/test-e2e/BFloat16/bfloat16_example_gpu.cpp b/sycl/test-e2e/BFloat16/bfloat16_example_aot_gpu.cpp
similarity index 72%
rename from sycl/test-e2e/BFloat16/bfloat16_example_gpu.cpp
rename to sycl/test-e2e/BFloat16/bfloat16_example_aot_gpu.cpp
index a9c547dc3120f..8262d3f655b37 100644
--- a/sycl/test-e2e/BFloat16/bfloat16_example_gpu.cpp
+++ b/sycl/test-e2e/BFloat16/bfloat16_example_aot_gpu.cpp
@@ -3,13 +3,13 @@
 /// GPU.
 ///
 
-// REQUIRES: opencl-aot, ocloc, gpu-intel-gen12, gpu
+// REQUIRES: opencl-aot, ocloc, gpu-intel-gen12, any-device-is-gpu
 
 // RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen12lp" %s -o %t.out
-// RUN: %{run} %t.out
+// RUN: %if gpu %{%{run} %t.out %}
 
 // RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device *" %s -o %t.out
-// RUN: %{run} %t.out
+// RUN: %if gpu %{%{run} %t.out %}
 
 #include "bfloat16_example.hpp"
 

From 4c8a40066ebeed8fa5de7de28d7ae8e84beac220 Mon Sep 17 00:00:00 2001
From: "Larsen, Steffen" <steffen.larsen@intel.com>
Date: Fri, 20 Sep 2024 05:18:23 -0700
Subject: [PATCH 6/6] Redisable for CUDA and fix if statements

Signed-off-by: Larsen, Steffen <steffen.larsen@intel.com>
---
 sycl/test-e2e/BFloat16/bfloat16_example.cpp         | 3 +++
 sycl/test-e2e/BFloat16/bfloat16_example_aot_cpu.cpp | 4 ++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/sycl/test-e2e/BFloat16/bfloat16_example.cpp b/sycl/test-e2e/BFloat16/bfloat16_example.cpp
index 3746e19c950d9..84ea36fd363a5 100644
--- a/sycl/test-e2e/BFloat16/bfloat16_example.cpp
+++ b/sycl/test-e2e/BFloat16/bfloat16_example.cpp
@@ -2,6 +2,9 @@
 /// Checks a simple case of bfloat16, also employed for AOT library fallback.
 ///
 
+// CUDA is not compatible with SPIR.
+// UNSUPPORTED: cuda
+
 // RUN: %clangxx -fsycl %s -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/BFloat16/bfloat16_example_aot_cpu.cpp b/sycl/test-e2e/BFloat16/bfloat16_example_aot_cpu.cpp
index 10d21f1eecfc2..2f6d893768c4e 100644
--- a/sycl/test-e2e/BFloat16/bfloat16_example_aot_cpu.cpp
+++ b/sycl/test-e2e/BFloat16/bfloat16_example_aot_cpu.cpp
@@ -6,10 +6,10 @@
 // REQUIRES: opencl-aot, ocloc, gpu-intel-gen12, any-device-is-cpu
 
 // RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device dg1" %s -o %t.out
-// RUN: if cpu %{ %{run} %t.out %}
+// RUN: %if cpu %{ %{run} %t.out %}
 
 // RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device dg1" %s -o %t.out
-// RUN: if cpu %{ %{run} %t.out %}
+// RUN: %if cpu %{ %{run} %t.out %}
 
 #include "bfloat16_example.hpp"