From ce18b546c80067008bb8cc2b3b2de91c4462e3a4 Mon Sep 17 00:00:00 2001 From: Alexey Sachkov Date: Thu, 4 Jul 2024 08:34:30 -0700 Subject: [PATCH 1/6] [SYCL][NFC] Drop Gen9 detection from E2E tests Gen9 HW is not officially supported anymore by our product, we don't have such machines in our CI and therefore it doesn't make sense to keep those legacy LIT features and their usage. --- sycl/test-e2e/BFloat16/bfloat16_example.cpp | 8 +- .../test-e2e/ESIMD/ext_math_ieee_sqrt_div.cpp | 2 +- sycl/test-e2e/ESIMD/histogram_raw_send.cpp | 297 ------------------ sycl/test-e2e/ESIMD/lit.local.cfg | 3 - sycl/test-e2e/ESIMD/vadd_raw_send.cpp | 186 ----------- .../Plugin/level_zero_device_free_mem.cpp | 2 +- sycl/test-e2e/Properties/cache_config.cpp | 5 - sycl/test-e2e/README.md | 1 - sycl/test-e2e/lit.cfg.py | 2 - 9 files changed, 6 insertions(+), 500 deletions(-) delete mode 100644 sycl/test-e2e/ESIMD/histogram_raw_send.cpp delete mode 100644 sycl/test-e2e/ESIMD/vadd_raw_send.cpp diff --git a/sycl/test-e2e/BFloat16/bfloat16_example.cpp b/sycl/test-e2e/BFloat16/bfloat16_example.cpp index d3136a293bbf9..35cc249e48def 100644 --- a/sycl/test-e2e/BFloat16/bfloat16_example.cpp +++ b/sycl/test-e2e/BFloat16/bfloat16_example.cpp @@ -2,7 +2,7 @@ /// Check if bfloat16 example works using fallback libraries /// -// REQUIRES: opencl-aot, ocloc, gpu-intel-gen9 +// REQUIRES: opencl-aot, ocloc, gpu-intel-gen12 // UNSUPPORTED: cuda // CUDA is not compatible with SPIR. // UNSUPPORTED: accelerator @@ -13,16 +13,16 @@ // RUN: %clangxx -fsycl -fsycl-targets=spir64 %s -o %t.out // RUN: %{run} %t.out -// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen9" %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen12" %s -o %t.out // RUN: %{run} %t.out // RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device *" %s -o %t.out // RUN: %if gpu %{ %{run} %t.out %} -// RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen9" %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen12" %s -o %t.out // RUN: %{run} %t.out -// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen9" %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen12" %s -o %t.out // RUN: %{run} %t.out // RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc" %s -o %t.out diff --git a/sycl/test-e2e/ESIMD/ext_math_ieee_sqrt_div.cpp b/sycl/test-e2e/ESIMD/ext_math_ieee_sqrt_div.cpp index 052ef69cc0d2c..164641059fe98 100644 --- a/sycl/test-e2e/ESIMD/ext_math_ieee_sqrt_div.cpp +++ b/sycl/test-e2e/ESIMD/ext_math_ieee_sqrt_div.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu-intel-gen9 || gpu-intel-pvc +// REQUIRES: gpu-intel-pvc // DEFINE: %{mathflags} = %if cl_options %{/clang:-fno-fast-math%} %else %{-fno-fast-math%} // RUN: %{build} -fsycl-device-code-split=per_kernel %{mathflags} -o %t.out diff --git a/sycl/test-e2e/ESIMD/histogram_raw_send.cpp b/sycl/test-e2e/ESIMD/histogram_raw_send.cpp deleted file mode 100644 index 26648af91393b..0000000000000 --- a/sycl/test-e2e/ESIMD/histogram_raw_send.cpp +++ /dev/null @@ -1,297 +0,0 @@ -//==-histogram_raw_send.cpp - DPC++ ESIMD on-device test-==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------===// -// REQUIRES: gpu-intel-gen9 -// UNSUPPORTED: gpu-intel-dg1,gpu-intel-dg2,gpu-intel-pvc -// UNSUPPORTED: ze_debug -// RUN: %{build} -o %t1.out -// RUN: %{run} %t1.out -// RUN: %{build} -DUSE_CONSTEXPR_API -o %t2.out -// RUN: %{run} %t2.out -// RUN: %{build} -DUSE_SUPPORTED_API -o %t3.out -// RUN: %{run} %t3.out - -// The test checks raw send functionality with atomic write implementation -// on SKL. It does not work on DG1 due to send instruction incompatibility. - -#include "esimd_test_utils.hpp" - -#include - -#include - -using namespace sycl; - -#define NUM_BINS 256 -#define IMG_WIDTH 1024 -#define IMG_HEIGHT 1024 -// -// each parallel_for handles 64x32 bytes -// -#define BLOCK_WIDTH 32 -#define BLOCK_HEIGHT 64 - -void histogram_CPU(unsigned int width, unsigned int height, uint8_t *srcY, - unsigned int *cpuHistogram) { - int i; - for (i = 0; i < width * height; i++) { - cpuHistogram[srcY[i]] += 1; - } -} - -void writeHist(unsigned int *hist) { - int total = 0; - - std::cerr << "\nHistogram: \n"; - for (int i = 0; i < NUM_BINS; i += 8) { - std::cerr << "\n [" << i << " - " << i + 7 << "]:"; - for (int j = 0; j < 8; j++) { - std::cerr << "\t" << hist[i + j]; - total += hist[i + j]; - } - } - std::cerr << "\nTotal = " << total << " \n"; -} - -int checkHistogram(unsigned int *refHistogram, unsigned int *hist) { - - for (int i = 0; i < NUM_BINS; i++) { - if (refHistogram[i] != hist[i]) { - return 0; - } - } - return 1; -} - -using namespace sycl::ext::intel; -using namespace sycl::ext::intel::esimd; - -template -ESIMD_INLINE void atomic_write(T *bins, simd offset, - simd src0) { - simd oldDst; - simd vAddr(reinterpret_cast(bins)); - simd vOffset = offset; - vAddr += vOffset; - - uint32_t exDesc = 0x4C; - uint32_t desc = 0x414A7FF; - constexpr uint8_t execSize = 0x83; - constexpr uint8_t sfid = 0x1; - constexpr uint8_t numDst = 0x1; - constexpr uint8_t numSrc0 = 0x2; - constexpr uint8_t numSrc1 = 0x1; - constexpr uint8_t isEOT = 0; - constexpr uint8_t isSendc = 0; - -#ifdef USE_CONSTEXPR_API - experimental::esimd::raw_sends(oldDst, vAddr, src0, exDesc, - desc); -#elif defined(USE_SUPPORTED_API) - esimd::raw_sends( - oldDst, vAddr, src0, exDesc, desc); - -#else - experimental::esimd::raw_sends(oldDst, vAddr, src0, exDesc, desc, execSize, - sfid, numSrc0, numSrc1, numDst, isEOT, - isSendc); -#endif -} - -int main(int argc, char *argv[]) { - - const char *input_file = nullptr; - unsigned int width = IMG_WIDTH * sizeof(unsigned int); - unsigned int height = IMG_HEIGHT; - - if (argc == 2) { - input_file = argv[1]; - } else { - std::cerr << "Usage: Histogram.exe input_file" << std::endl; - std::cerr << "No input file specificed. Use default random value ...." - << std::endl; - } - - // ------------------------------------------------------------------------ - // Read in image luma plane - - // Allocate Input Buffer - queue q = esimd_test::createQueue(); - esimd_test::printTestLabel(q); - - esimd_test::shared_vector srcY_vec( - width * height, esimd_test::shared_allocator{q}); - esimd_test::shared_vector bins_vec( - NUM_BINS, esimd_test::shared_allocator{q}); - uint8_t *srcY = srcY_vec.data(); - ; - unsigned int *bins = bins_vec.data(); - - uint range_width = width / BLOCK_WIDTH; - uint range_height = height / BLOCK_HEIGHT; - - // Initializes input. - unsigned int input_size = width * height; - std::cerr << "Processing inputs\n"; - - if (input_file != nullptr) { - FILE *f = fopen(input_file, "rb"); - if (f == NULL) { - std::cerr << "Error opening file " << input_file; - std::exit(1); - } - - unsigned int cnt = fread(srcY, sizeof(unsigned char), input_size, f); - if (cnt != input_size) { - std::cerr << "Error reading input from " << input_file; - std::exit(1); - } - } else { - srand(2009); - for (int i = 0; i < input_size; ++i) { - srcY[i] = rand() % 256; - } - } - - for (int i = 0; i < NUM_BINS; i++) { - bins[i] = 0; - } - - // ------------------------------------------------------------------------ - // CPU Execution: - - unsigned int cpuHistogram[NUM_BINS]; - memset(cpuHistogram, 0, sizeof(cpuHistogram)); - histogram_CPU(width, height, srcY, cpuHistogram); - - sycl::image<2> Img(srcY, image_channel_order::rgba, - image_channel_type::unsigned_int32, - range<2>{width / sizeof(uint4), height}); - - // Start Timer - esimd_test::Timer timer; - double start; - - double kernel_times = 0; - unsigned num_iters = 10; - const bool profiling = - q.has_property(); - try { - // num_iters + 1, iteration#0 is for warmup - for (int iter = 0; iter <= num_iters; ++iter) { - double etime = 0; - for (int b = 0; b < NUM_BINS; b++) - bins[b] = 0; - // create ranges - // We need that many task groups - auto GlobalRange = range<1>(range_width * range_height); - // We need that many tasks in each group - auto LocalRange = range<1>(1); - nd_range<1> Range(GlobalRange, LocalRange); - - auto e = q.submit([&](handler &cgh) { - auto readAcc = Img.get_access(cgh); - - cgh.parallel_for( - Range, [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL { - // Get thread origin offsets - uint tid = ndi.get_group(0); - uint h_pos = (tid % range_width) * BLOCK_WIDTH; - uint v_pos = (tid / range_width) * BLOCK_HEIGHT; - - // Declare a 8x32 uchar matrix to store the input block pixel - // value - simd in; - - // Declare a vector to store the local histogram - simd histogram(0); - - // Each thread handles BLOCK_HEIGHTxBLOCK_WIDTH pixel block - for (int y = 0; y < BLOCK_HEIGHT / 8; y++) { - // Perform 2D media block read to load 8x32 pixel block - in = media_block_load(readAcc, h_pos, - v_pos); - - // Accumulate local histogram for each pixel value -#pragma unroll - for (int i = 0; i < 8; i++) { -#pragma unroll - for (int j = 0; j < 32; j++) { - histogram.select<1, 1>(in[i * 32 + j]) += 1; - } - } - - // Update starting offset for the next work block - v_pos += 8; - } - - // Declare a vector to store the offset for atomic write operation - simd offset(0, 1); // init to 0, 1, 2, ..., 7 - offset *= sizeof(unsigned int); - - // Update global sum by atomically adding each local histogram -#pragma unroll - for (int i = 0; i < NUM_BINS; i += 8) { - // Declare a vector to store the source for atomic write - // operation - simd src; - src = histogram.select<8, 1>(i); - -#ifdef __SYCL_DEVICE_ONLY__ - // flat_atomic(bins, offset, src, 1); - atomic_write(bins, offset, - src); - offset += 8 * sizeof(unsigned int); -#else - simd vals; - vals.copy_from(bins + i); - vals = vals + src; - vals.copy_to(bins + i); -#endif - } - }); - }); - e.wait(); - if (profiling) { - etime = esimd_test::report_time("kernel time", e, e); - if (iter > 0) - kernel_times += etime; - } - if (iter == 0) - start = timer.Elapsed(); - } - - // SYCL will enqueue and run the kernel. Recall that the buffer's data is - // given back to the host at the end of scope. - // make sure data is given back to the host at the end of this scope - } catch (sycl::exception const &e) { - std::cout << "SYCL exception caught: " << e.what() << '\n'; - return 1; - } - - // End timer. - double end = timer.Elapsed(); - - esimd_test::display_timing_stats(profiling ? &kernel_times : nullptr, - num_iters, (end - start) * 1000); - - writeHist(bins); - writeHist(cpuHistogram); - // Checking Histogram - if (checkHistogram(cpuHistogram, bins)) { - std::cerr << "PASSED\n"; - return 0; - } else { - std::cerr << "FAILED\n"; - return 1; - } - - return 0; -} diff --git a/sycl/test-e2e/ESIMD/lit.local.cfg b/sycl/test-e2e/ESIMD/lit.local.cfg index 239c2e46e691b..2b47b5c784c27 100644 --- a/sycl/test-e2e/ESIMD/lit.local.cfg +++ b/sycl/test-e2e/ESIMD/lit.local.cfg @@ -3,9 +3,6 @@ import platform config.unsupported_features += ['cuda', 'hip'] config.required_features += ['gpu'] -if 'gpu-intel-gen9' in config.available_features and platform.system() == 'Windows': - config.unsupported = True - # We need this to fix failures when run on OCL. # The current DG2 postcommit job only runs L0 anyway, # so there's no difference in coverage. diff --git a/sycl/test-e2e/ESIMD/vadd_raw_send.cpp b/sycl/test-e2e/ESIMD/vadd_raw_send.cpp deleted file mode 100644 index e4ae7c044d21b..0000000000000 --- a/sycl/test-e2e/ESIMD/vadd_raw_send.cpp +++ /dev/null @@ -1,186 +0,0 @@ -//==---------------- vadd_raw_send.cpp - DPC++ ESIMD on-device test--------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// REQUIRES: gpu-intel-gen9 -// UNSUPPORTED: gpu-intel-dg1,gpu-intel-dg2,gpu-intel-pvc -// RUN: %{build} -fno-sycl-esimd-force-stateless-mem -o %t1.out -// RUN: %{run} %t1.out -// RUN: %{build} -fno-sycl-esimd-force-stateless-mem -DUSE_CONSTEXPR_API -o %t2.out -// RUN: %{run} %t2.out -// RUN: %{build} -fno-sycl-esimd-force-stateless-mem -DUSE_SUPPORTED_API -o %t3.out -// RUN: %{run} %t3.out -// The test checks raw send functionality with block read/write implementation -// on SKL. It does not work on DG1 due to send instruction incompatibility. - -#include "esimd_test_utils.hpp" - -using namespace sycl; - -using namespace sycl::ext::intel; -using namespace sycl::ext::intel::esimd; - -template -ESIMD_INLINE simd dwaligned_block_read(AccessorTy acc, - unsigned int offset) { - simd src0; - simd oldDst; - - src0.select<1, 1>(2) = offset; - uint32_t exDesc = 0xA; - SurfaceIndex desc = esimd::get_surface_index(acc); - desc += 0x2284300; - constexpr uint8_t execSize = 0x84; - constexpr uint8_t sfid = 0x0; - constexpr uint8_t numSrc0 = 0x1; - constexpr uint8_t numDst = 0x2; -#ifdef USE_CONSTEXPR_API - return experimental::esimd::raw_send( - oldDst, src0, exDesc, desc); -#elif defined(USE_SUPPORTED_API) - return esimd::raw_send(oldDst, src0, exDesc, - desc); -#else - return experimental::esimd::raw_send(oldDst, src0, exDesc, desc, execSize, - sfid, numSrc0, numDst); -#endif -} - -template -ESIMD_INLINE void block_write1(AccessorTy acc, unsigned int offset, - simd data) { - simd src0; - - src0.template select<1, 1>(2) = offset >> 4; - uint32_t exDesc = 0x4A; - SurfaceIndex desc = esimd::get_surface_index(acc); - desc += 0x20A0200; - constexpr uint8_t execSize = 0x83; - constexpr uint8_t sfid = 0x0; - constexpr uint8_t numSrc0 = 0x1; - constexpr uint8_t numSrc1 = 0x1; -#ifdef USE_CONSTEXPR_API - return experimental::esimd::raw_sends( - src0, data, exDesc, desc); -#elif defined(USE_SUPPORTED_API) - return esimd::raw_sends(src0, data, exDesc, - desc); -#else - return experimental::esimd::raw_sends(src0, data, exDesc, desc, execSize, - sfid, numSrc0, numSrc1); -#endif -} - -template -ESIMD_INLINE void block_write2(AccessorTy acc, unsigned int offset, - simd data) { - simd src0; - auto src0_ref1 = - src0.template select<8, 1>(0).template bit_cast_view(); - auto src0_ref2 = src0.template select<8, 1>(8); - - src0_ref1.template select<1, 1>(2) = offset >> 4; - src0_ref2 = data.template bit_cast_view(); - uint32_t exDesc = 0xA; - SurfaceIndex desc = esimd::get_surface_index(acc); - desc += 0x40A0200; - constexpr uint8_t execSize = 0x83; - constexpr uint8_t sfid = 0x0; - constexpr uint8_t numSrc0 = 0x2; -#ifdef USE_CONSTEXPR_API - return experimental::esimd::raw_send(src0, exDesc, - desc); -#elif defined(USE_SUPPORTED_API) - return esimd::raw_send(src0, exDesc, desc); -#else - return experimental::esimd::raw_send(src0, exDesc, desc, execSize, sfid, - numSrc0); -#endif -} - -template int test(queue q) { - constexpr unsigned Size = 1024 * 128; - constexpr unsigned VL = sizeof(T) == 4 ? 16 : 32; - T *A = new T[Size]; - T *B = new T[Size]; - T *C = new T[Size]; - - for (unsigned i = 0; i < Size; ++i) { - A[i] = B[i] = i; - C[i] = 0; - } - - try { - buffer bufa(A, range<1>(Size)); - buffer bufb(B, range<1>(Size)); - buffer bufc(C, range<1>(Size)); - - // We need that many workgroups - range<1> GlobalRange{Size / VL}; - - // We need that many threads in each group - range<1> LocalRange{1}; - - auto e = q.submit([&](handler &cgh) { - auto PA = bufa.template get_access(cgh); - auto PB = bufb.template get_access(cgh); - auto PC = bufc.template get_access(cgh); - cgh.parallel_for( - GlobalRange * LocalRange, [=](id<1> i) SYCL_ESIMD_KERNEL { - unsigned int offset = i * VL * sizeof(T); - simd va = dwaligned_block_read(PA, offset); - simd vb = dwaligned_block_read(PB, offset); - simd vc = va + vb; - constexpr int SIZE = VL / 2; - block_write1(PC, offset, vc.template select(0).read()); - offset += SIZE * sizeof(T); - block_write2(PC, offset, vc.template select(SIZE).read()); - }); - }); - e.wait(); - } catch (sycl::exception const &e) { - std::cout << "SYCL exception caught: " << e.what() << '\n'; - - delete[] A; - delete[] B; - delete[] C; - return 1; - } - - int err_cnt = 0; - - for (unsigned i = 0; i < Size; ++i) { - if (A[i] + B[i] != C[i]) { - if (++err_cnt < 10) { - std::cout << "failed at index " << i << ", " << C[i] << " != " << A[i] - << " + " << B[i] << "\n"; - } - } - } - - delete[] A; - delete[] B; - delete[] C; - - std::cout << (err_cnt > 0 ? "FAILED\n" : "Passed\n"); - return err_cnt; -} - -int main(void) { - - queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler()); - - auto dev = q.get_device(); - std::cout << "Running on " << dev.get_info() - << "\n"; - int err_cnt = 0; - - err_cnt += test(q); - if (dev.has(sycl::aspect::fp16)) { - err_cnt += test(q); - } - return err_cnt > 0 ? 1 : 0; -} diff --git a/sycl/test-e2e/Plugin/level_zero_device_free_mem.cpp b/sycl/test-e2e/Plugin/level_zero_device_free_mem.cpp index 560f427e88b13..d9423cfc82a35 100644 --- a/sycl/test-e2e/Plugin/level_zero_device_free_mem.cpp +++ b/sycl/test-e2e/Plugin/level_zero_device_free_mem.cpp @@ -5,7 +5,7 @@ // so requiring DG2. There may be more devices in our CI supporting this aspect. // REQUIRES: gpu-intel-dg2 // REQUIRES: level_zero, level_zero_dev_kit -// UNSUPPORTED: gpu-intel-gen9, gpu-intel-gen12 +// UNSUPPORTED: gpu-intel-gen12 // The query of free memory is not supported on integrated devices // // RUN: %{build} %level_zero_options -o %t.out diff --git a/sycl/test-e2e/Properties/cache_config.cpp b/sycl/test-e2e/Properties/cache_config.cpp index 72d5d68d3837e..0cda3e97a5d1f 100644 --- a/sycl/test-e2e/Properties/cache_config.cpp +++ b/sycl/test-e2e/Properties/cache_config.cpp @@ -1,9 +1,4 @@ - // REQUIRES: gpu, level_zero -// TODO: There is a bug on Windows Gen 9 with reductions -// which is not related to tested feature. Enable back when -// bug is fixed on Windows Gen9 -// UNSUPPORTED: gpu-intel-gen9 && windows // RUN: %{build} -o %t.out // RUN: env UR_L0_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s diff --git a/sycl/test-e2e/README.md b/sycl/test-e2e/README.md index f054c51874ccf..a300cf95b8ade 100644 --- a/sycl/test-e2e/README.md +++ b/sycl/test-e2e/README.md @@ -221,7 +221,6 @@ unavailable. * **ocloc**, **opencl-aot** - Specific AOT tool availability; * **level_zero_dev_kit** - Level_Zero headers and libraries availability; * **cuda_dev_kit** - CUDA SDK headers and libraries availability; - * **gpu-intel-gen9** - Intel GPU Gen9 availability; * **gpu-intel-gen11** - Intel GPU Gen11 availability; * **gpu-intel-gen12** - Intel GPU Gen12 availability; * **gpu-intel-dg1** - Intel GPU DG1 availability; diff --git a/sycl/test-e2e/lit.cfg.py b/sycl/test-e2e/lit.cfg.py index 9a1676da29bb7..701633587272c 100644 --- a/sycl/test-e2e/lit.cfg.py +++ b/sycl/test-e2e/lit.cfg.py @@ -136,8 +136,6 @@ config.substitutions.append(("%sycl_include", config.sycl_include)) # Intel GPU FAMILY availability -if lit_config.params.get("gpu-intel-gen9", False): - config.available_features.add("gpu-intel-gen9") if lit_config.params.get("gpu-intel-gen11", False): config.available_features.add("gpu-intel-gen11") if lit_config.params.get("gpu-intel-gen12", False): From 5d244cb674cc455063e67e05a8089b358d22f1d3 Mon Sep 17 00:00:00 2001 From: Alexey Sachkov Date: Wed, 10 Jul 2024 08:35:08 -0700 Subject: [PATCH 2/6] An attempt to fix bfloat16 example test --- sycl/test-e2e/BFloat16/bfloat16_example.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sycl/test-e2e/BFloat16/bfloat16_example.cpp b/sycl/test-e2e/BFloat16/bfloat16_example.cpp index 35cc249e48def..9c39a733c89d8 100644 --- a/sycl/test-e2e/BFloat16/bfloat16_example.cpp +++ b/sycl/test-e2e/BFloat16/bfloat16_example.cpp @@ -19,10 +19,10 @@ // RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device *" %s -o %t.out // RUN: %if gpu %{ %{run} %t.out %} -// RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen12" %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen12lp" %s -o %t.out // RUN: %{run} %t.out -// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen12" %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen12lp" %s -o %t.out // RUN: %{run} %t.out // RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc" %s -o %t.out From c1283bff693c00cb4a0d63497d0923a01339d719 Mon Sep 17 00:00:00 2001 From: Alexey Sachkov Date: Thu, 11 Jul 2024 03:40:51 -0700 Subject: [PATCH 3/6] Fix missed gen12 -> gen12lp --- sycl/test-e2e/BFloat16/bfloat16_example.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test-e2e/BFloat16/bfloat16_example.cpp b/sycl/test-e2e/BFloat16/bfloat16_example.cpp index 9c39a733c89d8..c507b33643041 100644 --- a/sycl/test-e2e/BFloat16/bfloat16_example.cpp +++ b/sycl/test-e2e/BFloat16/bfloat16_example.cpp @@ -13,7 +13,7 @@ // RUN: %clangxx -fsycl -fsycl-targets=spir64 %s -o %t.out // RUN: %{run} %t.out -// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen12" %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen12lp" %s -o %t.out // RUN: %{run} %t.out // RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device *" %s -o %t.out From 1326e5d1e08b5f2bc377990503bd4a294788e5dc Mon Sep 17 00:00:00 2001 From: "Larsen, Steffen" Date: Fri, 20 Sep 2024 01:36:22 -0700 Subject: [PATCH 4/6] Split bfloat16_example Signed-off-by: Larsen, Steffen --- sycl/test-e2e/BFloat16/bfloat16_example.cpp | 66 ++----------------- sycl/test-e2e/BFloat16/bfloat16_example.hpp | 47 +++++++++++++ .../BFloat16/bfloat16_example_cpu.cpp | 18 +++++ .../BFloat16/bfloat16_example_gpu.cpp | 18 +++++ 4 files changed, 88 insertions(+), 61 deletions(-) create mode 100644 sycl/test-e2e/BFloat16/bfloat16_example.hpp create mode 100644 sycl/test-e2e/BFloat16/bfloat16_example_cpu.cpp create mode 100644 sycl/test-e2e/BFloat16/bfloat16_example_gpu.cpp diff --git a/sycl/test-e2e/BFloat16/bfloat16_example.cpp b/sycl/test-e2e/BFloat16/bfloat16_example.cpp index 5c8604cb547e3..b588ce609633a 100644 --- a/sycl/test-e2e/BFloat16/bfloat16_example.cpp +++ b/sycl/test-e2e/BFloat16/bfloat16_example.cpp @@ -1,80 +1,24 @@ /// -/// Check if bfloat16 example works using fallback libraries +/// Check if bfloat16 example works using fallback libraries AOT compiled for +/// both GPU and CPU. /// // REQUIRES: opencl-aot, ocloc, gpu-intel-gen12 -// CUDA is not compatible with SPIR. -// UNSUPPORTED: cuda - // RUN: %clangxx -fsycl %s -o %t.out // RUN: %{run} %t.out // RUN: %clangxx -fsycl -fsycl-targets=spir64 %s -o %t.out // RUN: %{run} %t.out -// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen12lp" %s -o %t.out -// RUN: %{run} %t.out - -// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device *" %s -o %t.out -// RUN: %if gpu %{ %{run} %t.out %} - // RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen12lp" %s -o %t.out // RUN: %{run} %t.out // RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen12lp" %s -o %t.out // RUN: %{run} %t.out -// RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc" %s -o %t.out -// RUN: %if cpu %{ %{run} %t.out %} - -// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc" %s -o %t.out -// RUN: %if cpu %{ %{run} %t.out %} - -#include -#include - -using namespace sycl; -using sycl::ext::oneapi::bfloat16; - -float foo(float a, float b) { - // Convert from float to bfloat16. - bfloat16 A{a}; - bfloat16 B{b}; - - // Convert A and B from bfloat16 to float, do addition on floating-point - // numbers, then convert the result to bfloat16 and store it in C. - bfloat16 C = A + B; - - // Return the result converted from bfloat16 to float. - return C; -} - -int main(int argc, char *argv[]) { - float data[3] = {7.0f, 8.1f, 0.0f}; - - float result_host = foo(7.0f, 8.1f); - std::cout << "CPU Result = " << result_host << std::endl; - if (std::abs(15.1f - result_host) > 0.1f) { - std::cout << "Test failed. Expected CPU Result ~= 15.1" << std::endl; - return 1; - } - - queue deviceQueue; - buffer buf{data, 3}; - - deviceQueue.submit([&](handler &cgh) { - accessor numbers{buf, cgh, read_write}; - cgh.single_task([=]() { numbers[2] = foo(numbers[0], numbers[1]); }); - }); - - host_accessor hostOutAcc{buf, read_only}; - float result_device = hostOutAcc[2]; - std::cout << "GPU Result = " << result_device << std::endl; - if (std::abs(result_host - result_device) > 0.1f) { - std::cout << "Test failed. CPU Result !~= GPU result" << std::endl; - return 1; - } +#include "bfloat16_example.hpp" - return 0; +int main() { + return runTest(); } diff --git a/sycl/test-e2e/BFloat16/bfloat16_example.hpp b/sycl/test-e2e/BFloat16/bfloat16_example.hpp new file mode 100644 index 0000000000000..85c9172ed174f --- /dev/null +++ b/sycl/test-e2e/BFloat16/bfloat16_example.hpp @@ -0,0 +1,47 @@ +#include +#include + +using namespace sycl; +using sycl::ext::oneapi::bfloat16; + +float foo(float a, float b) { + // Convert from float to bfloat16. + bfloat16 A{a}; + bfloat16 B{b}; + + // Convert A and B from bfloat16 to float, do addition on floating-point + // numbers, then convert the result to bfloat16 and store it in C. + bfloat16 C = A + B; + + // Return the result converted from bfloat16 to float. + return C; +} + +int runTest() { + float data[3] = {7.0f, 8.1f, 0.0f}; + + float result_host = foo(7.0f, 8.1f); + std::cout << "Host Result = " << result_host << std::endl; + if (std::abs(15.1f - result_host) > 0.1f) { + std::cout << "Test failed. Expected Host Result ~= 15.1" << std::endl; + return 1; + } + + queue deviceQueue; + buffer buf{data, 3}; + + deviceQueue.submit([&](handler &cgh) { + accessor numbers{buf, cgh, read_write}; + cgh.single_task([=]() { numbers[2] = foo(numbers[0], numbers[1]); }); + }); + + host_accessor hostOutAcc{buf, read_only}; + float result_device = hostOutAcc[2]; + std::cout << "Device Result = " << result_device << std::endl; + if (std::abs(result_host - result_device) > 0.1f) { + std::cout << "Test failed. Host Result !~= Device result" << std::endl; + return 1; + } + + return 0; +} diff --git a/sycl/test-e2e/BFloat16/bfloat16_example_cpu.cpp b/sycl/test-e2e/BFloat16/bfloat16_example_cpu.cpp new file mode 100644 index 0000000000000..218862b78aa81 --- /dev/null +++ b/sycl/test-e2e/BFloat16/bfloat16_example_cpu.cpp @@ -0,0 +1,18 @@ +/// +/// Check if bfloat16 example works using fallback libraries AOT compiled for +/// CPU. +/// + +// REQUIRES: opencl-aot, ocloc, gpu-intel-gen12, cpu + +// RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc" %s -o %t.out +// RUN: %{run} %t.out + +// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc" %s -o %t.out +// RUN: %{run} %t.out + +#include "bfloat16_example.hpp" + +int main() { + return runTest(); +} diff --git a/sycl/test-e2e/BFloat16/bfloat16_example_gpu.cpp b/sycl/test-e2e/BFloat16/bfloat16_example_gpu.cpp new file mode 100644 index 0000000000000..a9c547dc3120f --- /dev/null +++ b/sycl/test-e2e/BFloat16/bfloat16_example_gpu.cpp @@ -0,0 +1,18 @@ +/// +/// Check if bfloat16 example works using fallback libraries AOT compiled for +/// GPU. +/// + +// REQUIRES: opencl-aot, ocloc, gpu-intel-gen12, gpu + +// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen12lp" %s -o %t.out +// RUN: %{run} %t.out + +// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device *" %s -o %t.out +// RUN: %{run} %t.out + +#include "bfloat16_example.hpp" + +int main() { + return runTest(); +} From 625c4a7c75dc434c2b66308e804f2c5a26ed703c Mon Sep 17 00:00:00 2001 From: "Larsen, Steffen" Date: Fri, 20 Sep 2024 03:29:17 -0700 Subject: [PATCH 5/6] Change restrictions on bfloat16_example Signed-off-by: Larsen, Steffen --- sycl/test-e2e/BFloat16/bfloat16_example.cpp | 14 +------------ .../BFloat16/bfloat16_example_aot.cpp | 21 +++++++++++++++++++ ...e_cpu.cpp => bfloat16_example_aot_cpu.cpp} | 10 ++++----- ...e_gpu.cpp => bfloat16_example_aot_gpu.cpp} | 6 +++--- 4 files changed, 30 insertions(+), 21 deletions(-) create mode 100644 sycl/test-e2e/BFloat16/bfloat16_example_aot.cpp rename sycl/test-e2e/BFloat16/{bfloat16_example_cpu.cpp => bfloat16_example_aot_cpu.cpp} (54%) rename sycl/test-e2e/BFloat16/{bfloat16_example_gpu.cpp => bfloat16_example_aot_gpu.cpp} (72%) diff --git a/sycl/test-e2e/BFloat16/bfloat16_example.cpp b/sycl/test-e2e/BFloat16/bfloat16_example.cpp index b588ce609633a..3746e19c950d9 100644 --- a/sycl/test-e2e/BFloat16/bfloat16_example.cpp +++ b/sycl/test-e2e/BFloat16/bfloat16_example.cpp @@ -1,22 +1,10 @@ /// -/// Check if bfloat16 example works using fallback libraries AOT compiled for -/// both GPU and CPU. +/// Checks a simple case of bfloat16, also employed for AOT library fallback. /// -// REQUIRES: opencl-aot, ocloc, gpu-intel-gen12 - // RUN: %clangxx -fsycl %s -o %t.out // RUN: %{run} %t.out -// RUN: %clangxx -fsycl -fsycl-targets=spir64 %s -o %t.out -// RUN: %{run} %t.out - -// RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen12lp" %s -o %t.out -// RUN: %{run} %t.out - -// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen12lp" %s -o %t.out -// RUN: %{run} %t.out - #include "bfloat16_example.hpp" int main() { diff --git a/sycl/test-e2e/BFloat16/bfloat16_example_aot.cpp b/sycl/test-e2e/BFloat16/bfloat16_example_aot.cpp new file mode 100644 index 0000000000000..8337716c3191e --- /dev/null +++ b/sycl/test-e2e/BFloat16/bfloat16_example_aot.cpp @@ -0,0 +1,21 @@ +/// +/// Check if bfloat16 example works using fallback libraries AOT compiled for +/// both GPU and CPU. +/// + +// REQUIRES: opencl-aot, ocloc, gpu-intel-gen12, any-device-is-cpu + +// RUN: %clangxx -fsycl -fsycl-targets=spir64 %s -o %t.out +// RUN: %{run} %t.out + +// RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen12lp" %s -o %t.out +// RUN: %{run} %t.out + +// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen12lp" %s -o %t.out +// RUN: %{run} %t.out + +#include "bfloat16_example.hpp" + +int main() { + return runTest(); +} diff --git a/sycl/test-e2e/BFloat16/bfloat16_example_cpu.cpp b/sycl/test-e2e/BFloat16/bfloat16_example_aot_cpu.cpp similarity index 54% rename from sycl/test-e2e/BFloat16/bfloat16_example_cpu.cpp rename to sycl/test-e2e/BFloat16/bfloat16_example_aot_cpu.cpp index 218862b78aa81..10d21f1eecfc2 100644 --- a/sycl/test-e2e/BFloat16/bfloat16_example_cpu.cpp +++ b/sycl/test-e2e/BFloat16/bfloat16_example_aot_cpu.cpp @@ -3,13 +3,13 @@ /// CPU. /// -// REQUIRES: opencl-aot, ocloc, gpu-intel-gen12, cpu +// REQUIRES: opencl-aot, ocloc, gpu-intel-gen12, any-device-is-cpu -// RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc" %s -o %t.out -// RUN: %{run} %t.out +// RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device dg1" %s -o %t.out +// RUN: if cpu %{ %{run} %t.out %} -// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc" %s -o %t.out -// RUN: %{run} %t.out +// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device dg1" %s -o %t.out +// RUN: if cpu %{ %{run} %t.out %} #include "bfloat16_example.hpp" diff --git a/sycl/test-e2e/BFloat16/bfloat16_example_gpu.cpp b/sycl/test-e2e/BFloat16/bfloat16_example_aot_gpu.cpp similarity index 72% rename from sycl/test-e2e/BFloat16/bfloat16_example_gpu.cpp rename to sycl/test-e2e/BFloat16/bfloat16_example_aot_gpu.cpp index a9c547dc3120f..8262d3f655b37 100644 --- a/sycl/test-e2e/BFloat16/bfloat16_example_gpu.cpp +++ b/sycl/test-e2e/BFloat16/bfloat16_example_aot_gpu.cpp @@ -3,13 +3,13 @@ /// GPU. /// -// REQUIRES: opencl-aot, ocloc, gpu-intel-gen12, gpu +// REQUIRES: opencl-aot, ocloc, gpu-intel-gen12, any-device-is-gpu // RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen12lp" %s -o %t.out -// RUN: %{run} %t.out +// RUN: %if gpu %{%{run} %t.out %} // RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device *" %s -o %t.out -// RUN: %{run} %t.out +// RUN: %if gpu %{%{run} %t.out %} #include "bfloat16_example.hpp" From 4c8a40066ebeed8fa5de7de28d7ae8e84beac220 Mon Sep 17 00:00:00 2001 From: "Larsen, Steffen" Date: Fri, 20 Sep 2024 05:18:23 -0700 Subject: [PATCH 6/6] Redisable for CUDA and fix if statements Signed-off-by: Larsen, Steffen --- sycl/test-e2e/BFloat16/bfloat16_example.cpp | 3 +++ sycl/test-e2e/BFloat16/bfloat16_example_aot_cpu.cpp | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/sycl/test-e2e/BFloat16/bfloat16_example.cpp b/sycl/test-e2e/BFloat16/bfloat16_example.cpp index 3746e19c950d9..84ea36fd363a5 100644 --- a/sycl/test-e2e/BFloat16/bfloat16_example.cpp +++ b/sycl/test-e2e/BFloat16/bfloat16_example.cpp @@ -2,6 +2,9 @@ /// Checks a simple case of bfloat16, also employed for AOT library fallback. /// +// CUDA is not compatible with SPIR. +// UNSUPPORTED: cuda + // RUN: %clangxx -fsycl %s -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/BFloat16/bfloat16_example_aot_cpu.cpp b/sycl/test-e2e/BFloat16/bfloat16_example_aot_cpu.cpp index 10d21f1eecfc2..2f6d893768c4e 100644 --- a/sycl/test-e2e/BFloat16/bfloat16_example_aot_cpu.cpp +++ b/sycl/test-e2e/BFloat16/bfloat16_example_aot_cpu.cpp @@ -6,10 +6,10 @@ // REQUIRES: opencl-aot, ocloc, gpu-intel-gen12, any-device-is-cpu // RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device dg1" %s -o %t.out -// RUN: if cpu %{ %{run} %t.out %} +// RUN: %if cpu %{ %{run} %t.out %} // RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device dg1" %s -o %t.out -// RUN: if cpu %{ %{run} %t.out %} +// RUN: %if cpu %{ %{run} %t.out %} #include "bfloat16_example.hpp"