From 73182dd8621189e88ff71f2376c418015c231ff1 Mon Sep 17 00:00:00 2001 From: Yanliang Mu Date: Tue, 16 Aug 2022 14:38:13 +0800 Subject: [PATCH 01/35] split double tests in AtomicRef directory --- SYCL/AtomicRef/assignment_atomic64.cpp | 1 - .../assignment_atomic64_aspect-fp64.cpp | 30 +++++++++++++++++++ .../AtomicRef/assignment_atomic64_generic.cpp | 1 - ...ssignment_atomic64_generic_aspect-fp64.cpp | 30 +++++++++++++++++++ 4 files changed, 60 insertions(+), 2 deletions(-) create mode 100644 SYCL/AtomicRef/assignment_atomic64_aspect-fp64.cpp create mode 100644 SYCL/AtomicRef/assignment_atomic64_generic_aspect-fp64.cpp diff --git a/SYCL/AtomicRef/assignment_atomic64.cpp b/SYCL/AtomicRef/assignment_atomic64.cpp index 3bf4e4551d..8f0b709653 100644 --- a/SYCL/AtomicRef/assignment_atomic64.cpp +++ b/SYCL/AtomicRef/assignment_atomic64.cpp @@ -17,7 +17,6 @@ int main() { } constexpr int N = 32; - assignment_test(q, N); // Include long tests if they are 64 bits wide if constexpr (sizeof(long) == 8) { diff --git a/SYCL/AtomicRef/assignment_atomic64_aspect-fp64.cpp b/SYCL/AtomicRef/assignment_atomic64_aspect-fp64.cpp new file mode 100644 index 0000000000..d8bf53da48 --- /dev/null +++ b/SYCL/AtomicRef/assignment_atomic64_aspect-fp64.cpp @@ -0,0 +1,30 @@ +// REQUIRES: aspect-fp64 +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// XFAIL: hip +// Expected failure because hip does not have atomic64 check implementation + +#include "assignment.h" +#include +using namespace sycl; + +int main() { + queue q; + if (!q.get_device().has(aspect::fp64)) { + std::cout << "Skipping test\n"; + return 0; + } + if (!q.get_device().has(aspect::atomic64)) { + std::cout << "Skipping test\n"; + return 0; + } + + constexpr int N = 32; + assignment_test(q, N); + + std::cout << "Test passed." << std::endl; +} diff --git a/SYCL/AtomicRef/assignment_atomic64_generic.cpp b/SYCL/AtomicRef/assignment_atomic64_generic.cpp index 6bd23254aa..cd0a9d3ea8 100644 --- a/SYCL/AtomicRef/assignment_atomic64_generic.cpp +++ b/SYCL/AtomicRef/assignment_atomic64_generic.cpp @@ -20,7 +20,6 @@ int main() { } constexpr int N = 32; - assignment_generic_test(q, N); // Include long tests if they are 64 bits wide if constexpr (sizeof(long) == 8) { diff --git a/SYCL/AtomicRef/assignment_atomic64_generic_aspect-fp64.cpp b/SYCL/AtomicRef/assignment_atomic64_generic_aspect-fp64.cpp new file mode 100644 index 0000000000..674211a754 --- /dev/null +++ b/SYCL/AtomicRef/assignment_atomic64_generic_aspect-fp64.cpp @@ -0,0 +1,30 @@ +// REQUIRES: aspect-fp64 +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// CUDA backend has had no support for the generic address space yet +// XFAIL: cuda || hip + +#include "assignment.h" +#include +using namespace sycl; + +int main() { + queue q; + if (!q.get_device().has(aspect::fp64)) { + std::cout << "Skipping test\n"; + return 0; + } + if (!q.get_device().has(aspect::atomic64)) { + std::cout << "Skipping test\n"; + return 0; + } + + constexpr int N = 32; + assignment_generic_test(q, N); + + std::cout << "Test passed." << std::endl; +} From 1ba4584af915eb7e68e87e90fa0041a7f3fddf83 Mon Sep 17 00:00:00 2001 From: Yanliang Mu Date: Tue, 16 Aug 2022 14:39:19 +0800 Subject: [PATCH 02/35] split double tests in Basic directory --- SYCL/Basic/buffer/buffer.cpp | 16 +----- SYCL/Basic/buffer/buffer_aspect-fp64.cpp | 71 ++++++++++++++++++++++++ 2 files changed, 72 insertions(+), 15 deletions(-) create mode 100644 SYCL/Basic/buffer/buffer_aspect-fp64.cpp diff --git a/SYCL/Basic/buffer/buffer.cpp b/SYCL/Basic/buffer/buffer.cpp index 5ced4c9a04..ceedfc754a 100644 --- a/SYCL/Basic/buffer/buffer.cpp +++ b/SYCL/Basic/buffer/buffer.cpp @@ -513,15 +513,11 @@ int main() { [](bool *data) { delete[] data; }); std::shared_ptr int_shrd(new int[size], [](int *data) { delete[] data; }); - std::shared_ptr double_shrd(new double[size], - [](double *data) { delete[] data; }); std::vector bool_vector; std::vector int_vector; - std::vector double_vector; bool_vector.reserve(size); int_vector.reserve(size); - double_vector.reserve(size); sycl::queue Queue; std::mutex m; @@ -532,40 +528,30 @@ int main() { sycl::buffer buf_int_shrd( int_shrd, r, sycl::property_list{sycl::property::buffer::use_mutex(m)}); - sycl::buffer buf_double_shrd( - double_shrd, r, - sycl::property_list{sycl::property::buffer::use_mutex(m)}); m.lock(); std::fill(bool_shrd.get(), (bool_shrd.get() + size), bool()); std::fill(int_shrd.get(), (int_shrd.get() + size), int()); - std::fill(double_shrd.get(), (double_shrd.get() + size), double()); m.unlock(); buf_bool_shrd.set_final_data(bool_vector.begin()); buf_int_shrd.set_final_data(int_vector.begin()); - buf_double_shrd.set_final_data(double_vector.begin()); buf_bool_shrd.set_write_back(true); buf_int_shrd.set_write_back(true); - buf_double_shrd.set_write_back(true); Queue.submit([&](sycl::handler &cgh) { auto Accessor_bool = buf_bool_shrd.get_access(cgh); auto Accessor_int = buf_int_shrd.get_access(cgh); - auto Accessor_double = - buf_double_shrd.get_access(cgh); cgh.parallel_for(r, [=](sycl::id<1> WIid) { Accessor_bool[WIid] = true; Accessor_int[WIid] = 3; - Accessor_double[WIid] = 7.5; }); }); } // Data is copied back for (size_t i = 0; i < size; i++) { - if (bool_vector[i] != true || int_vector[i] != 3 || - double_vector[i] != 7.5) { + if (bool_vector[i] != true || int_vector[i] != 3) { assert(false && "Data was not copied back"); return 1; } diff --git a/SYCL/Basic/buffer/buffer_aspect-fp64.cpp b/SYCL/Basic/buffer/buffer_aspect-fp64.cpp new file mode 100644 index 0000000000..55119db44e --- /dev/null +++ b/SYCL/Basic/buffer/buffer_aspect-fp64.cpp @@ -0,0 +1,71 @@ +// REQUIRES: aspect-fp64 +// RUN: %clangxx %cxx_std_optionc++17 %s -o %t1.out %sycl_options +// RUN: %HOST_RUN_PLACEHOLDER %t1.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t2.out +// RUN: %HOST_RUN_PLACEHOLDER %t2.out +// RUN: %CPU_RUN_PLACEHOLDER %t2.out +// RUN: %GPU_RUN_PLACEHOLDER %t2.out +// RUN: %ACC_RUN_PLACEHOLDER %t2.out + +//==------------------- buffer.cpp - SYCL buffer basic test ----------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +#include +#include + +using namespace sycl; + +template constexpr T write_back_result = T(3); +template <> constexpr double write_back_result = double(7.5); +template class fill_buffer_for_write_back {}; + +template void check_set_write_back() { + size_t size = 32; + sycl::range r(size); + std::shared_ptr shrd(new T[size], [](T *data) { delete[] data; }); + std::vector vector; + vector.reserve(size); + sycl::queue Queue; + std::mutex m; + { + sycl::buffer buf_shrd( + shrd, r, sycl::property_list{sycl::property::buffer::use_mutex(m)}); + m.lock(); + std::fill(shrd.get(), (shrd.get() + size), T()); + m.unlock(); + buf_shrd.set_final_data(vector.begin()); + buf_shrd.set_write_back(true); + Queue.submit([&](sycl::handler &cgh) { + auto Accessor = + buf_shrd.template get_access(cgh); + cgh.parallel_for>(r, [=](sycl::id<1> WIid) { + Accessor[WIid] = write_back_result; + }); + }); + } // Data is copied back + for (size_t i = 0; i < size; i++) { + if (vector[i] != write_back_result) { + assert(false && "Data was not copied back"); + } + } +} + +int main() { + // Check that data is copied back after forcing write-back using + // set_write_back + queue q; + if (!q.get_device().has(aspect::fp64)) { + std::cout << "Skipping test\n"; + return 0; + } + + check_set_write_back(); + return 0; +} \ No newline at end of file From 3df1be705b5c421d49df3f8f2822a996da1cb45a Mon Sep 17 00:00:00 2001 From: Yanliang Mu Date: Tue, 16 Aug 2022 15:17:21 +0800 Subject: [PATCH 03/35] split double tests in DeprecatedFeatures directory --- .../specialization_constants.cpp | 11 -- .../specialization_constants_aspect-fp64.cpp | 95 ++++++++++++++++ .../specialization_constants_override.cpp | 11 -- ...ization_constants_override_aspect-fp64.cpp | 103 ++++++++++++++++++ 4 files changed, 198 insertions(+), 22 deletions(-) create mode 100644 SYCL/DeprecatedFeatures/SpecConsts1.2.1/specialization_constants_aspect-fp64.cpp create mode 100644 SYCL/DeprecatedFeatures/SpecConsts1.2.1/specialization_constants_override_aspect-fp64.cpp diff --git a/SYCL/DeprecatedFeatures/SpecConsts1.2.1/specialization_constants.cpp b/SYCL/DeprecatedFeatures/SpecConsts1.2.1/specialization_constants.cpp index ae4e4e9f7c..6a968e33d9 100644 --- a/SYCL/DeprecatedFeatures/SpecConsts1.2.1/specialization_constants.cpp +++ b/SYCL/DeprecatedFeatures/SpecConsts1.2.1/specialization_constants.cpp @@ -35,7 +35,6 @@ class MyInt64Const; class MyUInt64Const; class MyHalfConst; class MyFloatConst; -class MyDoubleConst; using namespace sycl; @@ -54,7 +53,6 @@ int64_t int64_ref = rnd() % std::numeric_limits::max(); uint64_t uint64_ref = rnd() % std::numeric_limits::max(); half half_ref = rnd() % std::numeric_limits::max(); float float_ref = rnd() % std::numeric_limits::max(); -double double_ref = rnd() % std::numeric_limits::max(); template bool check(const T1 &test, const T2 &ref, std::string type) { @@ -110,9 +108,6 @@ int main(int argc, char **argv) { ext::oneapi::experimental::spec_constant f32 = prog.set_spec_constant(float_ref); - ext::oneapi::experimental::spec_constant f64 = - prog.set_spec_constant(double_ref); - prog.build_with_kernel_type(); bool bool_test = 0; @@ -126,7 +121,6 @@ int main(int argc, char **argv) { uint64_t uint64_test = 0; half half_test = 0; float float_test = 0; - double double_test = 0; { buffer bool_buf(&bool_test, 1); @@ -140,7 +134,6 @@ int main(int argc, char **argv) { buffer uint64_buf(&uint64_test, 1); buffer half_buf(&half_test, 1); buffer float_buf(&float_test, 1); - buffer double_buf(&double_test, 1); q.submit([&](handler &cgh) { auto bool_acc = bool_buf.get_access(cgh); @@ -154,7 +147,6 @@ int main(int argc, char **argv) { auto uint64_acc = uint64_buf.get_access(cgh); auto half_acc = half_buf.get_access(cgh); auto float_acc = float_buf.get_access(cgh); - auto double_acc = double_buf.get_access(cgh); cgh.single_task(prog.get_kernel(), [=]() { bool_acc[0] = i1.get(); @@ -170,7 +162,6 @@ int main(int argc, char **argv) { half_acc[0] = f16.get(); #endif float_acc[0] = f32.get(); - double_acc[0] = f64.get(); }); }); } @@ -198,8 +189,6 @@ int main(int argc, char **argv) { #endif if (!check(float_test, float_ref, "float")) return 1; - if (!check(double_test, double_ref, "double")) - return 1; } catch (const exception &e) { std::cout << "an async SYCL exception was caught: " << std::string(e.what()); diff --git a/SYCL/DeprecatedFeatures/SpecConsts1.2.1/specialization_constants_aspect-fp64.cpp b/SYCL/DeprecatedFeatures/SpecConsts1.2.1/specialization_constants_aspect-fp64.cpp new file mode 100644 index 0000000000..8793abebcf --- /dev/null +++ b/SYCL/DeprecatedFeatures/SpecConsts1.2.1/specialization_constants_aspect-fp64.cpp @@ -0,0 +1,95 @@ +// REQUIRES: aspect-fp64 +// RUN: %clangxx -fsycl -D__SYCL_INTERNAL_API %s -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// Specialization constants are not supported on FPGA h/w and emulator. +// UNSUPPORTED: cuda || hip +// +//==----------- specialization_constants.cpp -------------------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// Basic checks for some primitive types + +#include +#include +#include +#include + +class SpecializedKernel; + +class MyDoubleConst; + +using namespace sycl; + +unsigned seed = std::chrono::system_clock::now().time_since_epoch().count(); +std::mt19937_64 rnd(seed); + +// Fetch a value at runtime. +double double_ref = rnd() % std::numeric_limits::max(); + +template +bool check(const T1 &test, const T2 &ref, std::string type) { + + if (test != ref) { + std::cout << "Test != Reference: " << std::to_string(test) + << " != " << std::to_string(ref) << " for type: " << type << "\n"; + return false; + } + return true; +} + +int main(int argc, char **argv) { + queue q; + if (!q.get_device().has(aspect::fp64)) { + std::cout << "Skipping test\n"; + return 0; + } + + std::cout << "check specialization constants API. (seed =" << seed << "\n"; + + auto exception_handler = [&](sycl::exception_list exceptions) { + for (std::exception_ptr const &e : exceptions) { + try { + std::rethrow_exception(e); + } catch (sycl::exception const &e) { + std::cout << "an async SYCL exception was caught: " + << std::string(e.what()); + } + } + }; + try { + auto q = queue(exception_handler); + program prog(q.get_context()); + + // Create specialization constants. + ext::oneapi::experimental::spec_constant f64 = + prog.set_spec_constant(double_ref); + + prog.build_with_kernel_type(); + + double double_test = 0; + + { + buffer double_buf(&double_test, 1); + + q.submit([&](handler &cgh) { + auto double_acc = double_buf.get_access(cgh); + cgh.single_task( + prog.get_kernel(), + [=]() { double_acc[0] = f64.get(); }); + }); + } + if (!check(double_test, double_ref, "double")) + return 1; + } catch (const exception &e) { + std::cout << "an async SYCL exception was caught: " + << std::string(e.what()); + return 1; + } + return 0; +} diff --git a/SYCL/DeprecatedFeatures/SpecConsts1.2.1/specialization_constants_override.cpp b/SYCL/DeprecatedFeatures/SpecConsts1.2.1/specialization_constants_override.cpp index 16c1b34267..921e474362 100644 --- a/SYCL/DeprecatedFeatures/SpecConsts1.2.1/specialization_constants_override.cpp +++ b/SYCL/DeprecatedFeatures/SpecConsts1.2.1/specialization_constants_override.cpp @@ -24,7 +24,6 @@ class SpecializedKernelOverride; class MyBoolConstOverride; class MyUInt32ConstOverride; -class MyDoubleConstOverride; using namespace sycl; @@ -35,11 +34,9 @@ bool bool_ref = true; bool bool_ref_override = false; // Fetch a value at runtime. uint32_t uint32_ref = rnd() % std::numeric_limits::max(); -double double_ref = rnd() % std::numeric_limits::max(); // Values which override the previous ones uint32_t uint32_ref_override = rnd() % std::numeric_limits::max(); -double double_ref_override = rnd() % std::numeric_limits::max(); template bool check(const T1 &test, const T2 &ref, std::string type) { @@ -76,40 +73,32 @@ int main(int argc, char **argv) { prog.set_spec_constant(bool_ref); ext::oneapi::experimental::spec_constant ui32 = prog.set_spec_constant(uint32_ref); - ext::oneapi::experimental::spec_constant - f64 = prog.set_spec_constant(double_ref); // Override specialization constants. i1 = prog.set_spec_constant(bool_ref_override); ui32 = prog.set_spec_constant(uint32_ref_override); - f64 = prog.set_spec_constant(double_ref_override); prog.build_with_kernel_type(); bool bool_test = true; uint32_t uint32_test = 0; - double double_test = 0; { buffer bool_buf(&bool_test, 1); buffer uint32_buf(&uint32_test, 1); - buffer double_buf(&double_test, 1); q.submit([&](handler &cgh) { auto bool_acc = bool_buf.get_access(cgh); auto uint32_acc = uint32_buf.get_access(cgh); - auto double_acc = double_buf.get_access(cgh); cgh.single_task( prog.get_kernel(), [=]() { bool_acc[0] = i1.get(); uint32_acc[0] = ui32.get(); - double_acc[0] = f64.get(); }); }); } check(bool_test, bool_ref_override, "bool"); check(uint32_test, uint32_ref_override, "uint32"); - check(double_test, double_ref_override, "double"); } catch (const exception &e) { std::cout << "an async SYCL exception was caught: " diff --git a/SYCL/DeprecatedFeatures/SpecConsts1.2.1/specialization_constants_override_aspect-fp64.cpp b/SYCL/DeprecatedFeatures/SpecConsts1.2.1/specialization_constants_override_aspect-fp64.cpp new file mode 100644 index 0000000000..d409d04e4d --- /dev/null +++ b/SYCL/DeprecatedFeatures/SpecConsts1.2.1/specialization_constants_override_aspect-fp64.cpp @@ -0,0 +1,103 @@ +// REQUIRES: aspect-fp64 +// RUN: %clangxx -fsycl -D__SYCL_INTERNAL_API %s -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// Specialization constants are not supported on FPGA h/w and emulator. +// UNSUPPORTED: cuda || hip +// +//==----------- specialization_constants_override.cpp ----------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// Checks that set_spec_constant can be used twice on the same program + +#include +#include +#include +#include + +class SpecializedKernelOverride; + +class MyDoubleConstOverride; + +using namespace sycl; + +unsigned seed = std::chrono::system_clock::now().time_since_epoch().count(); +std::mt19937_64 rnd(seed); + +// Fetch a value at runtime. +double double_ref = rnd() % std::numeric_limits::max(); + +// Values which override the previous ones +double double_ref_override = rnd() % std::numeric_limits::max(); + +template +bool check(const T1 &test, const T2 &ref, std::string type) { + + if (test != ref) { + std::cout << "Test != Reference: " << std::to_string(test) + << " != " << std::to_string(ref) << " for type: " << type << "\n"; + return false; + } + return true; +} + +int main(int argc, char **argv) { + queue q; + if (!q.get_device().has(aspect::fp64)) { + std::cout << "Skipping test\n"; + return 0; + } + + std::cout << "check specialization constants overriding. (seed =" << seed + << "\n"; + + auto exception_handler = [&](sycl::exception_list exceptions) { + for (std::exception_ptr const &e : exceptions) { + try { + std::rethrow_exception(e); + } catch (sycl::exception const &e) { + std::cout << "an async SYCL exception was caught: " + << std::string(e.what()); + } + } + }; + + try { + auto q = queue(exception_handler); + program prog(q.get_context()); + + // Create specialization constants. + ext::oneapi::experimental::spec_constant + f64 = prog.set_spec_constant(double_ref); + + // Override specialization constants. + f64 = prog.set_spec_constant(double_ref_override); + + prog.build_with_kernel_type(); + + double double_test = 0; + + { + buffer double_buf(&double_test, 1); + + q.submit([&](handler &cgh) { + auto double_acc = double_buf.get_access(cgh); + cgh.single_task( + prog.get_kernel(), + [=]() { double_acc[0] = f64.get(); }); + }); + } + check(double_test, double_ref_override, "double"); + + } catch (const exception &e) { + std::cout << "an async SYCL exception was caught: " + << std::string(e.what()); + return 1; + } + return 0; +} From 0e6e02e9bac9378362a8047613c8ba9e164fb37e Mon Sep 17 00:00:00 2001 From: Yanliang Mu Date: Tue, 16 Aug 2022 15:19:34 +0800 Subject: [PATCH 04/35] split double tests in DeviceLib directory --- SYCL/DeviceLib/built-ins/nan.cpp | 8 --- SYCL/DeviceLib/built-ins/nan_aspect-fp64.cpp | 70 ++++++++++++++++++++ 2 files changed, 70 insertions(+), 8 deletions(-) create mode 100644 SYCL/DeviceLib/built-ins/nan_aspect-fp64.cpp diff --git a/SYCL/DeviceLib/built-ins/nan.cpp b/SYCL/DeviceLib/built-ins/nan.cpp index 61d66c41c9..d94af0d770 100644 --- a/SYCL/DeviceLib/built-ins/nan.cpp +++ b/SYCL/DeviceLib/built-ins/nan.cpp @@ -42,12 +42,8 @@ template void check_nan(s::queue &Queue) { int main() { test_nan_call(); test_nan_call(); - test_nan_call(); - test_nan_call(); test_nan_call(); test_nan_call(); - test_nan_call(); - test_nan_call(); s::queue Queue([](sycl::exception_list ExceptionList) { for (std::exception_ptr ExceptionPtr : ExceptionList) { @@ -65,9 +61,5 @@ int main() { check_nan(Queue); #endif check_nan(Queue); - if (Queue.get_device().has(sycl::aspect::fp64)) { - check_nan(Queue); - check_nan(Queue); - } return 0; } diff --git a/SYCL/DeviceLib/built-ins/nan_aspect-fp64.cpp b/SYCL/DeviceLib/built-ins/nan_aspect-fp64.cpp new file mode 100644 index 0000000000..1a568e245d --- /dev/null +++ b/SYCL/DeviceLib/built-ins/nan_aspect-fp64.cpp @@ -0,0 +1,70 @@ +// REQUIRES: aspect-fp64 +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -D HALF_IS_SUPPORTED %s -o %t_gpu.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t_gpu.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#include + +#include + +namespace s = cl::sycl; +using namespace std; + +template void test_nan_call() { + static_assert(is_same::value == Expected, ""); +} + +template struct test; + +template void check_nan(s::queue &Queue) { + R Data{0}; + s::vec VData{0}; + { + s::buffer Buf(&Data, s::range<1>(1)); + s::buffer, 1> VBuf(&VData, s::range<1>(1)); + Queue.submit([&](s::handler &Cgh) { + auto Acc = Buf.template get_access(Cgh); + auto VAcc = VBuf.template get_access(Cgh); + Cgh.single_task>([=]() { + Acc[0] = s::nan(T{0}); + VAcc[0] = s::nan(s::vec{0}); + }); + }); + Queue.wait_and_throw(); + } + assert(s::isnan(Data)); + assert(s::all(s::isnan(VData))); +} + +int main() { + queue q; + if (!q.get_device().has(aspect::fp64)) { + std::cout << "Skipping test\n"; + return 0; + } + + test_nan_call(); + test_nan_call(); + test_nan_call(); + test_nan_call(); + + s::queue Queue([](cl::sycl::exception_list ExceptionList) { + for (std::exception_ptr ExceptionPtr : ExceptionList) { + try { + std::rethrow_exception(ExceptionPtr); + } catch (cl::sycl::exception &E) { + std::cerr << E.what() << std::endl; + } catch (...) { + std::cerr << "Unknown async exception was caught." << std::endl; + } + } + }); + + check_nan(Queue); + check_nan(Queue); + + return 0; +} From f7cab86a5f1d6eabd747d0e9c848905e55f4ca27 Mon Sep 17 00:00:00 2001 From: Yanliang Mu Date: Tue, 16 Aug 2022 15:24:09 +0800 Subject: [PATCH 05/35] split double tests in ESIMD directory --- SYCL/ESIMD/api/bin_and_cmp_ops_heavy.cpp | 230 +-------- SYCL/ESIMD/api/bin_and_cmp_ops_heavy.hpp | 227 ++++++++ .../api/bin_and_cmp_ops_heavy_aspect-fp64.cpp | 55 ++ .../functions/functions_select_2d.hpp | 4 +- .../functions_select_2d_core_aspect-fp64.cpp | 42 ++ SYCL/ESIMD/api/saturation_smoke.cpp | 162 +----- SYCL/ESIMD/api/saturation_smoke.hpp | 162 ++++++ .../api/saturation_smoke_aspect-fp64.cpp | 42 ++ SYCL/ESIMD/api/simd_view_select_2d_fp.cpp | 1 - .../simd_view_select_2d_fp_aspect-fp64.cpp | 36 ++ SYCL/ESIMD/api/unary_ops_heavy.cpp | 126 +---- SYCL/ESIMD/api/unary_ops_heavy.hpp | 126 +++++ .../ESIMD/api/unary_ops_heavy_aspect-fp64.cpp | 53 ++ SYCL/ESIMD/ext_math.cpp | 445 +--------------- SYCL/ESIMD/ext_math.hpp | 441 ++++++++++++++++ SYCL/ESIMD/ext_math_aspect-fp64.cpp | 45 ++ .../regression/Inputs/dgetrf_aspect-fp64.hpp | 487 ++++++++++++++++++ SYCL/ESIMD/regression/dgetrf_8x8.cpp | 62 +-- .../regression/dgetrf_8x8_aspect-fp64.cpp | 314 +++++++++++ SYCL/ESIMD/regression/dgetrf_aspect-fp64.cpp | 19 + .../regression/dgetrf_ref_aspect-fp64.cpp | 19 + 21 files changed, 2107 insertions(+), 991 deletions(-) create mode 100644 SYCL/ESIMD/api/bin_and_cmp_ops_heavy.hpp create mode 100644 SYCL/ESIMD/api/bin_and_cmp_ops_heavy_aspect-fp64.cpp create mode 100644 SYCL/ESIMD/api/functional/functions/functions_select_2d_core_aspect-fp64.cpp create mode 100644 SYCL/ESIMD/api/saturation_smoke.hpp create mode 100644 SYCL/ESIMD/api/saturation_smoke_aspect-fp64.cpp create mode 100644 SYCL/ESIMD/api/simd_view_select_2d_fp_aspect-fp64.cpp create mode 100644 SYCL/ESIMD/api/unary_ops_heavy.hpp create mode 100644 SYCL/ESIMD/api/unary_ops_heavy_aspect-fp64.cpp create mode 100644 SYCL/ESIMD/ext_math.hpp create mode 100644 SYCL/ESIMD/ext_math_aspect-fp64.cpp create mode 100644 SYCL/ESIMD/regression/Inputs/dgetrf_aspect-fp64.hpp create mode 100644 SYCL/ESIMD/regression/dgetrf_8x8_aspect-fp64.cpp create mode 100644 SYCL/ESIMD/regression/dgetrf_aspect-fp64.cpp create mode 100644 SYCL/ESIMD/regression/dgetrf_ref_aspect-fp64.cpp diff --git a/SYCL/ESIMD/api/bin_and_cmp_ops_heavy.cpp b/SYCL/ESIMD/api/bin_and_cmp_ops_heavy.cpp index e0503e8597..c735f071f2 100644 --- a/SYCL/ESIMD/api/bin_and_cmp_ops_heavy.cpp +++ b/SYCL/ESIMD/api/bin_and_cmp_ops_heavy.cpp @@ -22,235 +22,11 @@ // larger than certain threshold. Might need to tune the cr0 once this feature // is available in ESIMD. // - -#include "../esimd_test_utils.hpp" - -#include -#include -#include +#include "bin_and_cmp_ops_heavy.hpp" using namespace sycl; using namespace sycl::ext::intel::esimd; -template class TestID; - -// Result type of a scalar binary Op -template -using scalar_comp_t = - std::conditional_t, - typename simd_mask<8>::element_type, - __ESIMD_DNS::computation_type_t>; - -// Result type of a vector binary Op -template -using comp_t = std::conditional_t< - N == 0, scalar_comp_t, - std::conditional_t, simd_mask, - simd<__ESIMD_DNS::computation_type_t, N>>>; - -// Helpers for printing -template auto cast(T val) { return val; } -template <> auto cast(char val) { return (int)val; } -template <> auto cast(unsigned char val) { - return (unsigned int)val; -} -#ifdef __SYCL_DEVICE_ONLY__ -template <> auto cast<_Float16>(_Float16 val) { return (float)val; } -#endif - -// Main test function. -// T1, T2 - operand types, -// VL - vector length, -// OpClass - binary or comparison operations, -// VerifyF and InitF - verification and initialization function types -// (instantiated within the test function), -// Ops - a compile-time sequence of operations to test. -// -template class VerifyF, - template class InitF, class Ops> -bool test(Ops ops, queue &q, comp_t epsilon = 0) { - // Log test case info - std::cout << "Testing T1=" << typeid(T1).name() << " T2=" << typeid(T2).name() - << ", VL=" << VL << " ...\n"; - std::cout << "Operations:"; - esimd_test::iterate_ops(ops, [=](OpClass op) { - std::cout << " '" << esimd_test::Op2Str(op) << "'"; - }); - std::cout << "\n"; - - // initialize test data - constexpr int Size = 1024 * 7; - T1 *A = sycl::malloc_shared(Size, q); - T2 *B = sycl::malloc_shared(Size, q); - constexpr int NumOps = (int)Ops::size; - int CSize = NumOps * Size; - using T = comp_t; - // Result array. For each pair of A[i] and B[i] elements it reserves NumOps - // elements to store result of all operations under test applied to the A[i] - // and B[i] - T *C = sycl::malloc_shared(CSize, q); - InitF init; - - for (int i = 0; i < Size; ++i) { - init(A, B, C, i); - } - - // submit the kernel - try { - auto e = q.submit([&](handler &cgh) { - cgh.parallel_for>( - Size / VL, [=](id<1> i) SYCL_ESIMD_KERNEL { - unsigned off = i * VL; - simd va(A + off, vector_aligned_tag{}); - simd vb(B + off, vector_aligned_tag{}); - - // applies each of the input operations to the va and vb vectors, - // then invokes the lambda below, passing the result of the - // operation, its ID and sequential number within the input sequence - esimd_test::apply_ops( - ops, va, vb, - [=](comp_t res, OpClass op, - unsigned op_num) { - unsigned res_off = off * NumOps + op_num * VL; - res.copy_to(C + res_off, vector_aligned_tag{}); - }); - }); - }); - e.wait(); - } catch (sycl::exception const &e) { - std::cout << "SYCL exception caught: " << e.what() << '\n'; - sycl::free(A, q); - sycl::free(B, q); - sycl::free(C, q); - return false; - } - - int err_cnt = 0; - - // now verify the results using provided verification function type - for (unsigned i = 0; i < Size / VL; ++i) { - unsigned off = i * VL; - - for (int j = 0; j < VL; ++j) { - T1 a = A[off + j]; - T2 b = B[off + j]; - - esimd_test::apply_ops( - ops, a, b, [&](T Gold, OpClass op, unsigned op_num) { - unsigned res_off = off * NumOps + op_num * VL; - T Res = C[res_off + j]; - using Tint = esimd_test::int_type_t; - Tint ResBits = *(Tint *)&Res; - Tint GoldBits = *(Tint *)&Gold; - VerifyF verify_f(epsilon); - - if (!verify_f(Gold, Res, op)) { - if (++err_cnt < 10) { - std::cout << " failed at index " << (res_off + j) << ", op " - << esimd_test::Op2Str(op) << ": " << cast(Res) - << "(0x" << std::hex << ResBits << ")" - << " != " << std::dec << cast(Gold) << "(0x" - << std::hex << GoldBits << ") [" << std::dec - << cast(a) << " " << esimd_test::Op2Str(op) << " " - << cast(b) << "]\n"; - } - } - }); - } - } - if (err_cnt > 0) { - auto Size1 = NumOps * Size; - std::cout << " pass rate: " - << ((float)(Size1 - err_cnt) / (float)Size1) * 100.0f << "% (" - << (Size1 - err_cnt) << "/" << Size1 << ")\n"; - } - - free(A, q); - free(B, q); - free(C, q); - std::cout << (err_cnt > 0 ? " FAILED\n" : " Passed\n"); - return err_cnt == 0; -} - -// Flavours of verification function types. - -template struct verify_strict { - using T = comp_t; - - verify_strict(T) {} - - bool operator()(T res, T gold, OpClass op) { return res == gold; } -}; - -#define EQ(x, y, epsilon) \ - ((x) > (y) ? (x) - (y) <= epsilon : (y) - (x) <= epsilon) - -template struct verify_epsilon { - using T = comp_t; - T epsilon; - verify_epsilon(T epsilon) : epsilon(epsilon) {} - - bool operator()(T res, T gold, OpClass op) { - if constexpr (std::is_same_v) { - if (op == esimd_test::BinaryOp::div) { - return EQ(res, gold, epsilon); - } - } - return res == gold; - } -}; - -template struct verify_n { - using T = comp_t; - int n; - verify_n(int n) : n(n) {} - - bool operator()(T res, T gold, OpClass op) { - using Tint = esimd_test::int_type_t; - Tint res_bits = *(Tint *)&res; - Tint gold_bits = *(Tint *)&gold; - return (abs(gold_bits - res_bits) > n) ? false : true; - } -}; - -// Flavours of initialization function types. - -template struct init_default { - using T = comp_t; - - void operator()(T1 *A, T2 *B, T *C, int i) { - A[i] = (i % 3) * 90 + 10; /*10, 100, 190, 10, ...*/ - if constexpr (std::is_unsigned_v) { - B[i] = (i % 3) * 99 + 1 /*1, 100, 199, 1, ...*/; - } else { - B[i] = (i % 4) * 180 - 170; /*-170, 10, 190, 370, -170,...*/ - } - C[i] = 0; - } -}; - -template struct init_for_shift { - using T = comp_t; - - void operator()(T1 *A, T2 *B, T *C, int i) { - if constexpr (std::is_unsigned_v) { - A[i] = (i % 3) + 100; /*100, 101, 102, 100, ...*/ - } else { - A[i] = (i % 4) * 100 - 150; /*-150, -50, 50, 150, -150, ...*/ - } - B[i] = (i % 3); - C[i] = 0; - } -}; - -// shortcuts for less clutter -template using VSf = verify_strict; -template using VEf = verify_epsilon; -template using VNf = verify_n; -template using IDf = init_default; -template using ISf = init_for_shift; - int main(void) { queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler()); @@ -262,11 +38,9 @@ int main(void) { auto arith_ops = esimd_test::ArithBinaryOps; passed &= test(arith_ops, q); passed &= test(arith_ops, q, 0.000001f); - passed &= test(arith_ops, q); passed &= test(arith_ops, q, 0.000001f); passed &= test(arith_ops, q, 1); passed &= test(arith_ops, q, 1); - passed &= test(arith_ops, q); passed &= test(arith_ops, q); auto int_ops = @@ -290,11 +64,9 @@ int main(void) { auto cmp_ops = esimd_test::CmpOps; passed &= test(cmp_ops, q); passed &= test(cmp_ops, q); - passed &= test(cmp_ops, q); passed &= test(cmp_ops, q); passed &= test(cmp_ops, q, 1); passed &= test(cmp_ops, q, 1); - passed &= test(cmp_ops, q); passed &= test(cmp_ops, q); std::cout << (passed ? "Test PASSED\n" : "Test FAILED\n"); diff --git a/SYCL/ESIMD/api/bin_and_cmp_ops_heavy.hpp b/SYCL/ESIMD/api/bin_and_cmp_ops_heavy.hpp new file mode 100644 index 0000000000..71078d31c5 --- /dev/null +++ b/SYCL/ESIMD/api/bin_and_cmp_ops_heavy.hpp @@ -0,0 +1,227 @@ +#include "../esimd_test_utils.hpp" + +#include +#include +#include + +using namespace cl::sycl; +using namespace sycl::ext::intel::esimd; + +template class TestID; + +// Result type of a scalar binary Op +template +using scalar_comp_t = + std::conditional_t, + typename simd_mask<8>::element_type, + __ESIMD_DNS::computation_type_t>; + +// Result type of a vector binary Op +template +using comp_t = std::conditional_t< + N == 0, scalar_comp_t, + std::conditional_t, simd_mask, + simd<__ESIMD_DNS::computation_type_t, N>>>; + +// Helpers for printing +template auto cast(T val) { return val; } +template <> auto cast(char val) { return (int)val; } +template <> auto cast(unsigned char val) { + return (unsigned int)val; +} +#ifdef __SYCL_DEVICE_ONLY__ +template <> auto cast<_Float16>(_Float16 val) { return (float)val; } +#endif + +// Main test function. +// T1, T2 - operand types, +// VL - vector length, +// OpClass - binary or comparison operations, +// VerifyF and InitF - verification and initialization function types +// (instantiated within the test function), +// Ops - a compile-time sequence of operations to test. +// +template class VerifyF, + template class InitF, class Ops> +bool test(Ops ops, queue &q, comp_t epsilon = 0) { + // Log test case info + std::cout << "Testing T1=" << typeid(T1).name() << " T2=" << typeid(T2).name() + << ", VL=" << VL << " ...\n"; + std::cout << "Operations:"; + esimd_test::iterate_ops(ops, [=](OpClass op) { + std::cout << " '" << esimd_test::Op2Str(op) << "'"; + }); + std::cout << "\n"; + + // initialize test data + constexpr int Size = 1024 * 7; + T1 *A = sycl::malloc_shared(Size, q); + T2 *B = sycl::malloc_shared(Size, q); + constexpr int NumOps = (int)Ops::size; + int CSize = NumOps * Size; + using T = comp_t; + // Result array. For each pair of A[i] and B[i] elements it reserves NumOps + // elements to store result of all operations under test applied to the A[i] + // and B[i] + T *C = sycl::malloc_shared(CSize, q); + InitF init; + + for (int i = 0; i < Size; ++i) { + init(A, B, C, i); + } + + // submit the kernel + try { + auto e = q.submit([&](handler &cgh) { + cgh.parallel_for>( + Size / VL, [=](id<1> i) SYCL_ESIMD_KERNEL { + unsigned off = i * VL; + simd va(A + off, vector_aligned_tag{}); + simd vb(B + off, vector_aligned_tag{}); + + // applies each of the input operations to the va and vb vectors, + // then invokes the lambda below, passing the result of the + // operation, its ID and sequential number within the input sequence + esimd_test::apply_ops( + ops, va, vb, + [=](comp_t res, OpClass op, + unsigned op_num) { + unsigned res_off = off * NumOps + op_num * VL; + res.copy_to(C + res_off, vector_aligned_tag{}); + }); + }); + }); + e.wait(); + } catch (sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + sycl::free(A, q); + sycl::free(B, q); + sycl::free(C, q); + return false; + } + + int err_cnt = 0; + + // now verify the results using provided verification function type + for (unsigned i = 0; i < Size / VL; ++i) { + unsigned off = i * VL; + + for (int j = 0; j < VL; ++j) { + T1 a = A[off + j]; + T2 b = B[off + j]; + + esimd_test::apply_ops( + ops, a, b, [&](T Gold, OpClass op, unsigned op_num) { + unsigned res_off = off * NumOps + op_num * VL; + T Res = C[res_off + j]; + using Tint = esimd_test::int_type_t; + Tint ResBits = *(Tint *)&Res; + Tint GoldBits = *(Tint *)&Gold; + VerifyF verify_f(epsilon); + + if (!verify_f(Gold, Res, op)) { + if (++err_cnt < 10) { + std::cout << " failed at index " << (res_off + j) << ", op " + << esimd_test::Op2Str(op) << ": " << cast(Res) + << "(0x" << std::hex << ResBits << ")" + << " != " << std::dec << cast(Gold) << "(0x" + << std::hex << GoldBits << ") [" << std::dec + << cast(a) << " " << esimd_test::Op2Str(op) << " " + << cast(b) << "]\n"; + } + } + }); + } + } + if (err_cnt > 0) { + auto Size1 = NumOps * Size; + std::cout << " pass rate: " + << ((float)(Size1 - err_cnt) / (float)Size1) * 100.0f << "% (" + << (Size1 - err_cnt) << "/" << Size1 << ")\n"; + } + + free(A, q); + free(B, q); + free(C, q); + std::cout << (err_cnt > 0 ? " FAILED\n" : " Passed\n"); + return err_cnt == 0; +} + +// Flavours of verification function types. + +template struct verify_strict { + using T = comp_t; + + verify_strict(T) {} + + bool operator()(T res, T gold, OpClass op) { return res == gold; } +}; + +#define EQ(x, y, epsilon) \ + ((x) > (y) ? (x) - (y) <= epsilon : (y) - (x) <= epsilon) + +template struct verify_epsilon { + using T = comp_t; + T epsilon; + verify_epsilon(T epsilon) : epsilon(epsilon) {} + + bool operator()(T res, T gold, OpClass op) { + if constexpr (std::is_same_v) { + if (op == esimd_test::BinaryOp::div) { + return EQ(res, gold, epsilon); + } + } + return res == gold; + } +}; + +template struct verify_n { + using T = comp_t; + int n; + verify_n(int n) : n(n) {} + + bool operator()(T res, T gold, OpClass op) { + using Tint = esimd_test::int_type_t; + Tint res_bits = *(Tint *)&res; + Tint gold_bits = *(Tint *)&gold; + return (abs(gold_bits - res_bits) > n) ? false : true; + } +}; + +// Flavours of initialization function types. + +template struct init_default { + using T = comp_t; + + void operator()(T1 *A, T2 *B, T *C, int i) { + A[i] = (i % 3) * 90 + 10; /*10, 100, 190, 10, ...*/ + if constexpr (std::is_unsigned_v) { + B[i] = (i % 3) * 99 + 1 /*1, 100, 199, 1, ...*/; + } else { + B[i] = (i % 4) * 180 - 170; /*-170, 10, 190, 370, -170,...*/ + } + C[i] = 0; + } +}; + +template struct init_for_shift { + using T = comp_t; + + void operator()(T1 *A, T2 *B, T *C, int i) { + if constexpr (std::is_unsigned_v) { + A[i] = (i % 3) + 100; /*100, 101, 102, 100, ...*/ + } else { + A[i] = (i % 4) * 100 - 150; /*-150, -50, 50, 150, -150, ...*/ + } + B[i] = (i % 3); + C[i] = 0; + } +}; + +// shortcuts for less clutter +template using VSf = verify_strict; +template using VEf = verify_epsilon; +template using VNf = verify_n; +template using IDf = init_default; +template using ISf = init_for_shift; diff --git a/SYCL/ESIMD/api/bin_and_cmp_ops_heavy_aspect-fp64.cpp b/SYCL/ESIMD/api/bin_and_cmp_ops_heavy_aspect-fp64.cpp new file mode 100644 index 0000000000..131728367f --- /dev/null +++ b/SYCL/ESIMD/api/bin_and_cmp_ops_heavy_aspect-fp64.cpp @@ -0,0 +1,55 @@ +//==--------------- bin_un_cmp_ops_heavy_aspect-fp64.cpp - DPC++ ESIMD +//on-device test -==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: aspect-fp64, gpu +// UNSUPPORTED: cuda || hip +// TODO: esimd_emulator fails due to unimplemented 'half' type +// XFAIL: esimd_emulator +// RUN: %clangxx -fsycl %s -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +// Tests various binary operations applied to simd objects. + +// TODO +// Arithmetic operations behaviour depends on Gen's control regiter's rounding +// mode, which is RTNE by default: +// cr0.5:4 is 00b = Round to Nearest or Even (RTNE) +// For half this leads to divergence between Gen and host (emulated) results +// larger than certain threshold. Might need to tune the cr0 once this feature +// is available in ESIMD. +// + +#include "bin_and_cmp_ops_heavy.hpp" + +using namespace cl::sycl; +using namespace sycl::ext::intel::esimd; + +int main(void) { + queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler()); + if (!q.get_device().has(aspect::fp64)) { + std::cout << "Skipping test\n"; + return 0; + } + + auto dev = q.get_device(); + std::cout << "Running on " << dev.get_info() << "\n"; + bool passed = true; + using BinOp = esimd_test::BinaryOp; + + auto arith_ops = esimd_test::ArithBinaryOps; + passed &= test(arith_ops, q); + passed &= test(arith_ops, q); + + using CmpOp = esimd_test::CmpOp; + auto cmp_ops = esimd_test::CmpOps; + passed &= test(cmp_ops, q); + passed &= test(cmp_ops, q); + + std::cout << (passed ? "Test PASSED\n" : "Test FAILED\n"); + return passed ? 0 : 1; +} diff --git a/SYCL/ESIMD/api/functional/functions/functions_select_2d.hpp b/SYCL/ESIMD/api/functional/functions/functions_select_2d.hpp index cc19d2cca7..84fac0b20b 100644 --- a/SYCL/ESIMD/api/functional/functions/functions_select_2d.hpp +++ b/SYCL/ESIMD/api/functional/functions/functions_select_2d.hpp @@ -224,8 +224,10 @@ bool run_test_for_types(sycl::queue &queue) { const auto great_size = get_dimensions(); #ifdef SIMD_RUN_TEST_WITH_SYCL_HALF_TYPE const auto all_types = get_tested_types(); -#else +#elif SIMD_RUN_TEST_WITH_SYCL_DOUBLE_TYPE const auto all_types = named_type_pack::generate("double"); +#else + const auto all_types = named_type_pack::generate("float"); #endif // Verify correctness for different select sizes. diff --git a/SYCL/ESIMD/api/functional/functions/functions_select_2d_core_aspect-fp64.cpp b/SYCL/ESIMD/api/functional/functions/functions_select_2d_core_aspect-fp64.cpp new file mode 100644 index 0000000000..2ac0db0279 --- /dev/null +++ b/SYCL/ESIMD/api/functional/functions/functions_select_2d_core_aspect-fp64.cpp @@ -0,0 +1,42 @@ +//==-- functions_select_2d_core_aspect-fp64.cpp - DPC++ ESIMD on-device test +// ----------------------------------------------------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: gpu, level_zero, aspect-fp64 +// XREQUIRES: gpu +// TODO gpu and level_zero in REQUIRES due to only this platforms supported yet. +// The current "REQUIRES" should be replaced with "gpu" only as mentioned in +// "XREQUIRES". +// UNSUPPORTED: cuda, hip +// RUN: %clangxx -fsycl -DSIMD_RUN_TEST_WITH_SYCL_DOUBLE_TYPE %s -fsycl-device-code-split=per_kernel -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// +// Test for simd select for 2d function. +// The test creates source simd instance with reference data and invokes logical +// not operator, using core data types. +// The test verifies that selected values can be changed with avoid to change +// values, that hasn't beed selected. + +#include "functions_select_2d.hpp" + +using namespace sycl::ext::intel::experimental::esimd; +using namespace esimd_test::api::functional; + +int main(int, char **) { + sycl::queue queue(esimd_test::ESIMDSelector{}, + esimd_test::createExceptionHandler()); + + if (!queue.get_device().has(sycl::aspect::fp64) { + std::cout << "Skipping test\n"; + return 0; + } + + bool passed = functions::run_test_for_types(queue); + + std::cout << (passed ? "=== Test passed\n" : "=== Test FAILED\n"); + return passed ? 0 : 1; +} diff --git a/SYCL/ESIMD/api/saturation_smoke.cpp b/SYCL/ESIMD/api/saturation_smoke.cpp index e6aa049160..caad4c2a09 100644 --- a/SYCL/ESIMD/api/saturation_smoke.cpp +++ b/SYCL/ESIMD/api/saturation_smoke.cpp @@ -14,169 +14,11 @@ // // The test checks main functionality of esimd::saturate function. -#include "../esimd_test_utils.hpp" - -#include -#include -#include +#include "saturation_smoke.hpp" using namespace sycl; using namespace sycl::ext::intel::esimd; -template struct char_to_int { - using type = typename std::conditional< - sizeof(T) == 1, - typename std::conditional::value, int, unsigned>::type, - T>::type; -}; - -template bool verify(T *data_arr, T *gold_arr, int N) { - int err_cnt = 0; - - for (unsigned i = 0; i < N; ++i) { - T val = data_arr[i]; - T gold = gold_arr[i]; - - if (val != gold) { - if (++err_cnt < 10) { - using T1 = typename char_to_int::type; - std::cout << " failed at index " << i << ": " << (T1)val - << " != " << (T1)gold << " (gold)\n"; - } - } - } - if (err_cnt > 0) { - std::cout << " pass rate: " << ((float)(N - err_cnt) / (float)N) * 100.0f - << "% (" << (N - err_cnt) << "/" << N << ")\n"; - } - return err_cnt == 0; -} - -template struct DataMgr { - From *src; - To *dst; - To *gold; - static inline constexpr int N = Nx; - - DataMgr(From (&&src_data)[N], To (&&gold_data)[N]) { - src = new From[N]; - dst = new To[N]; - gold = new To[N]; - - for (int i = 0; i < N; i++) { - src[i] = src_data[i]; - dst[i] = (To)2; // 0, 1 can be results of saturation, so use 2 - gold[i] = gold_data[i]; - } - } - - ~DataMgr() { - delete[] src; - delete[] dst; - delete[] gold; - } -}; - -template class Mgr> -bool test(queue q) { - std::cout << "Testing " << typeid(From).name() << " -> " << typeid(To).name() - << "\n"; - - Mgr dm; - constexpr int N = Mgr::N; - - try { - sycl::buffer src_buf(dm.src, N); - sycl::buffer dst_buf(dm.dst, N); - - auto e = q.submit([&](handler &cgh) { - auto src_acc = src_buf.template get_access(cgh); - auto dst_acc = dst_buf.template get_access(cgh); - - cgh.single_task([=]() SYCL_ESIMD_KERNEL { - simd x(src_acc, 0); - simd y = saturate(x); - y.copy_to(dst_acc, 0); - }); - }); - } catch (sycl::exception const &e) { - std::cout << "SYCL exception caught: " << e.what() << '\n'; - return false; // not success - } - return verify(dm.dst, dm.gold, N); -} - -// clang-format off -template struct FpToInt : public DataMgr { - static_assert( - (std::is_floating_point_v || std::is_same_v) && - std::is_integral_v); - static inline constexpr int N = 2; - - FpToInt() : DataMgr( - // need this trick with -127 + 130 because INT_MAX is not accurately - // representable with float, and compiler warns: - // implicit conversion from 'int' to 'const float' changes value from - // 2147483647 to 2147483648 - // INT_MAX-127 is accurately representable with float. Use +130 to exceed - // representable range to actually test saturation. - // Test data: - { (From)std::numeric_limits::min() - 10, - (From)(std::numeric_limits::max()-127) + 130 }, - // Gold data (saturated test data): - { std::numeric_limits::min(), - std::numeric_limits::max() }) - {} -}; - -template -struct UIntToSameOrNarrowAnyInt : public DataMgr { - static_assert(std::is_integral_v && std::is_integral_v && - !std::is_signed_v && (sizeof(From) >= sizeof(To))); - static inline constexpr int N = 1; - - UIntToSameOrNarrowAnyInt() : DataMgr( - { (From)((From)std::numeric_limits::max() + (From)10) }, - { (To)std::numeric_limits::max() }) - {} -}; - -template -struct IntToWiderUInt : public DataMgr { - static_assert(std::is_signed_v && !std::is_signed_v && - (sizeof(From) < sizeof(To))); - static inline constexpr int N = 1; - - IntToWiderUInt() : DataMgr( - { (From)-1 }, - { (To)0 }) - {} -}; - -template -struct SIntToNarrowAnyInt : public DataMgr { - static_assert(std::is_integral_v && std::is_signed_v && - std::is_integral_v && (sizeof(From) > sizeof(To))); - static inline constexpr int N = 2; - - SIntToNarrowAnyInt() : DataMgr( - { (From)std::numeric_limits::max() + 10, - (From)std::numeric_limits::min() - 10 }, - { (To)std::numeric_limits::max(), - (To)std::numeric_limits::min() }) - {} -}; - -template struct FpToFp : public DataMgr { - static_assert((std::is_floating_point_v || std::is_same_v)); - static inline constexpr int N = 5; - - FpToFp() : DataMgr( - { (From)-10, (From)0, (From)0.5, (From)1, (From)10 }, - { (To)0, (To)0, (To)((From)0.5), (To)1, (To)1 }) - {} -}; - // clang-format on int main(int argc, char **argv) { @@ -188,7 +30,6 @@ int main(int argc, char **argv) { passed &= test(q); passed &= test(q); passed &= test(q); - passed &= test(q); passed &= test(q); passed &= test(q); @@ -205,7 +46,6 @@ int main(int argc, char **argv) { passed &= test(q); passed &= test(q); - passed &= test(q); std::cout << (passed ? "Test passed\n" : "Test FAILED\n"); return passed ? 0 : 1; diff --git a/SYCL/ESIMD/api/saturation_smoke.hpp b/SYCL/ESIMD/api/saturation_smoke.hpp new file mode 100644 index 0000000000..9b11b97150 --- /dev/null +++ b/SYCL/ESIMD/api/saturation_smoke.hpp @@ -0,0 +1,162 @@ +#include "../esimd_test_utils.hpp" + +#include +#include +#include + +using namespace cl::sycl; +using namespace sycl::ext::intel::esimd; + +template struct char_to_int { + using type = typename std::conditional< + sizeof(T) == 1, + typename std::conditional::value, int, unsigned>::type, + T>::type; +}; + +template bool verify(T *data_arr, T *gold_arr, int N) { + int err_cnt = 0; + + for (unsigned i = 0; i < N; ++i) { + T val = data_arr[i]; + T gold = gold_arr[i]; + + if (val != gold) { + if (++err_cnt < 10) { + using T1 = typename char_to_int::type; + std::cout << " failed at index " << i << ": " << (T1)val + << " != " << (T1)gold << " (gold)\n"; + } + } + } + if (err_cnt > 0) { + std::cout << " pass rate: " << ((float)(N - err_cnt) / (float)N) * 100.0f + << "% (" << (N - err_cnt) << "/" << N << ")\n"; + } + return err_cnt == 0; +} + +template struct DataMgr { + From *src; + To *dst; + To *gold; + static inline constexpr int N = Nx; + + DataMgr(From (&&src_data)[N], To (&&gold_data)[N]) { + src = new From[N]; + dst = new To[N]; + gold = new To[N]; + + for (int i = 0; i < N; i++) { + src[i] = src_data[i]; + dst[i] = (To)2; // 0, 1 can be results of saturation, so use 2 + gold[i] = gold_data[i]; + } + } + + ~DataMgr() { + delete[] src; + delete[] dst; + delete[] gold; + } +}; + +template class Mgr> +bool test(queue q) { + std::cout << "Testing " << typeid(From).name() << " -> " << typeid(To).name() + << "\n"; + + Mgr dm; + constexpr int N = Mgr::N; + + try { + sycl::buffer src_buf(dm.src, N); + sycl::buffer dst_buf(dm.dst, N); + + auto e = q.submit([&](handler &cgh) { + auto src_acc = src_buf.template get_access(cgh); + auto dst_acc = dst_buf.template get_access(cgh); + + cgh.single_task([=]() SYCL_ESIMD_KERNEL { + simd x(src_acc, 0); + simd y = saturate(x); + y.copy_to(dst_acc, 0); + }); + }); + } catch (sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + return false; // not success + } + return verify(dm.dst, dm.gold, N); +} + +// clang-format off +template struct FpToInt : public DataMgr { + static_assert( + (std::is_floating_point_v || std::is_same_v) && + std::is_integral_v); + static inline constexpr int N = 2; + + FpToInt() : DataMgr( + // need this trick with -127 + 130 because INT_MAX is not accurately + // representable with float, and compiler warns: + // implicit conversion from 'int' to 'const float' changes value from + // 2147483647 to 2147483648 + // INT_MAX-127 is accurately representable with float. Use +130 to exceed + // representable range to actually test saturation. + // Test data: + { (From)std::numeric_limits::min() - 10, + (From)(std::numeric_limits::max()-127) + 130 }, + // Gold data (saturated test data): + { std::numeric_limits::min(), + std::numeric_limits::max() }) + {} +}; + +template +struct UIntToSameOrNarrowAnyInt : public DataMgr { + static_assert(std::is_integral_v && std::is_integral_v && + !std::is_signed_v && (sizeof(From) >= sizeof(To))); + static inline constexpr int N = 1; + + UIntToSameOrNarrowAnyInt() : DataMgr( + { (From)((From)std::numeric_limits::max() + (From)10) }, + { (To)std::numeric_limits::max() }) + {} +}; + +template +struct IntToWiderUInt : public DataMgr { + static_assert(std::is_signed_v && !std::is_signed_v && + (sizeof(From) < sizeof(To))); + static inline constexpr int N = 1; + + IntToWiderUInt() : DataMgr( + { (From)-1 }, + { (To)0 }) + {} +}; + +template +struct SIntToNarrowAnyInt : public DataMgr { + static_assert(std::is_integral_v && std::is_signed_v && + std::is_integral_v && (sizeof(From) > sizeof(To))); + static inline constexpr int N = 2; + + SIntToNarrowAnyInt() : DataMgr( + { (From)std::numeric_limits::max() + 10, + (From)std::numeric_limits::min() - 10 }, + { (To)std::numeric_limits::max(), + (To)std::numeric_limits::min() }) + {} +}; + +template struct FpToFp : public DataMgr { + static_assert((std::is_floating_point_v || std::is_same_v)); + static inline constexpr int N = 5; + + FpToFp() : DataMgr( + { (From)-10, (From)0, (From)0.5, (From)1, (From)10 }, + { (To)0, (To)0, (To)((From)0.5), (To)1, (To)1 }) + {} +}; diff --git a/SYCL/ESIMD/api/saturation_smoke_aspect-fp64.cpp b/SYCL/ESIMD/api/saturation_smoke_aspect-fp64.cpp new file mode 100644 index 0000000000..3a18cbabf4 --- /dev/null +++ b/SYCL/ESIMD/api/saturation_smoke_aspect-fp64.cpp @@ -0,0 +1,42 @@ +//==---- saturation_smoke_aspect-fp64.cpp - DPC++ ESIMD on-device test -----==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: gpu, aspect-fp64 +// UNSUPPORTED: cuda || hip +// TODO: esimd_emulator fails due to unimplemented 'half' type +// XFAIL: esimd_emulator +// RUN: %clangxx -fsycl %s -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// +// The test checks main functionality of esimd::saturate function. + +#include "saturation_smoke.hpp" + +using namespace cl::sycl; +using namespace sycl::ext::intel::esimd; + +// clang-format on + +int main(int argc, char **argv) { + queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler()); + if (!q.get_device().has(sycl::aspect::fp64) { + std::cout << "Skipping test\n"; + return 0; + } + + auto dev = q.get_device(); + std::cout << "Running on " << dev.get_info() << "\n"; + + bool passed = true; + + passed &= test(q); + + passed &= test(q); + + std::cout << (passed ? "Test passed\n" : "Test FAILED\n"); + return passed ? 0 : 1; +} diff --git a/SYCL/ESIMD/api/simd_view_select_2d_fp.cpp b/SYCL/ESIMD/api/simd_view_select_2d_fp.cpp index 7eca9af2de..db122c5db4 100644 --- a/SYCL/ESIMD/api/simd_view_select_2d_fp.cpp +++ b/SYCL/ESIMD/api/simd_view_select_2d_fp.cpp @@ -25,7 +25,6 @@ int main(int argc, char **argv) { bool passed = true; passed &= test(q); passed &= test(q); - passed &= test(q); std::cout << (passed ? "=== Test passed\n" : "=== Test FAILED\n"); return passed ? 0 : 1; diff --git a/SYCL/ESIMD/api/simd_view_select_2d_fp_aspect-fp64.cpp b/SYCL/ESIMD/api/simd_view_select_2d_fp_aspect-fp64.cpp new file mode 100644 index 0000000000..bc3ece4dbe --- /dev/null +++ b/SYCL/ESIMD/api/simd_view_select_2d_fp_aspect-fp64.cpp @@ -0,0 +1,36 @@ +//==- simd_view_select_2d_fp_aspect-fp64.cpp - DPC++ ESIMD on-device test -==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: gpu, aspect-fp64 +// UNSUPPORTED: cuda || hip +// TODO: esimd_emulator fails due to unimplemented 'single_task()' method +// XFAIL: esimd_emulator +// RUN: %clangxx -fsycl %s -fsycl-device-code-split=per_kernel -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// +// Smoke test for 2D region select API which can be used to represent 2D tiles. +// Tests FP types. + +#include "simd_view_select_2d.hpp" + +int main(int argc, char **argv) { + queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler()); + if (!q.get_device().has(sycl::aspect::fp64) { + std::cout << "Skipping test\n"; + return 0; + } + + auto dev = q.get_device(); + std::cout << "Running on " << dev.get_info() << "\n"; + + bool passed = true; + + passed &= test(q); + + std::cout << (passed ? "=== Test passed\n" : "=== Test FAILED\n"); + return passed ? 0 : 1; +} diff --git a/SYCL/ESIMD/api/unary_ops_heavy.cpp b/SYCL/ESIMD/api/unary_ops_heavy.cpp index 2a4940fe9a..a64f61f3e3 100644 --- a/SYCL/ESIMD/api/unary_ops_heavy.cpp +++ b/SYCL/ESIMD/api/unary_ops_heavy.cpp @@ -23,133 +23,11 @@ // is available in ESIMD. // -#include "../esimd_test_utils.hpp" - -#include -#include -#include +#include "unary_ops_heavy.hpp" using namespace sycl; using namespace sycl::ext::intel::esimd; -template class TestID; - -// Helpers for printing -template auto cast(T val) { return val; } -template <> auto cast(char val) { return (int)val; } -template <> auto cast(unsigned char val) { - return (unsigned int)val; -} -#ifdef __SYCL_DEVICE_ONLY__ -template <> auto cast<_Float16>(_Float16 val) { return (float)val; } -#endif - -// Main test function. -// T - operand type, -// VL - vector length, -// Ops - a compile-time sequence of operations to test. -// -template class SimdT = simd> -bool test(Ops ops, queue &q) { - using OpClass = esimd_test::UnaryOp; - // Log test case info - std::cout << "Testing T=" << typeid(T).name() << ", VL=" << VL << " ...\n"; - std::cout << "Operations:"; - esimd_test::iterate_ops(ops, [=](OpClass op) { - std::cout << " '" << esimd_test::Op2Str(op) << "'"; - }); - std::cout << "\n"; - - // initialize test data - constexpr int Size = 1024 * 7; - T *A = sycl::malloc_shared(Size, q); - constexpr int NumOps = (int)Ops::size; - int CSize = NumOps * Size; - T *C = sycl::malloc_shared(CSize, q); - - for (int i = 0; i < Size; ++i) { - if constexpr (std::is_unsigned_v) { - A[i] = i; - } else { - A[i] = i - Size / 2; - } - C[i] = 0; - } - - // submit the kernel - try { - auto e = q.submit([&](handler &cgh) { - cgh.parallel_for>( - Size / VL, [=](id<1> i) SYCL_ESIMD_KERNEL { - unsigned off = i * VL; - SimdT va(A + off); - // applies each of the input operations to the va, - // then invokes the lambda below, passing the result of the - // operation, its ID and sequential number within the input sequence - esimd_test::apply_unary_ops( - ops, va, [=](SimdT res, OpClass op, unsigned op_num) { - unsigned res_off = off * NumOps + op_num * VL; - res.copy_to(C + res_off); - }); - }); - }); - e.wait(); - } catch (sycl::exception const &e) { - std::cout << "SYCL exception caught: " << e.what() << '\n'; - sycl::free(A, q); - sycl::free(C, q); - return false; - } - - int err_cnt = 0; - - // now verify the results using provided verification function type - for (unsigned i = 0; i < Size / VL; ++i) { - unsigned off = i * VL; - - for (int j = 0; j < VL; ++j) { - T a = A[off + j]; - - esimd_test::apply_unary_ops( - ops, a, [&](T Gold, OpClass op, unsigned op_num) { - unsigned res_off = off * NumOps + op_num * VL; - T Res = C[res_off + j]; - using Tint = esimd_test::int_type_t; - Tint ResBits = *(Tint *)&Res; - Tint GoldBits = *(Tint *)&Gold; - // allow 1 bit discrepancy for half on modifying op - int delta = ((int)op >= (int)OpClass::minus_minus_pref) && - ((int)op <= (int)OpClass::plus_plus_inf) && - std::is_same_v - ? 1 - : 0; - - if ((Gold != Res) && (abs(ResBits - GoldBits) > delta)) { - if (++err_cnt < 10) { - std::cout << " failed at index " << (res_off + j) << ", op " - << esimd_test::Op2Str(op) << ": " << cast(Res) - << "(0x" << std::hex << ResBits << ")" - << " != " << cast(Gold) << "(0x" << std::hex - << GoldBits << ") [" << esimd_test::Op2Str(op) << " " - << std::dec << cast(a) << "]\n"; - } - } - }); - } - } - if (err_cnt > 0) { - auto Size1 = NumOps * Size; - std::cout << " pass rate: " - << ((float)(Size1 - err_cnt) / (float)Size1) * 100.0f << "% (" - << (Size1 - err_cnt) << "/" << Size1 << ")\n"; - } - - free(A, q); - free(C, q); - std::cout << (err_cnt > 0 ? " FAILED\n" : " Passed\n"); - return err_cnt == 0; -} - int main(void) { queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler()); @@ -172,7 +50,6 @@ int main(void) { passed &= test(mod_ops, q); passed &= test(mod_ops, q); passed &= test(mod_ops, q); - passed &= test(mod_ops, q); auto singed_ops = esimd_test::OpSeq{}; passed &= test(singed_ops, q); @@ -181,7 +58,6 @@ int main(void) { passed &= test(singed_ops, q); passed &= test(singed_ops, q); passed &= test(singed_ops, q); - passed &= test(singed_ops, q); auto bit_ops = esimd_test::OpSeq{}; passed &= test(bit_ops, q); diff --git a/SYCL/ESIMD/api/unary_ops_heavy.hpp b/SYCL/ESIMD/api/unary_ops_heavy.hpp new file mode 100644 index 0000000000..babd26a5cd --- /dev/null +++ b/SYCL/ESIMD/api/unary_ops_heavy.hpp @@ -0,0 +1,126 @@ +#include "../esimd_test_utils.hpp" + +#include +#include +#include + +using namespace cl::sycl; +using namespace sycl::ext::intel::esimd; + +template class TestID; + +// Helpers for printing +template auto cast(T val) { return val; } +template <> auto cast(char val) { return (int)val; } +template <> auto cast(unsigned char val) { + return (unsigned int)val; +} +#ifdef __SYCL_DEVICE_ONLY__ +template <> auto cast<_Float16>(_Float16 val) { return (float)val; } +#endif + +// Main test function. +// T - operand type, +// VL - vector length, +// Ops - a compile-time sequence of operations to test. +// +template class SimdT = simd> +bool test(Ops ops, queue &q) { + using OpClass = esimd_test::UnaryOp; + // Log test case info + std::cout << "Testing T=" << typeid(T).name() << ", VL=" << VL << " ...\n"; + std::cout << "Operations:"; + esimd_test::iterate_ops(ops, [=](OpClass op) { + std::cout << " '" << esimd_test::Op2Str(op) << "'"; + }); + std::cout << "\n"; + + // initialize test data + constexpr int Size = 1024 * 7; + T *A = sycl::malloc_shared(Size, q); + constexpr int NumOps = (int)Ops::size; + int CSize = NumOps * Size; + T *C = sycl::malloc_shared(CSize, q); + + for (int i = 0; i < Size; ++i) { + if constexpr (std::is_unsigned_v) { + A[i] = i; + } else { + A[i] = i - Size / 2; + } + C[i] = 0; + } + + // submit the kernel + try { + auto e = q.submit([&](handler &cgh) { + cgh.parallel_for>( + Size / VL, [=](id<1> i) SYCL_ESIMD_KERNEL { + unsigned off = i * VL; + SimdT va(A + off); + // applies each of the input operations to the va, + // then invokes the lambda below, passing the result of the + // operation, its ID and sequential number within the input sequence + esimd_test::apply_unary_ops( + ops, va, [=](SimdT res, OpClass op, unsigned op_num) { + unsigned res_off = off * NumOps + op_num * VL; + res.copy_to(C + res_off); + }); + }); + }); + e.wait(); + } catch (sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + sycl::free(A, q); + sycl::free(C, q); + return false; + } + + int err_cnt = 0; + + // now verify the results using provided verification function type + for (unsigned i = 0; i < Size / VL; ++i) { + unsigned off = i * VL; + + for (int j = 0; j < VL; ++j) { + T a = A[off + j]; + + esimd_test::apply_unary_ops( + ops, a, [&](T Gold, OpClass op, unsigned op_num) { + unsigned res_off = off * NumOps + op_num * VL; + T Res = C[res_off + j]; + using Tint = esimd_test::int_type_t; + Tint ResBits = *(Tint *)&Res; + Tint GoldBits = *(Tint *)&Gold; + // allow 1 bit discrepancy for half on modifying op + int delta = ((int)op >= (int)OpClass::minus_minus_pref) && + ((int)op <= (int)OpClass::plus_plus_inf) && + std::is_same_v + ? 1 + : 0; + + if ((Gold != Res) && (abs(ResBits - GoldBits) > delta)) { + if (++err_cnt < 10) { + std::cout << " failed at index " << (res_off + j) << ", op " + << esimd_test::Op2Str(op) << ": " << cast(Res) + << "(0x" << std::hex << ResBits << ")" + << " != " << cast(Gold) << "(0x" << std::hex + << GoldBits << ") [" << esimd_test::Op2Str(op) << " " + << std::dec << cast(a) << "]\n"; + } + } + }); + } + } + if (err_cnt > 0) { + auto Size1 = NumOps * Size; + std::cout << " pass rate: " + << ((float)(Size1 - err_cnt) / (float)Size1) * 100.0f << "% (" + << (Size1 - err_cnt) << "/" << Size1 << ")\n"; + } + + free(A, q); + free(C, q); + std::cout << (err_cnt > 0 ? " FAILED\n" : " Passed\n"); + return err_cnt == 0; +} diff --git a/SYCL/ESIMD/api/unary_ops_heavy_aspect-fp64.cpp b/SYCL/ESIMD/api/unary_ops_heavy_aspect-fp64.cpp new file mode 100644 index 0000000000..09f6a09fb4 --- /dev/null +++ b/SYCL/ESIMD/api/unary_ops_heavy_aspect-fp64.cpp @@ -0,0 +1,53 @@ +//==------ unary_ops_heavy_aspect-fp64.cpp - DPC++ ESIMD on-device test ---==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: gpu, aspect-fp64 +// UNSUPPORTED: cuda || hip +// TODO: esimd_emulator fails due to unimplemented 'half' type +// XFAIL: esimd_emulator +// RUN: %clangxx -fsycl %s -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +// Tests various unary operations applied to simd objects. + +// TODO +// Arithmetic operations behaviour depends on Gen's control regiter's rounding +// mode, which is RTNE by default: +// cr0.5:4 is 00b = Round to Nearest or Even (RTNE) +// For half this leads to divergence between Gen and host (emulated) results +// larger than certain threshold. Might need to tune the cr0 once this feature +// is available in ESIMD. +// + +#include "unary_ops_heavy.hpp" + +using namespace cl::sycl; +using namespace sycl::ext::intel::esimd; + +int main(void) { + queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler()); + if (!q.get_device().has(sycl::aspect::fp64) { + std::cout << "Skipping test\n"; + return 0; + } + + auto dev = q.get_device(); + std::cout << "Running on " << dev.get_info() << "\n"; + bool passed = true; + using UnOp = esimd_test::UnaryOp; + + auto mod_ops = + esimd_test::OpSeq{}; + passed &= test(mod_ops, q); + + auto singed_ops = esimd_test::OpSeq{}; + passed &= test(singed_ops, q); + + std::cout << (passed ? "Test passed\n" : "Test FAILED\n"); + return passed ? 0 : 1; +} diff --git a/SYCL/ESIMD/ext_math.cpp b/SYCL/ESIMD/ext_math.cpp index 1b910900e4..3dd65e823f 100644 --- a/SYCL/ESIMD/ext_math.cpp +++ b/SYCL/ESIMD/ext_math.cpp @@ -17,448 +17,11 @@ // - math function - sin, cos, ..., div_ieee, pow // - SYCL vs ESIMD APIs -#include "esimd_test_utils.hpp" +#include "ext_math.hpp" -#include -#include -#include - -#include -#include - -using namespace sycl; +using namespace cl::sycl; using namespace sycl::ext::intel; -// --- Data initialization functions - -// Initialization data for trigonometric functions' input. -// H/w supports only limited range of sin/cos arguments with decent accuracy: -// absolute error <= 0.0008 for the range of +/- 32767*pi (+/- 102941). - -constexpr int accuracy_limit = 32767 * 3.14 - 1; - -template struct InitTrig { - void operator()(T *In, T *Out, size_t Size) const { - for (auto I = 0; I < Size; ++I) { - In[I] = (I + 1) % accuracy_limit; - Out[I] = (T)0; - } - } -}; - -template struct InitWide { - void operator()(T *In, T *Out, size_t Size) const { - for (auto I = 0; I < Size; ++I) { - In[I] = I + 1.0; - Out[I] = (T)0; - } - } -}; - -template struct InitNarrow { - void operator()(T *In, T *Out, size_t Size) const { - for (auto I = 0; I < Size; ++I) { - In[I] = 2.0f + 16.0f * ((T)I / (T)(Size - 1)); // in [2..18] range - Out[I] = (T)0; - } - } -}; - -template struct InitInRange0_5 { - void operator()(T *In, T *Out, size_t Size) const { - for (auto I = 0; I < Size; ++I) { - In[I] = 5.0f * ((T)I / (T)(Size - 1)); // in [0..5] range - Out[I] = (T)0; - } - } -}; - -template struct InitBin { - void operator()(T *In1, T *In2, T *Out, size_t Size) const { - for (auto I = 0; I < Size; ++I) { - In1[I] = I % 17 + 1; - In2[I] = 4.0f * ((T)I / (T)(Size - 1)); // in [0..4] range - Out[I] = (T)0; - } - } -}; - -// --- Math operation identification - -enum class MathOp { - sin, - cos, - exp, - sqrt, - sqrt_ieee, - inv, - log, - rsqrt, - floor, - ceil, - trunc, - exp2, - log2, - div_ieee, - pow -}; - -// --- Template functions calculating given math operation on host and device - -enum ArgKind { - AllVec, - AllSca, - Sca1Vec2, - Sca2Vec1 -}; - -template struct ESIMDf; -template struct BinESIMDf; -template struct SYCLf; - -template struct HostFunc; - -#define DEFINE_HOST_OP(Op, HostOp) \ - template struct HostFunc { \ - T operator()(T X) { return HostOp; } \ - }; - -DEFINE_HOST_OP(sin, std::sin(X)); -DEFINE_HOST_OP(cos, std::cos(X)); -DEFINE_HOST_OP(exp, std::exp(X)); -DEFINE_HOST_OP(log, std::log(X)); -DEFINE_HOST_OP(inv, 1.0f / X); -DEFINE_HOST_OP(sqrt, std::sqrt(X)); -DEFINE_HOST_OP(sqrt_ieee, std::sqrt(X)); -DEFINE_HOST_OP(rsqrt, 1.0f / std::sqrt(X)); -DEFINE_HOST_OP(floor, std::floor(X)); -DEFINE_HOST_OP(ceil, std::ceil(X)); -DEFINE_HOST_OP(trunc, std::trunc(X)); -DEFINE_HOST_OP(exp2, std::exp2(X)); -DEFINE_HOST_OP(log2, std::log2(X)); - -#define DEFINE_HOST_BIN_OP(Op, HostOp) \ - template struct HostFunc { \ - T operator()(T X, T Y) { return HostOp; } \ - }; - -DEFINE_HOST_BIN_OP(div_ieee, X / Y); -DEFINE_HOST_BIN_OP(pow, std::pow(X, Y)); - -// --- Specializations per each extended math operation - -#define DEFINE_ESIMD_DEVICE_OP(Op) \ - template struct ESIMDf { \ - esimd::simd \ - operator()(esimd::simd X) const SYCL_ESIMD_FUNCTION { \ - return esimd::Op(X); \ - } \ - }; \ - template struct ESIMDf { \ - esimd::simd operator()(T X) const SYCL_ESIMD_FUNCTION { \ - return esimd::Op(X); \ - } \ - }; - -DEFINE_ESIMD_DEVICE_OP(sin); -DEFINE_ESIMD_DEVICE_OP(cos); -DEFINE_ESIMD_DEVICE_OP(exp); -DEFINE_ESIMD_DEVICE_OP(log); -DEFINE_ESIMD_DEVICE_OP(inv); -DEFINE_ESIMD_DEVICE_OP(sqrt); -DEFINE_ESIMD_DEVICE_OP(sqrt_ieee); -DEFINE_ESIMD_DEVICE_OP(rsqrt); -DEFINE_ESIMD_DEVICE_OP(floor); -DEFINE_ESIMD_DEVICE_OP(ceil); -DEFINE_ESIMD_DEVICE_OP(trunc); -DEFINE_ESIMD_DEVICE_OP(exp2); -DEFINE_ESIMD_DEVICE_OP(log2); - -#define DEFINE_ESIMD_DEVICE_BIN_OP(Op) \ - template struct BinESIMDf { \ - esimd::simd operator()(T X, T Y) const SYCL_ESIMD_FUNCTION { \ - return esimd::Op(X, Y); \ - } \ - }; \ - template struct BinESIMDf { \ - esimd::simd \ - operator()(esimd::simd X, \ - esimd::simd Y) const SYCL_ESIMD_FUNCTION { \ - return esimd::Op(X, Y); \ - } \ - }; \ - template struct BinESIMDf { \ - esimd::simd \ - operator()(T X, esimd::simd Y) const SYCL_ESIMD_FUNCTION { \ - return esimd::Op(X, Y); \ - } \ - }; \ - template struct BinESIMDf { \ - esimd::simd operator()(esimd::simd X, \ - T Y) const SYCL_ESIMD_FUNCTION { \ - return esimd::Op(X, Y); \ - } \ - }; - -DEFINE_ESIMD_DEVICE_BIN_OP(div_ieee); -DEFINE_ESIMD_DEVICE_BIN_OP(pow); - -#define DEFINE_SYCL_DEVICE_OP(Op) \ - template struct SYCLf { \ - esimd::simd \ - operator()(esimd::simd X) const SYCL_ESIMD_FUNCTION { \ - /* T must be float for SYCL, so not a template parameter for sycl::Op*/ \ - return sycl::Op(X); \ - } \ - }; \ - template struct SYCLf { \ - esimd::simd operator()(T X) const SYCL_ESIMD_FUNCTION { \ - return sycl::Op(X); \ - } \ - }; - -DEFINE_SYCL_DEVICE_OP(sin); -DEFINE_SYCL_DEVICE_OP(cos); -DEFINE_SYCL_DEVICE_OP(exp); -DEFINE_SYCL_DEVICE_OP(log); - -// --- Generic kernel calculating an extended math operation on array elements - -template class Kernel, typename AccIn, - typename AccOut> -struct UnaryDeviceFunc { - AccIn In; - AccOut Out; - - UnaryDeviceFunc(AccIn &In, AccOut &Out) : In(In), Out(Out) {} - - void operator()(id<1> I) const SYCL_ESIMD_KERNEL { - unsigned int Offset = I * N * sizeof(T); - esimd::simd Vx; - Vx.copy_from(In, Offset); - - if (I.get(0) % 2 == 0) { - for (int J = 0; J < N; J++) { - Kernel DevF{}; - T Val = Vx[J]; - esimd::simd V = DevF(Val); // scalar arg - Vx[J] = V[J]; - } - } else { - Kernel DevF{}; - Vx = DevF(Vx); // vector arg - } - Vx.copy_to(Out, Offset); - }; -}; - -template class Kernel, typename AccIn, - typename AccOut> -struct BinaryDeviceFunc { - AccIn In1; - AccIn In2; - AccOut Out; - - BinaryDeviceFunc(AccIn &In1, AccIn &In2, AccOut &Out) - : In1(In1), In2(In2), Out(Out) {} - - void operator()(id<1> I) const SYCL_ESIMD_KERNEL { - unsigned int Offset = I * N * sizeof(T); - esimd::simd V1(In1, Offset); - esimd::simd V2(In2, Offset); - esimd::simd V; - - if (I.get(0) % 2 == 0) { - int Ind = 0; - { - Kernel DevF{}; - T Val2 = V2[Ind]; - esimd::simd Vv = DevF(V1[Ind], Val2); // both arguments are scalar - V[Ind] = Vv[Ind]; - } - Ind++; - { - Kernel DevF{}; - T Val1 = V1[Ind]; - esimd::simd Vv = DevF(Val1, V2); // scalar, vector - V[Ind] = Vv[Ind]; - } - Ind++; - { - for (int J = Ind; J < N; ++J) { - Kernel DevF{}; - T Val2 = V2[J]; - esimd::simd Vv = DevF(V1, Val2); // scalar 2nd arg - V[J] = Vv[J]; - } - } - } else { - Kernel DevF{}; - V = DevF(V1, V2); // vec 2nd arg - } - V.copy_to(Out, Offset); - }; -}; - -// --- Generic test function for an extended math operation - -template class Kernel, - typename InitF = InitNarrow> -bool test(queue &Q, const std::string &Name, - InitF Init = InitNarrow{}, float delta = 0.0f) { - - constexpr size_t Size = 1024 * 128; - constexpr bool IsBinOp = (Op == MathOp::div_ieee) || (Op == MathOp::pow); - - T *A = new T[Size]; - T *B = new T[Size]; - T *C = new T[Size]; - if constexpr (IsBinOp) { - Init(A, B, C, Size); - } else { - Init(A, B, Size); - } - const char *kind = - std::is_same_v, ESIMDf> - ? "ESIMD" - : "SYCL"; - std::cout << " " << Name << " test, kind=" << kind << "...\n"; - - try { - buffer BufA(A, range<1>(Size)); - buffer BufB(B, range<1>(Size)); - buffer BufC(C, range<1>(Size)); - - // number of workgroups - sycl::range<1> GlobalRange{Size / N}; - - // threads (workitems) in each workgroup - sycl::range<1> LocalRange{1}; - - auto E = Q.submit([&](handler &CGH) { - auto PA = BufA.template get_access(CGH); - auto PC = BufC.template get_access(CGH); - if constexpr (IsBinOp) { - auto PB = BufB.template get_access(CGH); - BinaryDeviceFunc F( - PA, PB, PC); - CGH.parallel_for(nd_range<1>{GlobalRange, LocalRange}, F); - } else { - UnaryDeviceFunc F(PA, - PC); - CGH.parallel_for(nd_range<1>{GlobalRange, LocalRange}, F); - } - }); - E.wait(); - } catch (sycl::exception &Exc) { - std::cout << " *** ERROR. SYCL exception caught: << " << Exc.what() - << "\n"; - return false; - } - - int ErrCnt = 0; - - for (unsigned I = 0; I < Size; ++I) { - T Gold; - - if constexpr (IsBinOp) { - Gold = HostFunc{}((T)A[I], (T)B[I]); - } else { - Gold = HostFunc{}((T)A[I]); - } - T Test = C[I]; - - if (delta == 0.0f) { - delta = sizeof(T) > 2 ? 0.0001 : 0.01; - } - - if (abs(Test - Gold) > delta) { - if (++ErrCnt < 10) { - std::cout << " failed at index " << I << ", " << Test - << " != " << Gold << " (gold)\n"; - } - } - } - delete[] A; - delete[] B; - delete[] C; - - if (ErrCnt > 0) { - std::cout << " pass rate: " - << ((float)(Size - ErrCnt) / (float)Size) * 100.0f << "% (" - << (Size - ErrCnt) << "/" << Size << ")\n"; - } - - std::cout << (ErrCnt > 0 ? " FAILED\n" : " Passed\n"); - return ErrCnt == 0; -} - -// --- Tests all extended math operations with given vector length - -template bool testESIMD(queue &Q) { - bool Pass = true; - - std::cout << "--- TESTING ESIMD functions, T=" << typeid(T).name() - << ", N = " << N << "...\n"; - - Pass &= test(Q, "sqrt", InitWide{}); - Pass &= test(Q, "inv"); - Pass &= test(Q, "rsqrt"); - Pass &= test(Q, "sin", InitTrig{}); - Pass &= test(Q, "cos", InitTrig{}); - Pass &= test(Q, "exp", InitInRange0_5{}); - Pass &= test(Q, "log", InitWide{}); - Pass &= test(Q, "exp2", InitInRange0_5{}); - Pass &= test(Q, "log2", InitWide{}); - Pass &= test(Q, "floor", InitWide{}); - Pass &= test(Q, "ceil", InitWide{}); - Pass &= test(Q, "trunc", InitWide{}); - return Pass; -} - -template bool testESIMDSqrtIEEE(queue &Q) { - bool Pass = true; - std::cout << "--- TESTING ESIMD sqrt_ieee, T=" << typeid(T).name() - << ", N = " << N << "...\n"; - Pass &= test(Q, "sqrt_ieee", InitWide{}); - return Pass; -} - -template bool testESIMDDivIEEE(queue &Q) { - bool Pass = true; - std::cout << "--- TESTING ESIMD div_ieee, T=" << typeid(T).name() - << ", N = " << N << "...\n"; - Pass &= test(Q, "div_ieee", InitBin{}); - return Pass; -} - -template bool testESIMDPow(queue &Q) { - bool Pass = true; - std::cout << "--- TESTING ESIMD pow, T=" << typeid(T).name() - << ", N = " << N << "...\n"; - Pass &= test( - Q, "pow", InitBin{}, 0.1); - return Pass; -} - -template bool testSYCL(queue &Q) { - bool Pass = true; - // TODO SYCL currently supports only these 4 functions, extend the test when - // more are available. - std::cout << "--- TESTING SYCL functions, T=" << typeid(T).name() - << ", N = " << N << "...\n"; - // SYCL functions will have good accuracy for any argument, unlike bare h/w - // ESIMD versions, so init with "wide" data set. - Pass &= test(Q, "sin", InitWide{}); - Pass &= test(Q, "cos", InitWide{}); - Pass &= test(Q, "exp", InitInRange0_5{}); - Pass &= test(Q, "log", InitWide{}); - return Pass; -} - // --- The entry point int main(void) { @@ -471,10 +34,6 @@ int main(void) { Pass &= testESIMD(Q); Pass &= testSYCL(Q); Pass &= testSYCL(Q); - Pass &= testESIMDSqrtIEEE(Q); - Pass &= testESIMDSqrtIEEE(Q); - Pass &= testESIMDDivIEEE(Q); - Pass &= testESIMDDivIEEE(Q); Pass &= testESIMDPow(Q); Pass &= testESIMDPow(Q); std::cout << (Pass ? "Test Passed\n" : "Test FAILED\n"); diff --git a/SYCL/ESIMD/ext_math.hpp b/SYCL/ESIMD/ext_math.hpp new file mode 100644 index 0000000000..8e44f2317f --- /dev/null +++ b/SYCL/ESIMD/ext_math.hpp @@ -0,0 +1,441 @@ +#include "esimd_test_utils.hpp" + +#include +#include +#include + +#include +#include + +using namespace cl::sycl; +using namespace sycl::ext::intel; + +// --- Data initialization functions + +// Initialization data for trigonometric functions' input. +// H/w supports only limited range of sin/cos arguments with decent accuracy: +// absolute error <= 0.0008 for the range of +/- 32767*pi (+/- 102941). + +constexpr int accuracy_limit = 32767 * 3.14 - 1; + +template struct InitTrig { + void operator()(T *In, T *Out, size_t Size) const { + for (auto I = 0; I < Size; ++I) { + In[I] = (I + 1) % accuracy_limit; + Out[I] = (T)0; + } + } +}; + +template struct InitWide { + void operator()(T *In, T *Out, size_t Size) const { + for (auto I = 0; I < Size; ++I) { + In[I] = I + 1.0; + Out[I] = (T)0; + } + } +}; + +template struct InitNarrow { + void operator()(T *In, T *Out, size_t Size) const { + for (auto I = 0; I < Size; ++I) { + In[I] = 2.0f + 16.0f * ((T)I / (T)(Size - 1)); // in [2..18] range + Out[I] = (T)0; + } + } +}; + +template struct InitInRange0_5 { + void operator()(T *In, T *Out, size_t Size) const { + for (auto I = 0; I < Size; ++I) { + In[I] = 5.0f * ((T)I / (T)(Size - 1)); // in [0..5] range + Out[I] = (T)0; + } + } +}; + +template struct InitBin { + void operator()(T *In1, T *In2, T *Out, size_t Size) const { + for (auto I = 0; I < Size; ++I) { + In1[I] = I % 17 + 1; + In2[I] = 4.0f * ((T)I / (T)(Size - 1)); // in [0..4] range + Out[I] = (T)0; + } + } +}; + +// --- Math operation identification + +enum class MathOp { + sin, + cos, + exp, + sqrt, + sqrt_ieee, + inv, + log, + rsqrt, + floor, + ceil, + trunc, + exp2, + log2, + div_ieee, + pow +}; + +// --- Template functions calculating given math operation on host and device + +enum ArgKind { + AllVec, + AllSca, + Sca1Vec2, + Sca2Vec1 +}; + +template struct ESIMDf; +template struct BinESIMDf; +template struct SYCLf; + +template struct HostFunc; + +#define DEFINE_HOST_OP(Op, HostOp) \ + template struct HostFunc { \ + T operator()(T X) { return HostOp; } \ + }; + +DEFINE_HOST_OP(sin, std::sin(X)); +DEFINE_HOST_OP(cos, std::cos(X)); +DEFINE_HOST_OP(exp, std::exp(X)); +DEFINE_HOST_OP(log, std::log(X)); +DEFINE_HOST_OP(inv, 1.0f / X); +DEFINE_HOST_OP(sqrt, std::sqrt(X)); +DEFINE_HOST_OP(sqrt_ieee, std::sqrt(X)); +DEFINE_HOST_OP(rsqrt, 1.0f / std::sqrt(X)); +DEFINE_HOST_OP(floor, std::floor(X)); +DEFINE_HOST_OP(ceil, std::ceil(X)); +DEFINE_HOST_OP(trunc, std::trunc(X)); +DEFINE_HOST_OP(exp2, std::exp2(X)); +DEFINE_HOST_OP(log2, std::log2(X)); + +#define DEFINE_HOST_BIN_OP(Op, HostOp) \ + template struct HostFunc { \ + T operator()(T X, T Y) { return HostOp; } \ + }; + +DEFINE_HOST_BIN_OP(div_ieee, X / Y); +DEFINE_HOST_BIN_OP(pow, std::pow(X, Y)); + +// --- Specializations per each extended math operation + +#define DEFINE_ESIMD_DEVICE_OP(Op) \ + template struct ESIMDf { \ + esimd::simd \ + operator()(esimd::simd X) const SYCL_ESIMD_FUNCTION { \ + return esimd::Op(X); \ + } \ + }; \ + template struct ESIMDf { \ + esimd::simd operator()(T X) const SYCL_ESIMD_FUNCTION { \ + return esimd::Op(X); \ + } \ + }; + +DEFINE_ESIMD_DEVICE_OP(sin); +DEFINE_ESIMD_DEVICE_OP(cos); +DEFINE_ESIMD_DEVICE_OP(exp); +DEFINE_ESIMD_DEVICE_OP(log); +DEFINE_ESIMD_DEVICE_OP(inv); +DEFINE_ESIMD_DEVICE_OP(sqrt); +DEFINE_ESIMD_DEVICE_OP(sqrt_ieee); +DEFINE_ESIMD_DEVICE_OP(rsqrt); +DEFINE_ESIMD_DEVICE_OP(floor); +DEFINE_ESIMD_DEVICE_OP(ceil); +DEFINE_ESIMD_DEVICE_OP(trunc); +DEFINE_ESIMD_DEVICE_OP(exp2); +DEFINE_ESIMD_DEVICE_OP(log2); + +#define DEFINE_ESIMD_DEVICE_BIN_OP(Op) \ + template struct BinESIMDf { \ + esimd::simd operator()(T X, T Y) const SYCL_ESIMD_FUNCTION { \ + return esimd::Op(X, Y); \ + } \ + }; \ + template struct BinESIMDf { \ + esimd::simd \ + operator()(esimd::simd X, \ + esimd::simd Y) const SYCL_ESIMD_FUNCTION { \ + return esimd::Op(X, Y); \ + } \ + }; \ + template struct BinESIMDf { \ + esimd::simd \ + operator()(T X, esimd::simd Y) const SYCL_ESIMD_FUNCTION { \ + return esimd::Op(X, Y); \ + } \ + }; \ + template struct BinESIMDf { \ + esimd::simd operator()(esimd::simd X, \ + T Y) const SYCL_ESIMD_FUNCTION { \ + return esimd::Op(X, Y); \ + } \ + }; + +DEFINE_ESIMD_DEVICE_BIN_OP(div_ieee); +DEFINE_ESIMD_DEVICE_BIN_OP(pow); + +#define DEFINE_SYCL_DEVICE_OP(Op) \ + template struct SYCLf { \ + esimd::simd \ + operator()(esimd::simd X) const SYCL_ESIMD_FUNCTION { \ + /* T must be float for SYCL, so not a template parameter for sycl::Op*/ \ + return sycl::Op(X); \ + } \ + }; \ + template struct SYCLf { \ + esimd::simd operator()(T X) const SYCL_ESIMD_FUNCTION { \ + return sycl::Op(X); \ + } \ + }; + +DEFINE_SYCL_DEVICE_OP(sin); +DEFINE_SYCL_DEVICE_OP(cos); +DEFINE_SYCL_DEVICE_OP(exp); +DEFINE_SYCL_DEVICE_OP(log); + +// --- Generic kernel calculating an extended math operation on array elements + +template class Kernel, typename AccIn, + typename AccOut> +struct UnaryDeviceFunc { + AccIn In; + AccOut Out; + + UnaryDeviceFunc(AccIn &In, AccOut &Out) : In(In), Out(Out) {} + + void operator()(id<1> I) const SYCL_ESIMD_KERNEL { + unsigned int Offset = I * N * sizeof(T); + esimd::simd Vx; + Vx.copy_from(In, Offset); + + if (I.get(0) % 2 == 0) { + for (int J = 0; J < N; J++) { + Kernel DevF{}; + T Val = Vx[J]; + esimd::simd V = DevF(Val); // scalar arg + Vx[J] = V[J]; + } + } else { + Kernel DevF{}; + Vx = DevF(Vx); // vector arg + } + Vx.copy_to(Out, Offset); + }; +}; + +template class Kernel, typename AccIn, + typename AccOut> +struct BinaryDeviceFunc { + AccIn In1; + AccIn In2; + AccOut Out; + + BinaryDeviceFunc(AccIn &In1, AccIn &In2, AccOut &Out) + : In1(In1), In2(In2), Out(Out) {} + + void operator()(id<1> I) const SYCL_ESIMD_KERNEL { + unsigned int Offset = I * N * sizeof(T); + esimd::simd V1(In1, Offset); + esimd::simd V2(In2, Offset); + esimd::simd V; + + if (I.get(0) % 2 == 0) { + int Ind = 0; + { + Kernel DevF{}; + T Val2 = V2[Ind]; + esimd::simd Vv = DevF(V1[Ind], Val2); // both arguments are scalar + V[Ind] = Vv[Ind]; + } + Ind++; + { + Kernel DevF{}; + T Val1 = V1[Ind]; + esimd::simd Vv = DevF(Val1, V2); // scalar, vector + V[Ind] = Vv[Ind]; + } + Ind++; + { + for (int J = Ind; J < N; ++J) { + Kernel DevF{}; + T Val2 = V2[J]; + esimd::simd Vv = DevF(V1, Val2); // scalar 2nd arg + V[J] = Vv[J]; + } + } + } else { + Kernel DevF{}; + V = DevF(V1, V2); // vec 2nd arg + } + V.copy_to(Out, Offset); + }; +}; + +// --- Generic test function for an extended math operation + +template class Kernel, + typename InitF = InitNarrow> +bool test(queue &Q, const std::string &Name, + InitF Init = InitNarrow{}, float delta = 0.0f) { + + constexpr size_t Size = 1024 * 128; + constexpr bool IsBinOp = (Op == MathOp::div_ieee) || (Op == MathOp::pow); + + T *A = new T[Size]; + T *B = new T[Size]; + T *C = new T[Size]; + if constexpr (IsBinOp) { + Init(A, B, C, Size); + } else { + Init(A, B, Size); + } + const char *kind = + std::is_same_v, ESIMDf> + ? "ESIMD" + : "SYCL"; + std::cout << " " << Name << " test, kind=" << kind << "...\n"; + + try { + buffer BufA(A, range<1>(Size)); + buffer BufB(B, range<1>(Size)); + buffer BufC(C, range<1>(Size)); + + // number of workgroups + cl::sycl::range<1> GlobalRange{Size / N}; + + // threads (workitems) in each workgroup + cl::sycl::range<1> LocalRange{1}; + + auto E = Q.submit([&](handler &CGH) { + auto PA = BufA.template get_access(CGH); + auto PC = BufC.template get_access(CGH); + if constexpr (IsBinOp) { + auto PB = BufB.template get_access(CGH); + BinaryDeviceFunc F( + PA, PB, PC); + CGH.parallel_for(nd_range<1>{GlobalRange, LocalRange}, F); + } else { + UnaryDeviceFunc F(PA, + PC); + CGH.parallel_for(nd_range<1>{GlobalRange, LocalRange}, F); + } + }); + E.wait(); + } catch (sycl::exception &Exc) { + std::cout << " *** ERROR. SYCL exception caught: << " << Exc.what() + << "\n"; + return false; + } + + int ErrCnt = 0; + + for (unsigned I = 0; I < Size; ++I) { + T Gold; + + if constexpr (IsBinOp) { + Gold = HostFunc{}((T)A[I], (T)B[I]); + } else { + Gold = HostFunc{}((T)A[I]); + } + T Test = C[I]; + + if (delta == 0.0f) { + delta = sizeof(T) > 2 ? 0.0001 : 0.01; + } + + if (abs(Test - Gold) > delta) { + if (++ErrCnt < 10) { + std::cout << " failed at index " << I << ", " << Test + << " != " << Gold << " (gold)\n"; + } + } + } + delete[] A; + delete[] B; + delete[] C; + + if (ErrCnt > 0) { + std::cout << " pass rate: " + << ((float)(Size - ErrCnt) / (float)Size) * 100.0f << "% (" + << (Size - ErrCnt) << "/" << Size << ")\n"; + } + + std::cout << (ErrCnt > 0 ? " FAILED\n" : " Passed\n"); + return ErrCnt == 0; +} + +// --- Tests all extended math operations with given vector length + +template bool testESIMD(queue &Q) { + bool Pass = true; + + std::cout << "--- TESTING ESIMD functions, T=" << typeid(T).name() + << ", N = " << N << "...\n"; + + Pass &= test(Q, "sqrt", InitWide{}); + Pass &= test(Q, "inv"); + Pass &= test(Q, "rsqrt"); + Pass &= test(Q, "sin", InitTrig{}); + Pass &= test(Q, "cos", InitTrig{}); + Pass &= test(Q, "exp", InitInRange0_5{}); + Pass &= test(Q, "log", InitWide{}); + Pass &= test(Q, "exp2", InitInRange0_5{}); + Pass &= test(Q, "log2", InitWide{}); + Pass &= test(Q, "floor", InitWide{}); + Pass &= test(Q, "ceil", InitWide{}); + Pass &= test(Q, "trunc", InitWide{}); + return Pass; +} + +template bool testESIMDSqrtIEEE(queue &Q) { + bool Pass = true; + std::cout << "--- TESTING ESIMD sqrt_ieee, T=" << typeid(T).name() + << ", N = " << N << "...\n"; + Pass &= test(Q, "sqrt_ieee", InitWide{}); + return Pass; +} + +template bool testESIMDDivIEEE(queue &Q) { + bool Pass = true; + std::cout << "--- TESTING ESIMD div_ieee, T=" << typeid(T).name() + << ", N = " << N << "...\n"; + Pass &= test(Q, "div_ieee", InitBin{}); + return Pass; +} + +template bool testESIMDPow(queue &Q) { + bool Pass = true; + std::cout << "--- TESTING ESIMD pow, T=" << typeid(T).name() + << ", N = " << N << "...\n"; + Pass &= test( + Q, "pow", InitBin{}, 0.1); + return Pass; +} + +template bool testSYCL(queue &Q) { + bool Pass = true; + // TODO SYCL currently supports only these 4 functions, extend the test when + // more are available. + std::cout << "--- TESTING SYCL functions, T=" << typeid(T).name() + << ", N = " << N << "...\n"; + // SYCL functions will have good accuracy for any argument, unlike bare h/w + // ESIMD versions, so init with "wide" data set. + Pass &= test(Q, "sin", InitWide{}); + Pass &= test(Q, "cos", InitWide{}); + Pass &= test(Q, "exp", InitInRange0_5{}); + Pass &= test(Q, "log", InitWide{}); + return Pass; +} diff --git a/SYCL/ESIMD/ext_math_aspect-fp64.cpp b/SYCL/ESIMD/ext_math_aspect-fp64.cpp new file mode 100644 index 0000000000..377e648e17 --- /dev/null +++ b/SYCL/ESIMD/ext_math_aspect-fp64.cpp @@ -0,0 +1,45 @@ +//==----- ext_math_aspect-fp64.cpp - DPC++ ESIMD extended math test -------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: gpu, aspect-fp64 +// UNSUPPORTED: cuda || hip +// TODO: esimd_emulator fails due to unimplemented 'half' type +// XFAIL: esimd_emulator +// RUN: %clangxx -fsycl %s -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +// This test checks extended math operations. Combinations of +// - argument type - half, float +// - math function - sin, cos, ..., div_ieee, pow +// - SYCL vs ESIMD APIs + +#include "ext_math.hpp" + +using namespace cl::sycl; +using namespace sycl::ext::intel; + +// --- The entry point + +int main(void) { + queue Q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler()); + if (!Q.get_device().has(sycl::aspect::fp64) { + std::cout << "Skipping test\n"; + return 0; + } + auto Dev = Q.get_device(); + std::cout << "Running on " << Dev.get_info() << "\n"; + bool Pass = true; + + // Not support IEEE-conformant sqrt operations for single precision data. + Pass &= testESIMDSqrtIEEE(Q); + Pass &= testESIMDDivIEEE(Q); + + Pass &= testESIMDSqrtIEEE(Q); + Pass &= testESIMDDivIEEE(Q); + std::cout << (Pass ? "Test Passed\n" : "Test FAILED\n"); + return Pass ? 0 : 1; +} diff --git a/SYCL/ESIMD/regression/Inputs/dgetrf_aspect-fp64.hpp b/SYCL/ESIMD/regression/Inputs/dgetrf_aspect-fp64.hpp new file mode 100644 index 0000000000..f2586f471b --- /dev/null +++ b/SYCL/ESIMD/regression/Inputs/dgetrf_aspect-fp64.hpp @@ -0,0 +1,487 @@ +//==-------------- dgetrf.hpp - DPC++ ESIMD on-device test ----------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// This test checks the correctness of ESIMD program for batched LU +// decomposition without pivoting. The program contains multiple branches +// corresponding to LU input sizes; all internal functions are inlined. +// +#include +#include +#include +#include +#include +#include + +#define ABS(x) ((x) >= 0 ? (x) : -(x)) +#define MIN(x, y) ((x) <= (y) ? (x) : (y)) +#define MAX(x, y) ((x) >= (y) ? (x) : (y)) +#define FP_RAND ((double)rand() / (double)RAND_MAX) + +#define OUTN(text, ...) fprintf(stderr, text, ##__VA_ARGS__) +#define OUT(text, ...) OUTN(text "\n", ##__VA_ARGS__) + +#define CHECK(cmd, status) \ + do { \ + cmd; \ + if (status) { \ + OUT(#cmd " status: %d", status); \ + exit(1); \ + } \ + } while (0) +#define FAILED(res, thresh) ((res) > (thresh) || (res) != (res)) +#define CHECK_AND_REPORT(test_desc, test_id, fail_cond, res, fail_cnt) \ + do { \ + if (fail_cond) \ + fail_cnt++; \ + OUT("Test (%s): " test_desc ". Result: %f. %s", test_id, res, \ + (fail_cond) ? "FAILED" : "PASSED"); \ + } while (0) + +using namespace cl::sycl; +using namespace std; +using namespace sycl::ext::intel::esimd; + +ESIMD_PRIVATE ESIMD_REGISTER(256) simd GRF; + +#define V(x, w, i) (x).template select(i) +#define V1(x, i) V(x, 1, i) +#define V8(x, i) V(x, 8, i) +#define BCAST8(x, i) (x).template replicate_w<8, 1>(i) + +template ESIMD_INLINE void dgetrfnp_panel(int64_t *info) { + auto a = V(GRF, M * N, 0); + + if (K % 8) { + simd_mask<8> mask = 1; + for (int k = 0; k < K % 8; k++) + V1(mask, k) = 0; + + for (int k = K % 8; k < 8 && k < K + N; k++) { + auto ak = V(a, M, (k - K % 8) * M); + auto ak0 = V8(ak, K & (-8)); + + V1(mask, k) = 0; + if (ak0[k] != 0.0) { + // scal + double temp = 1.0 / ak0[k]; + ak0.merge(ak0 * temp, mask); + for (int i = 8 + K & (-8); i < M; i += 8) { + V8(ak, i) *= temp; + } + + // update + for (int j = k - K % 8 + 1; j < N; j++) { + auto aj = V(a, M, j * M); + auto aj0 = V8(aj, K & (-8)); + auto temp = BCAST8(aj0, k); + aj0.merge(aj0 - temp * ak0, aj0, mask); + for (int i = 8 + K & (-8); i < M; i += 8) { + V8(aj, i) -= temp * V8(ak, i); + } + } + } else if (*info == 0) { + *info = K + k - K % 8 + 1; + } + } + for (int kk = 0; kk < N + K % 8 - 8; kk += 8) { + mask = 1; + for (int k = 0; k < 8 && kk + k < N + K % 8 - 8; k++) { + auto ak = V(a, M, (kk + k + 8 - K % 8) * M); + auto ak0 = V8(ak, kk + (K & (-8)) + 8); + + V1(mask, k) = 0; + if (ak0[k] != 0.0) { + // scal + double temp = 1.0 / ak0[k]; + ak0.merge(ak0 * temp, mask); + for (int i = 16 + (K & (-8)) + kk; i < M; i += 8) { + V8(ak, i) *= temp; + } + + // update + for (int j = kk + k + 8 - K % 8 + 1; j < N; j++) { + auto aj = V(a, M, j * M); + auto aj0 = V8(aj, kk + (K & (-8)) + 8); + auto temp = BCAST8(aj0, k); + aj0.merge(aj0 - temp * ak0, aj0, mask); + for (int i = 16 + (K & (-8)) + kk; i < M; i += 8) { + V8(aj, i) -= temp * V8(ak, i); + } + } + } else if (*info == 0) { + *info = K + kk + k + 8 - K % 8 + 1; + } + } + } + } else { + for (int kk = 0; kk < N; kk += 8) { + simd_mask<8> mask = 1; + for (int k = 0; k < 8 && kk + k < N; k++) { + auto ak = V(a, M, (kk + k) * M); + auto ak0 = V8(ak, kk + K); + + V1(mask, k) = 0; + if (ak0[k] != 0.0) { + // scal + double temp = 1.0 / ak0[k]; + ak0.merge(ak0 * temp, mask); + for (int i = 8 + K + kk; i < M; i += 8) { + V8(ak, i) *= temp; + } + + // update + for (int j = kk + k + 1; j < N; j++) { + auto aj = V(a, M, j * M); + auto aj0 = V8(aj, kk + K); + auto temp = BCAST8(aj0, k); + aj0.merge(aj0 - temp * ak0, aj0, mask); + for (int i = 8 + K + kk; i < M; i += 8) { + V8(aj, i) -= temp * V8(ak, i); + } + } + } else if (*info == 0) { + *info = K + kk + k + 1; + } + } + } + } +} + +#ifndef USE_REF +// A left-looking algorithm step +// M, N - a panel size to be updated and factorized (M * N <= 64 * 6), must fit +// into GRF K - an update rank P0=A[0:M,0:K] = column(F=A[0:K,0:K], +// L=A[K:M,0:K]) - panel to update with P1=A[0:M,K:K+N] = column(U=A[0:K,K:K+N], +// T=A[K:M,K:K+N]) - panel to be updated +template +ESIMD_INLINE void dgetrfnp_left_step(double *a, int64_t lda, int64_t *info) { + auto p1 = V(GRF, M * N, 0); + double *a1; + int i, j, k; + + // load P1 + for (j = 0, a1 = a + K * lda; j < N; j++, a1 += lda) + for (i = 0; i < M; i += 8) { + simd data; + data.copy_from(a1 + i); + V8(p1, j * M + i) = data; + } + + if (K > 0) { + // (trsm) solve F*X=U for X, X overwrites U + // (gemm) update T=T-L*U + for (int kk = 0; kk < K; kk += 8) { + simd_mask<8> mask = 1; + simd a0k, aik; + for (k = 0; k < 8 && kk + k < K; k++) { + V1(mask, k) = 0; + simd data; + data.copy_from(a + kk + (kk + k) * lda); + V8(a0k, 0) = data; + for (j = 0; j < N; j++) { + auto aj = V(p1, M, j * M); + auto aj0 = V8(aj, kk); + auto temp = BCAST8(aj0, k); + aj0.merge(aj0 - temp * a0k, aj0, mask); + } + } + for (k = 0; k < 8 && kk + k < K; k++) { + for (i = kk + 8; i < M; i += 8) { + simd data; + data.copy_from(a + i + (kk + k) * lda); + V8(aik, 0) = data; + for (j = 0; j < N; j++) { + auto aj = V(p1, M, j * M); + auto aj0 = V8(aj, kk); + auto temp = BCAST8(aj0, k); + V8(aj, i) -= temp * aik; + } + } + } + } + } + // (getrf) factorize T=P*L*U + dgetrfnp_panel(info); + + // store P1 + for (j = 0, a1 = a + K * lda; j < N; j++, a1 += lda) + for (i = 0; i < M; i += 8) { + simd vals = V8(p1, j * M + i); + vals.copy_to(a1 + i); + } +} +#endif // !USE_REF + +ESIMD_INLINE void dgetrfnp_esimd(int64_t m, int64_t n, double *a, int64_t lda, + int64_t *ipiv, int64_t *info) { + *info = 0; +#if defined(USE_REF) + int i, j, k; + for (k = 0; k < MIN(m, n); k++) { + double temp = a[k + k * lda]; + if (!(*info) && temp == 0.0) + *info = k + 1; + // scal + temp = 1.0 / temp; + for (i = k + 1; i < m; i++) { + a[i + k * lda] *= temp; + } + // update + for (j = k + 1; j < n; j++) { + temp = a[k + j * lda]; + for (i = k + 1; i < m; i++) { + a[i + j * lda] -= temp * a[i + k * lda]; + } + } + } +#else // defined(USE_REF) + if (m == 8) { + if (n == 8) + dgetrfnp_left_step<8, 8, 0>(a, lda, info); + } else if (m == 16) { + if (n == 8) + dgetrfnp_left_step<16, 8, 0>(a, lda, info); + else if (n == 16) + dgetrfnp_left_step<16, 16, 0>(a, lda, info); + } else if (m == 32) { + if (n == 8) + dgetrfnp_left_step<32, 8, 0>(a, lda, info); + else if (n == 12) + dgetrfnp_left_step<32, 12, 0>(a, lda, info); + else if (n == 16) { + dgetrfnp_left_step<32, 8, 0>(a, lda, info); + dgetrfnp_left_step<32, 8, 8>(a, lda, info); + } else if (n == 24) { + dgetrfnp_left_step<32, 8, 0>(a, lda, info); + dgetrfnp_left_step<32, 8, 8>(a, lda, info); + dgetrfnp_left_step<32, 8, 16>(a, lda, info); + } else if (n == 32) { + dgetrfnp_left_step<32, 8, 0>(a, lda, info); + dgetrfnp_left_step<32, 8, 8>(a, lda, info); + dgetrfnp_left_step<32, 8, 16>(a, lda, info); + dgetrfnp_left_step<32, 8, 24>(a, lda, info); + } + } else if (m == 64) { + if (n == 6) + dgetrfnp_left_step<64, 6, 0>(a, lda, info); + else if (n == 16) { + dgetrfnp_left_step<64, 6, 0>(a, lda, info); + dgetrfnp_left_step<64, 6, 6>(a, lda, info); + dgetrfnp_left_step<64, 4, 12>(a, lda, info); + } else if (n == 32) { + dgetrfnp_left_step<64, 6, 0>(a, lda, info); + dgetrfnp_left_step<64, 6, 6>(a, lda, info); + dgetrfnp_left_step<64, 6, 12>(a, lda, info); + dgetrfnp_left_step<64, 6, 18>(a, lda, info); + dgetrfnp_left_step<64, 6, 24>(a, lda, info); + dgetrfnp_left_step<64, 2, 30>(a, lda, info); + } else if (n == 64) { + dgetrfnp_left_step<64, 6, 0>(a, lda, info); + dgetrfnp_left_step<64, 6, 6>(a, lda, info); + dgetrfnp_left_step<64, 6, 12>(a, lda, info); + dgetrfnp_left_step<64, 6, 18>(a, lda, info); + dgetrfnp_left_step<64, 6, 24>(a, lda, info); + dgetrfnp_left_step<64, 6, 30>(a, lda, info); + dgetrfnp_left_step<64, 6, 36>(a, lda, info); + dgetrfnp_left_step<64, 6, 42>(a, lda, info); + dgetrfnp_left_step<64, 6, 48>(a, lda, info); + dgetrfnp_left_step<64, 6, 54>(a, lda, info); + dgetrfnp_left_step<64, 4, 60>(a, lda, info); + } + } +#endif // defined(USE_REF) +} + +void dgetrfnp_batch_strided_c(int64_t m, int64_t n, double *a, int64_t lda, + int64_t stride_a, int64_t *ipiv, + int64_t stride_ipiv, int64_t batch, + int64_t *info) { + queue queue((gpu_selector())); + auto device = queue.get_device(); + auto context = queue.get_context(); + int status; + + CHECK(status = device.is_gpu(), !status); + + double *a_gpu; + int64_t *ipiv_gpu; + int64_t *info_gpu; + CHECK(a_gpu = static_cast( + malloc_shared(stride_a * batch * sizeof(double), device, context)), + !a_gpu); + CHECK(ipiv_gpu = static_cast(malloc_shared( + stride_ipiv * batch * sizeof(int64_t), device, context)), + !ipiv_gpu); + CHECK(info_gpu = static_cast( + malloc_shared(batch * sizeof(int64_t), device, context)), + !info_gpu); + + memcpy(a_gpu, a, stride_a * batch * sizeof(double)); + + sycl::nd_range<1> range(sycl::range<1>{static_cast(batch)}, + sycl::range<1>{1}); + try { + auto event = queue.submit([&](handler &cgh) { + cgh.parallel_for( + range, [=](nd_item<1> id) SYCL_ESIMD_KERNEL { + int i = id.get_global_id(0); + dgetrfnp_esimd(m, n, &a_gpu[i * stride_a], lda, + &ipiv_gpu[i * stride_ipiv], &info_gpu[i]); + }); + }); + event.wait(); + } catch (const sycl::exception &e) { + std::cout << "*** EXCEPTION caught: " << e.what() << "\n"; + free(a_gpu, context); + free(ipiv_gpu, context); + free(info_gpu, context); + return; + } + + memcpy(a, a_gpu, stride_a * batch * sizeof(double)); + memcpy(ipiv, ipiv_gpu, stride_ipiv * batch * sizeof(int64_t)); + memcpy(info, info_gpu, batch * sizeof(int64_t)); + + free(a_gpu, context); + free(ipiv_gpu, context); + free(info_gpu, context); +} + +static void fp_init(int64_t m, int64_t n, double *a, int64_t lda) { + int64_t i, j; + for (j = 0; j < n; j++) + for (i = 0; i < m; i++) + a[i + j * lda] = 2.0 * FP_RAND - 1.0; +} + +static void fp_copy(int64_t m, int64_t n, double *a, int64_t lda, double *b, + int64_t ldb) { + int64_t i, j; + for (j = 0; j < n; j++) + for (i = 0; i < m; i++) + b[i + j * ldb] = a[i + j * lda]; +} + +static double fp_norm1(int64_t m, int64_t n, double *a, int64_t lda) { + double sum, value = 0.0; + int64_t i, j; + for (j = 0; j < n; j++) { + sum = 0.0; + for (i = 0; i < m; i++) + sum += ABS(a[i + j * lda]); + if (value < sum) + value = sum; + } + return value; +} + +static int dgetrfnp_batch_strided_check(int64_t m, int64_t n, double *a_in, + double *a, int64_t lda, + int64_t stride_a, int64_t *ipiv, + int64_t stride_ipiv, int64_t batch, + int64_t *info) { + double thresh = 30.0; + int fail = 0; + int64_t i, j, k, l; + char label[1024]; + unsigned char prec_b[] = {0, 0, 0, 0, 0, 0, 0xb0, 0x3c}; + double res = 0.0, nrm = 0.0, ulp = *(double *)prec_b; + double *w = (double *)malloc(sizeof(double) * MAX(m * n, 1)); + + sprintf(label, "m=%ld, n=%ld, lda=%ld, batch=%ld", m, n, lda, batch); + + for (k = 0; k < batch; k++) { + /* info == 0 */ + CHECK_AND_REPORT("info == 0", label, info[k] != 0, (double)info[k], fail); + + if (m > 0 && n > 0) { + /* | L U - A | / ( |A| n ulp ) */ + memset(w, 0, sizeof(double) * m * n); + if (m < n) { + for (j = 0; j < n; j++) + for (i = 0; i <= j; i++) + w[i + j * m] = a[i + j * lda + k * stride_a]; + for (i = m - 1; i >= 0; i--) + for (j = 0; j < n; j++) + for (l = 0; l < i; l++) + w[i + j * m] += a[i + l * lda + k * stride_a] * w[l + j * m]; + } else { + for (j = 0; j < n; j++) + for (i = j; i < m; i++) + w[i + j * m] = a[i + j * lda + k * stride_a]; + for (j = 0; j < n; j++) + w[j + j * m] = 1.0; + for (j = n - 1; j >= 0; j--) + for (i = 0; i < m; i++) { + w[i + j * m] *= a[j + j * lda + k * stride_a]; + for (l = 0; l < j; l++) + w[i + j * m] += w[i + l * m] * a[l + j * lda + k * stride_a]; + } + } + for (j = 0; j < n; j++) + for (i = 0; i < m; i++) + w[i + j * m] -= a_in[k * stride_a + i + j * lda]; + res = fp_norm1(m, n, w, m); + nrm = fp_norm1(m, n, &a_in[k * stride_a], lda); + nrm *= (double)n * ulp; + res /= nrm > 0.0 ? nrm : ulp; + CHECK_AND_REPORT("| L U - A | / ( |A| n ulp )", label, + FAILED(res, thresh), res, fail); + } + } + + free(w); + return fail; +} + +void dgetrfnp_batch_strided_c(int64_t m, int64_t n, double *a, int64_t lda, + int64_t stride_a, int64_t *ipiv, + int64_t stride_ipiv, int64_t batch, + int64_t *info); + +int main(int argc, char *argv[]) { + int exit_status = 0; + int64_t m = 64, n = 64, lda = 64; + int64_t stride_a = lda * n, stride_ipiv = n; + + srand(1); + + for (int i = 1; i < argc; i++) { + int64_t batch = (int64_t)atoi(argv[i]); + batch = MAX(batch, 0); + int64_t a_count = MAX(stride_a * batch, 1); + int64_t ipiv_count = MAX(stride_ipiv * batch, 1); + int64_t info_count = MAX(batch, 1); + double *a = NULL, *a_copy = NULL; + int64_t *ipiv = NULL, *info = NULL; + CHECK(a = (double *)malloc(sizeof(double) * a_count), !a); + CHECK(a_copy = (double *)malloc(sizeof(double) * a_count), !a_copy); + CHECK(ipiv = (int64_t *)malloc(sizeof(int64_t) * ipiv_count), !ipiv); + CHECK(info = (int64_t *)malloc(sizeof(int64_t) * info_count), !info); + + /* Initialize input data */ + for (int64_t k = 0; k < batch; k++) { + fp_init(m, n, &a_copy[k * stride_a], lda); + fp_copy(m, n, &a_copy[k * stride_a], lda, &a[k * stride_a], lda); + } + + /* Run the tested function */ + dgetrfnp_batch_strided_c(m, n, a, lda, stride_a, ipiv, stride_ipiv, batch, + info); + + /* Check that the computation completed successfully */ + exit_status += dgetrfnp_batch_strided_check(m, n, a_copy, a, lda, stride_a, + ipiv, stride_ipiv, batch, info); + + free(a); + free(a_copy); + free(ipiv); + free(info); + } + return exit_status; +} diff --git a/SYCL/ESIMD/regression/dgetrf_8x8.cpp b/SYCL/ESIMD/regression/dgetrf_8x8.cpp index 521be63073..5ef53b70ac 100644 --- a/SYCL/ESIMD/regression/dgetrf_8x8.cpp +++ b/SYCL/ESIMD/regression/dgetrf_8x8.cpp @@ -23,7 +23,7 @@ #define ABS(x) ((x) >= 0 ? (x) : -(x)) #define MIN(x, y) ((x) <= (y) ? (x) : (y)) #define MAX(x, y) ((x) >= (y) ? (x) : (y)) -#define FP_RAND ((double)rand() / (double)RAND_MAX) +#define FP_RAND ((float)rand() / (float)RAND_MAX) #define OUTN(text, ...) fprintf(stderr, text, ##__VA_ARGS__) #define OUT(text, ...) OUTN(text "\n", ##__VA_ARGS__) @@ -49,7 +49,7 @@ using namespace sycl; using namespace std; using namespace sycl::ext::intel::esimd; -ESIMD_PRIVATE ESIMD_REGISTER(384) simd GRF; +ESIMD_PRIVATE ESIMD_REGISTER(384) simd GRF; #define V(x, w, i) (x).template select(i) #define V1(x, i) V(x, 1, i) @@ -67,7 +67,7 @@ template ESIMD_INLINE void dgetrfnp_panel(int64_t *info) { V1(mask, k) = 0; if (ak0[k] != 0.0) { // scal - double temp = 1.0 / ak0[k]; + float temp = 1.0 / ak0[k]; ak0.merge(ak0 * temp, mask); for (int i = 8 + K + kk; i < M; i += 8) { V8(ak, i) *= temp; @@ -96,15 +96,15 @@ template ESIMD_INLINE void dgetrfnp_panel(int64_t *info) { // L=A[K:M,0:K]) - panel to update with P1=A[0:M,K:K+N] = column(U=A[0:K,K:K+N], // T=A[K:M,K:K+N]) - panel to be updated template -ESIMD_INLINE void dgetrfnp_left_step(double *a, int64_t lda, int64_t *info) { +ESIMD_INLINE void dgetrfnp_left_step(float *a, int64_t lda, int64_t *info) { auto p1 = V(GRF, M * N, 0); - double *a1; + float *a1; int i, j, k; // load P1 for (j = 0, a1 = a + K * lda; j < N; j++, a1 += lda) for (i = 0; i < M; i += 8) { - simd data; + simd data; data.copy_from(a1 + i); V8(p1, j * M + i) = data; } @@ -114,18 +114,18 @@ ESIMD_INLINE void dgetrfnp_left_step(double *a, int64_t lda, int64_t *info) { // store P1 for (j = 0, a1 = a + K * lda; j < N; j++, a1 += lda) for (i = 0; i < M; i += 8) { - simd vals = V8(p1, j * M + i); + simd vals = V8(p1, j * M + i); vals.copy_to(a1 + i); } } -ESIMD_INLINE void dgetrfnp_esimd_8x8(double *a, int64_t lda, int64_t *ipiv, +ESIMD_INLINE void dgetrfnp_esimd_8x8(float *a, int64_t lda, int64_t *ipiv, int64_t *info) { *info = 0; dgetrfnp_left_step<8, 8, 0>(a, lda, info); } -void dgetrfnp_batch_strided_c(int64_t m, int64_t n, double *a, int64_t lda, +void dgetrfnp_batch_strided_c(int64_t m, int64_t n, float *a, int64_t lda, int64_t stride_a, int64_t *ipiv, int64_t stride_ipiv, int64_t batch, int64_t *info) { @@ -136,11 +136,11 @@ void dgetrfnp_batch_strided_c(int64_t m, int64_t n, double *a, int64_t lda, CHECK(status = device.is_gpu(), !status); - double *a_gpu; + float *a_gpu; int64_t *ipiv_gpu; int64_t *info_gpu; - CHECK(a_gpu = static_cast( - malloc_shared(stride_a * batch * sizeof(double), device, context)), + CHECK(a_gpu = static_cast( + malloc_shared(stride_a * batch * sizeof(float), device, context)), !a_gpu); CHECK(ipiv_gpu = static_cast(malloc_shared( stride_ipiv * batch * sizeof(int64_t), device, context)), @@ -149,7 +149,7 @@ void dgetrfnp_batch_strided_c(int64_t m, int64_t n, double *a, int64_t lda, malloc_shared(batch * sizeof(int64_t), device, context)), !info_gpu); - memcpy(a_gpu, a, stride_a * batch * sizeof(double)); + memcpy(a_gpu, a, stride_a * batch * sizeof(float)); sycl::nd_range<1> range(sycl::range<1>{static_cast(batch)}, sycl::range<1>{1}); @@ -171,7 +171,7 @@ void dgetrfnp_batch_strided_c(int64_t m, int64_t n, double *a, int64_t lda, return; } - memcpy(a, a_gpu, stride_a * batch * sizeof(double)); + memcpy(a, a_gpu, stride_a * batch * sizeof(float)); memcpy(ipiv, ipiv_gpu, stride_ipiv * batch * sizeof(int64_t)); memcpy(info, info_gpu, batch * sizeof(int64_t)); @@ -180,14 +180,14 @@ void dgetrfnp_batch_strided_c(int64_t m, int64_t n, double *a, int64_t lda, free(info_gpu, context); } -static void fp_init(int64_t m, int64_t n, double *a, int64_t lda) { +static void fp_init(int64_t m, int64_t n, float *a, int64_t lda) { int64_t i, j; for (j = 0; j < n; j++) for (i = 0; i < m; i++) a[i + j * lda] = 2.0 * FP_RAND - 1.0; } -static void fp_copy(int64_t m, int64_t n, double *a, int64_t lda, double *b, +static void fp_copy(int64_t m, int64_t n, float *a, int64_t lda, float *b, int64_t ldb) { int64_t i, j; for (j = 0; j < n; j++) @@ -195,8 +195,8 @@ static void fp_copy(int64_t m, int64_t n, double *a, int64_t lda, double *b, b[i + j * ldb] = a[i + j * lda]; } -static double fp_norm1(int64_t m, int64_t n, double *a, int64_t lda) { - double sum, value = 0.0; +static float fp_norm1(int64_t m, int64_t n, float *a, int64_t lda) { + float sum, value = 0.0; int64_t i, j; for (j = 0; j < n; j++) { sum = 0.0; @@ -208,28 +208,28 @@ static double fp_norm1(int64_t m, int64_t n, double *a, int64_t lda) { return value; } -static int dgetrfnp_batch_strided_check(int64_t m, int64_t n, double *a_in, - double *a, int64_t lda, +static int dgetrfnp_batch_strided_check(int64_t m, int64_t n, float *a_in, + float *a, int64_t lda, int64_t stride_a, int64_t *ipiv, int64_t stride_ipiv, int64_t batch, int64_t *info) { - double thresh = 30.0; + float thresh = 30.0; int fail = 0; int64_t i, j, k, l; char label[1024]; - unsigned char prec_b[] = {0, 0, 0, 0, 0, 0, 0xb0, 0x3c}; - double res = 0.0, nrm = 0.0, ulp = *(double *)prec_b; - double *w = (double *)malloc(sizeof(double) * MAX(m * n, 1)); + unsigned char prec_b[] = {0, 0, 0xb0, 0x3c}; + float res = 0.0, nrm = 0.0, ulp = *(float *)prec_b; + float *w = (float *)malloc(sizeof(float) * MAX(m * n, 1)); sprintf(label, "m=%ld, n=%ld, lda=%ld, batch=%ld", m, n, lda, batch); for (k = 0; k < batch; k++) { /* info == 0 */ - CHECK_AND_REPORT("info == 0", label, info[k] != 0, (double)info[k], fail); + CHECK_AND_REPORT("info == 0", label, info[k] != 0, (float)info[k], fail); if (m > 0 && n > 0) { /* | L U - A | / ( |A| n ulp ) */ - memset(w, 0, sizeof(double) * m * n); + memset(w, 0, sizeof(float) * m * n); if (m < n) { for (j = 0; j < n; j++) for (i = 0; i <= j; i++) @@ -256,7 +256,7 @@ static int dgetrfnp_batch_strided_check(int64_t m, int64_t n, double *a_in, w[i + j * m] -= a_in[k * stride_a + i + j * lda]; res = fp_norm1(m, n, w, m); nrm = fp_norm1(m, n, &a_in[k * stride_a], lda); - nrm *= (double)n * ulp; + nrm *= (float)n * ulp; res /= nrm > 0.0 ? nrm : ulp; CHECK_AND_REPORT("| L U - A | / ( |A| n ulp )", label, FAILED(res, thresh), res, fail); @@ -267,7 +267,7 @@ static int dgetrfnp_batch_strided_check(int64_t m, int64_t n, double *a_in, return fail; } -void dgetrfnp_batch_strided_c(int64_t m, int64_t n, double *a, int64_t lda, +void dgetrfnp_batch_strided_c(int64_t m, int64_t n, float *a, int64_t lda, int64_t stride_a, int64_t *ipiv, int64_t stride_ipiv, int64_t batch, int64_t *info); @@ -285,10 +285,10 @@ int main(int argc, char *argv[]) { int64_t a_count = MAX(stride_a * batch, 1); int64_t ipiv_count = MAX(stride_ipiv * batch, 1); int64_t info_count = MAX(batch, 1); - double *a = NULL, *a_copy = NULL; + float *a = NULL, *a_copy = NULL; int64_t *ipiv = NULL, *info = NULL; - CHECK(a = (double *)malloc(sizeof(double) * a_count), !a); - CHECK(a_copy = (double *)malloc(sizeof(double) * a_count), !a_copy); + CHECK(a = (float *)malloc(sizeof(float) * a_count), !a); + CHECK(a_copy = (float *)malloc(sizeof(float) * a_count), !a_copy); CHECK(ipiv = (int64_t *)malloc(sizeof(int64_t) * ipiv_count), !ipiv); CHECK(info = (int64_t *)malloc(sizeof(int64_t) * info_count), !info); diff --git a/SYCL/ESIMD/regression/dgetrf_8x8_aspect-fp64.cpp b/SYCL/ESIMD/regression/dgetrf_8x8_aspect-fp64.cpp new file mode 100644 index 0000000000..4b695a4877 --- /dev/null +++ b/SYCL/ESIMD/regression/dgetrf_8x8_aspect-fp64.cpp @@ -0,0 +1,314 @@ +//==------- dgetrf_8x8_aspect-fp64.cpp - DPC++ ESIMD on-device test -------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: gpu, aspect-fp64 +// UNSUPPORTED: cuda || hip +// RUN: %clangxx -fsycl %s -I%S/.. -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out 1 +// +// Reduced version of dgetrf.cpp - M = 8, N = 8, single batch. +// +#include +#include +#include +#include +#include +#include + +#define ABS(x) ((x) >= 0 ? (x) : -(x)) +#define MIN(x, y) ((x) <= (y) ? (x) : (y)) +#define MAX(x, y) ((x) >= (y) ? (x) : (y)) +#define FP_RAND ((double)rand() / (double)RAND_MAX) + +#define OUTN(text, ...) fprintf(stderr, text, ##__VA_ARGS__) +#define OUT(text, ...) OUTN(text "\n", ##__VA_ARGS__) + +#define CHECK(cmd, status) \ + do { \ + cmd; \ + if (status) { \ + OUT(#cmd " status: %d", status); \ + exit(1); \ + } \ + } while (0) +#define FAILED(res, thresh) ((res) > (thresh) || (res) != (res)) +#define CHECK_AND_REPORT(test_desc, test_id, fail_cond, res, fail_cnt) \ + do { \ + if (fail_cond) \ + fail_cnt++; \ + OUT("Test (%s): " test_desc ". Result: %f. %s", test_id, res, \ + (fail_cond) ? "FAILED" : "PASSED"); \ + } while (0) + +using namespace cl::sycl; +using namespace std; +using namespace sycl::ext::intel::esimd; + +ESIMD_PRIVATE ESIMD_REGISTER(384) simd GRF; + +#define V(x, w, i) (x).template select(i) +#define V1(x, i) V(x, 1, i) +#define V8(x, i) V(x, 8, i) +#define BCAST8(x, i) (x).template replicate_w<8, 1>(i) + +template ESIMD_INLINE void dgetrfnp_panel(int64_t *info) { + auto a = V(GRF, M * N, 0); + for (int kk = 0; kk < N; kk += 8) { + simd_mask<8> mask = 1; + for (int k = 0; k < 8 && kk + k < N; k++) { + auto ak = V(a, M, (kk + k) * M); + auto ak0 = V8(ak, kk + K); + + V1(mask, k) = 0; + if (ak0[k] != 0.0) { + // scal + double temp = 1.0 / ak0[k]; + ak0.merge(ak0 * temp, mask); + for (int i = 8 + K + kk; i < M; i += 8) { + V8(ak, i) *= temp; + } + + // update + for (int j = kk + k + 1; j < N; j++) { + auto aj = V(a, M, j * M); + auto aj0 = V8(aj, kk + K); + auto temp = BCAST8(aj0, k); + aj0.merge(aj0 - temp * ak0, aj0, mask); + for (int i = 8 + K + kk; i < M; i += 8) { + V8(aj, i) -= temp * V8(ak, i); + } + } + } else if (*info == 0) { + *info = K + kk + k + 1; + } + } + } +} + +// A left-looking algorithm step +// M, N - a panel size to be updated and factorized (M * N <= 64 * 6), must fit +// into GRF K - an update rank P0=A[0:M,0:K] = column(F=A[0:K,0:K], +// L=A[K:M,0:K]) - panel to update with P1=A[0:M,K:K+N] = column(U=A[0:K,K:K+N], +// T=A[K:M,K:K+N]) - panel to be updated +template +ESIMD_INLINE void dgetrfnp_left_step(double *a, int64_t lda, int64_t *info) { + auto p1 = V(GRF, M * N, 0); + double *a1; + int i, j, k; + + // load P1 + for (j = 0, a1 = a + K * lda; j < N; j++, a1 += lda) + for (i = 0; i < M; i += 8) { + simd data; + data.copy_from(a1 + i); + V8(p1, j * M + i) = data; + } + // (getrf) factorize T=P*L*U + dgetrfnp_panel(info); + + // store P1 + for (j = 0, a1 = a + K * lda; j < N; j++, a1 += lda) + for (i = 0; i < M; i += 8) { + simd vals = V8(p1, j * M + i); + vals.copy_to(a1 + i); + } +} + +ESIMD_INLINE void dgetrfnp_esimd_8x8(double *a, int64_t lda, int64_t *ipiv, + int64_t *info) { + *info = 0; + dgetrfnp_left_step<8, 8, 0>(a, lda, info); +} + +void dgetrfnp_batch_strided_c(int64_t m, int64_t n, double *a, int64_t lda, + int64_t stride_a, int64_t *ipiv, + int64_t stride_ipiv, int64_t batch, + int64_t *info) { + queue queue((gpu_selector())); + auto device = queue.get_device(); + auto context = queue.get_context(); + int status; + + CHECK(status = device.is_gpu(), !status); + + double *a_gpu; + int64_t *ipiv_gpu; + int64_t *info_gpu; + CHECK(a_gpu = static_cast( + malloc_shared(stride_a * batch * sizeof(double), device, context)), + !a_gpu); + CHECK(ipiv_gpu = static_cast(malloc_shared( + stride_ipiv * batch * sizeof(int64_t), device, context)), + !ipiv_gpu); + CHECK(info_gpu = static_cast( + malloc_shared(batch * sizeof(int64_t), device, context)), + !info_gpu); + + memcpy(a_gpu, a, stride_a * batch * sizeof(double)); + + sycl::nd_range<1> range(sycl::range<1>{static_cast(batch)}, + sycl::range<1>{1}); + try { + auto event = queue.submit([&](handler &cgh) { + cgh.parallel_for( + range, [=](nd_item<1> id) SYCL_ESIMD_KERNEL { + int i = id.get_global_id(0); + dgetrfnp_esimd_8x8(&a_gpu[i * stride_a], lda, + &ipiv_gpu[i * stride_ipiv], &info_gpu[i]); + }); + }); + event.wait(); + } catch (const sycl::exception &e) { + std::cout << "*** EXCEPTION caught: " << e.what() << "\n"; + free(a_gpu, context); + free(ipiv_gpu, context); + free(info_gpu, context); + return; + } + + memcpy(a, a_gpu, stride_a * batch * sizeof(double)); + memcpy(ipiv, ipiv_gpu, stride_ipiv * batch * sizeof(int64_t)); + memcpy(info, info_gpu, batch * sizeof(int64_t)); + + free(a_gpu, context); + free(ipiv_gpu, context); + free(info_gpu, context); +} + +static void fp_init(int64_t m, int64_t n, double *a, int64_t lda) { + int64_t i, j; + for (j = 0; j < n; j++) + for (i = 0; i < m; i++) + a[i + j * lda] = 2.0 * FP_RAND - 1.0; +} + +static void fp_copy(int64_t m, int64_t n, double *a, int64_t lda, double *b, + int64_t ldb) { + int64_t i, j; + for (j = 0; j < n; j++) + for (i = 0; i < m; i++) + b[i + j * ldb] = a[i + j * lda]; +} + +static double fp_norm1(int64_t m, int64_t n, double *a, int64_t lda) { + double sum, value = 0.0; + int64_t i, j; + for (j = 0; j < n; j++) { + sum = 0.0; + for (i = 0; i < m; i++) + sum += ABS(a[i + j * lda]); + if (value < sum) + value = sum; + } + return value; +} + +static int dgetrfnp_batch_strided_check(int64_t m, int64_t n, double *a_in, + double *a, int64_t lda, + int64_t stride_a, int64_t *ipiv, + int64_t stride_ipiv, int64_t batch, + int64_t *info) { + double thresh = 30.0; + int fail = 0; + int64_t i, j, k, l; + char label[1024]; + unsigned char prec_b[] = {0, 0, 0, 0, 0, 0, 0xb0, 0x3c}; + double res = 0.0, nrm = 0.0, ulp = *(double *)prec_b; + double *w = (double *)malloc(sizeof(double) * MAX(m * n, 1)); + + sprintf(label, "m=%ld, n=%ld, lda=%ld, batch=%ld", m, n, lda, batch); + + for (k = 0; k < batch; k++) { + /* info == 0 */ + CHECK_AND_REPORT("info == 0", label, info[k] != 0, (double)info[k], fail); + + if (m > 0 && n > 0) { + /* | L U - A | / ( |A| n ulp ) */ + memset(w, 0, sizeof(double) * m * n); + if (m < n) { + for (j = 0; j < n; j++) + for (i = 0; i <= j; i++) + w[i + j * m] = a[i + j * lda + k * stride_a]; + for (i = m - 1; i >= 0; i--) + for (j = 0; j < n; j++) + for (l = 0; l < i; l++) + w[i + j * m] += a[i + l * lda + k * stride_a] * w[l + j * m]; + } else { + for (j = 0; j < n; j++) + for (i = j; i < m; i++) + w[i + j * m] = a[i + j * lda + k * stride_a]; + for (j = 0; j < n; j++) + w[j + j * m] = 1.0; + for (j = n - 1; j >= 0; j--) + for (i = 0; i < m; i++) { + w[i + j * m] *= a[j + j * lda + k * stride_a]; + for (l = 0; l < j; l++) + w[i + j * m] += w[i + l * m] * a[l + j * lda + k * stride_a]; + } + } + for (j = 0; j < n; j++) + for (i = 0; i < m; i++) + w[i + j * m] -= a_in[k * stride_a + i + j * lda]; + res = fp_norm1(m, n, w, m); + nrm = fp_norm1(m, n, &a_in[k * stride_a], lda); + nrm *= (double)n * ulp; + res /= nrm > 0.0 ? nrm : ulp; + CHECK_AND_REPORT("| L U - A | / ( |A| n ulp )", label, + FAILED(res, thresh), res, fail); + } + } + + free(w); + return fail; +} + +void dgetrfnp_batch_strided_c(int64_t m, int64_t n, double *a, int64_t lda, + int64_t stride_a, int64_t *ipiv, + int64_t stride_ipiv, int64_t batch, + int64_t *info); + +int main(int argc, char *argv[]) { + int exit_status = 0; + constexpr int64_t m = 8, n = 8, lda = 8; + int64_t stride_a = lda * n, stride_ipiv = n; + + srand(1); + + for (int i = 1; i < argc; i++) { + int64_t batch = (int64_t)atoi(argv[i]); + batch = MAX(batch, 0); + int64_t a_count = MAX(stride_a * batch, 1); + int64_t ipiv_count = MAX(stride_ipiv * batch, 1); + int64_t info_count = MAX(batch, 1); + double *a = NULL, *a_copy = NULL; + int64_t *ipiv = NULL, *info = NULL; + CHECK(a = (double *)malloc(sizeof(double) * a_count), !a); + CHECK(a_copy = (double *)malloc(sizeof(double) * a_count), !a_copy); + CHECK(ipiv = (int64_t *)malloc(sizeof(int64_t) * ipiv_count), !ipiv); + CHECK(info = (int64_t *)malloc(sizeof(int64_t) * info_count), !info); + + /* Initialize input data */ + for (int64_t k = 0; k < batch; k++) { + fp_init(m, n, &a_copy[k * stride_a], lda); + fp_copy(m, n, &a_copy[k * stride_a], lda, &a[k * stride_a], lda); + } + + /* Run the tested function */ + dgetrfnp_batch_strided_c(m, n, a, lda, stride_a, ipiv, stride_ipiv, batch, + info); + + /* Check that the computation completed successfully */ + exit_status += dgetrfnp_batch_strided_check(m, n, a_copy, a, lda, stride_a, + ipiv, stride_ipiv, batch, info); + + free(a); + free(a_copy); + free(ipiv); + free(info); + } + return exit_status; +} diff --git a/SYCL/ESIMD/regression/dgetrf_aspect-fp64.cpp b/SYCL/ESIMD/regression/dgetrf_aspect-fp64.cpp new file mode 100644 index 0000000000..c3dfece33c --- /dev/null +++ b/SYCL/ESIMD/regression/dgetrf_aspect-fp64.cpp @@ -0,0 +1,19 @@ +//==--------- dgetrf_aspect-fp64.cpp - DPC++ ESIMD on-device test ---------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// REQUIRES: gpu, aspect-fp64 +// UNSUPPORTED: cuda || hip +// RUN: %clangxx -fsycl %s -I%S/.. -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out 3 2 1 +// +// This test checks the correctness of ESIMD program for batched LU +// decomposition without pivoting. The program contains multiple branches +// corresponding to LU input sizes; all internal functions are inlined. +// + +#include "Inputs/dgetrf_aspect-fp64.hpp" diff --git a/SYCL/ESIMD/regression/dgetrf_ref_aspect-fp64.cpp b/SYCL/ESIMD/regression/dgetrf_ref_aspect-fp64.cpp new file mode 100644 index 0000000000..5b11ff3beb --- /dev/null +++ b/SYCL/ESIMD/regression/dgetrf_ref_aspect-fp64.cpp @@ -0,0 +1,19 @@ +//==------- dgetrf_ref_aspect-fp64.cpp - DPC++ ESIMD on-device test -------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// REQUIRES: gpu, aspect-fp64 +// UNSUPPORTED: cuda || hip +// RUN: %clangxx -fsycl -DUSE_REF %s -I%S/.. -o %t.ref.out +// RUN: %GPU_RUN_PLACEHOLDER %t.ref.out 3 2 1 +// +// This test checks the correctness of ESIMD program for batched LU +// decomposition without pivoting. The program contains multiple branches +// corresponding to LU input sizes; all internal functions are inlined. +// + +#include "Inputs/dgetrf.hpp" From e2130bb518b13ba1058cc3e00f451feb1f52a23b Mon Sep 17 00:00:00 2001 From: Yanliang Mu Date: Tue, 16 Aug 2022 15:27:32 +0800 Subject: [PATCH 06/35] split double tests in GroupAlgorithm directory --- SYCL/GroupAlgorithm/SYCL2020/sort.cpp | 342 +----------------- SYCL/GroupAlgorithm/SYCL2020/sort.hpp | 342 ++++++++++++++++++ .../SYCL2020/sort_aspect-fp64.cpp | 38 ++ 3 files changed, 381 insertions(+), 341 deletions(-) create mode 100644 SYCL/GroupAlgorithm/SYCL2020/sort.hpp create mode 100644 SYCL/GroupAlgorithm/SYCL2020/sort_aspect-fp64.cpp diff --git a/SYCL/GroupAlgorithm/SYCL2020/sort.cpp b/SYCL/GroupAlgorithm/SYCL2020/sort.cpp index 85f31e78b0..b7fe34ed3f 100644 --- a/SYCL/GroupAlgorithm/SYCL2020/sort.cpp +++ b/SYCL/GroupAlgorithm/SYCL2020/sort.cpp @@ -3,349 +3,10 @@ // RUN: %GPU_RUN_PLACEHOLDER %t.out // RUN: %ACC_RUN_PLACEHOLDER %t.out -#include "support.h" -#include - -#include -#include -#include -#include +#include "sort.hpp" namespace oneapi_exp = sycl::ext::oneapi::experimental; -auto async_handler_ = [](sycl::exception_list ex_list) { - for (auto &ex : ex_list) { - try { - std::rethrow_exception(ex); - } catch (sycl::exception &ex) { - std::cerr << ex.what() << std::endl; - std::exit(EXIT_FAILURE); - } - } -}; - -constexpr uint32_t items_per_work_item = 4; - -struct CustomType { - int x; -}; - -struct CustomFunctor { - bool operator()(const CustomType &lhs, const CustomType &rhs) const { - return lhs.x < rhs.x; - } -}; - -template bool check(T lhs, T rhs, float epsilon) { - return sycl::abs(lhs - rhs) > epsilon; -} -bool check(CustomType lhs, CustomType rhs, float epsilon) { - return sycl::abs(lhs.x - rhs.x) > epsilon; -} - -template -bool verify(T *expected, T *got, std::size_t n, float epsilon) { - for (std::size_t i = 0; i < n; ++i) { - if (check(expected[i], got[i], epsilon)) { - return false; - } - } - return true; -} - -// forward declared classes to name kernels -template class sort_over_group_kernel_name; -template class joint_sort_kernel_name; -template class custom_sorter_kernel_name; - -// this class is needed to pass dimension value to aforementioned classes -template class int_wrapper; - -// custom sorter -template struct bubble_sorter { - Compare comp; - size_t idx; - - template - void operator()(Group g, Ptr begin, Ptr end) { - size_t n = end - begin; - if (idx == 0) - for (size_t i = 0; i < n; ++i) - for (size_t j = i + 1; j < n; ++j) - if (comp(begin[j], begin[i])) - std::swap(begin[i], begin[j]); - } -}; - -template sycl::range get_range(const std::size_t local); - -template <> sycl::range<1> get_range<1>(const std::size_t local) { - return sycl::range<1>(local); -} - -template <> sycl::range<2> get_range<2>(const std::size_t local) { - return sycl::range<2>(local, 1); -} - -template <> sycl::range<3> get_range<3>(const std::size_t local) { - return sycl::range<3>(local, 1, 1); -} - -template -int test_sort_over_group(sycl::queue &q, std::size_t local, - sycl::buffer &bufI1, Compare comp, int test_case) { - auto n = bufI1.size(); - if (n > local) - return -1; - - sycl::range local_range = get_range(local); - - std::size_t local_memory_size = - oneapi_exp::default_sorter<>::memory_required( - sycl::memory_scope::work_group, local_range); - - if (local_memory_size > - q.get_device().template get_info()) - std::cout << "local_memory_size = " << local_memory_size << ", available = " - << q.get_device() - .template get_info() - << std::endl; - q.submit([&](sycl::handler &h) { - auto aI1 = sycl::accessor(bufI1, h); - sycl::accessor - scratch({local_memory_size}, h); - - h.parallel_for, T, Compare>>( - sycl::nd_range(local_range, local_range), - [=](sycl::nd_item id) { - scratch[0] = std::byte{}; - auto local_id = id.get_local_linear_id(); - switch (test_case) { - case 0: - if constexpr (std::is_same_v> && - !std::is_same_v) - aI1[local_id] = oneapi_exp::sort_over_group( - oneapi_exp::group_with_scratchpad( - id.get_group(), - sycl::span{&scratch[0], local_memory_size}), - aI1[local_id]); - break; - case 1: - aI1[local_id] = oneapi_exp::sort_over_group( - oneapi_exp::group_with_scratchpad( - id.get_group(), - sycl::span{&scratch[0], local_memory_size}), - aI1[local_id], comp); - break; - case 2: - aI1[local_id] = oneapi_exp::sort_over_group( - id.get_group(), aI1[local_id], - oneapi_exp::default_sorter( - sycl::span{&scratch[0], local_memory_size})); - break; - } - }); - }).wait_and_throw(); - return 1; -} - -template -int test_joint_sort(sycl::queue &q, std::size_t n_items, std::size_t local, - sycl::buffer &bufI1, Compare comp, int test_case) { - auto n = bufI1.size(); - auto n_groups = (n - 1) / n_items + 1; - - std::size_t local_memory_size = - oneapi_exp::default_sorter<>::memory_required( - sycl::memory_scope::work_group, n); - if (local_memory_size > - q.get_device().template get_info()) - std::cout << "local_memory_size = " << local_memory_size << ", available = " - << q.get_device() - .template get_info() - << std::endl; - q.submit([&](sycl::handler &h) { - auto aI1 = sycl::accessor(bufI1, h); - sycl::accessor - scratch({local_memory_size}, h); - - h.parallel_for>( - sycl::nd_range<1>{{n_groups * local}, {local}}, - [=](sycl::nd_item<1> id) { - auto group_id = id.get_group(0); - auto ptr_keys = &aI1[group_id * n_items]; - // Replacing the line above with the line below also works - // auto ptr_keys = aI1.get_pointer() + group_id * n_items; - - scratch[0] = std::byte{}; - switch (test_case) { - case 0: - if constexpr (std::is_same_v> && - !std::is_same_v) - oneapi_exp::joint_sort( - oneapi_exp::group_with_scratchpad( - id.get_group(), - sycl::span{&scratch[0], local_memory_size}), - ptr_keys, - ptr_keys + sycl::min(n_items, n - group_id * n_items)); - break; - case 1: - oneapi_exp::joint_sort( - oneapi_exp::group_with_scratchpad( - id.get_group(), - sycl::span{&scratch[0], local_memory_size}), - ptr_keys, - ptr_keys + sycl::min(n_items, n - group_id * n_items), comp); - break; - case 2: - oneapi_exp::joint_sort( - id.get_group(), ptr_keys, - ptr_keys + sycl::min(n_items, n - group_id * n_items), - oneapi_exp::default_sorter( - sycl::span{&scratch[0], local_memory_size})); - break; - } - }); - }).wait_and_throw(); - return n_groups; -} - -template -int test_custom_sorter(sycl::queue &q, sycl::buffer &bufI1, Compare comp) { - std::size_t local = 4; - auto n = bufI1.size(); - if (n > local) - return -1; - local = std::min(local, n); - - q.submit([&](sycl::handler &h) { - auto aI1 = sycl::accessor(bufI1, h); - - h.parallel_for>( - sycl::nd_range<2>({local, 1}, {local, 1}), [=](sycl::nd_item<2> id) { - auto ptr = aI1.get_pointer(); - - oneapi_exp::joint_sort( - id.get_group(), ptr, ptr + n, - bubble_sorter{comp, id.get_local_linear_id()}); - }); - }).wait_and_throw(); - return 1; -} - -template -void run_sort(sycl::queue &q, std::vector &in, std::size_t size, - Compare comp, int test_case, int sort_case) { - std::vector in2(in.begin(), in.begin() + size); - std::vector expected(in.begin(), in.begin() + size); - constexpr size_t work_size_limit = 4; - std::size_t local = std::min( - work_size_limit, - q.get_device() - .template get_info()); - local = std::min(local, size); - auto n_items = items_per_work_item * local; - - int n_groups = 1; - { // scope to destruct buffers - sycl::buffer bufKeys(in2.data(), size); - { - switch (sort_case) { - case 0: - // this case is just to check the compilation - n_groups = test_sort_over_group<1>(q, local, bufKeys, comp, test_case); - - n_groups = test_sort_over_group<2>(q, local, bufKeys, comp, test_case); - break; - case 1: - n_groups = test_joint_sort(q, n_items, local, bufKeys, comp, test_case); - break; - case 2: - n_groups = test_custom_sorter(q, bufKeys, comp); - break; - } - } - } - - // check results - for (int i_group = 0; i_group < n_groups; ++i_group) { - std::sort(expected.begin() + i_group * n_items, - expected.begin() + std::min((i_group + 1) * n_items, size), comp); - } - if (n_groups != -1 && - (test_case != 0 || - test_case == 0 && std::is_same_v> && - !std::is_same_v)&&!verify(expected.data(), in2.data(), - size, 0.001f)) { - std::cerr << "Verification failed \n"; - exit(1); - } -} - -template struct test_sort_cases { - template - void operator()(sycl::queue &q, std::size_t dataSize, Compare comp, - Generator generate) { - std::vector stationaryData(dataSize); - // fill data - for (std::size_t i = 0; i < dataSize; ++i) - stationaryData[i] = generate(i); - - // run test - for (int test_case = 0; test_case < 3; ++test_case) { - for (int sort_case = 0; sort_case < 3; ++sort_case) { - run_sort(q, stationaryData, dataSize, comp, test_case, sort_case); - } - } - } -}; - -void test_custom_type(sycl::queue &q, std::size_t dataSize) { - std::vector stationaryData(dataSize, CustomType{0}); - // fill data - for (std::size_t i = 0; i < dataSize; ++i) - stationaryData[i] = CustomType{int(i)}; - - // run test - for (int test_case = 0; test_case < 1; ++test_case) { - for (int sort_case = 0; sort_case < 3; ++sort_case) { - run_sort(q, stationaryData, dataSize, CustomFunctor{}, test_case, - sort_case); - } - } -} - -template -void test_sort_by_comp(sycl::queue &q, std::size_t dataSize) { - std::default_random_engine generator; - std::normal_distribution distribution((10.0), (2.0)); - - T max_size = std::numeric_limits::max(); - std::size_t to_fill = dataSize; - if (dataSize > max_size) - to_fill = max_size; - - // reversed order - test_sort_cases()(q, to_fill, Compare{}, - [to_fill](std::size_t i) { return T(to_fill - i - 1); }); - // filled by 1 - test_sort_cases()(q, dataSize, Compare{}, - [](std::size_t) { return T(1); }); - // random distribution - test_sort_cases()(q, dataSize, Compare{}, - [&distribution, &generator](std::size_t) { - return T(distribution(generator)); - }); -} - -template -void test_sort_by_type(sycl::queue &q, std::size_t dataSize) { - test_sort_by_comp>(q, dataSize); - test_sort_by_comp>(q, dataSize); -} - int main(int argc, char *argv[]) { sycl::queue q(sycl::default_selector{}, async_handler_); if (!isSupportedDevice(q.get_device())) { @@ -362,7 +23,6 @@ int main(int argc, char *argv[]) { test_sort_by_type(q, sizes[i]); test_sort_by_type(q, sizes[i]); test_sort_by_type(q, sizes[i]); - test_sort_by_type(q, sizes[i]); test_sort_by_type(q, sizes[i]); test_custom_type(q, sizes[i]); diff --git a/SYCL/GroupAlgorithm/SYCL2020/sort.hpp b/SYCL/GroupAlgorithm/SYCL2020/sort.hpp new file mode 100644 index 0000000000..c6fce86fff --- /dev/null +++ b/SYCL/GroupAlgorithm/SYCL2020/sort.hpp @@ -0,0 +1,342 @@ +#include "support.h" +#include + +#include +#include +#include +#include + +namespace oneapi_exp = sycl::ext::oneapi::experimental; + +auto async_handler_ = [](sycl::exception_list ex_list) { + for (auto &ex : ex_list) { + try { + std::rethrow_exception(ex); + } catch (sycl::exception &ex) { + std::cerr << ex.what() << std::endl; + std::exit(EXIT_FAILURE); + } + } +}; + +constexpr uint32_t items_per_work_item = 4; + +struct CustomType { + int x; +}; + +struct CustomFunctor { + bool operator()(const CustomType &lhs, const CustomType &rhs) const { + return lhs.x < rhs.x; + } +}; + +template bool check(T lhs, T rhs, float epsilon) { + return sycl::abs(lhs - rhs) > epsilon; +} +bool check(CustomType lhs, CustomType rhs, float epsilon) { + return sycl::abs(lhs.x - rhs.x) > epsilon; +} + +template +bool verify(T *expected, T *got, std::size_t n, float epsilon) { + for (std::size_t i = 0; i < n; ++i) { + if (check(expected[i], got[i], epsilon)) { + return false; + } + } + return true; +} + +// forward declared classes to name kernels +template class sort_over_group_kernel_name; +template class joint_sort_kernel_name; +template class custom_sorter_kernel_name; + +// this class is needed to pass dimension value to aforementioned classes +template class int_wrapper; + +// custom sorter +template struct bubble_sorter { + Compare comp; + size_t idx; + + template + void operator()(Group g, Ptr begin, Ptr end) { + size_t n = end - begin; + if (idx == 0) + for (size_t i = 0; i < n; ++i) + for (size_t j = i + 1; j < n; ++j) + if (comp(begin[j], begin[i])) + std::swap(begin[i], begin[j]); + } +}; + +template sycl::range get_range(const std::size_t local); + +template <> sycl::range<1> get_range<1>(const std::size_t local) { + return sycl::range<1>(local); +} + +template <> sycl::range<2> get_range<2>(const std::size_t local) { + return sycl::range<2>(local, 1); +} + +template <> sycl::range<3> get_range<3>(const std::size_t local) { + return sycl::range<3>(local, 1, 1); +} + +template +int test_sort_over_group(sycl::queue &q, std::size_t local, + sycl::buffer &bufI1, Compare comp, int test_case) { + auto n = bufI1.size(); + if (n > local) + return -1; + + sycl::range local_range = get_range(local); + + std::size_t local_memory_size = + oneapi_exp::default_sorter<>::memory_required( + sycl::memory_scope::work_group, local_range); + + if (local_memory_size > + q.get_device().template get_info()) + std::cout << "local_memory_size = " << local_memory_size << ", available = " + << q.get_device() + .template get_info() + << std::endl; + q.submit([&](sycl::handler &h) { + auto aI1 = sycl::accessor(bufI1, h); + sycl::accessor + scratch({local_memory_size}, h); + + h.parallel_for, T, Compare>>( + sycl::nd_range(local_range, local_range), + [=](sycl::nd_item id) { + scratch[0] = std::byte{}; + auto local_id = id.get_local_linear_id(); + switch (test_case) { + case 0: + if constexpr (std::is_same_v> && + !std::is_same_v) + aI1[local_id] = oneapi_exp::sort_over_group( + oneapi_exp::group_with_scratchpad( + id.get_group(), + sycl::span{&scratch[0], local_memory_size}), + aI1[local_id]); + break; + case 1: + aI1[local_id] = oneapi_exp::sort_over_group( + oneapi_exp::group_with_scratchpad( + id.get_group(), + sycl::span{&scratch[0], local_memory_size}), + aI1[local_id], comp); + break; + case 2: + aI1[local_id] = oneapi_exp::sort_over_group( + id.get_group(), aI1[local_id], + oneapi_exp::default_sorter( + sycl::span{&scratch[0], local_memory_size})); + break; + } + }); + }).wait_and_throw(); + return 1; +} + +template +int test_joint_sort(sycl::queue &q, std::size_t n_items, std::size_t local, + sycl::buffer &bufI1, Compare comp, int test_case) { + auto n = bufI1.size(); + auto n_groups = (n - 1) / n_items + 1; + + std::size_t local_memory_size = + oneapi_exp::default_sorter<>::memory_required( + sycl::memory_scope::work_group, n); + if (local_memory_size > + q.get_device().template get_info()) + std::cout << "local_memory_size = " << local_memory_size << ", available = " + << q.get_device() + .template get_info() + << std::endl; + q.submit([&](sycl::handler &h) { + auto aI1 = sycl::accessor(bufI1, h); + sycl::accessor + scratch({local_memory_size}, h); + + h.parallel_for>( + sycl::nd_range<1>{{n_groups * local}, {local}}, + [=](sycl::nd_item<1> id) { + auto group_id = id.get_group(0); + auto ptr_keys = &aI1[group_id * n_items]; + // Replacing the line above with the line below also works + // auto ptr_keys = aI1.get_pointer() + group_id * n_items; + + scratch[0] = std::byte{}; + switch (test_case) { + case 0: + if constexpr (std::is_same_v> && + !std::is_same_v) + oneapi_exp::joint_sort( + oneapi_exp::group_with_scratchpad( + id.get_group(), + sycl::span{&scratch[0], local_memory_size}), + ptr_keys, + ptr_keys + sycl::min(n_items, n - group_id * n_items)); + break; + case 1: + oneapi_exp::joint_sort( + oneapi_exp::group_with_scratchpad( + id.get_group(), + sycl::span{&scratch[0], local_memory_size}), + ptr_keys, + ptr_keys + sycl::min(n_items, n - group_id * n_items), comp); + break; + case 2: + oneapi_exp::joint_sort( + id.get_group(), ptr_keys, + ptr_keys + sycl::min(n_items, n - group_id * n_items), + oneapi_exp::default_sorter( + sycl::span{&scratch[0], local_memory_size})); + break; + } + }); + }).wait_and_throw(); + return n_groups; +} + +template +int test_custom_sorter(sycl::queue &q, sycl::buffer &bufI1, Compare comp) { + std::size_t local = 4; + auto n = bufI1.size(); + if (n > local) + return -1; + local = std::min(local, n); + + q.submit([&](sycl::handler &h) { + auto aI1 = sycl::accessor(bufI1, h); + + h.parallel_for>( + sycl::nd_range<2>({local, 1}, {local, 1}), [=](sycl::nd_item<2> id) { + auto ptr = aI1.get_pointer(); + + oneapi_exp::joint_sort( + id.get_group(), ptr, ptr + n, + bubble_sorter{comp, id.get_local_linear_id()}); + }); + }).wait_and_throw(); + return 1; +} + +template +void run_sort(sycl::queue &q, std::vector &in, std::size_t size, + Compare comp, int test_case, int sort_case) { + std::vector in2(in.begin(), in.begin() + size); + std::vector expected(in.begin(), in.begin() + size); + constexpr size_t work_size_limit = 4; + std::size_t local = std::min( + work_size_limit, + q.get_device() + .template get_info()); + local = std::min(local, size); + auto n_items = items_per_work_item * local; + + int n_groups = 1; + { // scope to destruct buffers + sycl::buffer bufKeys(in2.data(), size); + { + switch (sort_case) { + case 0: + // this case is just to check the compilation + n_groups = test_sort_over_group<1>(q, local, bufKeys, comp, test_case); + + n_groups = test_sort_over_group<2>(q, local, bufKeys, comp, test_case); + break; + case 1: + n_groups = test_joint_sort(q, n_items, local, bufKeys, comp, test_case); + break; + case 2: + n_groups = test_custom_sorter(q, bufKeys, comp); + break; + } + } + } + + // check results + for (int i_group = 0; i_group < n_groups; ++i_group) { + std::sort(expected.begin() + i_group * n_items, + expected.begin() + std::min((i_group + 1) * n_items, size), comp); + } + if (n_groups != -1 && + (test_case != 0 || + test_case == 0 && std::is_same_v> && + !std::is_same_v)&&!verify(expected.data(), in2.data(), + size, 0.001f)) { + std::cerr << "Verification failed \n"; + exit(1); + } +} + +template struct test_sort_cases { + template + void operator()(sycl::queue &q, std::size_t dataSize, Compare comp, + Generator generate) { + std::vector stationaryData(dataSize); + // fill data + for (std::size_t i = 0; i < dataSize; ++i) + stationaryData[i] = generate(i); + + // run test + for (int test_case = 0; test_case < 3; ++test_case) { + for (int sort_case = 0; sort_case < 3; ++sort_case) { + run_sort(q, stationaryData, dataSize, comp, test_case, sort_case); + } + } + } +}; + +void test_custom_type(sycl::queue &q, std::size_t dataSize) { + std::vector stationaryData(dataSize, CustomType{0}); + // fill data + for (std::size_t i = 0; i < dataSize; ++i) + stationaryData[i] = CustomType{int(i)}; + + // run test + for (int test_case = 0; test_case < 1; ++test_case) { + for (int sort_case = 0; sort_case < 3; ++sort_case) { + run_sort(q, stationaryData, dataSize, CustomFunctor{}, test_case, + sort_case); + } + } +} + +template +void test_sort_by_comp(sycl::queue &q, std::size_t dataSize) { + std::default_random_engine generator; + std::normal_distribution distribution((10.0), (2.0)); + + T max_size = std::numeric_limits::max(); + std::size_t to_fill = dataSize; + if (dataSize > max_size) + to_fill = max_size; + + // reversed order + test_sort_cases()(q, to_fill, Compare{}, + [to_fill](std::size_t i) { return T(to_fill - i - 1); }); + // filled by 1 + test_sort_cases()(q, dataSize, Compare{}, + [](std::size_t) { return T(1); }); + // random distribution + test_sort_cases()(q, dataSize, Compare{}, + [&distribution, &generator](std::size_t) { + return T(distribution(generator)); + }); +} + +template +void test_sort_by_type(sycl::queue &q, std::size_t dataSize) { + test_sort_by_comp>(q, dataSize); + test_sort_by_comp>(q, dataSize); +} \ No newline at end of file diff --git a/SYCL/GroupAlgorithm/SYCL2020/sort_aspect-fp64.cpp b/SYCL/GroupAlgorithm/SYCL2020/sort_aspect-fp64.cpp new file mode 100644 index 0000000000..1925b84605 --- /dev/null +++ b/SYCL/GroupAlgorithm/SYCL2020/sort_aspect-fp64.cpp @@ -0,0 +1,38 @@ +// REQUIRES: aspect-fp64 +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -I . -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#include "sort.hpp" + +namespace oneapi_exp = sycl::ext::oneapi::experimental; + +int main(int argc, char *argv[]) { + sycl::queue q(sycl::default_selector{}, async_handler_); + + if (!q.get_device().has(sycl::aspect::fp64) { + std::cout << "Skipping test\n"; + return 0; + } + + if (!isSupportedDevice(q.get_device())) { + std::cout << "Skipping test\n"; + return 0; + } + + std::vector sizes{1, 12, 32}; + + for (int i = 0; i < sizes.size(); ++i) { + test_sort_by_type(q, sizes[i]); + test_sort_by_type(q, sizes[i]); + test_sort_by_type(q, sizes[i]); + test_sort_by_type(q, sizes[i]); + test_sort_by_type(q, sizes[i]); + test_sort_by_type(q, sizes[i]); + test_sort_by_type(q, sizes[i]); + + test_custom_type(q, sizes[i]); + } + std::cout << "Test passed." << std::endl; +} From c459551330dba9ffc4d2943ae30aa0ceccd35957 Mon Sep 17 00:00:00 2001 From: Yanliang Mu Date: Tue, 16 Aug 2022 15:33:31 +0800 Subject: [PATCH 07/35] split double tests in InlineAsm directory --- SYCL/InlineAsm/asm_float_add.cpp | 6 +++--- SYCL/InlineAsm/asm_float_imm_arg.cpp | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/SYCL/InlineAsm/asm_float_add.cpp b/SYCL/InlineAsm/asm_float_add.cpp index a9a7edd16f..f633743449 100644 --- a/SYCL/InlineAsm/asm_float_add.cpp +++ b/SYCL/InlineAsm/asm_float_add.cpp @@ -9,7 +9,7 @@ #include #include -using dataType = sycl::cl_double; +using dataType = sycl::cl_float; template struct KernelFunctor : WithInputBuffers, WithOutputBuffer { @@ -46,8 +46,8 @@ int main() { std::vector inputA(DEFAULT_PROBLEM_SIZE), inputB(DEFAULT_PROBLEM_SIZE); for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) { - inputA[i] = (double)1 / std::pow(2, i); - inputB[i] = (double)2 / std::pow(2, i); + inputA[i] = (float)1 / std::pow(2, i); + inputB[i] = (float)2 / std::pow(2, i); } KernelFunctor<> f(inputA, inputB); diff --git a/SYCL/InlineAsm/asm_float_imm_arg.cpp b/SYCL/InlineAsm/asm_float_imm_arg.cpp index 13aae5c455..d2fb47000f 100644 --- a/SYCL/InlineAsm/asm_float_imm_arg.cpp +++ b/SYCL/InlineAsm/asm_float_imm_arg.cpp @@ -9,8 +9,8 @@ #include #include -constexpr double IMM_ARGUMENT = 0.5; -using dataType = sycl::cl_double; +constexpr float IMM_ARGUMENT = 0.5; +using dataType = sycl::cl_float; template struct KernelFunctor : WithInputBuffers, WithOutputBuffer { @@ -42,7 +42,7 @@ struct KernelFunctor : WithInputBuffers, WithOutputBuffer { int main() { std::vector input(DEFAULT_PROBLEM_SIZE); for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) - input[i] = (double)1 / std::pow(2, i); + input[i] = (float)1 / std::pow(2, i); KernelFunctor<> f(input); if (!launchInlineASMTest(f)) From 2ca5eeaa9dd33f4bd1acd4a09e4e7486ab1e9f73 Mon Sep 17 00:00:00 2001 From: Yanliang Mu Date: Tue, 16 Aug 2022 15:35:45 +0800 Subject: [PATCH 08/35] split double tests in KernelParams directory --- SYCL/KernelParams/union_kernel_param.cpp | 14 +++--- .../union_kernel_param_aspect-fp64.cpp | 46 +++++++++++++++++++ 2 files changed, 53 insertions(+), 7 deletions(-) create mode 100644 SYCL/KernelParams/union_kernel_param_aspect-fp64.cpp diff --git a/SYCL/KernelParams/union_kernel_param.cpp b/SYCL/KernelParams/union_kernel_param.cpp index 0cacf310a8..c3c98dcbb3 100644 --- a/SYCL/KernelParams/union_kernel_param.cpp +++ b/SYCL/KernelParams/union_kernel_param.cpp @@ -13,27 +13,27 @@ union TestUnion { public: int myint; char mychar; - double mydouble; + float myfloat; - TestUnion() { mydouble = 0.0; }; + TestUnion() { myfloat = 0.0f; }; }; int main(int argc, char **argv) { TestUnion x; - x.mydouble = 5.0; - double mydouble = 0.0; + x.myfloat = 5.0f; + float myfloat = 0.0f; sycl::queue queue; { sycl::buffer buf(&mydouble, 1); queue.submit([&](sycl::handler &cgh) { auto acc = buf.get_access(cgh); - cgh.single_task([=]() { acc[0] = x.mydouble; }); + cgh.single_task([=]() { acc[0] = x.myfloat; }); }); } - if (mydouble != 5.0) { - printf("FAILED\nmydouble = %d\n", mydouble); + if (myfloat != 5.0f) { + printf("FAILED\nmyfloat = %d\n", myfloat); return 1; } return 0; diff --git a/SYCL/KernelParams/union_kernel_param_aspect-fp64.cpp b/SYCL/KernelParams/union_kernel_param_aspect-fp64.cpp new file mode 100644 index 0000000000..b8715fd992 --- /dev/null +++ b/SYCL/KernelParams/union_kernel_param_aspect-fp64.cpp @@ -0,0 +1,46 @@ +// This test checks kernel execution with union type as kernel parameters. + +// REQUIRES: aspect-fp64 +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#include +#include + +union TestUnion { +public: + int myint; + char mychar; + double mydouble; + + TestUnion() { mydouble = 0.0; }; +}; + +int main(int argc, char **argv) { + TestUnion x; + x.mydouble = 5.0; + double mydouble = 0.0; + + cl::sycl::queue queue; + if (!queue.get_device().has(sycl::aspect::fp64) { + std::cout << "Skipping test\n"; + return 0; + } + + { + cl::sycl::buffer buf(&mydouble, 1); + queue.submit([&](cl::sycl::handler &cgh) { + auto acc = buf.get_access(cgh); + cgh.single_task([=]() { acc[0] = x.mydouble; }); + }); + } + + if (mydouble != 5.0) { + printf("FAILED\nmydouble = %d\n", mydouble); + return 1; + } + return 0; +} From a9bab5b4f1fbf4737cda415451e8ef5a3e9dd1d5 Mon Sep 17 00:00:00 2001 From: Yanliang Mu Date: Tue, 16 Aug 2022 15:36:46 +0800 Subject: [PATCH 09/35] split double tests in Regression directory --- SYCL/Regression/commandlist/Inputs/FindPrimesSYCL.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SYCL/Regression/commandlist/Inputs/FindPrimesSYCL.cpp b/SYCL/Regression/commandlist/Inputs/FindPrimesSYCL.cpp index 0057048134..c7ff46673d 100644 --- a/SYCL/Regression/commandlist/Inputs/FindPrimesSYCL.cpp +++ b/SYCL/Regression/commandlist/Inputs/FindPrimesSYCL.cpp @@ -51,7 +51,7 @@ float find_prime_s(work *w) { if (number < N) { for (size_t i = 0; i < niter; ++i) { bool is_prime = !(number % 2 == 0); - const int upper_bound = sycl::sqrt(1.0 * number) + 1; + const int upper_bound = sycl::sqrt(1.0f * number) + 1; int k = 3; while (k < upper_bound && is_prime) { is_prime = !(number % k == 0); From 1083492d6de10b6a8f5dc5315a63b9c70a2a48a6 Mon Sep 17 00:00:00 2001 From: Yanliang Mu Date: Tue, 16 Aug 2022 15:37:12 +0800 Subject: [PATCH 10/35] split double tests in SpecConstants directory --- SYCL/SpecConstants/2020/handler-api.cpp | 29 --- .../2020/handler-api_aspect-fp64.cpp | 133 +++++++++++++ SYCL/SpecConstants/2020/kernel-bundle-api.cpp | 35 ---- .../2020/kernel-bundle-api_aspect-fp64.cpp | 182 ++++++++++++++++++ 4 files changed, 315 insertions(+), 64 deletions(-) create mode 100644 SYCL/SpecConstants/2020/handler-api_aspect-fp64.cpp create mode 100644 SYCL/SpecConstants/2020/kernel-bundle-api_aspect-fp64.cpp diff --git a/SYCL/SpecConstants/2020/handler-api.cpp b/SYCL/SpecConstants/2020/handler-api.cpp index f9df9b82a1..669a1b1c09 100644 --- a/SYCL/SpecConstants/2020/handler-api.cpp +++ b/SYCL/SpecConstants/2020/handler-api.cpp @@ -23,7 +23,6 @@ constexpr sycl::specialization_id int_id; constexpr sycl::specialization_id int_id2(2); -constexpr sycl::specialization_id double_id(3.14); constexpr sycl::specialization_id custom_type_id; class TestDefaultValuesKernel; @@ -71,19 +70,16 @@ int main() { bool test_default_values(sycl::queue q) { sycl::buffer int_buffer(1); sycl::buffer int_buffer2(1); - sycl::buffer double_buffer(1); sycl::buffer custom_type_buffer(1); q.submit([&](sycl::handler &cgh) { auto int_acc = int_buffer.get_access(cgh); auto int_acc2 = int_buffer2.get_access(cgh); - auto double_acc = double_buffer.get_access(cgh); auto custom_type_acc = custom_type_buffer.get_access(cgh); cgh.single_task([=](sycl::kernel_handler kh) { int_acc[0] = kh.get_specialization_constant(); int_acc2[0] = kh.get_specialization_constant(); - double_acc[0] = kh.get_specialization_constant(); custom_type_acc[0] = kh.get_specialization_constant(); }); }); @@ -98,10 +94,6 @@ bool test_default_values(sycl::queue q) { if (!check_value(2, int_acc2[0], "integer specialization constant")) return false; - auto double_acc = double_buffer.get_access(); - if (!check_value(3.14, double_acc[0], "double specialization constant")) - return false; - auto custom_type_acc = custom_type_buffer.get_access(); const custom_type custom_type_ref; @@ -120,10 +112,6 @@ bool test_set_and_get_on_host(sycl::queue q) { "integer specializaiton constant before setting any value")) ++errors; - if (!check_value(3.14, cgh.get_specialization_constant(), - "double specializaiton constant before setting any value")) - ++errors; - custom_type custom_type_ref; if (!check_value( custom_type_ref, cgh.get_specialization_constant(), @@ -131,10 +119,8 @@ bool test_set_and_get_on_host(sycl::queue q) { ++errors; int new_int_value = 8; - double new_double_value = 3.0; custom_type new_custom_type_value('b', 1.0, 12); cgh.set_specialization_constant(new_int_value); - cgh.set_specialization_constant(new_double_value); cgh.set_specialization_constant(new_custom_type_value); if (!check_value( @@ -142,11 +128,6 @@ bool test_set_and_get_on_host(sycl::queue q) { "integer specializaiton constant after setting a new value")) ++errors; - if (!check_value( - new_double_value, cgh.get_specialization_constant(), - "double specializaiton constant after setting a new value")) - ++errors; - if (!check_value( new_custom_type_value, cgh.get_specialization_constant(), @@ -162,30 +143,25 @@ bool test_set_and_get_on_host(sycl::queue q) { bool test_set_and_get_on_device(sycl::queue q) { sycl::buffer int_buffer(1); sycl::buffer int_buffer2(1); - sycl::buffer double_buffer(1); sycl::buffer custom_type_buffer(1); int new_int_value = 8; int new_int_value2 = 0; - double new_double_value = 3.0; custom_type new_custom_type_value('b', 1.0, 12); q.submit([&](sycl::handler &cgh) { auto int_acc = int_buffer.get_access(cgh); auto int_acc2 = int_buffer2.get_access(cgh); - auto double_acc = double_buffer.get_access(cgh); auto custom_type_acc = custom_type_buffer.get_access(cgh); cgh.set_specialization_constant(new_int_value); cgh.set_specialization_constant(new_int_value2); - cgh.set_specialization_constant(new_double_value); cgh.set_specialization_constant(new_custom_type_value); cgh.single_task([=](sycl::kernel_handler kh) { int_acc[0] = kh.get_specialization_constant(); int_acc2[0] = kh.get_specialization_constant(); - double_acc[0] = kh.get_specialization_constant(); custom_type_acc[0] = kh.get_specialization_constant(); }); }); @@ -200,11 +176,6 @@ bool test_set_and_get_on_device(sycl::queue q) { "integer specialization constant")) return false; - auto double_acc = double_buffer.get_access(); - if (!check_value(new_double_value, double_acc[0], - "double specialization constant")) - return false; - auto custom_type_acc = custom_type_buffer.get_access(); if (!check_value(new_custom_type_value, custom_type_acc[0], diff --git a/SYCL/SpecConstants/2020/handler-api_aspect-fp64.cpp b/SYCL/SpecConstants/2020/handler-api_aspect-fp64.cpp new file mode 100644 index 0000000000..13d002f7da --- /dev/null +++ b/SYCL/SpecConstants/2020/handler-api_aspect-fp64.cpp @@ -0,0 +1,133 @@ +// This test is intended to check basic operations with SYCL 2020 specialization +// constants using sycl::handler and sycl::kernel_handler APIs: +// - test that specialization constants can be accessed in kernel and they +// have their default values if `set_specialization_constants` wasn't called +// - test that specialization constant values can be set and retrieved within +// command group scope +// - test that specialization constant values can be set within command group +// scope and correctly retrieved within a kernel +// +// REQUIRES: aspect-fp64 +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out \ +// RUN: -fsycl-dead-args-optimization +// FIXME: SYCL 2020 specialization constants are not supported on host device +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// FIXME: ACC devices use emulation path, which is not yet supported +// UNSUPPORTED: hip + +#include +#include +#include + +#include "common.hpp" + +constexpr sycl::specialization_id double_id(3.14); + +class TestDefaultValuesKernel; +class EmptyKernel; +class TestSetAndGetOnDevice; + +bool test_default_values(sycl::queue q); +bool test_set_and_get_on_host(sycl::queue q); +bool test_set_and_get_on_device(sycl::queue q); + +int main() { + auto exception_handler = [&](sycl::exception_list exceptions) { + for (std::exception_ptr const &e : exceptions) { + try { + std::rethrow_exception(e); + } catch (sycl::exception const &e) { + std::cout << "An async SYCL exception was caught: " << e.what() + << std::endl; + std::exit(1); + } + } + }; + + sycl::queue q(exception_handler); + + if (!q.get_device().has(sycl::aspect::fp64) { + std::cout << "Skipping test\n"; + return 0; + } + + if (!test_default_values(q)) { + std::cout << "Test for default values of specialization constants failed!" + << std::endl; + return 1; + } + + if (!test_set_and_get_on_host(q)) { + std::cout << "Test for set and get API on host failed!" << std::endl; + return 1; + } + + if (!test_set_and_get_on_device(q)) { + std::cout << "Test for set and get API on device failed!" << std::endl; + return 1; + } + + return 0; +}; + +bool test_default_values(sycl::queue q) { + sycl::buffer double_buffer(1); + + q.submit([&](sycl::handler &cgh) { + auto double_acc = double_buffer.get_access(cgh); + cgh.single_task([=](sycl::kernel_handler kh) { + double_acc[0] = kh.get_specialization_constant(); + }); + }); + + auto double_acc = double_buffer.get_access(); + if (!check_value(3.14, double_acc[0], "double specialization constant")) + return false; + + return true; +} + +bool test_set_and_get_on_host(sycl::queue q) { + unsigned errors = 0; + q.submit([&](sycl::handler &cgh) { + if (!check_value(3.14, cgh.get_specialization_constant(), + "double specializaiton constant before setting any value")) + ++errors; + + double new_double_value = 3.0; + cgh.set_specialization_constant(new_double_value); + + if (!check_value( + new_double_value, cgh.get_specialization_constant(), + "double specializaiton constant after setting a new value")) + ++errors; + + cgh.single_task([=]() {}); + }); + + return errors == 0; +} + +bool test_set_and_get_on_device(sycl::queue q) { + sycl::buffer double_buffer(1); + + double new_double_value = 3.0; + + q.submit([&](sycl::handler &cgh) { + auto double_acc = double_buffer.get_access(cgh); + + cgh.set_specialization_constant(new_double_value); + + cgh.single_task([=](sycl::kernel_handler kh) { + double_acc[0] = kh.get_specialization_constant(); + }); + }); + + auto double_acc = double_buffer.get_access(); + if (!check_value(new_double_value, double_acc[0], + "double specialization constant")) + return false; + + return true; +} diff --git a/SYCL/SpecConstants/2020/kernel-bundle-api.cpp b/SYCL/SpecConstants/2020/kernel-bundle-api.cpp index 0278043535..1320114af0 100644 --- a/SYCL/SpecConstants/2020/kernel-bundle-api.cpp +++ b/SYCL/SpecConstants/2020/kernel-bundle-api.cpp @@ -22,7 +22,6 @@ #include "common.hpp" constexpr sycl::specialization_id int_id; -constexpr sycl::specialization_id double_id(3.14); constexpr sycl::specialization_id custom_type_id; class TestDefaultValuesKernel; @@ -77,7 +76,6 @@ bool test_default_values(sycl::queue q) { } sycl::buffer int_buffer(1); - sycl::buffer double_buffer(1); sycl::buffer custom_type_buffer(1); auto input_bundle = @@ -87,12 +85,10 @@ bool test_default_values(sycl::queue q) { q.submit([&](sycl::handler &cgh) { cgh.use_kernel_bundle(exec_bundle); auto int_acc = int_buffer.get_access(cgh); - auto double_acc = double_buffer.get_access(cgh); auto custom_type_acc = custom_type_buffer.get_access(cgh); cgh.single_task([=](sycl::kernel_handler kh) { int_acc[0] = kh.get_specialization_constant(); - double_acc[0] = kh.get_specialization_constant(); custom_type_acc[0] = kh.get_specialization_constant(); }); }); @@ -103,10 +99,6 @@ bool test_default_values(sycl::queue q) { "integer specialization constant (defined without default value)")) return false; - auto double_acc = double_buffer.get_access(); - if (!check_value(3.14, double_acc[0], "double specialization constant")) - return false; - auto custom_type_acc = custom_type_buffer.get_access(); const custom_type custom_type_ref; @@ -146,11 +138,6 @@ bool test_set_and_get_on_host(sycl::queue q) { "integer specializaiton constant before setting any value")) ++errors; - if (!check_value(3.14, - input_bundle.get_specialization_constant(), - "double specializaiton constant before setting any value")) - ++errors; - custom_type custom_type_ref; if (!check_value( custom_type_ref, @@ -160,11 +147,9 @@ bool test_set_and_get_on_host(sycl::queue q) { // Update values int new_int_value = 42; - double new_double_value = 3.0; custom_type new_custom_type_value('b', 1.0, 12); input_bundle.set_specialization_constant(new_int_value); - input_bundle.set_specialization_constant(new_double_value); input_bundle.set_specialization_constant( new_custom_type_value); @@ -174,11 +159,6 @@ bool test_set_and_get_on_host(sycl::queue q) { "integer specializaiton constant after setting a new value")) ++errors; - if (!check_value(new_double_value, - input_bundle.get_specialization_constant(), - "double specializaiton constant after setting a value")) - ++errors; - if (!check_value( new_custom_type_value, input_bundle.get_specialization_constant(), @@ -194,11 +174,6 @@ bool test_set_and_get_on_host(sycl::queue q) { "integer specializaiton constant after build")) ++errors; - if (!check_value(new_double_value, - exec_bundle.get_specialization_constant(), - "double specializaiton constant after build")) - ++errors; - if (!check_value(new_custom_type_value, exec_bundle.get_specialization_constant(), "custom_type specializaiton constant after build")) @@ -211,17 +186,14 @@ bool test_set_and_get_on_host(sycl::queue q) { bool test_set_and_get_on_device(sycl::queue q) { sycl::buffer int_buffer(1); - sycl::buffer double_buffer(1); sycl::buffer custom_type_buffer(1); int new_int_value = 42; - double new_double_value = 3.0; custom_type new_custom_type_value('b', 1.0, 12); auto input_bundle = sycl::get_kernel_bundle(q.get_context()); input_bundle.set_specialization_constant(new_int_value); - input_bundle.set_specialization_constant(new_double_value); input_bundle.set_specialization_constant( new_custom_type_value); auto exec_bundle = sycl::build(input_bundle); @@ -229,13 +201,11 @@ bool test_set_and_get_on_device(sycl::queue q) { q.submit([&](sycl::handler &cgh) { cgh.use_kernel_bundle(exec_bundle); auto int_acc = int_buffer.get_access(cgh); - auto double_acc = double_buffer.get_access(cgh); auto custom_type_acc = custom_type_buffer.get_access(cgh); cgh.single_task([=](sycl::kernel_handler kh) { int_acc[0] = kh.get_specialization_constant(); - double_acc[0] = kh.get_specialization_constant(); custom_type_acc[0] = kh.get_specialization_constant(); }); }); @@ -245,11 +215,6 @@ bool test_set_and_get_on_device(sycl::queue q) { "integer specialization constant")) return false; - auto double_acc = double_buffer.get_access(); - if (!check_value(new_double_value, double_acc[0], - "double specialization constant")) - return false; - auto custom_type_acc = custom_type_buffer.get_access(); if (!check_value(new_custom_type_value, custom_type_acc[0], diff --git a/SYCL/SpecConstants/2020/kernel-bundle-api_aspect-fp64.cpp b/SYCL/SpecConstants/2020/kernel-bundle-api_aspect-fp64.cpp new file mode 100644 index 0000000000..3fe7ee60cc --- /dev/null +++ b/SYCL/SpecConstants/2020/kernel-bundle-api_aspect-fp64.cpp @@ -0,0 +1,182 @@ +// This test is intended to check basic operations with SYCL 2020 specialization +// constants using sycl::kernel_bundle and sycl::kernel_handler APIs: +// - test that specialization constants can be accessed in kernel and they +// have their default values if `set_specialization_constants` wasn't called +// - test that specialization constant values can be set and retrieved through +// kernel_bundle APIs on host +// - test that specialization constant values can be set through kernel_bundle +// API and correctly retrieved within a kernel +// +// REQUIRES: aspect-fp64 +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out \ +// RUN: -fsycl-dead-args-optimization +// FIXME: SYCL 2020 specialization constants are not supported on host device +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// FIXME: ACC devices use emulation path, which is not yet supported +// UNSUPPORTED: hip + +#include +#include +#include + +#include "common.hpp" + +constexpr sycl::specialization_id double_id(3.14); + +class TestDefaultValuesKernel; +class EmptyKernel; +class TestSetAndGetOnDevice; + +bool test_default_values(sycl::queue q); +bool test_set_and_get_on_host(sycl::queue q); +bool test_set_and_get_on_device(sycl::queue q); + +int main() { + auto exception_handler = [&](sycl::exception_list exceptions) { + for (std::exception_ptr const &e : exceptions) { + try { + std::rethrow_exception(e); + } catch (sycl::exception const &e) { + std::cout << "An async SYCL exception was caught: " << e.what() + << std::endl; + std::exit(1); + } + } + }; + + sycl::queue q(exception_handler); + + if (!q.get_device().has(sycl::aspect::fp64) { + std::cout << "Skipping test\n"; + return 0; + } + + if (!test_default_values(q)) { + std::cout << "Test for default values of specialization constants failed!" + << std::endl; + return 1; + } + + if (!test_set_and_get_on_host(q)) { + std::cout << "Test for set and get API on host failed!" << std::endl; + return 1; + } + + if (!test_set_and_get_on_device(q)) { + std::cout << "Test for set and get API on device failed!" << std::endl; + return 1; + } + + return 0; +}; + +bool test_default_values(sycl::queue q) { + if (!sycl::has_kernel_bundle(q.get_context())) { + std::cout << "Cannot obtain kernel_bundle in input state, skipping default " + "values test" + << std::endl; + // TODO: check that online_compielr aspec is not available + return true; + } + + sycl::buffer double_buffer(1); + + auto input_bundle = + sycl::get_kernel_bundle(q.get_context()); + auto exec_bundle = sycl::build(input_bundle); + + q.submit([&](sycl::handler &cgh) { + cgh.use_kernel_bundle(exec_bundle); + auto double_acc = double_buffer.get_access(cgh); + cgh.single_task([=](sycl::kernel_handler kh) { + double_acc[0] = kh.get_specialization_constant(); + }); + }); + + auto double_acc = double_buffer.get_access(); + if (!check_value(3.14, double_acc[0], "double specialization constant")) + return false; + + return true; +} + +bool test_set_and_get_on_host(sycl::queue q) { + if (!sycl::has_kernel_bundle(q.get_context())) { + std::cout << "Cannot obtain kernel_bundle in input state, skipping default " + "values test" + << std::endl; + // TODO: check that online_compielr aspec is not available + return true; + } + + unsigned errors = 0; + + try { + auto input_bundle = + sycl::get_kernel_bundle(q.get_context()); + + if (!input_bundle.contains_specialization_constants()) { + std::cout + << "Obtained kernel_bundle is expected to contain specialization " + "constants, but it doesn't!" + << std::endl; + return false; + } + + if (!check_value(3.14, + input_bundle.get_specialization_constant(), + "double specializaiton constant before setting any value")) + ++errors; + + // Update values + double new_double_value = 3.0; + + input_bundle.set_specialization_constant(new_double_value); + + // And re-check them again + if (!check_value(new_double_value, + input_bundle.get_specialization_constant(), + "double specializaiton constant after setting a value")) + ++errors; + + // Let's try to build the bundle + auto exec_bundle = sycl::build(input_bundle); + + // And ensure that updated spec constant values are still there + if (!check_value(new_double_value, + exec_bundle.get_specialization_constant(), + "double specializaiton constant after build")) + ++errors; + } catch (sycl::exception &e) { + } + + return 0 == errors; +} + +bool test_set_and_get_on_device(sycl::queue q) { + sycl::buffer double_buffer(1); + + double new_double_value = 3.0; + + auto input_bundle = + sycl::get_kernel_bundle(q.get_context()); + input_bundle.set_specialization_constant(new_double_value); + auto exec_bundle = sycl::build(input_bundle); + + q.submit([&](sycl::handler &cgh) { + cgh.use_kernel_bundle(exec_bundle); + auto double_acc = double_buffer.get_access(cgh); + + cgh.single_task([=](sycl::kernel_handler kh) { + double_acc[0] = kh.get_specialization_constant(); + }); + }); + + auto double_acc = double_buffer.get_access(); + if (!check_value(new_double_value, double_acc[0], + "double specialization constant")) + return false; + + return true; +} From 6bd2c97d92a15850be773e76745d64b32974ed67 Mon Sep 17 00:00:00 2001 From: Yanliang Mu Date: Tue, 16 Aug 2022 15:38:53 +0800 Subject: [PATCH 11/35] split double tests in SubGroup directory --- SYCL/SubGroup/barrier.cpp | 68 +----- SYCL/SubGroup/barrier.hpp | 63 ++++++ SYCL/SubGroup/barrier_aspect-fp64.cpp | 28 +++ SYCL/SubGroup/generic-shuffle.cpp | 205 +----------------- SYCL/SubGroup/generic-shuffle.hpp | 200 +++++++++++++++++ SYCL/SubGroup/generic-shuffle_aspect-fp64.cpp | 34 +++ SYCL/SubGroup/info.cpp | 2 +- SYCL/SubGroup/load_store.cpp | 178 +-------------- SYCL/SubGroup/load_store.hpp | 171 +++++++++++++++ SYCL/SubGroup/load_store_aspect-fp64.cpp | 46 ++++ 10 files changed, 546 insertions(+), 449 deletions(-) create mode 100644 SYCL/SubGroup/barrier.hpp create mode 100644 SYCL/SubGroup/barrier_aspect-fp64.cpp create mode 100644 SYCL/SubGroup/generic-shuffle.hpp create mode 100644 SYCL/SubGroup/generic-shuffle_aspect-fp64.cpp create mode 100644 SYCL/SubGroup/load_store.hpp create mode 100644 SYCL/SubGroup/load_store_aspect-fp64.cpp diff --git a/SYCL/SubGroup/barrier.cpp b/SYCL/SubGroup/barrier.cpp index 38e0fb1afe..d74c175ef8 100644 --- a/SYCL/SubGroup/barrier.cpp +++ b/SYCL/SubGroup/barrier.cpp @@ -10,70 +10,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// +#include "barrier.hpp" -#include "helper.hpp" -#include -#include -#include - -template class sycl_subgr; -using namespace sycl; -template -void check(queue &Queue, size_t G = 240, size_t L = 60) { - try { - nd_range<1> NdRange(G, L); - std::vector data(G); - std::iota(data.begin(), data.end(), sizeof(T)); - buffer addbuf(data.data(), range<1>(G)); - buffer sgsizebuf(1); - Queue.submit([&](handler &cgh) { - auto addacc = addbuf.template get_access(cgh); - auto sgsizeacc = sgsizebuf.get_access(cgh); - - cgh.parallel_for>( - NdRange, [=](nd_item<1> NdItem) { - ext::oneapi::sub_group SG = NdItem.get_sub_group(); - size_t lid = SG.get_local_id().get(0); - size_t gid = NdItem.get_global_id(0); - size_t SGoff = gid - lid; - - T res = 0; - for (size_t i = 0; i <= lid; i++) { - res += addacc[SGoff + i]; - } - if constexpr (UseNewSyntax) { - group_barrier(SG); - } else { - SG.barrier(access::fence_space::global_space); - } - addacc[gid] = res; - if (NdItem.get_global_id(0) == 0) - sgsizeacc[0] = SG.get_max_local_range()[0]; - }); - }); - auto addacc = addbuf.template get_access(); - auto sgsizeacc = sgsizebuf.get_access(); - - size_t sg_size = sgsizeacc[0]; - int WGid = -1, SGid = 0; - T add = 0; - for (int j = 0; j < G; j++) { - if (j % L % sg_size == 0) { - SGid++; - add = 0; - } - if (j % L == 0) { - WGid++; - SGid = 0; - } - add += j + sizeof(T); - exit_if_not_equal(addacc[j], add, "barrier"); - } - } catch (exception e) { - std::cout << "SYCL exception caught: " << e.what(); - exit(1); - } -} int main() { queue Queue; if (Queue.get_device().is_host()) { @@ -90,10 +28,6 @@ int main() { check(Queue); check(Queue); check(Queue); - if (Queue.get_device().has(sycl::aspect::fp64)) { - check(Queue); - check(Queue); - } std::cout << "Test passed." << std::endl; return 0; } diff --git a/SYCL/SubGroup/barrier.hpp b/SYCL/SubGroup/barrier.hpp new file mode 100644 index 0000000000..4d2dad6f50 --- /dev/null +++ b/SYCL/SubGroup/barrier.hpp @@ -0,0 +1,63 @@ +#include "helper.hpp" +#include +#include +#include + +template class sycl_subgr; +using namespace cl::sycl; +template +void check(queue &Queue, size_t G = 240, size_t L = 60) { + try { + nd_range<1> NdRange(G, L); + std::vector data(G); + std::iota(data.begin(), data.end(), sizeof(T)); + buffer addbuf(data.data(), range<1>(G)); + buffer sgsizebuf(1); + Queue.submit([&](handler &cgh) { + auto addacc = addbuf.template get_access(cgh); + auto sgsizeacc = sgsizebuf.get_access(cgh); + + cgh.parallel_for>( + NdRange, [=](nd_item<1> NdItem) { + ext::oneapi::sub_group SG = NdItem.get_sub_group(); + size_t lid = SG.get_local_id().get(0); + size_t gid = NdItem.get_global_id(0); + size_t SGoff = gid - lid; + + T res = 0; + for (size_t i = 0; i <= lid; i++) { + res += addacc[SGoff + i]; + } + if constexpr (UseNewSyntax) { + group_barrier(SG); + } else { + SG.barrier(access::fence_space::global_space); + } + addacc[gid] = res; + if (NdItem.get_global_id(0) == 0) + sgsizeacc[0] = SG.get_max_local_range()[0]; + }); + }); + auto addacc = addbuf.template get_access(); + auto sgsizeacc = sgsizebuf.get_access(); + + size_t sg_size = sgsizeacc[0]; + int WGid = -1, SGid = 0; + T add = 0; + for (int j = 0; j < G; j++) { + if (j % L % sg_size == 0) { + SGid++; + add = 0; + } + if (j % L == 0) { + WGid++; + SGid = 0; + } + add += j + sizeof(T); + exit_if_not_equal(addacc[j], add, "barrier"); + } + } catch (exception e) { + std::cout << "SYCL exception caught: " << e.what(); + exit(1); + } +} \ No newline at end of file diff --git a/SYCL/SubGroup/barrier_aspect-fp64.cpp b/SYCL/SubGroup/barrier_aspect-fp64.cpp new file mode 100644 index 0000000000..9f45b0b67e --- /dev/null +++ b/SYCL/SubGroup/barrier_aspect-fp64.cpp @@ -0,0 +1,28 @@ +// REQUIRES: aspect-fp64 +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +//==-- barrier_aspect-fp64.cpp - SYCL sub_group barrier test ---*- C++ -*---==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include "barrier.hpp" + +int main() { + queue Queue; + if (Queue.get_device().is_host()) { + std::cout << "Skipping test\n"; + return 0; + } + if (Queue.get_device().has(sycl::aspect::fp64)) { + check(Queue); + check(Queue); + } + std::cout << "Test passed." << std::endl; + return 0; +} diff --git a/SYCL/SubGroup/generic-shuffle.cpp b/SYCL/SubGroup/generic-shuffle.cpp index 59b710f925..3ede067ce8 100644 --- a/SYCL/SubGroup/generic-shuffle.cpp +++ b/SYCL/SubGroup/generic-shuffle.cpp @@ -11,207 +11,10 @@ // //===----------------------------------------------------------------------===// -#include "helper.hpp" -#include -#include -#include -#include -template class pointer_kernel; +#include "generic-shuffle.hpp" using namespace sycl; -template -void check_pointer(queue &Queue, size_t G = 256, size_t L = 64) { - try { - nd_range<1> NdRange(G, L); - buffer buf(G); - buffer buf_up(G); - buffer buf_down(G); - buffer buf_xor(G); - buffer sgsizebuf(1); - Queue.submit([&](handler &cgh) { - auto acc = buf.template get_access(cgh); - auto acc_up = buf_up.template get_access(cgh); - auto acc_down = - buf_down.template get_access(cgh); - auto acc_xor = buf_xor.template get_access(cgh); - auto sgsizeacc = sgsizebuf.get_access(cgh); - - cgh.parallel_for( - NdRange, [=](nd_item<1> NdItem) { - ext::oneapi::sub_group SG = NdItem.get_sub_group(); - uint32_t wggid = NdItem.get_global_id(0); - uint32_t sgid = SG.get_group_id().get(0); - if (wggid == 0) - sgsizeacc[0] = SG.get_max_local_range()[0]; - - T *ptr = static_cast(0x0) + wggid; - - /*GID of middle element in every subgroup*/ - acc[NdItem.get_global_id()] = - SG.shuffle(ptr, SG.get_max_local_range()[0] / 2); - - /* Save GID-SGID */ - acc_up[NdItem.get_global_id()] = SG.shuffle_up(ptr, sgid); - - /* Save GID+SGID */ - acc_down[NdItem.get_global_id()] = SG.shuffle_down(ptr, sgid); - - /* Save GID with SGLID = ( SGLID XOR SGID ) % SGMaxSize */ - acc_xor[NdItem.get_global_id()] = - SG.shuffle_xor(ptr, sgid % SG.get_max_local_range()[0]); - }); - }); - auto acc = buf.template get_access(); - auto acc_up = buf_up.template get_access(); - auto acc_down = buf_down.template get_access(); - auto acc_xor = buf_xor.template get_access(); - auto sgsizeacc = sgsizebuf.get_access(); - - size_t sg_size = sgsizeacc[0]; - int SGid = 0; - int SGLid = 0; - int SGBeginGid = 0; - for (int j = 0; j < G; j++) { - if (j % L % sg_size == 0) { - SGid++; - SGLid = 0; - SGBeginGid = j; - } - if (j % L == 0) { - SGid = 0; - SGLid = 0; - SGBeginGid = j; - } - - /*GID of middle element in every subgroup*/ - exit_if_not_equal(acc[j], - static_cast(0x0) + - (j / L * L + SGid * sg_size + sg_size / 2), - "shuffle"); - - /* Value GID+SGID for all element except last SGID in SG*/ - if (j % L % sg_size + SGid < sg_size && j % L + SGid < L) { - exit_if_not_equal(acc_down[j], static_cast(0x0) + (j + SGid), - "shuffle_down"); - } - - /* Value GID-SGID for all element except first SGID in SG*/ - if (j % L % sg_size >= SGid) { - exit_if_not_equal(acc_up[j], static_cast(0x0) + (j - SGid), - "shuffle_up"); - } - - /* Value GID with SGLID = ( SGLID XOR SGID ) % SGMaxSize */ - exit_if_not_equal(acc_xor[j], - static_cast(0x0) + - (SGBeginGid + (SGLid ^ (SGid % sg_size))), - "shuffle_xor"); - SGLid++; - } - } catch (exception e) { - std::cout << "SYCL exception caught: " << e.what(); - exit(1); - } -} - -template -void check_struct(queue &Queue, Generator &Gen, size_t G = 256, size_t L = 64) { - - // Fill a vector with values that will be shuffled - std::vector values(G); - std::generate(values.begin(), values.end(), Gen); - - try { - nd_range<1> NdRange(G, L); - buffer buf(G); - buffer buf_up(G); - buffer buf_down(G); - buffer buf_xor(G); - buffer sgsizebuf(1); - buffer buf_in(values.data(), values.size()); - Queue.submit([&](handler &cgh) { - auto acc = buf.template get_access(cgh); - auto acc_up = buf_up.template get_access(cgh); - auto acc_down = - buf_down.template get_access(cgh); - auto acc_xor = buf_xor.template get_access(cgh); - auto sgsizeacc = sgsizebuf.get_access(cgh); - auto in = buf_in.template get_access(cgh); - - cgh.parallel_for( - NdRange, [=](nd_item<1> NdItem) { - ext::oneapi::sub_group SG = NdItem.get_sub_group(); - uint32_t wggid = NdItem.get_global_id(0); - uint32_t sgid = SG.get_group_id().get(0); - if (wggid == 0) - sgsizeacc[0] = SG.get_max_local_range()[0]; - - T val = in[wggid]; - - /*GID of middle element in every subgroup*/ - acc[NdItem.get_global_id()] = - SG.shuffle(val, SG.get_max_local_range()[0] / 2); - - /* Save GID-SGID */ - acc_up[NdItem.get_global_id()] = SG.shuffle_up(val, sgid); - - /* Save GID+SGID */ - acc_down[NdItem.get_global_id()] = SG.shuffle_down(val, sgid); - - /* Save GID with SGLID = ( SGLID XOR SGID ) % SGMaxSize */ - acc_xor[NdItem.get_global_id()] = - SG.shuffle_xor(val, sgid % SG.get_max_local_range()[0]); - }); - }); - auto acc = buf.template get_access(); - auto acc_up = buf_up.template get_access(); - auto acc_down = buf_down.template get_access(); - auto acc_xor = buf_xor.template get_access(); - auto sgsizeacc = sgsizebuf.get_access(); - - size_t sg_size = sgsizeacc[0]; - int SGid = 0; - int SGLid = 0; - int SGBeginGid = 0; - for (int j = 0; j < G; j++) { - if (j % L % sg_size == 0) { - SGid++; - SGLid = 0; - SGBeginGid = j; - } - if (j % L == 0) { - SGid = 0; - SGLid = 0; - SGBeginGid = j; - } - - /*GID of middle element in every subgroup*/ - exit_if_not_equal( - acc[j], values[j / L * L + SGid * sg_size + sg_size / 2], "shuffle"); - - /* Value GID+SGID for all element except last SGID in SG*/ - if (j % L % sg_size + SGid < sg_size && j % L + SGid < L) { - exit_if_not_equal(acc_down[j], values[j + SGid], "shuffle_down"); - } - - /* Value GID-SGID for all element except first SGID in SG*/ - if (j % L % sg_size >= SGid) { - exit_if_not_equal(acc_up[j], values[j - SGid], "shuffle_up"); - } - - /* Value GID with SGLID = ( SGLID XOR SGID ) % SGMaxSize */ - exit_if_not_equal(acc_xor[j], - values[SGBeginGid + (SGLid ^ (SGid % sg_size))], - "shuffle_xor"); - SGLid++; - } - } catch (exception e) { - std::cout << "SYCL exception caught: " << e.what(); - exit(1); - } -} - int main() { queue Queue; if (Queue.get_device().is_host()) { @@ -229,12 +32,6 @@ int main() { check_struct>( Queue, ComplexFloatGenerator); - auto ComplexDoubleGenerator = [state = std::complex(0, 1)]() mutable { - return state += std::complex(2, 2); - }; - check_struct>( - Queue, ComplexDoubleGenerator); - std::cout << "Test passed." << std::endl; return 0; } diff --git a/SYCL/SubGroup/generic-shuffle.hpp b/SYCL/SubGroup/generic-shuffle.hpp new file mode 100644 index 0000000000..56a4149076 --- /dev/null +++ b/SYCL/SubGroup/generic-shuffle.hpp @@ -0,0 +1,200 @@ +#include "helper.hpp" +#include +#include +#include +#include +template class pointer_kernel; + +using namespace cl::sycl; + +template +void check_pointer(queue &Queue, size_t G = 256, size_t L = 64) { + try { + nd_range<1> NdRange(G, L); + buffer buf(G); + buffer buf_up(G); + buffer buf_down(G); + buffer buf_xor(G); + buffer sgsizebuf(1); + Queue.submit([&](handler &cgh) { + auto acc = buf.template get_access(cgh); + auto acc_up = buf_up.template get_access(cgh); + auto acc_down = + buf_down.template get_access(cgh); + auto acc_xor = buf_xor.template get_access(cgh); + auto sgsizeacc = sgsizebuf.get_access(cgh); + + cgh.parallel_for( + NdRange, [=](nd_item<1> NdItem) { + ext::oneapi::sub_group SG = NdItem.get_sub_group(); + uint32_t wggid = NdItem.get_global_id(0); + uint32_t sgid = SG.get_group_id().get(0); + if (wggid == 0) + sgsizeacc[0] = SG.get_max_local_range()[0]; + + T *ptr = static_cast(0x0) + wggid; + + /*GID of middle element in every subgroup*/ + acc[NdItem.get_global_id()] = + SG.shuffle(ptr, SG.get_max_local_range()[0] / 2); + + /* Save GID-SGID */ + acc_up[NdItem.get_global_id()] = SG.shuffle_up(ptr, sgid); + + /* Save GID+SGID */ + acc_down[NdItem.get_global_id()] = SG.shuffle_down(ptr, sgid); + + /* Save GID with SGLID = ( SGLID XOR SGID ) % SGMaxSize */ + acc_xor[NdItem.get_global_id()] = + SG.shuffle_xor(ptr, sgid % SG.get_max_local_range()[0]); + }); + }); + auto acc = buf.template get_access(); + auto acc_up = buf_up.template get_access(); + auto acc_down = buf_down.template get_access(); + auto acc_xor = buf_xor.template get_access(); + auto sgsizeacc = sgsizebuf.get_access(); + + size_t sg_size = sgsizeacc[0]; + int SGid = 0; + int SGLid = 0; + int SGBeginGid = 0; + for (int j = 0; j < G; j++) { + if (j % L % sg_size == 0) { + SGid++; + SGLid = 0; + SGBeginGid = j; + } + if (j % L == 0) { + SGid = 0; + SGLid = 0; + SGBeginGid = j; + } + + /*GID of middle element in every subgroup*/ + exit_if_not_equal(acc[j], + static_cast(0x0) + + (j / L * L + SGid * sg_size + sg_size / 2), + "shuffle"); + + /* Value GID+SGID for all element except last SGID in SG*/ + if (j % L % sg_size + SGid < sg_size && j % L + SGid < L) { + exit_if_not_equal(acc_down[j], static_cast(0x0) + (j + SGid), + "shuffle_down"); + } + + /* Value GID-SGID for all element except first SGID in SG*/ + if (j % L % sg_size >= SGid) { + exit_if_not_equal(acc_up[j], static_cast(0x0) + (j - SGid), + "shuffle_up"); + } + + /* Value GID with SGLID = ( SGLID XOR SGID ) % SGMaxSize */ + exit_if_not_equal(acc_xor[j], + static_cast(0x0) + + (SGBeginGid + (SGLid ^ (SGid % sg_size))), + "shuffle_xor"); + SGLid++; + } + } catch (exception e) { + std::cout << "SYCL exception caught: " << e.what(); + exit(1); + } +} + +template +void check_struct(queue &Queue, Generator &Gen, size_t G = 256, size_t L = 64) { + + // Fill a vector with values that will be shuffled + std::vector values(G); + std::generate(values.begin(), values.end(), Gen); + + try { + nd_range<1> NdRange(G, L); + buffer buf(G); + buffer buf_up(G); + buffer buf_down(G); + buffer buf_xor(G); + buffer sgsizebuf(1); + buffer buf_in(values.data(), values.size()); + Queue.submit([&](handler &cgh) { + auto acc = buf.template get_access(cgh); + auto acc_up = buf_up.template get_access(cgh); + auto acc_down = + buf_down.template get_access(cgh); + auto acc_xor = buf_xor.template get_access(cgh); + auto sgsizeacc = sgsizebuf.get_access(cgh); + auto in = buf_in.template get_access(cgh); + + cgh.parallel_for( + NdRange, [=](nd_item<1> NdItem) { + ext::oneapi::sub_group SG = NdItem.get_sub_group(); + uint32_t wggid = NdItem.get_global_id(0); + uint32_t sgid = SG.get_group_id().get(0); + if (wggid == 0) + sgsizeacc[0] = SG.get_max_local_range()[0]; + + T val = in[wggid]; + + /*GID of middle element in every subgroup*/ + acc[NdItem.get_global_id()] = + SG.shuffle(val, SG.get_max_local_range()[0] / 2); + + /* Save GID-SGID */ + acc_up[NdItem.get_global_id()] = SG.shuffle_up(val, sgid); + + /* Save GID+SGID */ + acc_down[NdItem.get_global_id()] = SG.shuffle_down(val, sgid); + + /* Save GID with SGLID = ( SGLID XOR SGID ) % SGMaxSize */ + acc_xor[NdItem.get_global_id()] = + SG.shuffle_xor(val, sgid % SG.get_max_local_range()[0]); + }); + }); + auto acc = buf.template get_access(); + auto acc_up = buf_up.template get_access(); + auto acc_down = buf_down.template get_access(); + auto acc_xor = buf_xor.template get_access(); + auto sgsizeacc = sgsizebuf.get_access(); + + size_t sg_size = sgsizeacc[0]; + int SGid = 0; + int SGLid = 0; + int SGBeginGid = 0; + for (int j = 0; j < G; j++) { + if (j % L % sg_size == 0) { + SGid++; + SGLid = 0; + SGBeginGid = j; + } + if (j % L == 0) { + SGid = 0; + SGLid = 0; + SGBeginGid = j; + } + + /*GID of middle element in every subgroup*/ + exit_if_not_equal( + acc[j], values[j / L * L + SGid * sg_size + sg_size / 2], "shuffle"); + + /* Value GID+SGID for all element except last SGID in SG*/ + if (j % L % sg_size + SGid < sg_size && j % L + SGid < L) { + exit_if_not_equal(acc_down[j], values[j + SGid], "shuffle_down"); + } + + /* Value GID-SGID for all element except first SGID in SG*/ + if (j % L % sg_size >= SGid) { + exit_if_not_equal(acc_up[j], values[j - SGid], "shuffle_up"); + } + + /* Value GID with SGLID = ( SGLID XOR SGID ) % SGMaxSize */ + exit_if_not_equal(acc_xor[j], + values[SGBeginGid + (SGLid ^ (SGid % sg_size))], + "shuffle_xor"); + SGLid++; + } + } catch (exception e) { + std::cout << "SYCL exception caught: " << e.what(); + exit(1); + } +} diff --git a/SYCL/SubGroup/generic-shuffle_aspect-fp64.cpp b/SYCL/SubGroup/generic-shuffle_aspect-fp64.cpp new file mode 100644 index 0000000000..ff219bc91e --- /dev/null +++ b/SYCL/SubGroup/generic-shuffle_aspect-fp64.cpp @@ -0,0 +1,34 @@ +// REQUIRES: aspect-fp64 +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +// +//==-- generic_shuffle_aspect-fp64.cpp - SYCL sub_group generic shuffle test *- C++ -*--==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "generic-shuffle.hpp" + +using namespace cl::sycl; + +int main() { + queue Queue; + if (Queue.get_device().is_host() or !Queue.get_device().has(sycl::aspect::fp64)) { + std::cout << "Skipping test\n"; + return 0; + } + + auto ComplexDoubleGenerator = [state = std::complex(0, 1)]() mutable { + return state += std::complex(2, 2); + }; + check_struct>( + Queue, ComplexDoubleGenerator); + + std::cout << "Test passed." << std::endl; + return 0; +} diff --git a/SYCL/SubGroup/info.cpp b/SYCL/SubGroup/info.cpp index 74e6349849..71ae70679b 100644 --- a/SYCL/SubGroup/info.cpp +++ b/SYCL/SubGroup/info.cpp @@ -40,7 +40,7 @@ int main() { auto Kernel = KB.get_kernel(KernelID); range<2> GlobalRange{50, 40}; - buffer ABuf{GlobalRange}, BBuf{GlobalRange}, CBuf{GlobalRange}; + buffer ABuf{GlobalRange}, BBuf{GlobalRange}, CBuf{GlobalRange}; Queue.submit([&](sycl::handler &cgh) { auto A = ABuf.get_access(cgh); diff --git a/SYCL/SubGroup/load_store.cpp b/SYCL/SubGroup/load_store.cpp index 854608dce7..44963631bf 100644 --- a/SYCL/SubGroup/load_store.cpp +++ b/SYCL/SubGroup/load_store.cpp @@ -15,178 +15,10 @@ // //===----------------------------------------------------------------------===// -#include "helper.hpp" -#include - -#include - -template class sycl_subgr; +#include "load_store.hpp" using namespace sycl; -template void check(queue &Queue) { - const int G = 512, L = 256; - - auto sg_sizes = Queue.get_device().get_info(); - size_t max_sg_size = *std::max_element(sg_sizes.begin(), sg_sizes.end()); - - try { - nd_range<1> NdRange(G, L); - buffer syclbuf(G + max_sg_size * N); - buffer sgsizebuf(1); - { - auto acc = syclbuf.template get_access(); - for (int i = 0; i < G; i++) { - acc[i] = i; - acc[i] += 0.25; // Check that floating point types are not casted to int - } - } - Queue.submit([&](handler &cgh) { - auto acc = syclbuf.template get_access(cgh); - auto sgsizeacc = sgsizebuf.get_access(cgh); - accessor LocalMem( - {L + max_sg_size * N}, cgh); - cgh.parallel_for>(NdRange, [=](nd_item<1> NdItem) { - ext::oneapi::sub_group SG = NdItem.get_sub_group(); - auto SGid = SG.get_group_id().get(0); - auto SGsize = SG.get_max_local_range().get(0); - /* Avoid overlapping data ranges inside and between local groups */ - if (SGid % N == 0 && (SGid + N) * SGsize <= L) { - size_t SGOffset = SGid * SGsize; - size_t WGSGoffset = NdItem.get_group(0) * L + SGOffset; - multi_ptr mp( - &acc[WGSGoffset]); - multi_ptr MPL( - &LocalMem[SGOffset]); - - // half does not have full support for volatile type qualifier - using CVT = std::conditional_t, const T, - const volatile T>; - - multi_ptr mp_cv(mp); - multi_ptr MPL_CV(MPL); - // Add all values in read block - vec v(SG.load(mp)); - vec v_cv(SG.load(mp_cv)); - if (utils::cmp_vec( - v, v_cv)) // Store result only if same for non-cv and cv - SG.store(MPL, v); - vec t(utils::add_vec(SG.load(MPL))); - vec t_cv(utils::add_vec(SG.load(MPL_CV))); - if (utils::cmp_vec( - t, t_cv)) // Store result only if same for non-cv and cv - SG.store(mp, t); - } - if (NdItem.get_global_id(0) == 0) - sgsizeacc[0] = SGsize; - }); - }); - auto acc = syclbuf.template get_access(); - auto sgsizeacc = sgsizebuf.get_access(); - size_t sg_size = sgsizeacc[0]; - int WGid = -1, SGid = 0; - for (int j = 0; j < (G - (sg_size * N)); j++) { - if (j % L % sg_size == 0) { - SGid++; - } - if (j % L == 0) { - WGid++; - SGid = 0; - } - T ref = 0; - if (SGid % N) { - ref = acc[j - (SGid % N) * sg_size]; - } else { - for (int i = 0; i < N; i++) { - ref += (T)(j + i * sg_size) + 0.25; - } - } - /* There is no defined out-of-range behavior for these functions. */ - if ((SGid + N) * sg_size <= L) { - std::string s("Vector<"); - s += std::string(typeid(ref).name()) + std::string(",") + - std::to_string(N) + std::string(">[") + std::to_string(j) + - std::string("]"); - exit_if_not_equal(acc[j], ref, s.c_str()); - } - } - } catch (exception e) { - std::cout << "SYCL exception caught: " << e.what(); - exit(1); - } -} -template void check(queue &Queue) { - const int G = 128, L = 64; - try { - nd_range<1> NdRange(G, L); - buffer syclbuf(G); - buffer sgsizebuf(1); - { - auto acc = syclbuf.template get_access(); - for (int i = 0; i < G; i++) { - acc[i] = i; - acc[i] += 0.1; // Check that floating point types are not casted to int - } - } - - Queue.submit([&](handler &cgh) { - auto acc = syclbuf.template get_access(cgh); - auto sgsizeacc = sgsizebuf.get_access(cgh); - accessor LocalMem( - {L}, cgh); - cgh.parallel_for>(NdRange, [=](nd_item<1> NdItem) { - ext::oneapi::sub_group SG = NdItem.get_sub_group(); - if (NdItem.get_global_id(0) == 0) - sgsizeacc[0] = SG.get_max_local_range()[0]; - size_t SGOffset = - SG.get_group_id().get(0) * SG.get_max_local_range().get(0); - size_t WGSGoffset = NdItem.get_group(0) * L + SGOffset; - multi_ptr mp(&acc[WGSGoffset]); - multi_ptr MPL( - &LocalMem[SGOffset]); - - // half does not have full support for volatile type qualifier - using CVT = std::conditional_t, const T, - const volatile T>; - - multi_ptr mp_cv(mp); - multi_ptr MPL_CV(MPL); - T s = SG.load(mp) + (T)SG.get_local_id().get(0); - T s_cv = SG.load(mp_cv) + (T)SG.get_local_id().get(0); - if (s == s_cv) // Store result only if same for non-cv and cv - SG.store(MPL, s); - T t = SG.load(MPL) + (T)SG.get_local_id().get(0); - T t_cv = SG.load(MPL_CV) + (T)SG.get_local_id().get(0); - if (t == t_cv) // Store result only if same for non-cv and cv - SG.store(mp, t); - }); - }); - auto acc = syclbuf.template get_access(); - auto sgsizeacc = sgsizebuf.get_access(); - size_t sg_size = sgsizeacc[0]; - int WGid = -1, SGid = 0; - for (int j = 0; j < G; j++) { - if (j % L % sg_size == 0) { - SGid++; - } - if (j % L == 0) { - WGid++; - SGid = 0; - } - std::string s("Scalar<"); - s += std::string(typeid(acc[j]).name()) + std::string(">[") + - std::to_string(j) + std::string("]"); - - exit_if_not_equal(acc[j], (T)(j + 2 * (j % L % sg_size)) + 0.1, - s.c_str()); - } - - } catch (exception e) { - std::cout << "SYCL exception caught: " << e.what(); - exit(1); - } -} - int main() { queue Queue; if (Queue.get_device().is_host()) { @@ -268,14 +100,6 @@ int main() { check(Queue); check(Queue); check(Queue); - typedef double aligned_double __attribute__((aligned(16))); - check(Queue); - check(Queue); - check(Queue); - check(Queue); - check(Queue); - check(Queue); - check(Queue); } std::cout << "Test passed." << std::endl; return 0; diff --git a/SYCL/SubGroup/load_store.hpp b/SYCL/SubGroup/load_store.hpp new file mode 100644 index 0000000000..6a9b575d07 --- /dev/null +++ b/SYCL/SubGroup/load_store.hpp @@ -0,0 +1,171 @@ +#include "helper.hpp" +#include + +#include + +template class sycl_subgr; + +using namespace cl::sycl; + +template void check(queue &Queue) { + const int G = 512, L = 256; + + auto sg_sizes = Queue.get_device().get_info(); + size_t max_sg_size = *std::max_element(sg_sizes.begin(), sg_sizes.end()); + + try { + nd_range<1> NdRange(G, L); + buffer syclbuf(G + max_sg_size * N); + buffer sgsizebuf(1); + { + auto acc = syclbuf.template get_access(); + for (int i = 0; i < G; i++) { + acc[i] = i; + acc[i] += 0.25; // Check that floating point types are not casted to int + } + } + Queue.submit([&](handler &cgh) { + auto acc = syclbuf.template get_access(cgh); + auto sgsizeacc = sgsizebuf.get_access(cgh); + accessor LocalMem( + {L + max_sg_size * N}, cgh); + cgh.parallel_for>(NdRange, [=](nd_item<1> NdItem) { + ext::oneapi::sub_group SG = NdItem.get_sub_group(); + auto SGid = SG.get_group_id().get(0); + auto SGsize = SG.get_max_local_range().get(0); + /* Avoid overlapping data ranges inside and between local groups */ + if (SGid % N == 0 && (SGid + N) * SGsize <= L) { + size_t SGOffset = SGid * SGsize; + size_t WGSGoffset = NdItem.get_group(0) * L + SGOffset; + multi_ptr mp( + &acc[WGSGoffset]); + multi_ptr MPL( + &LocalMem[SGOffset]); + + // half does not have full support for volatile type qualifier + using CVT = std::conditional_t, const T, + const volatile T>; + + multi_ptr mp_cv(mp); + multi_ptr MPL_CV(MPL); + // Add all values in read block + vec v(SG.load(mp)); + vec v_cv(SG.load(mp_cv)); + if (utils::cmp_vec( + v, v_cv)) // Store result only if same for non-cv and cv + SG.store(MPL, v); + vec t(utils::add_vec(SG.load(MPL))); + vec t_cv(utils::add_vec(SG.load(MPL_CV))); + if (utils::cmp_vec( + t, t_cv)) // Store result only if same for non-cv and cv + SG.store(mp, t); + } + if (NdItem.get_global_id(0) == 0) + sgsizeacc[0] = SGsize; + }); + }); + auto acc = syclbuf.template get_access(); + auto sgsizeacc = sgsizebuf.get_access(); + size_t sg_size = sgsizeacc[0]; + int WGid = -1, SGid = 0; + for (int j = 0; j < (G - (sg_size * N)); j++) { + if (j % L % sg_size == 0) { + SGid++; + } + if (j % L == 0) { + WGid++; + SGid = 0; + } + T ref = 0; + if (SGid % N) { + ref = acc[j - (SGid % N) * sg_size]; + } else { + for (int i = 0; i < N; i++) { + ref += (T)(j + i * sg_size) + 0.25; + } + } + /* There is no defined out-of-range behavior for these functions. */ + if ((SGid + N) * sg_size <= L) { + std::string s("Vector<"); + s += std::string(typeid(ref).name()) + std::string(",") + + std::to_string(N) + std::string(">[") + std::to_string(j) + + std::string("]"); + exit_if_not_equal(acc[j], ref, s.c_str()); + } + } + } catch (exception e) { + std::cout << "SYCL exception caught: " << e.what(); + exit(1); + } +} +template void check(queue &Queue) { + const int G = 128, L = 64; + try { + nd_range<1> NdRange(G, L); + buffer syclbuf(G); + buffer sgsizebuf(1); + { + auto acc = syclbuf.template get_access(); + for (int i = 0; i < G; i++) { + acc[i] = i; + acc[i] += 0.1; // Check that floating point types are not casted to int + } + } + + Queue.submit([&](handler &cgh) { + auto acc = syclbuf.template get_access(cgh); + auto sgsizeacc = sgsizebuf.get_access(cgh); + accessor LocalMem( + {L}, cgh); + cgh.parallel_for>(NdRange, [=](nd_item<1> NdItem) { + ext::oneapi::sub_group SG = NdItem.get_sub_group(); + if (NdItem.get_global_id(0) == 0) + sgsizeacc[0] = SG.get_max_local_range()[0]; + size_t SGOffset = + SG.get_group_id().get(0) * SG.get_max_local_range().get(0); + size_t WGSGoffset = NdItem.get_group(0) * L + SGOffset; + multi_ptr mp(&acc[WGSGoffset]); + multi_ptr MPL( + &LocalMem[SGOffset]); + + // half does not have full support for volatile type qualifier + using CVT = std::conditional_t, const T, + const volatile T>; + + multi_ptr mp_cv(mp); + multi_ptr MPL_CV(MPL); + T s = SG.load(mp) + (T)SG.get_local_id().get(0); + T s_cv = SG.load(mp_cv) + (T)SG.get_local_id().get(0); + if (s == s_cv) // Store result only if same for non-cv and cv + SG.store(MPL, s); + T t = SG.load(MPL) + (T)SG.get_local_id().get(0); + T t_cv = SG.load(MPL_CV) + (T)SG.get_local_id().get(0); + if (t == t_cv) // Store result only if same for non-cv and cv + SG.store(mp, t); + }); + }); + auto acc = syclbuf.template get_access(); + auto sgsizeacc = sgsizebuf.get_access(); + size_t sg_size = sgsizeacc[0]; + int WGid = -1, SGid = 0; + for (int j = 0; j < G; j++) { + if (j % L % sg_size == 0) { + SGid++; + } + if (j % L == 0) { + WGid++; + SGid = 0; + } + std::string s("Scalar<"); + s += std::string(typeid(acc[j]).name()) + std::string(">[") + + std::to_string(j) + std::string("]"); + + exit_if_not_equal(acc[j], (T)(j + 2 * (j % L % sg_size)) + 0.1, + s.c_str()); + } + + } catch (exception e) { + std::cout << "SYCL exception caught: " << e.what(); + exit(1); + } +} \ No newline at end of file diff --git a/SYCL/SubGroup/load_store_aspect-fp64.cpp b/SYCL/SubGroup/load_store_aspect-fp64.cpp new file mode 100644 index 0000000000..ef96ab2b6b --- /dev/null +++ b/SYCL/SubGroup/load_store_aspect-fp64.cpp @@ -0,0 +1,46 @@ +// REQUIRES: aspect-fp64 +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +// +// Missing __spirv_SubgroupBlockReadINTEL, __spirv_SubgroupBlockWriteINTEL on +// AMD +// XFAIL: hip_amd +// +//==----- load_store_aspect-fp64.cpp - SYCL sub_group load/store test ------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "load_store.hpp" + +using namespace cl::sycl; + +int main() { + queue Queue; + if (Queue.get_device().is_host() or !Queue.get_device().has(sycl::aspect::fp64)) { + std::cout << "Skipping test\n"; + return 0; + } + std::string PlatformName = + Queue.get_device().get_platform().get_info(); + auto Vec = Queue.get_device().get_info(); + if (std::find(Vec.begin(), Vec.end(), "cl_intel_subgroups_long") != + std::end(Vec) || + PlatformName.find("CUDA") != std::string::npos) { + typedef double aligned_double __attribute__((aligned(16))); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + } + std::cout << "Test passed." << std::endl; + return 0; +} From 0cd6b543568988a9ae7b04b4df0157d24a3700b4 Mon Sep 17 00:00:00 2001 From: Yanliang Mu Date: Tue, 16 Aug 2022 15:39:42 +0800 Subject: [PATCH 12/35] split double tests in USM directory --- SYCL/USM/copy.cpp | 65 ++--------------------- SYCL/USM/copy.hpp | 56 ++++++++++++++++++++ SYCL/USM/copy_aspect-fp64.cpp | 59 +++++++++++++++++++++ SYCL/USM/fill.cpp | 98 ++--------------------------------- SYCL/USM/fill.hpp | 92 ++++++++++++++++++++++++++++++++ SYCL/USM/fill_aspect-fp64.cpp | 44 ++++++++++++++++ 6 files changed, 257 insertions(+), 157 deletions(-) create mode 100644 SYCL/USM/copy.hpp create mode 100644 SYCL/USM/copy_aspect-fp64.cpp create mode 100644 SYCL/USM/fill.hpp create mode 100644 SYCL/USM/fill_aspect-fp64.cpp diff --git a/SYCL/USM/copy.cpp b/SYCL/USM/copy.cpp index 2613326756..c359962c45 100644 --- a/SYCL/USM/copy.cpp +++ b/SYCL/USM/copy.cpp @@ -12,15 +12,11 @@ // RUN: %GPU_RUN_PLACEHOLDER %t1.out // RUN: %ACC_RUN_PLACEHOLDER %t1.out -#include +#include "copy.hpp"; using namespace sycl; using namespace sycl::usm; -template class transfer; - -static constexpr int N = 100; // should be even - struct test_struct { short a; int b; @@ -28,67 +24,18 @@ struct test_struct { long long d; half e; float f; - double g; }; bool operator==(const test_struct &lhs, const test_struct &rhs) { return lhs.a == rhs.a && lhs.b == rhs.b && lhs.c == rhs.c && lhs.d == rhs.d && - lhs.e == rhs.e && lhs.f == rhs.f && lhs.g == rhs.g; -} - -template T *regular(queue q, alloc kind) { - return malloc(N, q, kind); -} - -template T *aligned(queue q, alloc kind) { - return aligned_alloc(alignof(long long), N, q, kind); -} - -template void test(queue q, T val, T *src, T *dst, bool dev_dst) { - q.fill(src, val, N).wait(); - - // Use queue::copy for the first half and handler::copy for the second - q.copy(src, dst, N / 2).wait(); - q.submit([&](handler &h) { h.copy(src + N / 2, dst + N / 2, N / 2); }).wait(); - - T *out = dst; - - std::array arr; - if (dev_dst) { // if copied to device, transfer data back to host - buffer buf{arr}; - q.submit([&](handler &h) { - accessor acc{buf, h}; - h.parallel_for>(N, [=](id<1> i) { acc[i] = dst[i]; }); - }); - out = arr.data(); - } - - for (int i = 0; i < N; ++i) { - assert(out[i] == val); - } - - free(src, q); - free(dst, q); -} - -template void runTests(queue q, T val, alloc kind1, alloc kind2) { - bool dev_dst1 = (kind1 == alloc::device); - bool dev_dst2 = (kind2 == alloc::device); - test(q, val, regular(q, kind1), regular(q, kind2), dev_dst2); - test(q, val, regular(q, kind2), regular(q, kind1), dev_dst1); - test(q, val, aligned(q, kind1), aligned(q, kind2), dev_dst2); - test(q, val, aligned(q, kind2), aligned(q, kind1), dev_dst1); - test(q, val, regular(q, kind1), aligned(q, kind2), dev_dst2); - test(q, val, regular(q, kind2), aligned(q, kind1), dev_dst1); - test(q, val, aligned(q, kind1), regular(q, kind2), dev_dst2); - test(q, val, aligned(q, kind2), regular(q, kind1), dev_dst1); + lhs.e == rhs.e && lhs.f == rhs.f; } int main() { queue q; auto dev = q.get_device(); - test_struct test_obj{4, 42, 424, 4242, 4.2f, 4.242f, 4.24242}; + test_struct test_obj{4, 42, 424, 4242, 4.2f, 4.242f}; if (dev.has(aspect::usm_host_allocations)) { runTests(q, 4, alloc::host, alloc::host); @@ -97,7 +44,6 @@ int main() { runTests(q, 4242, alloc::host, alloc::host); runTests(q, half(4.2f), alloc::host, alloc::host); runTests(q, 4.242f, alloc::host, alloc::host); - runTests(q, 4.24242, alloc::host, alloc::host); runTests(q, test_obj, alloc::host, alloc::host); } @@ -108,7 +54,6 @@ int main() { runTests(q, 4242, alloc::shared, alloc::shared); runTests(q, half(4.2f), alloc::shared, alloc::shared); runTests(q, 4.242f, alloc::shared, alloc::shared); - runTests(q, 4.24242, alloc::shared, alloc::shared); runTests(q, test_obj, alloc::shared, alloc::shared); } @@ -119,7 +64,6 @@ int main() { runTests(q, 4242, alloc::device, alloc::device); runTests(q, half(4.2f), alloc::device, alloc::device); runTests(q, 4.242f, alloc::device, alloc::device); - runTests(q, 4.24242, alloc::device, alloc::device); runTests(q, test_obj, alloc::device, alloc::device); } @@ -131,7 +75,6 @@ int main() { runTests(q, 4242, alloc::host, alloc::shared); runTests(q, half(4.2f), alloc::host, alloc::shared); runTests(q, 4.242f, alloc::host, alloc::shared); - runTests(q, 4.24242, alloc::host, alloc::shared); runTests(q, test_obj, alloc::host, alloc::shared); } @@ -143,7 +86,6 @@ int main() { runTests(q, 4242, alloc::host, alloc::device); runTests(q, half(4.2f), alloc::host, alloc::device); runTests(q, 4.242f, alloc::host, alloc::device); - runTests(q, 4.24242, alloc::host, alloc::device); runTests(q, test_obj, alloc::host, alloc::device); } @@ -155,7 +97,6 @@ int main() { runTests(q, 4242, alloc::shared, alloc::device); runTests(q, half(4.2f), alloc::shared, alloc::device); runTests(q, 4.242f, alloc::shared, alloc::device); - runTests(q, 4.24242, alloc::shared, alloc::device); runTests(q, test_obj, alloc::shared, alloc::device); } diff --git a/SYCL/USM/copy.hpp b/SYCL/USM/copy.hpp new file mode 100644 index 0000000000..1b0a7b0f15 --- /dev/null +++ b/SYCL/USM/copy.hpp @@ -0,0 +1,56 @@ +#include + +using namespace sycl; +using namespace sycl::usm; + +template class transfer; + +static constexpr int N = 100; // should be even + +template T *regular(queue q, alloc kind) { + return malloc(N, q, kind); +} + +template T *aligned(queue q, alloc kind) { + return aligned_alloc(alignof(long long), N, q, kind); +} + +template void test(queue q, T val, T *src, T *dst, bool dev_dst) { + q.fill(src, val, N).wait(); + + // Use queue::copy for the first half and handler::copy for the second + q.copy(src, dst, N / 2).wait(); + q.submit([&](handler &h) { h.copy(src + N / 2, dst + N / 2, N / 2); }).wait(); + + T *out = dst; + + std::array arr; + if (dev_dst) { // if copied to device, transfer data back to host + buffer buf{arr}; + q.submit([&](handler &h) { + accessor acc{buf, h}; + h.parallel_for>(N, [=](id<1> i) { acc[i] = dst[i]; }); + }); + out = arr.data(); + } + + for (int i = 0; i < N; ++i) { + assert(out[i] == val); + } + + free(src, q); + free(dst, q); +} + +template void runTests(queue q, T val, alloc kind1, alloc kind2) { + bool dev_dst1 = (kind1 == alloc::device); + bool dev_dst2 = (kind2 == alloc::device); + test(q, val, regular(q, kind1), regular(q, kind2), dev_dst2); + test(q, val, regular(q, kind2), regular(q, kind1), dev_dst1); + test(q, val, aligned(q, kind1), aligned(q, kind2), dev_dst2); + test(q, val, aligned(q, kind2), aligned(q, kind1), dev_dst1); + test(q, val, regular(q, kind1), aligned(q, kind2), dev_dst2); + test(q, val, regular(q, kind2), aligned(q, kind1), dev_dst1); + test(q, val, aligned(q, kind1), regular(q, kind2), dev_dst2); + test(q, val, aligned(q, kind2), regular(q, kind1), dev_dst1); +} \ No newline at end of file diff --git a/SYCL/USM/copy_aspect-fp64.cpp b/SYCL/USM/copy_aspect-fp64.cpp new file mode 100644 index 0000000000..46f9765ab8 --- /dev/null +++ b/SYCL/USM/copy_aspect-fp64.cpp @@ -0,0 +1,59 @@ +//==---- copy_aspect-fp64.cp - USM copy test ------------------------------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// REQUIRES: aspect-fp64 +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t1.out +// RUN: %HOST_RUN_PLACEHOLDER %t1.out +// RUN: %CPU_RUN_PLACEHOLDER %t1.out +// RUN: %GPU_RUN_PLACEHOLDER %t1.out +// RUN: %ACC_RUN_PLACEHOLDER %t1.out + +#include "copy.hpp"; + +using namespace sycl; +using namespace sycl::usm; + +int main() { + queue q; + + if (!q.get_device().has(aspect::fp64)) { + std::cout << "Skipping test\n"; + return 0; + } + + auto dev = q.get_device(); + + if (dev.has(aspect::usm_host_allocations)) { + runTests(q, 4.24242, alloc::host, alloc::host); + } + + if (dev.has(aspect::usm_shared_allocations)) { + runTests(q, 4.24242, alloc::shared, alloc::shared); + } + + if (dev.has(aspect::usm_device_allocations)) { + runTests(q, 4.24242, alloc::device, alloc::device); + } + + if (dev.has(aspect::usm_host_allocations) && + dev.has(aspect::usm_shared_allocations)) { + runTests(q, 4.24242, alloc::host, alloc::shared); + } + + if (dev.has(aspect::usm_host_allocations) && + dev.has(aspect::usm_device_allocations)) { + runTests(q, 4.24242, alloc::host, alloc::device); + } + + if (dev.has(aspect::usm_shared_allocations) && + dev.has(aspect::usm_device_allocations)) { + runTests(q, 4.24242, alloc::shared, alloc::device); + } + + return 0; +} diff --git a/SYCL/USM/fill.cpp b/SYCL/USM/fill.cpp index c3b96abcad..2ef72638d1 100644 --- a/SYCL/USM/fill.cpp +++ b/SYCL/USM/fill.cpp @@ -12,15 +12,10 @@ // RUN: %GPU_RUN_PLACEHOLDER %t1.out // RUN: %ACC_RUN_PLACEHOLDER %t1.out -#include +#include "fill.hpp" using namespace sycl; -template class usm_device_transfer; -template class usm_aligned_device_transfer; - -static constexpr int N = 100; - struct test_struct { short a; int b; @@ -28,104 +23,20 @@ struct test_struct { long long d; sycl::half e; float f; - double g; }; bool operator==(const test_struct &lhs, const test_struct &rhs) { return lhs.a == rhs.a && lhs.b == rhs.b && lhs.c == rhs.c && lhs.d == rhs.d && - lhs.e == rhs.e && lhs.f == rhs.f && lhs.g == rhs.g; -} - -template -void runHostTests(device dev, context ctxt, queue q, T val) { - T *array; - - array = (T *)malloc_host(N * sizeof(T), q); - q.submit([&](handler &h) { h.fill(array, val, N); }).wait(); - for (int i = 0; i < N; ++i) { - assert(array[i] == val); - } - free(array, ctxt); - - array = (T *)aligned_alloc_host(alignof(long long), N * sizeof(T), ctxt); - q.submit([&](handler &h) { h.fill(array, val, N); }).wait(); - for (int i = 0; i < N; ++i) { - assert(array[i] == val); - } - free(array, ctxt); -} - -template -void runSharedTests(device dev, context ctxt, queue q, T val) { - T *array; - - array = (T *)malloc_shared(N * sizeof(T), q); - q.submit([&](handler &h) { h.fill(array, val, N); }).wait(); - for (int i = 0; i < N; ++i) { - assert(array[i] == val); - } - free(array, ctxt); - - array = - (T *)aligned_alloc_shared(alignof(long long), N * sizeof(T), dev, ctxt); - q.submit([&](handler &h) { h.fill(array, val, N); }).wait(); - for (int i = 0; i < N; ++i) { - assert(array[i] == val); - } - free(array, ctxt); + lhs.e == rhs.e && lhs.f == rhs.f; } -template -void runDeviceTests(device dev, context ctxt, queue q, T val) { - T *array; - std::vector out; - out.resize(N); - - array = (T *)malloc_device(N * sizeof(T), q); - q.submit([&](handler &h) { h.fill(array, val, N); }).wait(); - - { - buffer buf{&out[0], range<1>{N}}; - q.submit([&](handler &h) { - auto acc = buf.template get_access(h); - h.parallel_for>( - range<1>(N), [=](id<1> item) { acc[item] = array[item]; }); - }).wait(); - } - - for (int i = 0; i < N; ++i) { - assert(out[i] == val); - } - free(array, ctxt); - - out.clear(); - out.resize(N); - - array = - (T *)aligned_alloc_device(alignof(long long), N * sizeof(T), dev, ctxt); - q.submit([&](handler &h) { h.fill(array, val, N); }).wait(); - - { - buffer buf{&out[0], range<1>{N}}; - q.submit([&](handler &h) { - auto acc = buf.template get_access(h); - h.parallel_for>( - range<1>(N), [=](id<1> item) { acc[item] = array[item]; }); - }).wait(); - } - - for (int i = 0; i < N; ++i) { - assert(out[i] == val); - } - free(array, ctxt); -} int main() { queue q; auto dev = q.get_device(); auto ctxt = q.get_context(); - test_struct test_obj{4, 42, 424, 4242, 4.2f, 4.242, 4.24242}; + test_struct test_obj{4, 42, 424, 4242, 4.2f, 4.242}; if (dev.get_info()) { runHostTests(dev, ctxt, q, 4); @@ -134,7 +45,6 @@ int main() { runHostTests(dev, ctxt, q, 4242); runHostTests(dev, ctxt, q, sycl::half(4.2f)); runHostTests(dev, ctxt, q, 4.242f); - runHostTests(dev, ctxt, q, 4.24242); runHostTests(dev, ctxt, q, test_obj); } @@ -145,7 +55,6 @@ int main() { runSharedTests(dev, ctxt, q, 4242); runSharedTests(dev, ctxt, q, sycl::half(4.2f)); runSharedTests(dev, ctxt, q, 4.242f); - runSharedTests(dev, ctxt, q, 4.24242); runSharedTests(dev, ctxt, q, test_obj); } @@ -156,7 +65,6 @@ int main() { runDeviceTests(dev, ctxt, q, 4242); runDeviceTests(dev, ctxt, q, sycl::half(4.2f)); runDeviceTests(dev, ctxt, q, 4.242f); - runDeviceTests(dev, ctxt, q, 4.24242); runDeviceTests(dev, ctxt, q, test_obj); } diff --git a/SYCL/USM/fill.hpp b/SYCL/USM/fill.hpp new file mode 100644 index 0000000000..3734ebcf90 --- /dev/null +++ b/SYCL/USM/fill.hpp @@ -0,0 +1,92 @@ +#include + +using namespace cl::sycl; + +template class usm_device_transfer; +template class usm_aligned_device_transfer; + +static constexpr int N = 100; + +template +void runHostTests(device dev, context ctxt, queue q, T val) { + T *array; + + array = (T *)malloc_host(N * sizeof(T), q); + q.submit([&](handler &h) { h.fill(array, val, N); }).wait(); + for (int i = 0; i < N; ++i) { + assert(array[i] == val); + } + free(array, ctxt); + + array = (T *)aligned_alloc_host(alignof(long long), N * sizeof(T), ctxt); + q.submit([&](handler &h) { h.fill(array, val, N); }).wait(); + for (int i = 0; i < N; ++i) { + assert(array[i] == val); + } + free(array, ctxt); +} + +template +void runSharedTests(device dev, context ctxt, queue q, T val) { + T *array; + + array = (T *)malloc_shared(N * sizeof(T), q); + q.submit([&](handler &h) { h.fill(array, val, N); }).wait(); + for (int i = 0; i < N; ++i) { + assert(array[i] == val); + } + free(array, ctxt); + + array = + (T *)aligned_alloc_shared(alignof(long long), N * sizeof(T), dev, ctxt); + q.submit([&](handler &h) { h.fill(array, val, N); }).wait(); + for (int i = 0; i < N; ++i) { + assert(array[i] == val); + } + free(array, ctxt); +} + +template +void runDeviceTests(device dev, context ctxt, queue q, T val) { + T *array; + std::vector out; + out.resize(N); + + array = (T *)malloc_device(N * sizeof(T), q); + q.submit([&](handler &h) { h.fill(array, val, N); }).wait(); + + { + buffer buf{&out[0], range<1>{N}}; + q.submit([&](handler &h) { + auto acc = buf.template get_access(h); + h.parallel_for>( + range<1>(N), [=](id<1> item) { acc[item] = array[item]; }); + }).wait(); + } + + for (int i = 0; i < N; ++i) { + assert(out[i] == val); + } + free(array, ctxt); + + out.clear(); + out.resize(N); + + array = + (T *)aligned_alloc_device(alignof(long long), N * sizeof(T), dev, ctxt); + q.submit([&](handler &h) { h.fill(array, val, N); }).wait(); + + { + buffer buf{&out[0], range<1>{N}}; + q.submit([&](handler &h) { + auto acc = buf.template get_access(h); + h.parallel_for>( + range<1>(N), [=](id<1> item) { acc[item] = array[item]; }); + }).wait(); + } + + for (int i = 0; i < N; ++i) { + assert(out[i] == val); + } + free(array, ctxt); +} diff --git a/SYCL/USM/fill_aspect-fp64.cpp b/SYCL/USM/fill_aspect-fp64.cpp new file mode 100644 index 0000000000..d29ba14fee --- /dev/null +++ b/SYCL/USM/fill_aspect-fp64.cpp @@ -0,0 +1,44 @@ +//==---- fill_aspect-fp64.cpp - USM fill test for double type ---------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// REQUIRES: aspect-fp64 +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t1.out +// RUN: %HOST_RUN_PLACEHOLDER %t1.out +// RUN: %CPU_RUN_PLACEHOLDER %t1.out +// RUN: %GPU_RUN_PLACEHOLDER %t1.out +// RUN: %ACC_RUN_PLACEHOLDER %t1.out + +#include "fill.hpp"; + +using namespace cl::sycl; + +int main() { + queue q; + + if (!q.get_device().has(aspect::fp64)) { + std::cout << "Skipping test\n"; + return 0; + } + + auto dev = q.get_device(); + auto ctxt = q.get_context(); + + if (dev.get_info()) { + runHostTests(dev, ctxt, q, 4.24242); + } + + if (dev.get_info()) { + runHostTests(dev, ctxt, q, 4.24242); + } + + if (dev.get_info()) { + runHostTests(dev, ctxt, q, 4.24242); + } + + return 0; +} From 6fd342901c5bd51b3c43615fe4245bb24975b331 Mon Sep 17 00:00:00 2001 From: Yanliang Mu Date: Tue, 16 Aug 2022 16:23:32 +0800 Subject: [PATCH 13/35] remove deprecated namespace cl:: --- SYCL/ESIMD/api/bin_and_cmp_ops_heavy.hpp | 4 ++-- SYCL/ESIMD/api/bin_and_cmp_ops_heavy_aspect-fp64.cpp | 2 +- SYCL/ESIMD/api/saturation_smoke.hpp | 2 +- SYCL/ESIMD/api/saturation_smoke_aspect-fp64.cpp | 2 +- SYCL/ESIMD/api/unary_ops_heavy.hpp | 2 +- SYCL/ESIMD/api/unary_ops_heavy_aspect-fp64.cpp | 2 +- SYCL/ESIMD/ext_math.cpp | 2 +- SYCL/ESIMD/ext_math.hpp | 6 +++--- SYCL/ESIMD/ext_math_aspect-fp64.cpp | 2 +- SYCL/SubGroup/barrier.hpp | 4 ++-- SYCL/SubGroup/generic-shuffle.hpp | 2 +- SYCL/SubGroup/generic-shuffle_aspect-fp64.cpp | 2 +- SYCL/SubGroup/load_store.hpp | 4 ++-- SYCL/SubGroup/load_store_aspect-fp64.cpp | 2 +- SYCL/USM/fill.hpp | 2 +- SYCL/USM/fill_aspect-fp64.cpp | 2 +- 16 files changed, 21 insertions(+), 21 deletions(-) diff --git a/SYCL/ESIMD/api/bin_and_cmp_ops_heavy.hpp b/SYCL/ESIMD/api/bin_and_cmp_ops_heavy.hpp index 71078d31c5..5ce313d9e4 100644 --- a/SYCL/ESIMD/api/bin_and_cmp_ops_heavy.hpp +++ b/SYCL/ESIMD/api/bin_and_cmp_ops_heavy.hpp @@ -1,10 +1,10 @@ #include "../esimd_test_utils.hpp" -#include #include #include +#include -using namespace cl::sycl; +using namespace sycl; using namespace sycl::ext::intel::esimd; template class TestID; diff --git a/SYCL/ESIMD/api/bin_and_cmp_ops_heavy_aspect-fp64.cpp b/SYCL/ESIMD/api/bin_and_cmp_ops_heavy_aspect-fp64.cpp index 131728367f..7c44d90eaa 100644 --- a/SYCL/ESIMD/api/bin_and_cmp_ops_heavy_aspect-fp64.cpp +++ b/SYCL/ESIMD/api/bin_and_cmp_ops_heavy_aspect-fp64.cpp @@ -26,7 +26,7 @@ #include "bin_and_cmp_ops_heavy.hpp" -using namespace cl::sycl; +using namespace sycl; using namespace sycl::ext::intel::esimd; int main(void) { diff --git a/SYCL/ESIMD/api/saturation_smoke.hpp b/SYCL/ESIMD/api/saturation_smoke.hpp index 9b11b97150..6ce0b8d81c 100644 --- a/SYCL/ESIMD/api/saturation_smoke.hpp +++ b/SYCL/ESIMD/api/saturation_smoke.hpp @@ -4,7 +4,7 @@ #include #include -using namespace cl::sycl; +using namespace sycl; using namespace sycl::ext::intel::esimd; template struct char_to_int { diff --git a/SYCL/ESIMD/api/saturation_smoke_aspect-fp64.cpp b/SYCL/ESIMD/api/saturation_smoke_aspect-fp64.cpp index 3a18cbabf4..1de0bd4fd5 100644 --- a/SYCL/ESIMD/api/saturation_smoke_aspect-fp64.cpp +++ b/SYCL/ESIMD/api/saturation_smoke_aspect-fp64.cpp @@ -16,7 +16,7 @@ #include "saturation_smoke.hpp" -using namespace cl::sycl; +using namespace sycl; using namespace sycl::ext::intel::esimd; // clang-format on diff --git a/SYCL/ESIMD/api/unary_ops_heavy.hpp b/SYCL/ESIMD/api/unary_ops_heavy.hpp index babd26a5cd..49bc7e3273 100644 --- a/SYCL/ESIMD/api/unary_ops_heavy.hpp +++ b/SYCL/ESIMD/api/unary_ops_heavy.hpp @@ -4,7 +4,7 @@ #include #include -using namespace cl::sycl; +using namespace sycl; using namespace sycl::ext::intel::esimd; template class TestID; diff --git a/SYCL/ESIMD/api/unary_ops_heavy_aspect-fp64.cpp b/SYCL/ESIMD/api/unary_ops_heavy_aspect-fp64.cpp index 09f6a09fb4..b5281519aa 100644 --- a/SYCL/ESIMD/api/unary_ops_heavy_aspect-fp64.cpp +++ b/SYCL/ESIMD/api/unary_ops_heavy_aspect-fp64.cpp @@ -25,7 +25,7 @@ #include "unary_ops_heavy.hpp" -using namespace cl::sycl; +using namespace sycl; using namespace sycl::ext::intel::esimd; int main(void) { diff --git a/SYCL/ESIMD/ext_math.cpp b/SYCL/ESIMD/ext_math.cpp index 3dd65e823f..4e6671f83d 100644 --- a/SYCL/ESIMD/ext_math.cpp +++ b/SYCL/ESIMD/ext_math.cpp @@ -19,7 +19,7 @@ #include "ext_math.hpp" -using namespace cl::sycl; +using namespace sycl; using namespace sycl::ext::intel; // --- The entry point diff --git a/SYCL/ESIMD/ext_math.hpp b/SYCL/ESIMD/ext_math.hpp index 8e44f2317f..4a8e5926fa 100644 --- a/SYCL/ESIMD/ext_math.hpp +++ b/SYCL/ESIMD/ext_math.hpp @@ -7,7 +7,7 @@ #include #include -using namespace cl::sycl; +using namespace sycl; using namespace sycl::ext::intel; // --- Data initialization functions @@ -314,10 +314,10 @@ bool test(queue &Q, const std::string &Name, buffer BufC(C, range<1>(Size)); // number of workgroups - cl::sycl::range<1> GlobalRange{Size / N}; + sycl::range<1> GlobalRange{Size / N}; // threads (workitems) in each workgroup - cl::sycl::range<1> LocalRange{1}; + sycl::range<1> LocalRange{1}; auto E = Q.submit([&](handler &CGH) { auto PA = BufA.template get_access(CGH); diff --git a/SYCL/ESIMD/ext_math_aspect-fp64.cpp b/SYCL/ESIMD/ext_math_aspect-fp64.cpp index 377e648e17..91bbe27e58 100644 --- a/SYCL/ESIMD/ext_math_aspect-fp64.cpp +++ b/SYCL/ESIMD/ext_math_aspect-fp64.cpp @@ -19,7 +19,7 @@ #include "ext_math.hpp" -using namespace cl::sycl; +using namespace sycl; using namespace sycl::ext::intel; // --- The entry point diff --git a/SYCL/SubGroup/barrier.hpp b/SYCL/SubGroup/barrier.hpp index 4d2dad6f50..cccffc2d72 100644 --- a/SYCL/SubGroup/barrier.hpp +++ b/SYCL/SubGroup/barrier.hpp @@ -4,7 +4,7 @@ #include template class sycl_subgr; -using namespace cl::sycl; +using namespace sycl; template void check(queue &Queue, size_t G = 240, size_t L = 60) { try { @@ -60,4 +60,4 @@ void check(queue &Queue, size_t G = 240, size_t L = 60) { std::cout << "SYCL exception caught: " << e.what(); exit(1); } -} \ No newline at end of file +} diff --git a/SYCL/SubGroup/generic-shuffle.hpp b/SYCL/SubGroup/generic-shuffle.hpp index 56a4149076..96c6c3783f 100644 --- a/SYCL/SubGroup/generic-shuffle.hpp +++ b/SYCL/SubGroup/generic-shuffle.hpp @@ -5,7 +5,7 @@ #include template class pointer_kernel; -using namespace cl::sycl; +using namespace sycl; template void check_pointer(queue &Queue, size_t G = 256, size_t L = 64) { diff --git a/SYCL/SubGroup/generic-shuffle_aspect-fp64.cpp b/SYCL/SubGroup/generic-shuffle_aspect-fp64.cpp index ff219bc91e..97e0e05897 100644 --- a/SYCL/SubGroup/generic-shuffle_aspect-fp64.cpp +++ b/SYCL/SubGroup/generic-shuffle_aspect-fp64.cpp @@ -14,7 +14,7 @@ #include "generic-shuffle.hpp" -using namespace cl::sycl; +using namespace sycl; int main() { queue Queue; diff --git a/SYCL/SubGroup/load_store.hpp b/SYCL/SubGroup/load_store.hpp index 6a9b575d07..b52b1d0c05 100644 --- a/SYCL/SubGroup/load_store.hpp +++ b/SYCL/SubGroup/load_store.hpp @@ -5,7 +5,7 @@ template class sycl_subgr; -using namespace cl::sycl; +using namespace sycl; template void check(queue &Queue) { const int G = 512, L = 256; @@ -168,4 +168,4 @@ template void check(queue &Queue) { std::cout << "SYCL exception caught: " << e.what(); exit(1); } -} \ No newline at end of file +} diff --git a/SYCL/SubGroup/load_store_aspect-fp64.cpp b/SYCL/SubGroup/load_store_aspect-fp64.cpp index ef96ab2b6b..d117d2ebae 100644 --- a/SYCL/SubGroup/load_store_aspect-fp64.cpp +++ b/SYCL/SubGroup/load_store_aspect-fp64.cpp @@ -18,7 +18,7 @@ #include "load_store.hpp" -using namespace cl::sycl; +using namespace sycl; int main() { queue Queue; diff --git a/SYCL/USM/fill.hpp b/SYCL/USM/fill.hpp index 3734ebcf90..a28ecbfff2 100644 --- a/SYCL/USM/fill.hpp +++ b/SYCL/USM/fill.hpp @@ -1,6 +1,6 @@ #include -using namespace cl::sycl; +using namespace sycl; template class usm_device_transfer; template class usm_aligned_device_transfer; diff --git a/SYCL/USM/fill_aspect-fp64.cpp b/SYCL/USM/fill_aspect-fp64.cpp index d29ba14fee..49d17c2d02 100644 --- a/SYCL/USM/fill_aspect-fp64.cpp +++ b/SYCL/USM/fill_aspect-fp64.cpp @@ -15,7 +15,7 @@ #include "fill.hpp"; -using namespace cl::sycl; +using namespace sycl; int main() { queue q; From 4dd16ed8ce746be9a3b69310f0c2fb8aa6fc1cb9 Mon Sep 17 00:00:00 2001 From: Yanliang Mu Date: Wed, 17 Aug 2022 09:23:11 +0800 Subject: [PATCH 14/35] fix clang-format issue --- SYCL/Basic/buffer/buffer_aspect-fp64.cpp | 5 +-- .../api/bin_and_cmp_ops_heavy_aspect-fp64.cpp | 2 +- .../api/saturation_smoke_aspect-fp64.cpp | 3 +- SYCL/ESIMD/ext_math.hpp | 37 ++++++++----------- SYCL/ESIMD/regression/dgetrf_8x8.cpp | 7 ++-- SYCL/SubGroup/generic-shuffle_aspect-fp64.cpp | 6 ++- SYCL/SubGroup/load_store_aspect-fp64.cpp | 3 +- SYCL/USM/copy_aspect-fp64.cpp | 3 +- SYCL/USM/fill.cpp | 1 - SYCL/USM/fill_aspect-fp64.cpp | 3 +- 10 files changed, 33 insertions(+), 37 deletions(-) diff --git a/SYCL/Basic/buffer/buffer_aspect-fp64.cpp b/SYCL/Basic/buffer/buffer_aspect-fp64.cpp index 55119db44e..aee0fa514c 100644 --- a/SYCL/Basic/buffer/buffer_aspect-fp64.cpp +++ b/SYCL/Basic/buffer/buffer_aspect-fp64.cpp @@ -45,9 +45,8 @@ template void check_set_write_back() { Queue.submit([&](sycl::handler &cgh) { auto Accessor = buf_shrd.template get_access(cgh); - cgh.parallel_for>(r, [=](sycl::id<1> WIid) { - Accessor[WIid] = write_back_result; - }); + cgh.parallel_for>( + r, [=](sycl::id<1> WIid) { Accessor[WIid] = write_back_result; }); }); } // Data is copied back for (size_t i = 0; i < size; i++) { diff --git a/SYCL/ESIMD/api/bin_and_cmp_ops_heavy_aspect-fp64.cpp b/SYCL/ESIMD/api/bin_and_cmp_ops_heavy_aspect-fp64.cpp index 7c44d90eaa..3ce6e686d3 100644 --- a/SYCL/ESIMD/api/bin_and_cmp_ops_heavy_aspect-fp64.cpp +++ b/SYCL/ESIMD/api/bin_and_cmp_ops_heavy_aspect-fp64.cpp @@ -1,5 +1,5 @@ //==--------------- bin_un_cmp_ops_heavy_aspect-fp64.cpp - DPC++ ESIMD -//on-device test -==// +// on-device test -==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/SYCL/ESIMD/api/saturation_smoke_aspect-fp64.cpp b/SYCL/ESIMD/api/saturation_smoke_aspect-fp64.cpp index 1de0bd4fd5..7d1941983f 100644 --- a/SYCL/ESIMD/api/saturation_smoke_aspect-fp64.cpp +++ b/SYCL/ESIMD/api/saturation_smoke_aspect-fp64.cpp @@ -1,4 +1,5 @@ -//==---- saturation_smoke_aspect-fp64.cpp - DPC++ ESIMD on-device test -----==// +//==---- saturation_smoke_aspect-fp64.cpp - DPC++ ESIMD on-device test +//-----==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/SYCL/ESIMD/ext_math.hpp b/SYCL/ESIMD/ext_math.hpp index 4a8e5926fa..3d1278a60c 100644 --- a/SYCL/ESIMD/ext_math.hpp +++ b/SYCL/ESIMD/ext_math.hpp @@ -86,16 +86,11 @@ enum class MathOp { // --- Template functions calculating given math operation on host and device -enum ArgKind { - AllVec, - AllSca, - Sca1Vec2, - Sca2Vec1 -}; +enum ArgKind { AllVec, AllSca, Sca1Vec2, Sca2Vec1 }; -template struct ESIMDf; -template struct BinESIMDf; -template struct SYCLf; +template struct ESIMDf; +template struct BinESIMDf; +template struct SYCLf; template struct HostFunc; @@ -288,8 +283,8 @@ struct BinaryDeviceFunc { template class Kernel, typename InitF = InitNarrow> -bool test(queue &Q, const std::string &Name, - InitF Init = InitNarrow{}, float delta = 0.0f) { +bool test(queue &Q, const std::string &Name, InitF Init = InitNarrow{}, + float delta = 0.0f) { constexpr size_t Size = 1024 * 128; constexpr bool IsBinOp = (Op == MathOp::div_ieee) || (Op == MathOp::pow); @@ -303,9 +298,9 @@ bool test(queue &Q, const std::string &Name, Init(A, B, Size); } const char *kind = - std::is_same_v, ESIMDf> - ? "ESIMD" - : "SYCL"; + std::is_same_v, ESIMDf> + ? "ESIMD" + : "SYCL"; std::cout << " " << Name << " test, kind=" << kind << "...\n"; try { @@ -324,12 +319,11 @@ bool test(queue &Q, const std::string &Name, auto PC = BufC.template get_access(CGH); if constexpr (IsBinOp) { auto PB = BufB.template get_access(CGH); - BinaryDeviceFunc F( - PA, PB, PC); + BinaryDeviceFunc F(PA, PB, + PC); CGH.parallel_for(nd_range<1>{GlobalRange, LocalRange}, F); } else { - UnaryDeviceFunc F(PA, - PC); + UnaryDeviceFunc F(PA, PC); CGH.parallel_for(nd_range<1>{GlobalRange, LocalRange}, F); } }); @@ -418,10 +412,9 @@ template bool testESIMDDivIEEE(queue &Q) { template bool testESIMDPow(queue &Q) { bool Pass = true; - std::cout << "--- TESTING ESIMD pow, T=" << typeid(T).name() - << ", N = " << N << "...\n"; - Pass &= test( - Q, "pow", InitBin{}, 0.1); + std::cout << "--- TESTING ESIMD pow, T=" << typeid(T).name() << ", N = " << N + << "...\n"; + Pass &= test(Q, "pow", InitBin{}, 0.1); return Pass; } diff --git a/SYCL/ESIMD/regression/dgetrf_8x8.cpp b/SYCL/ESIMD/regression/dgetrf_8x8.cpp index 5ef53b70ac..e4c2e04514 100644 --- a/SYCL/ESIMD/regression/dgetrf_8x8.cpp +++ b/SYCL/ESIMD/regression/dgetrf_8x8.cpp @@ -209,10 +209,9 @@ static float fp_norm1(int64_t m, int64_t n, float *a, int64_t lda) { } static int dgetrfnp_batch_strided_check(int64_t m, int64_t n, float *a_in, - float *a, int64_t lda, - int64_t stride_a, int64_t *ipiv, - int64_t stride_ipiv, int64_t batch, - int64_t *info) { + float *a, int64_t lda, int64_t stride_a, + int64_t *ipiv, int64_t stride_ipiv, + int64_t batch, int64_t *info) { float thresh = 30.0; int fail = 0; int64_t i, j, k, l; diff --git a/SYCL/SubGroup/generic-shuffle_aspect-fp64.cpp b/SYCL/SubGroup/generic-shuffle_aspect-fp64.cpp index 97e0e05897..a9dd10debd 100644 --- a/SYCL/SubGroup/generic-shuffle_aspect-fp64.cpp +++ b/SYCL/SubGroup/generic-shuffle_aspect-fp64.cpp @@ -4,7 +4,8 @@ // RUN: %GPU_RUN_PLACEHOLDER %t.out // RUN: %ACC_RUN_PLACEHOLDER %t.out // -//==-- generic_shuffle_aspect-fp64.cpp - SYCL sub_group generic shuffle test *- C++ -*--==// +//==-- generic_shuffle_aspect-fp64.cpp - SYCL sub_group generic shuffle test *- +//C++ -*--==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -18,7 +19,8 @@ using namespace sycl; int main() { queue Queue; - if (Queue.get_device().is_host() or !Queue.get_device().has(sycl::aspect::fp64)) { + if (Queue.get_device().is_host() or + !Queue.get_device().has(sycl::aspect::fp64)) { std::cout << "Skipping test\n"; return 0; } diff --git a/SYCL/SubGroup/load_store_aspect-fp64.cpp b/SYCL/SubGroup/load_store_aspect-fp64.cpp index d117d2ebae..329515a4d1 100644 --- a/SYCL/SubGroup/load_store_aspect-fp64.cpp +++ b/SYCL/SubGroup/load_store_aspect-fp64.cpp @@ -22,7 +22,8 @@ using namespace sycl; int main() { queue Queue; - if (Queue.get_device().is_host() or !Queue.get_device().has(sycl::aspect::fp64)) { + if (Queue.get_device().is_host() or + !Queue.get_device().has(sycl::aspect::fp64)) { std::cout << "Skipping test\n"; return 0; } diff --git a/SYCL/USM/copy_aspect-fp64.cpp b/SYCL/USM/copy_aspect-fp64.cpp index 46f9765ab8..e1aea79f2f 100644 --- a/SYCL/USM/copy_aspect-fp64.cpp +++ b/SYCL/USM/copy_aspect-fp64.cpp @@ -1,4 +1,5 @@ -//==---- copy_aspect-fp64.cp - USM copy test ------------------------------------------==// +//==---- copy_aspect-fp64.cp - USM copy test +//------------------------------------------==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/SYCL/USM/fill.cpp b/SYCL/USM/fill.cpp index 2ef72638d1..3a4d5a8fc7 100644 --- a/SYCL/USM/fill.cpp +++ b/SYCL/USM/fill.cpp @@ -30,7 +30,6 @@ bool operator==(const test_struct &lhs, const test_struct &rhs) { lhs.e == rhs.e && lhs.f == rhs.f; } - int main() { queue q; auto dev = q.get_device(); diff --git a/SYCL/USM/fill_aspect-fp64.cpp b/SYCL/USM/fill_aspect-fp64.cpp index 49d17c2d02..acc00c06fa 100644 --- a/SYCL/USM/fill_aspect-fp64.cpp +++ b/SYCL/USM/fill_aspect-fp64.cpp @@ -1,4 +1,5 @@ -//==---- fill_aspect-fp64.cpp - USM fill test for double type ---------------==// +//==---- fill_aspect-fp64.cpp - USM fill test for double type +//---------------==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. From 1127547fc7e1df93075a26c9ac77ef96e33dfcaa Mon Sep 17 00:00:00 2001 From: Yanliang Mu Date: Wed, 17 Aug 2022 09:48:13 +0800 Subject: [PATCH 15/35] fix clang-format issue 2 --- SYCL/SubGroup/generic-shuffle_aspect-fp64.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SYCL/SubGroup/generic-shuffle_aspect-fp64.cpp b/SYCL/SubGroup/generic-shuffle_aspect-fp64.cpp index a9dd10debd..6bfac67d4a 100644 --- a/SYCL/SubGroup/generic-shuffle_aspect-fp64.cpp +++ b/SYCL/SubGroup/generic-shuffle_aspect-fp64.cpp @@ -5,7 +5,7 @@ // RUN: %ACC_RUN_PLACEHOLDER %t.out // //==-- generic_shuffle_aspect-fp64.cpp - SYCL sub_group generic shuffle test *- -//C++ -*--==// +// C++ -*--==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. From bb846ff662c40fa168594ce9c81721f16ab80894 Mon Sep 17 00:00:00 2001 From: Yanliang Mu Date: Wed, 17 Aug 2022 15:01:16 +0800 Subject: [PATCH 16/35] fix 3-way merge conflicts in union_kernel_param.cpp --- SYCL/KernelParams/union_kernel_param.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SYCL/KernelParams/union_kernel_param.cpp b/SYCL/KernelParams/union_kernel_param.cpp index c3c98dcbb3..2e5095bcc8 100644 --- a/SYCL/KernelParams/union_kernel_param.cpp +++ b/SYCL/KernelParams/union_kernel_param.cpp @@ -25,7 +25,7 @@ int main(int argc, char **argv) { sycl::queue queue; { - sycl::buffer buf(&mydouble, 1); + sycl::buffer buf(&myfloat, 1); queue.submit([&](sycl::handler &cgh) { auto acc = buf.get_access(cgh); cgh.single_task([=]() { acc[0] = x.myfloat; }); From e6dce8dade0f40a49653c741263753f736ce3c74 Mon Sep 17 00:00:00 2001 From: Yanliang Mu Date: Tue, 23 Aug 2022 14:54:47 +0800 Subject: [PATCH 17/35] To avoid splitting increases maintainability burden greatly, use `-DENABLE_FP64` macro to split 'double' type code as following: ``` // test.cpp // RUN: ... -DENABLE_FP64=false constexpr bool EnableFP64 = ENABLE_FP64; void test() { // non-fp64-case do_smth(); if constexpr (EnableFP64) do_smth64(); } // test-fp64.cpp ; RUN: ... -DENABLE_FP64=true \#include ``` --- SYCL/AtomicRef/assignment_atomic64.cpp | 3 + .../assignment_atomic64_aspect-fp64.cpp | 23 +- .../AtomicRef/assignment_atomic64_generic.cpp | 5 +- ...ssignment_atomic64_generic_aspect-fp64.cpp | 23 +- SYCL/Basic/buffer/buffer.cpp | 40 +- SYCL/Basic/buffer/buffer_aspect-fp64.cpp | 56 +- .../specialization_constants.cpp | 29 +- .../specialization_constants_aspect-fp64.cpp | 81 +-- .../specialization_constants_override.cpp | 32 +- ...ization_constants_override_aspect-fp64.cpp | 89 +--- SYCL/DeviceLib/built-ins/nan.cpp | 14 +- SYCL/DeviceLib/built-ins/nan_aspect-fp64.cpp | 66 +-- SYCL/ESIMD/api/bin_and_cmp_ops_heavy.cpp | 234 ++++++++- SYCL/ESIMD/api/bin_and_cmp_ops_heavy.hpp | 227 -------- .../api/bin_and_cmp_ops_heavy_aspect-fp64.cpp | 34 +- SYCL/ESIMD/api/saturation_smoke.cpp | 166 +++++- SYCL/ESIMD/api/saturation_smoke.hpp | 162 ------ .../api/saturation_smoke_aspect-fp64.cpp | 29 +- SYCL/ESIMD/api/simd_view_select_2d_fp.cpp | 3 + .../simd_view_select_2d_fp_aspect-fp64.cpp | 22 +- SYCL/ESIMD/api/unary_ops_heavy.cpp | 130 ++++- SYCL/ESIMD/api/unary_ops_heavy.hpp | 126 ----- .../ESIMD/api/unary_ops_heavy_aspect-fp64.cpp | 31 +- SYCL/ESIMD/ext_math.cpp | 445 +++++++++++++++- SYCL/ESIMD/ext_math.hpp | 434 ---------------- SYCL/ESIMD/ext_math_aspect-fp64.cpp | 29 +- SYCL/ESIMD/regression/Inputs/dgetrf.hpp | 83 +-- .../regression/Inputs/dgetrf_aspect-fp64.hpp | 487 ------------------ SYCL/ESIMD/regression/dgetrf_8x8.cpp | 76 +-- .../regression/dgetrf_8x8_aspect-fp64.cpp | 303 +---------- SYCL/ESIMD/regression/dgetrf_aspect-fp64.cpp | 4 +- .../regression/dgetrf_ref_aspect-fp64.cpp | 2 +- SYCL/GroupAlgorithm/SYCL2020/sort.cpp | 345 ++++++++++++- SYCL/GroupAlgorithm/SYCL2020/sort.hpp | 342 ------------ .../SYCL2020/sort_aspect-fp64.cpp | 35 +- SYCL/InlineAsm/asm_float_add.cpp | 67 --- SYCL/InlineAsm/asm_float_imm_arg.cpp | 60 --- SYCL/KernelParams/union_kernel_param.cpp | 22 +- .../union_kernel_param_aspect-fp64.cpp | 41 +- .../commandlist/Inputs/FindPrimesSYCL.cpp | 112 ---- SYCL/SpecConstants/2020/handler-api.cpp | 57 +- .../2020/handler-api_aspect-fp64.cpp | 118 +---- SYCL/SpecConstants/2020/kernel-bundle-api.cpp | 71 ++- .../2020/kernel-bundle-api_aspect-fp64.cpp | 167 +----- SYCL/SubGroup/barrier.cpp | 70 ++- SYCL/SubGroup/barrier.hpp | 63 --- SYCL/SubGroup/barrier_aspect-fp64.cpp | 18 +- SYCL/SubGroup/generic-shuffle.cpp | 207 +++++++- SYCL/SubGroup/generic-shuffle.hpp | 200 ------- SYCL/SubGroup/generic-shuffle_aspect-fp64.cpp | 27 +- SYCL/SubGroup/info.cpp | 24 +- SYCL/SubGroup/load_store.cpp | 180 ++++++- SYCL/SubGroup/load_store.hpp | 171 ------ SYCL/SubGroup/load_store_aspect-fp64.cpp | 32 +- SYCL/USM/copy.cpp | 84 ++- SYCL/USM/copy.hpp | 56 -- SYCL/USM/copy_aspect-fp64.cpp | 50 +- SYCL/USM/fill.cpp | 112 +++- SYCL/USM/fill.hpp | 92 ---- SYCL/USM/fill_aspect-fp64.cpp | 32 +- 60 files changed, 2372 insertions(+), 3971 deletions(-) delete mode 100644 SYCL/ESIMD/api/bin_and_cmp_ops_heavy.hpp delete mode 100644 SYCL/ESIMD/api/saturation_smoke.hpp delete mode 100644 SYCL/ESIMD/api/unary_ops_heavy.hpp delete mode 100644 SYCL/ESIMD/ext_math.hpp delete mode 100644 SYCL/ESIMD/regression/Inputs/dgetrf_aspect-fp64.hpp delete mode 100644 SYCL/GroupAlgorithm/SYCL2020/sort.hpp delete mode 100644 SYCL/InlineAsm/asm_float_add.cpp delete mode 100644 SYCL/InlineAsm/asm_float_imm_arg.cpp delete mode 100644 SYCL/Regression/commandlist/Inputs/FindPrimesSYCL.cpp delete mode 100644 SYCL/SubGroup/barrier.hpp delete mode 100644 SYCL/SubGroup/generic-shuffle.hpp delete mode 100644 SYCL/SubGroup/load_store.hpp delete mode 100644 SYCL/USM/copy.hpp delete mode 100644 SYCL/USM/fill.hpp diff --git a/SYCL/AtomicRef/assignment_atomic64.cpp b/SYCL/AtomicRef/assignment_atomic64.cpp index 8f0b709653..bc31d0663d 100644 --- a/SYCL/AtomicRef/assignment_atomic64.cpp +++ b/SYCL/AtomicRef/assignment_atomic64.cpp @@ -17,6 +17,9 @@ int main() { } constexpr int N = 32; +#ifdef ENABLE_FP64 + assignment_test(q, N); +#endif // Include long tests if they are 64 bits wide if constexpr (sizeof(long) == 8) { diff --git a/SYCL/AtomicRef/assignment_atomic64_aspect-fp64.cpp b/SYCL/AtomicRef/assignment_atomic64_aspect-fp64.cpp index d8bf53da48..a46953ecd2 100644 --- a/SYCL/AtomicRef/assignment_atomic64_aspect-fp64.cpp +++ b/SYCL/AtomicRef/assignment_atomic64_aspect-fp64.cpp @@ -1,5 +1,5 @@ // REQUIRES: aspect-fp64 -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -DENABLE_FP64 %s -o %t.out // RUN: %HOST_RUN_PLACEHOLDER %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out @@ -8,23 +8,4 @@ // XFAIL: hip // Expected failure because hip does not have atomic64 check implementation -#include "assignment.h" -#include -using namespace sycl; - -int main() { - queue q; - if (!q.get_device().has(aspect::fp64)) { - std::cout << "Skipping test\n"; - return 0; - } - if (!q.get_device().has(aspect::atomic64)) { - std::cout << "Skipping test\n"; - return 0; - } - - constexpr int N = 32; - assignment_test(q, N); - - std::cout << "Test passed." << std::endl; -} +#include "assignment_atomic64.cpp" diff --git a/SYCL/AtomicRef/assignment_atomic64_generic.cpp b/SYCL/AtomicRef/assignment_atomic64_generic.cpp index cd0a9d3ea8..d3eaab3d2a 100644 --- a/SYCL/AtomicRef/assignment_atomic64_generic.cpp +++ b/SYCL/AtomicRef/assignment_atomic64_generic.cpp @@ -20,7 +20,9 @@ int main() { } constexpr int N = 32; - +#ifdef ENABLE_FP64 + assignment_generic_test(q, N); +#endif // Include long tests if they are 64 bits wide if constexpr (sizeof(long) == 8) { assignment_generic_test(q, N); @@ -37,6 +39,5 @@ int main() { if constexpr (sizeof(char *) == 8) { assignment_generic_test(q, N); } - std::cout << "Test passed." << std::endl; } diff --git a/SYCL/AtomicRef/assignment_atomic64_generic_aspect-fp64.cpp b/SYCL/AtomicRef/assignment_atomic64_generic_aspect-fp64.cpp index 674211a754..9e61572d0e 100644 --- a/SYCL/AtomicRef/assignment_atomic64_generic_aspect-fp64.cpp +++ b/SYCL/AtomicRef/assignment_atomic64_generic_aspect-fp64.cpp @@ -1,5 +1,5 @@ // REQUIRES: aspect-fp64 -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -DENABLE_FP64 %s -o %t.out // RUN: %HOST_RUN_PLACEHOLDER %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out @@ -8,23 +8,4 @@ // CUDA backend has had no support for the generic address space yet // XFAIL: cuda || hip -#include "assignment.h" -#include -using namespace sycl; - -int main() { - queue q; - if (!q.get_device().has(aspect::fp64)) { - std::cout << "Skipping test\n"; - return 0; - } - if (!q.get_device().has(aspect::atomic64)) { - std::cout << "Skipping test\n"; - return 0; - } - - constexpr int N = 32; - assignment_generic_test(q, N); - - std::cout << "Test passed." << std::endl; -} +#include "assignment_atomic64_generic.cpp" diff --git a/SYCL/Basic/buffer/buffer.cpp b/SYCL/Basic/buffer/buffer.cpp index ceedfc754a..6814b10a06 100644 --- a/SYCL/Basic/buffer/buffer.cpp +++ b/SYCL/Basic/buffer/buffer.cpp @@ -25,6 +25,7 @@ int main() { int data = 5; bool failed = false; buffer buf(&data, range<1>(1)); + { int data1[10] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1}; { @@ -508,16 +509,26 @@ int main() { size_t size = 32; const size_t dims = 1; sycl::range r(size); - std::shared_ptr bool_shrd(new bool[size], [](bool *data) { delete[] data; }); std::shared_ptr int_shrd(new int[size], [](int *data) { delete[] data; }); +#ifdef ENABLE_FP64 + std::shared_ptr double_shrd(new double[size], + [](double *data) { delete[] data; }); +#endif std::vector bool_vector; std::vector int_vector; +#ifdef ENABLE_FP64 + std::vector double_vector; +#endif + bool_vector.reserve(size); int_vector.reserve(size); +#ifdef ENABLE_FP64 + double_vector.reserve(size); +#endif sycl::queue Queue; std::mutex m; @@ -528,30 +539,55 @@ int main() { sycl::buffer buf_int_shrd( int_shrd, r, sycl::property_list{sycl::property::buffer::use_mutex(m)}); +#ifdef ENABLE_FP64 + sycl::buffer buf_double_shrd( + double_shrd, r, + sycl::property_list{sycl::property::buffer::use_mutex(m)}); +#endif m.lock(); std::fill(bool_shrd.get(), (bool_shrd.get() + size), bool()); std::fill(int_shrd.get(), (int_shrd.get() + size), int()); +#ifdef ENABLE_FP64 + std::fill(double_shrd.get(), (double_shrd.get() + size), double()); +#endif m.unlock(); - buf_bool_shrd.set_final_data(bool_vector.begin()); buf_int_shrd.set_final_data(int_vector.begin()); +#ifdef ENABLE_FP64 + buf_double_shrd.set_final_data(double_vector.begin()); +#endif + buf_bool_shrd.set_write_back(true); buf_int_shrd.set_write_back(true); +#ifdef ENABLE_FP64 + buf_double_shrd.set_write_back(true); +#endif Queue.submit([&](sycl::handler &cgh) { auto Accessor_bool = buf_bool_shrd.get_access(cgh); auto Accessor_int = buf_int_shrd.get_access(cgh); +#ifdef ENABLE_FP64 + auto Accessor_double = + buf_double_shrd.get_access(cgh); +#endif cgh.parallel_for(r, [=](sycl::id<1> WIid) { Accessor_bool[WIid] = true; Accessor_int[WIid] = 3; +#ifdef ENABLE_FP64 + Accessor_double[WIid] = 7.5; +#endif }); }); } // Data is copied back for (size_t i = 0; i < size; i++) { if (bool_vector[i] != true || int_vector[i] != 3) { +#ifdef ENABLE_FP64 + if (bool_vector[i] != true || int_vector[i] != 3 || + double_vector[i] != 7.5) { +#endif assert(false && "Data was not copied back"); return 1; } diff --git a/SYCL/Basic/buffer/buffer_aspect-fp64.cpp b/SYCL/Basic/buffer/buffer_aspect-fp64.cpp index aee0fa514c..d26a0d4de6 100644 --- a/SYCL/Basic/buffer/buffer_aspect-fp64.cpp +++ b/SYCL/Basic/buffer/buffer_aspect-fp64.cpp @@ -1,5 +1,5 @@ // REQUIRES: aspect-fp64 -// RUN: %clangxx %cxx_std_optionc++17 %s -o %t1.out %sycl_options +// RUN: %clangxx %cxx_std_optionc++17 -DENABLE_FP64 %s -o %t1.out %sycl_options // RUN: %HOST_RUN_PLACEHOLDER %t1.out // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t2.out // RUN: %HOST_RUN_PLACEHOLDER %t2.out @@ -15,56 +15,4 @@ // //===----------------------------------------------------------------------===// -#include - -#include -#include - -using namespace sycl; - -template constexpr T write_back_result = T(3); -template <> constexpr double write_back_result = double(7.5); -template class fill_buffer_for_write_back {}; - -template void check_set_write_back() { - size_t size = 32; - sycl::range r(size); - std::shared_ptr shrd(new T[size], [](T *data) { delete[] data; }); - std::vector vector; - vector.reserve(size); - sycl::queue Queue; - std::mutex m; - { - sycl::buffer buf_shrd( - shrd, r, sycl::property_list{sycl::property::buffer::use_mutex(m)}); - m.lock(); - std::fill(shrd.get(), (shrd.get() + size), T()); - m.unlock(); - buf_shrd.set_final_data(vector.begin()); - buf_shrd.set_write_back(true); - Queue.submit([&](sycl::handler &cgh) { - auto Accessor = - buf_shrd.template get_access(cgh); - cgh.parallel_for>( - r, [=](sycl::id<1> WIid) { Accessor[WIid] = write_back_result; }); - }); - } // Data is copied back - for (size_t i = 0; i < size; i++) { - if (vector[i] != write_back_result) { - assert(false && "Data was not copied back"); - } - } -} - -int main() { - // Check that data is copied back after forcing write-back using - // set_write_back - queue q; - if (!q.get_device().has(aspect::fp64)) { - std::cout << "Skipping test\n"; - return 0; - } - - check_set_write_back(); - return 0; -} \ No newline at end of file +#include "buffer.cpp" diff --git a/SYCL/DeprecatedFeatures/SpecConsts1.2.1/specialization_constants.cpp b/SYCL/DeprecatedFeatures/SpecConsts1.2.1/specialization_constants.cpp index 6a968e33d9..d1e2de7b03 100644 --- a/SYCL/DeprecatedFeatures/SpecConsts1.2.1/specialization_constants.cpp +++ b/SYCL/DeprecatedFeatures/SpecConsts1.2.1/specialization_constants.cpp @@ -23,7 +23,6 @@ #define HALF 0 // FIXME Spec constants do not support half type yet class SpecializedKernel; - class MyBoolConst; class MyInt8Const; class MyUInt8Const; @@ -35,6 +34,9 @@ class MyInt64Const; class MyUInt64Const; class MyHalfConst; class MyFloatConst; +#ifdef ENABLE_FP64 +class MyDoubleConst; +#endif using namespace sycl; @@ -53,6 +55,9 @@ int64_t int64_ref = rnd() % std::numeric_limits::max(); uint64_t uint64_ref = rnd() % std::numeric_limits::max(); half half_ref = rnd() % std::numeric_limits::max(); float float_ref = rnd() % std::numeric_limits::max(); +#ifdef ENABLE_FP64 +double double_ref = rnd() % std::numeric_limits::max(); +#endif template bool check(const T1 &test, const T2 &ref, std::string type) { @@ -107,9 +112,11 @@ int main(int argc, char **argv) { #endif ext::oneapi::experimental::spec_constant f32 = prog.set_spec_constant(float_ref); - +#ifdef ENABLE_FP64 + ext::oneapi::experimental::spec_constant f64 = + prog.set_spec_constant(double_ref); +#endif prog.build_with_kernel_type(); - bool bool_test = 0; int8_t int8_test = 0; uint8_t uint8_test = 0; @@ -121,6 +128,9 @@ int main(int argc, char **argv) { uint64_t uint64_test = 0; half half_test = 0; float float_test = 0; +#ifdef ENABLE_FP64 + double double_test = 0; +#endif { buffer bool_buf(&bool_test, 1); @@ -134,6 +144,9 @@ int main(int argc, char **argv) { buffer uint64_buf(&uint64_test, 1); buffer half_buf(&half_test, 1); buffer float_buf(&float_test, 1); +#ifdef ENABLE_FP64 + buffer double_buf(&double_test, 1); +#endif q.submit([&](handler &cgh) { auto bool_acc = bool_buf.get_access(cgh); @@ -147,6 +160,9 @@ int main(int argc, char **argv) { auto uint64_acc = uint64_buf.get_access(cgh); auto half_acc = half_buf.get_access(cgh); auto float_acc = float_buf.get_access(cgh); +#ifdef ENABLE_FP64 + auto double_acc = double_buf.get_access(cgh); +#endif cgh.single_task(prog.get_kernel(), [=]() { bool_acc[0] = i1.get(); @@ -162,6 +178,9 @@ int main(int argc, char **argv) { half_acc[0] = f16.get(); #endif float_acc[0] = f32.get(); +#ifdef ENABLE_FP64 + double_acc[0] = f64.get(); +#endif }); }); } @@ -189,6 +208,10 @@ int main(int argc, char **argv) { #endif if (!check(float_test, float_ref, "float")) return 1; +#ifdef ENABLE_FP64 + if (!check(double_test, double_ref, "double")) + return 1; +#endif } catch (const exception &e) { std::cout << "an async SYCL exception was caught: " << std::string(e.what()); diff --git a/SYCL/DeprecatedFeatures/SpecConsts1.2.1/specialization_constants_aspect-fp64.cpp b/SYCL/DeprecatedFeatures/SpecConsts1.2.1/specialization_constants_aspect-fp64.cpp index 8793abebcf..2cd47178f9 100644 --- a/SYCL/DeprecatedFeatures/SpecConsts1.2.1/specialization_constants_aspect-fp64.cpp +++ b/SYCL/DeprecatedFeatures/SpecConsts1.2.1/specialization_constants_aspect-fp64.cpp @@ -1,5 +1,5 @@ // REQUIRES: aspect-fp64 -// RUN: %clangxx -fsycl -D__SYCL_INTERNAL_API %s -o %t.out +// RUN: %clangxx -fsycl -D__SYCL_INTERNAL_API -DENABLE_FP64 %s -o %t.out // RUN: %HOST_RUN_PLACEHOLDER %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out @@ -15,81 +15,4 @@ //===----------------------------------------------------------------------===// // Basic checks for some primitive types -#include -#include -#include -#include - -class SpecializedKernel; - -class MyDoubleConst; - -using namespace sycl; - -unsigned seed = std::chrono::system_clock::now().time_since_epoch().count(); -std::mt19937_64 rnd(seed); - -// Fetch a value at runtime. -double double_ref = rnd() % std::numeric_limits::max(); - -template -bool check(const T1 &test, const T2 &ref, std::string type) { - - if (test != ref) { - std::cout << "Test != Reference: " << std::to_string(test) - << " != " << std::to_string(ref) << " for type: " << type << "\n"; - return false; - } - return true; -} - -int main(int argc, char **argv) { - queue q; - if (!q.get_device().has(aspect::fp64)) { - std::cout << "Skipping test\n"; - return 0; - } - - std::cout << "check specialization constants API. (seed =" << seed << "\n"; - - auto exception_handler = [&](sycl::exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { - try { - std::rethrow_exception(e); - } catch (sycl::exception const &e) { - std::cout << "an async SYCL exception was caught: " - << std::string(e.what()); - } - } - }; - try { - auto q = queue(exception_handler); - program prog(q.get_context()); - - // Create specialization constants. - ext::oneapi::experimental::spec_constant f64 = - prog.set_spec_constant(double_ref); - - prog.build_with_kernel_type(); - - double double_test = 0; - - { - buffer double_buf(&double_test, 1); - - q.submit([&](handler &cgh) { - auto double_acc = double_buf.get_access(cgh); - cgh.single_task( - prog.get_kernel(), - [=]() { double_acc[0] = f64.get(); }); - }); - } - if (!check(double_test, double_ref, "double")) - return 1; - } catch (const exception &e) { - std::cout << "an async SYCL exception was caught: " - << std::string(e.what()); - return 1; - } - return 0; -} +#include "specialization_constants.cpp" diff --git a/SYCL/DeprecatedFeatures/SpecConsts1.2.1/specialization_constants_override.cpp b/SYCL/DeprecatedFeatures/SpecConsts1.2.1/specialization_constants_override.cpp index 921e474362..8db7b1cefe 100644 --- a/SYCL/DeprecatedFeatures/SpecConsts1.2.1/specialization_constants_override.cpp +++ b/SYCL/DeprecatedFeatures/SpecConsts1.2.1/specialization_constants_override.cpp @@ -24,19 +24,27 @@ class SpecializedKernelOverride; class MyBoolConstOverride; class MyUInt32ConstOverride; +#ifdef ENABLE_FP64 +class MyDoubleConstOverride; +#endif using namespace sycl; unsigned seed = std::chrono::system_clock::now().time_since_epoch().count(); std::mt19937_64 rnd(seed); - bool bool_ref = true; bool bool_ref_override = false; // Fetch a value at runtime. uint32_t uint32_ref = rnd() % std::numeric_limits::max(); +#ifdef ENABLE_FP64 +double double_ref = rnd() % std::numeric_limits::max(); +#endif // Values which override the previous ones uint32_t uint32_ref_override = rnd() % std::numeric_limits::max(); +#ifdef ENABLE_FP64 +double double_ref_override = rnd() % std::numeric_limits::max(); +#endif template bool check(const T1 &test, const T2 &ref, std::string type) { @@ -73,32 +81,54 @@ int main(int argc, char **argv) { prog.set_spec_constant(bool_ref); ext::oneapi::experimental::spec_constant ui32 = prog.set_spec_constant(uint32_ref); +#ifdef ENABLE_FP64 + ext::oneapi::experimental::spec_constant + f64 = prog.set_spec_constant(double_ref); +#endif // Override specialization constants. i1 = prog.set_spec_constant(bool_ref_override); ui32 = prog.set_spec_constant(uint32_ref_override); +#ifdef ENABLE_FP64 + f64 = prog.set_spec_constant(double_ref_override); +#endif prog.build_with_kernel_type(); bool bool_test = true; uint32_t uint32_test = 0; +#ifdef ENABLE_FP64 + double double_test = 0; +#endif { buffer bool_buf(&bool_test, 1); buffer uint32_buf(&uint32_test, 1); +#ifdef ENABLE_FP64 + buffer double_buf(&double_test, 1); +#endif q.submit([&](handler &cgh) { auto bool_acc = bool_buf.get_access(cgh); auto uint32_acc = uint32_buf.get_access(cgh); +#ifdef ENABLE_FP64 + auto double_acc = double_buf.get_access(cgh); +#endif cgh.single_task( prog.get_kernel(), [=]() { bool_acc[0] = i1.get(); uint32_acc[0] = ui32.get(); +#ifdef ENABLE_FP64 + double_acc[0] = f64.get(); +#endif }); }); } check(bool_test, bool_ref_override, "bool"); check(uint32_test, uint32_ref_override, "uint32"); +#ifdef ENABLE_FP64 + check(double_test, double_ref_override, "double"); +#endif } catch (const exception &e) { std::cout << "an async SYCL exception was caught: " diff --git a/SYCL/DeprecatedFeatures/SpecConsts1.2.1/specialization_constants_override_aspect-fp64.cpp b/SYCL/DeprecatedFeatures/SpecConsts1.2.1/specialization_constants_override_aspect-fp64.cpp index d409d04e4d..179b27abec 100644 --- a/SYCL/DeprecatedFeatures/SpecConsts1.2.1/specialization_constants_override_aspect-fp64.cpp +++ b/SYCL/DeprecatedFeatures/SpecConsts1.2.1/specialization_constants_override_aspect-fp64.cpp @@ -1,5 +1,5 @@ // REQUIRES: aspect-fp64 -// RUN: %clangxx -fsycl -D__SYCL_INTERNAL_API %s -o %t.out +// RUN: %clangxx -fsycl -D__SYCL_INTERNAL_API -DENABLE_FP64 %s -o %t.out // RUN: %HOST_RUN_PLACEHOLDER %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out @@ -15,89 +15,4 @@ //===----------------------------------------------------------------------===// // Checks that set_spec_constant can be used twice on the same program -#include -#include -#include -#include - -class SpecializedKernelOverride; - -class MyDoubleConstOverride; - -using namespace sycl; - -unsigned seed = std::chrono::system_clock::now().time_since_epoch().count(); -std::mt19937_64 rnd(seed); - -// Fetch a value at runtime. -double double_ref = rnd() % std::numeric_limits::max(); - -// Values which override the previous ones -double double_ref_override = rnd() % std::numeric_limits::max(); - -template -bool check(const T1 &test, const T2 &ref, std::string type) { - - if (test != ref) { - std::cout << "Test != Reference: " << std::to_string(test) - << " != " << std::to_string(ref) << " for type: " << type << "\n"; - return false; - } - return true; -} - -int main(int argc, char **argv) { - queue q; - if (!q.get_device().has(aspect::fp64)) { - std::cout << "Skipping test\n"; - return 0; - } - - std::cout << "check specialization constants overriding. (seed =" << seed - << "\n"; - - auto exception_handler = [&](sycl::exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { - try { - std::rethrow_exception(e); - } catch (sycl::exception const &e) { - std::cout << "an async SYCL exception was caught: " - << std::string(e.what()); - } - } - }; - - try { - auto q = queue(exception_handler); - program prog(q.get_context()); - - // Create specialization constants. - ext::oneapi::experimental::spec_constant - f64 = prog.set_spec_constant(double_ref); - - // Override specialization constants. - f64 = prog.set_spec_constant(double_ref_override); - - prog.build_with_kernel_type(); - - double double_test = 0; - - { - buffer double_buf(&double_test, 1); - - q.submit([&](handler &cgh) { - auto double_acc = double_buf.get_access(cgh); - cgh.single_task( - prog.get_kernel(), - [=]() { double_acc[0] = f64.get(); }); - }); - } - check(double_test, double_ref_override, "double"); - - } catch (const exception &e) { - std::cout << "an async SYCL exception was caught: " - << std::string(e.what()); - return 1; - } - return 0; -} +#include "specialization_constants_override.cpp" diff --git a/SYCL/DeviceLib/built-ins/nan.cpp b/SYCL/DeviceLib/built-ins/nan.cpp index d94af0d770..c9e99168f4 100644 --- a/SYCL/DeviceLib/built-ins/nan.cpp +++ b/SYCL/DeviceLib/built-ins/nan.cpp @@ -44,7 +44,12 @@ int main() { test_nan_call(); test_nan_call(); test_nan_call(); - +#ifdef ENABLE_FP64 + test_nan_call(); + test_nan_call(); + test_nan_call(); + test_nan_call(); +#endif s::queue Queue([](sycl::exception_list ExceptionList) { for (std::exception_ptr ExceptionPtr : ExceptionList) { try { @@ -60,6 +65,13 @@ int main() { if (Queue.get_device().has(sycl::aspect::fp16)) check_nan(Queue); #endif + check_nan(Queue); +#ifdef ENABLE_FP64 + if (Queue.get_device().has(sycl::aspect::fp64)) { + check_nan(Queue); + check_nan(Queue); + } +#endif return 0; } diff --git a/SYCL/DeviceLib/built-ins/nan_aspect-fp64.cpp b/SYCL/DeviceLib/built-ins/nan_aspect-fp64.cpp index 1a568e245d..357e52f864 100644 --- a/SYCL/DeviceLib/built-ins/nan_aspect-fp64.cpp +++ b/SYCL/DeviceLib/built-ins/nan_aspect-fp64.cpp @@ -1,70 +1,8 @@ // REQUIRES: aspect-fp64 -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -D HALF_IS_SUPPORTED %s -o %t_gpu.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -DENABLE_FP64 %s -o %t.out // RUN: %HOST_RUN_PLACEHOLDER %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t_gpu.out // RUN: %ACC_RUN_PLACEHOLDER %t.out -#include - -#include - -namespace s = cl::sycl; -using namespace std; - -template void test_nan_call() { - static_assert(is_same::value == Expected, ""); -} - -template struct test; - -template void check_nan(s::queue &Queue) { - R Data{0}; - s::vec VData{0}; - { - s::buffer Buf(&Data, s::range<1>(1)); - s::buffer, 1> VBuf(&VData, s::range<1>(1)); - Queue.submit([&](s::handler &Cgh) { - auto Acc = Buf.template get_access(Cgh); - auto VAcc = VBuf.template get_access(Cgh); - Cgh.single_task>([=]() { - Acc[0] = s::nan(T{0}); - VAcc[0] = s::nan(s::vec{0}); - }); - }); - Queue.wait_and_throw(); - } - assert(s::isnan(Data)); - assert(s::all(s::isnan(VData))); -} - -int main() { - queue q; - if (!q.get_device().has(aspect::fp64)) { - std::cout << "Skipping test\n"; - return 0; - } - - test_nan_call(); - test_nan_call(); - test_nan_call(); - test_nan_call(); - - s::queue Queue([](cl::sycl::exception_list ExceptionList) { - for (std::exception_ptr ExceptionPtr : ExceptionList) { - try { - std::rethrow_exception(ExceptionPtr); - } catch (cl::sycl::exception &E) { - std::cerr << E.what() << std::endl; - } catch (...) { - std::cerr << "Unknown async exception was caught." << std::endl; - } - } - }); - - check_nan(Queue); - check_nan(Queue); - - return 0; -} +#include "nan.cpp" diff --git a/SYCL/ESIMD/api/bin_and_cmp_ops_heavy.cpp b/SYCL/ESIMD/api/bin_and_cmp_ops_heavy.cpp index c735f071f2..c141d071ea 100644 --- a/SYCL/ESIMD/api/bin_and_cmp_ops_heavy.cpp +++ b/SYCL/ESIMD/api/bin_and_cmp_ops_heavy.cpp @@ -22,11 +22,235 @@ // larger than certain threshold. Might need to tune the cr0 once this feature // is available in ESIMD. // -#include "bin_and_cmp_ops_heavy.hpp" + +#include "../esimd_test_utils.hpp" + +#include +#include +#include using namespace sycl; using namespace sycl::ext::intel::esimd; +template class TestID; + +// Result type of a scalar binary Op +template +using scalar_comp_t = + std::conditional_t, + typename simd_mask<8>::element_type, + __ESIMD_DNS::computation_type_t>; + +// Result type of a vector binary Op +template +using comp_t = std::conditional_t< + N == 0, scalar_comp_t, + std::conditional_t, simd_mask, + simd<__ESIMD_DNS::computation_type_t, N>>>; + +// Helpers for printing +template auto cast(T val) { return val; } +template <> auto cast(char val) { return (int)val; } +template <> auto cast(unsigned char val) { + return (unsigned int)val; +} +#ifdef __SYCL_DEVICE_ONLY__ +template <> auto cast<_Float16>(_Float16 val) { return (float)val; } +#endif + +// Main test function. +// T1, T2 - operand types, +// VL - vector length, +// OpClass - binary or comparison operations, +// VerifyF and InitF - verification and initialization function types +// (instantiated within the test function), +// Ops - a compile-time sequence of operations to test. +// +template class VerifyF, + template class InitF, class Ops> +bool test(Ops ops, queue &q, comp_t epsilon = 0) { + // Log test case info + std::cout << "Testing T1=" << typeid(T1).name() << " T2=" << typeid(T2).name() + << ", VL=" << VL << " ...\n"; + std::cout << "Operations:"; + esimd_test::iterate_ops(ops, [=](OpClass op) { + std::cout << " '" << esimd_test::Op2Str(op) << "'"; + }); + std::cout << "\n"; + + // initialize test data + constexpr int Size = 1024 * 7; + T1 *A = sycl::malloc_shared(Size, q); + T2 *B = sycl::malloc_shared(Size, q); + constexpr int NumOps = (int)Ops::size; + int CSize = NumOps * Size; + using T = comp_t; + // Result array. For each pair of A[i] and B[i] elements it reserves NumOps + // elements to store result of all operations under test applied to the A[i] + // and B[i] + T *C = sycl::malloc_shared(CSize, q); + InitF init; + + for (int i = 0; i < Size; ++i) { + init(A, B, C, i); + } + + // submit the kernel + try { + auto e = q.submit([&](handler &cgh) { + cgh.parallel_for>( + Size / VL, [=](id<1> i) SYCL_ESIMD_KERNEL { + unsigned off = i * VL; + simd va(A + off, vector_aligned_tag{}); + simd vb(B + off, vector_aligned_tag{}); + + // applies each of the input operations to the va and vb vectors, + // then invokes the lambda below, passing the result of the + // operation, its ID and sequential number within the input sequence + esimd_test::apply_ops( + ops, va, vb, + [=](comp_t res, OpClass op, + unsigned op_num) { + unsigned res_off = off * NumOps + op_num * VL; + res.copy_to(C + res_off, vector_aligned_tag{}); + }); + }); + }); + e.wait(); + } catch (sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + sycl::free(A, q); + sycl::free(B, q); + sycl::free(C, q); + return false; + } + + int err_cnt = 0; + + // now verify the results using provided verification function type + for (unsigned i = 0; i < Size / VL; ++i) { + unsigned off = i * VL; + + for (int j = 0; j < VL; ++j) { + T1 a = A[off + j]; + T2 b = B[off + j]; + + esimd_test::apply_ops( + ops, a, b, [&](T Gold, OpClass op, unsigned op_num) { + unsigned res_off = off * NumOps + op_num * VL; + T Res = C[res_off + j]; + using Tint = esimd_test::int_type_t; + Tint ResBits = *(Tint *)&Res; + Tint GoldBits = *(Tint *)&Gold; + VerifyF verify_f(epsilon); + + if (!verify_f(Gold, Res, op)) { + if (++err_cnt < 10) { + std::cout << " failed at index " << (res_off + j) << ", op " + << esimd_test::Op2Str(op) << ": " << cast(Res) + << "(0x" << std::hex << ResBits << ")" + << " != " << std::dec << cast(Gold) << "(0x" + << std::hex << GoldBits << ") [" << std::dec + << cast(a) << " " << esimd_test::Op2Str(op) << " " + << cast(b) << "]\n"; + } + } + }); + } + } + if (err_cnt > 0) { + auto Size1 = NumOps * Size; + std::cout << " pass rate: " + << ((float)(Size1 - err_cnt) / (float)Size1) * 100.0f << "% (" + << (Size1 - err_cnt) << "/" << Size1 << ")\n"; + } + + free(A, q); + free(B, q); + free(C, q); + std::cout << (err_cnt > 0 ? " FAILED\n" : " Passed\n"); + return err_cnt == 0; +} + +// Flavours of verification function types. + +template struct verify_strict { + using T = comp_t; + + verify_strict(T) {} + + bool operator()(T res, T gold, OpClass op) { return res == gold; } +}; + +#define EQ(x, y, epsilon) \ + ((x) > (y) ? (x) - (y) <= epsilon : (y) - (x) <= epsilon) + +template struct verify_epsilon { + using T = comp_t; + T epsilon; + verify_epsilon(T epsilon) : epsilon(epsilon) {} + + bool operator()(T res, T gold, OpClass op) { + if constexpr (std::is_same_v) { + if (op == esimd_test::BinaryOp::div) { + return EQ(res, gold, epsilon); + } + } + return res == gold; + } +}; + +template struct verify_n { + using T = comp_t; + int n; + verify_n(int n) : n(n) {} + + bool operator()(T res, T gold, OpClass op) { + using Tint = esimd_test::int_type_t; + Tint res_bits = *(Tint *)&res; + Tint gold_bits = *(Tint *)&gold; + return (abs(gold_bits - res_bits) > n) ? false : true; + } +}; + +// Flavours of initialization function types. + +template struct init_default { + using T = comp_t; + + void operator()(T1 *A, T2 *B, T *C, int i) { + A[i] = (i % 3) * 90 + 10; /*10, 100, 190, 10, ...*/ + if constexpr (std::is_unsigned_v) { + B[i] = (i % 3) * 99 + 1 /*1, 100, 199, 1, ...*/; + } else { + B[i] = (i % 4) * 180 - 170; /*-170, 10, 190, 370, -170,...*/ + } + C[i] = 0; + } +}; + +template struct init_for_shift { + using T = comp_t; + + void operator()(T1 *A, T2 *B, T *C, int i) { + if constexpr (std::is_unsigned_v) { + A[i] = (i % 3) + 100; /*100, 101, 102, 100, ...*/ + } else { + A[i] = (i % 4) * 100 - 150; /*-150, -50, 50, 150, -150, ...*/ + } + B[i] = (i % 3); + C[i] = 0; + } +}; + +// shortcuts for less clutter +template using VSf = verify_strict; +template using VEf = verify_epsilon; +template using VNf = verify_n; +template using IDf = init_default; +template using ISf = init_for_shift; + int main(void) { queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler()); @@ -42,6 +266,10 @@ int main(void) { passed &= test(arith_ops, q, 1); passed &= test(arith_ops, q, 1); passed &= test(arith_ops, q); +#ifdef ENABLE_FP64 + passed &= test(arith_ops, q); + passed &= test(arith_ops, q); +#endif auto int_ops = esimd_test::IntBinaryOpsNoShift; // different data needed for shift @@ -68,6 +296,10 @@ int main(void) { passed &= test(cmp_ops, q, 1); passed &= test(cmp_ops, q, 1); passed &= test(cmp_ops, q); +#ifdef ENABLE_FP64 + passed &= test(cmp_ops, q); + passed &= test(cmp_ops, q); +#endif std::cout << (passed ? "Test PASSED\n" : "Test FAILED\n"); return passed ? 0 : 1; diff --git a/SYCL/ESIMD/api/bin_and_cmp_ops_heavy.hpp b/SYCL/ESIMD/api/bin_and_cmp_ops_heavy.hpp deleted file mode 100644 index 5ce313d9e4..0000000000 --- a/SYCL/ESIMD/api/bin_and_cmp_ops_heavy.hpp +++ /dev/null @@ -1,227 +0,0 @@ -#include "../esimd_test_utils.hpp" - -#include -#include -#include - -using namespace sycl; -using namespace sycl::ext::intel::esimd; - -template class TestID; - -// Result type of a scalar binary Op -template -using scalar_comp_t = - std::conditional_t, - typename simd_mask<8>::element_type, - __ESIMD_DNS::computation_type_t>; - -// Result type of a vector binary Op -template -using comp_t = std::conditional_t< - N == 0, scalar_comp_t, - std::conditional_t, simd_mask, - simd<__ESIMD_DNS::computation_type_t, N>>>; - -// Helpers for printing -template auto cast(T val) { return val; } -template <> auto cast(char val) { return (int)val; } -template <> auto cast(unsigned char val) { - return (unsigned int)val; -} -#ifdef __SYCL_DEVICE_ONLY__ -template <> auto cast<_Float16>(_Float16 val) { return (float)val; } -#endif - -// Main test function. -// T1, T2 - operand types, -// VL - vector length, -// OpClass - binary or comparison operations, -// VerifyF and InitF - verification and initialization function types -// (instantiated within the test function), -// Ops - a compile-time sequence of operations to test. -// -template class VerifyF, - template class InitF, class Ops> -bool test(Ops ops, queue &q, comp_t epsilon = 0) { - // Log test case info - std::cout << "Testing T1=" << typeid(T1).name() << " T2=" << typeid(T2).name() - << ", VL=" << VL << " ...\n"; - std::cout << "Operations:"; - esimd_test::iterate_ops(ops, [=](OpClass op) { - std::cout << " '" << esimd_test::Op2Str(op) << "'"; - }); - std::cout << "\n"; - - // initialize test data - constexpr int Size = 1024 * 7; - T1 *A = sycl::malloc_shared(Size, q); - T2 *B = sycl::malloc_shared(Size, q); - constexpr int NumOps = (int)Ops::size; - int CSize = NumOps * Size; - using T = comp_t; - // Result array. For each pair of A[i] and B[i] elements it reserves NumOps - // elements to store result of all operations under test applied to the A[i] - // and B[i] - T *C = sycl::malloc_shared(CSize, q); - InitF init; - - for (int i = 0; i < Size; ++i) { - init(A, B, C, i); - } - - // submit the kernel - try { - auto e = q.submit([&](handler &cgh) { - cgh.parallel_for>( - Size / VL, [=](id<1> i) SYCL_ESIMD_KERNEL { - unsigned off = i * VL; - simd va(A + off, vector_aligned_tag{}); - simd vb(B + off, vector_aligned_tag{}); - - // applies each of the input operations to the va and vb vectors, - // then invokes the lambda below, passing the result of the - // operation, its ID and sequential number within the input sequence - esimd_test::apply_ops( - ops, va, vb, - [=](comp_t res, OpClass op, - unsigned op_num) { - unsigned res_off = off * NumOps + op_num * VL; - res.copy_to(C + res_off, vector_aligned_tag{}); - }); - }); - }); - e.wait(); - } catch (sycl::exception const &e) { - std::cout << "SYCL exception caught: " << e.what() << '\n'; - sycl::free(A, q); - sycl::free(B, q); - sycl::free(C, q); - return false; - } - - int err_cnt = 0; - - // now verify the results using provided verification function type - for (unsigned i = 0; i < Size / VL; ++i) { - unsigned off = i * VL; - - for (int j = 0; j < VL; ++j) { - T1 a = A[off + j]; - T2 b = B[off + j]; - - esimd_test::apply_ops( - ops, a, b, [&](T Gold, OpClass op, unsigned op_num) { - unsigned res_off = off * NumOps + op_num * VL; - T Res = C[res_off + j]; - using Tint = esimd_test::int_type_t; - Tint ResBits = *(Tint *)&Res; - Tint GoldBits = *(Tint *)&Gold; - VerifyF verify_f(epsilon); - - if (!verify_f(Gold, Res, op)) { - if (++err_cnt < 10) { - std::cout << " failed at index " << (res_off + j) << ", op " - << esimd_test::Op2Str(op) << ": " << cast(Res) - << "(0x" << std::hex << ResBits << ")" - << " != " << std::dec << cast(Gold) << "(0x" - << std::hex << GoldBits << ") [" << std::dec - << cast(a) << " " << esimd_test::Op2Str(op) << " " - << cast(b) << "]\n"; - } - } - }); - } - } - if (err_cnt > 0) { - auto Size1 = NumOps * Size; - std::cout << " pass rate: " - << ((float)(Size1 - err_cnt) / (float)Size1) * 100.0f << "% (" - << (Size1 - err_cnt) << "/" << Size1 << ")\n"; - } - - free(A, q); - free(B, q); - free(C, q); - std::cout << (err_cnt > 0 ? " FAILED\n" : " Passed\n"); - return err_cnt == 0; -} - -// Flavours of verification function types. - -template struct verify_strict { - using T = comp_t; - - verify_strict(T) {} - - bool operator()(T res, T gold, OpClass op) { return res == gold; } -}; - -#define EQ(x, y, epsilon) \ - ((x) > (y) ? (x) - (y) <= epsilon : (y) - (x) <= epsilon) - -template struct verify_epsilon { - using T = comp_t; - T epsilon; - verify_epsilon(T epsilon) : epsilon(epsilon) {} - - bool operator()(T res, T gold, OpClass op) { - if constexpr (std::is_same_v) { - if (op == esimd_test::BinaryOp::div) { - return EQ(res, gold, epsilon); - } - } - return res == gold; - } -}; - -template struct verify_n { - using T = comp_t; - int n; - verify_n(int n) : n(n) {} - - bool operator()(T res, T gold, OpClass op) { - using Tint = esimd_test::int_type_t; - Tint res_bits = *(Tint *)&res; - Tint gold_bits = *(Tint *)&gold; - return (abs(gold_bits - res_bits) > n) ? false : true; - } -}; - -// Flavours of initialization function types. - -template struct init_default { - using T = comp_t; - - void operator()(T1 *A, T2 *B, T *C, int i) { - A[i] = (i % 3) * 90 + 10; /*10, 100, 190, 10, ...*/ - if constexpr (std::is_unsigned_v) { - B[i] = (i % 3) * 99 + 1 /*1, 100, 199, 1, ...*/; - } else { - B[i] = (i % 4) * 180 - 170; /*-170, 10, 190, 370, -170,...*/ - } - C[i] = 0; - } -}; - -template struct init_for_shift { - using T = comp_t; - - void operator()(T1 *A, T2 *B, T *C, int i) { - if constexpr (std::is_unsigned_v) { - A[i] = (i % 3) + 100; /*100, 101, 102, 100, ...*/ - } else { - A[i] = (i % 4) * 100 - 150; /*-150, -50, 50, 150, -150, ...*/ - } - B[i] = (i % 3); - C[i] = 0; - } -}; - -// shortcuts for less clutter -template using VSf = verify_strict; -template using VEf = verify_epsilon; -template using VNf = verify_n; -template using IDf = init_default; -template using ISf = init_for_shift; diff --git a/SYCL/ESIMD/api/bin_and_cmp_ops_heavy_aspect-fp64.cpp b/SYCL/ESIMD/api/bin_and_cmp_ops_heavy_aspect-fp64.cpp index 3ce6e686d3..4aee26383f 100644 --- a/SYCL/ESIMD/api/bin_and_cmp_ops_heavy_aspect-fp64.cpp +++ b/SYCL/ESIMD/api/bin_and_cmp_ops_heavy_aspect-fp64.cpp @@ -1,5 +1,5 @@ //==--------------- bin_un_cmp_ops_heavy_aspect-fp64.cpp - DPC++ ESIMD -// on-device test -==// +//on-device test -==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -10,7 +10,7 @@ // UNSUPPORTED: cuda || hip // TODO: esimd_emulator fails due to unimplemented 'half' type // XFAIL: esimd_emulator -// RUN: %clangxx -fsycl %s -o %t.out +// RUN: %clangxx -fsycl -DENABLE_FP64 %s -o %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out // Tests various binary operations applied to simd objects. @@ -24,32 +24,4 @@ // is available in ESIMD. // -#include "bin_and_cmp_ops_heavy.hpp" - -using namespace sycl; -using namespace sycl::ext::intel::esimd; - -int main(void) { - queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler()); - if (!q.get_device().has(aspect::fp64)) { - std::cout << "Skipping test\n"; - return 0; - } - - auto dev = q.get_device(); - std::cout << "Running on " << dev.get_info() << "\n"; - bool passed = true; - using BinOp = esimd_test::BinaryOp; - - auto arith_ops = esimd_test::ArithBinaryOps; - passed &= test(arith_ops, q); - passed &= test(arith_ops, q); - - using CmpOp = esimd_test::CmpOp; - auto cmp_ops = esimd_test::CmpOps; - passed &= test(cmp_ops, q); - passed &= test(cmp_ops, q); - - std::cout << (passed ? "Test PASSED\n" : "Test FAILED\n"); - return passed ? 0 : 1; -} +#include "bin_and_cmp_ops_heavy.cpp" diff --git a/SYCL/ESIMD/api/saturation_smoke.cpp b/SYCL/ESIMD/api/saturation_smoke.cpp index caad4c2a09..789224963f 100644 --- a/SYCL/ESIMD/api/saturation_smoke.cpp +++ b/SYCL/ESIMD/api/saturation_smoke.cpp @@ -14,11 +14,169 @@ // // The test checks main functionality of esimd::saturate function. -#include "saturation_smoke.hpp" +#include "../esimd_test_utils.hpp" + +#include +#include +#include using namespace sycl; using namespace sycl::ext::intel::esimd; +template struct char_to_int { + using type = typename std::conditional< + sizeof(T) == 1, + typename std::conditional::value, int, unsigned>::type, + T>::type; +}; + +template bool verify(T *data_arr, T *gold_arr, int N) { + int err_cnt = 0; + + for (unsigned i = 0; i < N; ++i) { + T val = data_arr[i]; + T gold = gold_arr[i]; + + if (val != gold) { + if (++err_cnt < 10) { + using T1 = typename char_to_int::type; + std::cout << " failed at index " << i << ": " << (T1)val + << " != " << (T1)gold << " (gold)\n"; + } + } + } + if (err_cnt > 0) { + std::cout << " pass rate: " << ((float)(N - err_cnt) / (float)N) * 100.0f + << "% (" << (N - err_cnt) << "/" << N << ")\n"; + } + return err_cnt == 0; +} + +template struct DataMgr { + From *src; + To *dst; + To *gold; + static inline constexpr int N = Nx; + + DataMgr(From (&&src_data)[N], To (&&gold_data)[N]) { + src = new From[N]; + dst = new To[N]; + gold = new To[N]; + + for (int i = 0; i < N; i++) { + src[i] = src_data[i]; + dst[i] = (To)2; // 0, 1 can be results of saturation, so use 2 + gold[i] = gold_data[i]; + } + } + + ~DataMgr() { + delete[] src; + delete[] dst; + delete[] gold; + } +}; + +template class Mgr> +bool test(queue q) { + std::cout << "Testing " << typeid(From).name() << " -> " << typeid(To).name() + << "\n"; + + Mgr dm; + constexpr int N = Mgr::N; + + try { + sycl::buffer src_buf(dm.src, N); + sycl::buffer dst_buf(dm.dst, N); + + auto e = q.submit([&](handler &cgh) { + auto src_acc = src_buf.template get_access(cgh); + auto dst_acc = dst_buf.template get_access(cgh); + + cgh.single_task([=]() SYCL_ESIMD_KERNEL { + simd x(src_acc, 0); + simd y = saturate(x); + y.copy_to(dst_acc, 0); + }); + }); + } catch (sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + return false; // not success + } + return verify(dm.dst, dm.gold, N); +} + +// clang-format off +template struct FpToInt : public DataMgr { + static_assert( + (std::is_floating_point_v || std::is_same_v) && + std::is_integral_v); + static inline constexpr int N = 2; + + FpToInt() : DataMgr( + // need this trick with -127 + 130 because INT_MAX is not accurately + // representable with float, and compiler warns: + // implicit conversion from 'int' to 'const float' changes value from + // 2147483647 to 2147483648 + // INT_MAX-127 is accurately representable with float. Use +130 to exceed + // representable range to actually test saturation. + // Test data: + { (From)std::numeric_limits::min() - 10, + (From)(std::numeric_limits::max()-127) + 130 }, + // Gold data (saturated test data): + { std::numeric_limits::min(), + std::numeric_limits::max() }) + {} +}; + +template +struct UIntToSameOrNarrowAnyInt : public DataMgr { + static_assert(std::is_integral_v && std::is_integral_v && + !std::is_signed_v && (sizeof(From) >= sizeof(To))); + static inline constexpr int N = 1; + + UIntToSameOrNarrowAnyInt() : DataMgr( + { (From)((From)std::numeric_limits::max() + (From)10) }, + { (To)std::numeric_limits::max() }) + {} +}; + +template +struct IntToWiderUInt : public DataMgr { + static_assert(std::is_signed_v && !std::is_signed_v && + (sizeof(From) < sizeof(To))); + static inline constexpr int N = 1; + + IntToWiderUInt() : DataMgr( + { (From)-1 }, + { (To)0 }) + {} +}; + +template +struct SIntToNarrowAnyInt : public DataMgr { + static_assert(std::is_integral_v && std::is_signed_v && + std::is_integral_v && (sizeof(From) > sizeof(To))); + static inline constexpr int N = 2; + + SIntToNarrowAnyInt() : DataMgr( + { (From)std::numeric_limits::max() + 10, + (From)std::numeric_limits::min() - 10 }, + { (To)std::numeric_limits::max(), + (To)std::numeric_limits::min() }) + {} +}; + +template struct FpToFp : public DataMgr { + static_assert((std::is_floating_point_v || std::is_same_v)); + static inline constexpr int N = 5; + + FpToFp() : DataMgr( + { (From)-10, (From)0, (From)0.5, (From)1, (From)10 }, + { (To)0, (To)0, (To)((From)0.5), (To)1, (To)1 }) + {} +}; + // clang-format on int main(int argc, char **argv) { @@ -30,6 +188,9 @@ int main(int argc, char **argv) { passed &= test(q); passed &= test(q); passed &= test(q); +#ifdef ENABLE_FP64 + passed &= test(q); +#endif passed &= test(q); passed &= test(q); @@ -46,6 +207,9 @@ int main(int argc, char **argv) { passed &= test(q); passed &= test(q); +#ifdef ENABLE_FP64 + passed &= test(q); +#endif std::cout << (passed ? "Test passed\n" : "Test FAILED\n"); return passed ? 0 : 1; diff --git a/SYCL/ESIMD/api/saturation_smoke.hpp b/SYCL/ESIMD/api/saturation_smoke.hpp deleted file mode 100644 index 6ce0b8d81c..0000000000 --- a/SYCL/ESIMD/api/saturation_smoke.hpp +++ /dev/null @@ -1,162 +0,0 @@ -#include "../esimd_test_utils.hpp" - -#include -#include -#include - -using namespace sycl; -using namespace sycl::ext::intel::esimd; - -template struct char_to_int { - using type = typename std::conditional< - sizeof(T) == 1, - typename std::conditional::value, int, unsigned>::type, - T>::type; -}; - -template bool verify(T *data_arr, T *gold_arr, int N) { - int err_cnt = 0; - - for (unsigned i = 0; i < N; ++i) { - T val = data_arr[i]; - T gold = gold_arr[i]; - - if (val != gold) { - if (++err_cnt < 10) { - using T1 = typename char_to_int::type; - std::cout << " failed at index " << i << ": " << (T1)val - << " != " << (T1)gold << " (gold)\n"; - } - } - } - if (err_cnt > 0) { - std::cout << " pass rate: " << ((float)(N - err_cnt) / (float)N) * 100.0f - << "% (" << (N - err_cnt) << "/" << N << ")\n"; - } - return err_cnt == 0; -} - -template struct DataMgr { - From *src; - To *dst; - To *gold; - static inline constexpr int N = Nx; - - DataMgr(From (&&src_data)[N], To (&&gold_data)[N]) { - src = new From[N]; - dst = new To[N]; - gold = new To[N]; - - for (int i = 0; i < N; i++) { - src[i] = src_data[i]; - dst[i] = (To)2; // 0, 1 can be results of saturation, so use 2 - gold[i] = gold_data[i]; - } - } - - ~DataMgr() { - delete[] src; - delete[] dst; - delete[] gold; - } -}; - -template class Mgr> -bool test(queue q) { - std::cout << "Testing " << typeid(From).name() << " -> " << typeid(To).name() - << "\n"; - - Mgr dm; - constexpr int N = Mgr::N; - - try { - sycl::buffer src_buf(dm.src, N); - sycl::buffer dst_buf(dm.dst, N); - - auto e = q.submit([&](handler &cgh) { - auto src_acc = src_buf.template get_access(cgh); - auto dst_acc = dst_buf.template get_access(cgh); - - cgh.single_task([=]() SYCL_ESIMD_KERNEL { - simd x(src_acc, 0); - simd y = saturate(x); - y.copy_to(dst_acc, 0); - }); - }); - } catch (sycl::exception const &e) { - std::cout << "SYCL exception caught: " << e.what() << '\n'; - return false; // not success - } - return verify(dm.dst, dm.gold, N); -} - -// clang-format off -template struct FpToInt : public DataMgr { - static_assert( - (std::is_floating_point_v || std::is_same_v) && - std::is_integral_v); - static inline constexpr int N = 2; - - FpToInt() : DataMgr( - // need this trick with -127 + 130 because INT_MAX is not accurately - // representable with float, and compiler warns: - // implicit conversion from 'int' to 'const float' changes value from - // 2147483647 to 2147483648 - // INT_MAX-127 is accurately representable with float. Use +130 to exceed - // representable range to actually test saturation. - // Test data: - { (From)std::numeric_limits::min() - 10, - (From)(std::numeric_limits::max()-127) + 130 }, - // Gold data (saturated test data): - { std::numeric_limits::min(), - std::numeric_limits::max() }) - {} -}; - -template -struct UIntToSameOrNarrowAnyInt : public DataMgr { - static_assert(std::is_integral_v && std::is_integral_v && - !std::is_signed_v && (sizeof(From) >= sizeof(To))); - static inline constexpr int N = 1; - - UIntToSameOrNarrowAnyInt() : DataMgr( - { (From)((From)std::numeric_limits::max() + (From)10) }, - { (To)std::numeric_limits::max() }) - {} -}; - -template -struct IntToWiderUInt : public DataMgr { - static_assert(std::is_signed_v && !std::is_signed_v && - (sizeof(From) < sizeof(To))); - static inline constexpr int N = 1; - - IntToWiderUInt() : DataMgr( - { (From)-1 }, - { (To)0 }) - {} -}; - -template -struct SIntToNarrowAnyInt : public DataMgr { - static_assert(std::is_integral_v && std::is_signed_v && - std::is_integral_v && (sizeof(From) > sizeof(To))); - static inline constexpr int N = 2; - - SIntToNarrowAnyInt() : DataMgr( - { (From)std::numeric_limits::max() + 10, - (From)std::numeric_limits::min() - 10 }, - { (To)std::numeric_limits::max(), - (To)std::numeric_limits::min() }) - {} -}; - -template struct FpToFp : public DataMgr { - static_assert((std::is_floating_point_v || std::is_same_v)); - static inline constexpr int N = 5; - - FpToFp() : DataMgr( - { (From)-10, (From)0, (From)0.5, (From)1, (From)10 }, - { (To)0, (To)0, (To)((From)0.5), (To)1, (To)1 }) - {} -}; diff --git a/SYCL/ESIMD/api/saturation_smoke_aspect-fp64.cpp b/SYCL/ESIMD/api/saturation_smoke_aspect-fp64.cpp index 7d1941983f..6bf6fb4734 100644 --- a/SYCL/ESIMD/api/saturation_smoke_aspect-fp64.cpp +++ b/SYCL/ESIMD/api/saturation_smoke_aspect-fp64.cpp @@ -10,34 +10,9 @@ // UNSUPPORTED: cuda || hip // TODO: esimd_emulator fails due to unimplemented 'half' type // XFAIL: esimd_emulator -// RUN: %clangxx -fsycl %s -o %t.out +// RUN: %clangxx -fsycl -DENABLE_FP64 %s -o %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out // // The test checks main functionality of esimd::saturate function. -#include "saturation_smoke.hpp" - -using namespace sycl; -using namespace sycl::ext::intel::esimd; - -// clang-format on - -int main(int argc, char **argv) { - queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler()); - if (!q.get_device().has(sycl::aspect::fp64) { - std::cout << "Skipping test\n"; - return 0; - } - - auto dev = q.get_device(); - std::cout << "Running on " << dev.get_info() << "\n"; - - bool passed = true; - - passed &= test(q); - - passed &= test(q); - - std::cout << (passed ? "Test passed\n" : "Test FAILED\n"); - return passed ? 0 : 1; -} +#include "saturation_smoke.cpp" diff --git a/SYCL/ESIMD/api/simd_view_select_2d_fp.cpp b/SYCL/ESIMD/api/simd_view_select_2d_fp.cpp index db122c5db4..75c92e4863 100644 --- a/SYCL/ESIMD/api/simd_view_select_2d_fp.cpp +++ b/SYCL/ESIMD/api/simd_view_select_2d_fp.cpp @@ -25,6 +25,9 @@ int main(int argc, char **argv) { bool passed = true; passed &= test(q); passed &= test(q); +#ifdef ENABLE_FP64 + passed &= test(q); +#endif std::cout << (passed ? "=== Test passed\n" : "=== Test FAILED\n"); return passed ? 0 : 1; diff --git a/SYCL/ESIMD/api/simd_view_select_2d_fp_aspect-fp64.cpp b/SYCL/ESIMD/api/simd_view_select_2d_fp_aspect-fp64.cpp index bc3ece4dbe..7730fa6787 100644 --- a/SYCL/ESIMD/api/simd_view_select_2d_fp_aspect-fp64.cpp +++ b/SYCL/ESIMD/api/simd_view_select_2d_fp_aspect-fp64.cpp @@ -9,28 +9,10 @@ // UNSUPPORTED: cuda || hip // TODO: esimd_emulator fails due to unimplemented 'single_task()' method // XFAIL: esimd_emulator -// RUN: %clangxx -fsycl %s -fsycl-device-code-split=per_kernel -o %t.out +// RUN: %clangxx -fsycl -DENABLE_FP64 %s -fsycl-device-code-split=per_kernel -o %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out // // Smoke test for 2D region select API which can be used to represent 2D tiles. // Tests FP types. -#include "simd_view_select_2d.hpp" - -int main(int argc, char **argv) { - queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler()); - if (!q.get_device().has(sycl::aspect::fp64) { - std::cout << "Skipping test\n"; - return 0; - } - - auto dev = q.get_device(); - std::cout << "Running on " << dev.get_info() << "\n"; - - bool passed = true; - - passed &= test(q); - - std::cout << (passed ? "=== Test passed\n" : "=== Test FAILED\n"); - return passed ? 0 : 1; -} +#include "simd_view_select_2d.cpp" diff --git a/SYCL/ESIMD/api/unary_ops_heavy.cpp b/SYCL/ESIMD/api/unary_ops_heavy.cpp index a64f61f3e3..2b734d5372 100644 --- a/SYCL/ESIMD/api/unary_ops_heavy.cpp +++ b/SYCL/ESIMD/api/unary_ops_heavy.cpp @@ -23,11 +23,133 @@ // is available in ESIMD. // -#include "unary_ops_heavy.hpp" +#include "../esimd_test_utils.hpp" + +#include +#include +#include using namespace sycl; using namespace sycl::ext::intel::esimd; +template class TestID; + +// Helpers for printing +template auto cast(T val) { return val; } +template <> auto cast(char val) { return (int)val; } +template <> auto cast(unsigned char val) { + return (unsigned int)val; +} +#ifdef __SYCL_DEVICE_ONLY__ +template <> auto cast<_Float16>(_Float16 val) { return (float)val; } +#endif + +// Main test function. +// T - operand type, +// VL - vector length, +// Ops - a compile-time sequence of operations to test. +// +template class SimdT = simd> +bool test(Ops ops, queue &q) { + using OpClass = esimd_test::UnaryOp; + // Log test case info + std::cout << "Testing T=" << typeid(T).name() << ", VL=" << VL << " ...\n"; + std::cout << "Operations:"; + esimd_test::iterate_ops(ops, [=](OpClass op) { + std::cout << " '" << esimd_test::Op2Str(op) << "'"; + }); + std::cout << "\n"; + + // initialize test data + constexpr int Size = 1024 * 7; + T *A = sycl::malloc_shared(Size, q); + constexpr int NumOps = (int)Ops::size; + int CSize = NumOps * Size; + T *C = sycl::malloc_shared(CSize, q); + + for (int i = 0; i < Size; ++i) { + if constexpr (std::is_unsigned_v) { + A[i] = i; + } else { + A[i] = i - Size / 2; + } + C[i] = 0; + } + + // submit the kernel + try { + auto e = q.submit([&](handler &cgh) { + cgh.parallel_for>( + Size / VL, [=](id<1> i) SYCL_ESIMD_KERNEL { + unsigned off = i * VL; + SimdT va(A + off); + // applies each of the input operations to the va, + // then invokes the lambda below, passing the result of the + // operation, its ID and sequential number within the input sequence + esimd_test::apply_unary_ops( + ops, va, [=](SimdT res, OpClass op, unsigned op_num) { + unsigned res_off = off * NumOps + op_num * VL; + res.copy_to(C + res_off); + }); + }); + }); + e.wait(); + } catch (sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + sycl::free(A, q); + sycl::free(C, q); + return false; + } + + int err_cnt = 0; + + // now verify the results using provided verification function type + for (unsigned i = 0; i < Size / VL; ++i) { + unsigned off = i * VL; + + for (int j = 0; j < VL; ++j) { + T a = A[off + j]; + + esimd_test::apply_unary_ops( + ops, a, [&](T Gold, OpClass op, unsigned op_num) { + unsigned res_off = off * NumOps + op_num * VL; + T Res = C[res_off + j]; + using Tint = esimd_test::int_type_t; + Tint ResBits = *(Tint *)&Res; + Tint GoldBits = *(Tint *)&Gold; + // allow 1 bit discrepancy for half on modifying op + int delta = ((int)op >= (int)OpClass::minus_minus_pref) && + ((int)op <= (int)OpClass::plus_plus_inf) && + std::is_same_v + ? 1 + : 0; + + if ((Gold != Res) && (abs(ResBits - GoldBits) > delta)) { + if (++err_cnt < 10) { + std::cout << " failed at index " << (res_off + j) << ", op " + << esimd_test::Op2Str(op) << ": " << cast(Res) + << "(0x" << std::hex << ResBits << ")" + << " != " << cast(Gold) << "(0x" << std::hex + << GoldBits << ") [" << esimd_test::Op2Str(op) << " " + << std::dec << cast(a) << "]\n"; + } + } + }); + } + } + if (err_cnt > 0) { + auto Size1 = NumOps * Size; + std::cout << " pass rate: " + << ((float)(Size1 - err_cnt) / (float)Size1) * 100.0f << "% (" + << (Size1 - err_cnt) << "/" << Size1 << ")\n"; + } + + free(A, q); + free(C, q); + std::cout << (err_cnt > 0 ? " FAILED\n" : " Passed\n"); + return err_cnt == 0; +} + int main(void) { queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler()); @@ -50,6 +172,9 @@ int main(void) { passed &= test(mod_ops, q); passed &= test(mod_ops, q); passed &= test(mod_ops, q); +#ifdef ENABLE_FP64 + passed &= test(mod_ops, q); +#endif auto singed_ops = esimd_test::OpSeq{}; passed &= test(singed_ops, q); @@ -58,6 +183,9 @@ int main(void) { passed &= test(singed_ops, q); passed &= test(singed_ops, q); passed &= test(singed_ops, q); +#ifdef ENABLE_FP64 + passed &= test(singed_ops, q); +#endif auto bit_ops = esimd_test::OpSeq{}; passed &= test(bit_ops, q); diff --git a/SYCL/ESIMD/api/unary_ops_heavy.hpp b/SYCL/ESIMD/api/unary_ops_heavy.hpp deleted file mode 100644 index 49bc7e3273..0000000000 --- a/SYCL/ESIMD/api/unary_ops_heavy.hpp +++ /dev/null @@ -1,126 +0,0 @@ -#include "../esimd_test_utils.hpp" - -#include -#include -#include - -using namespace sycl; -using namespace sycl::ext::intel::esimd; - -template class TestID; - -// Helpers for printing -template auto cast(T val) { return val; } -template <> auto cast(char val) { return (int)val; } -template <> auto cast(unsigned char val) { - return (unsigned int)val; -} -#ifdef __SYCL_DEVICE_ONLY__ -template <> auto cast<_Float16>(_Float16 val) { return (float)val; } -#endif - -// Main test function. -// T - operand type, -// VL - vector length, -// Ops - a compile-time sequence of operations to test. -// -template class SimdT = simd> -bool test(Ops ops, queue &q) { - using OpClass = esimd_test::UnaryOp; - // Log test case info - std::cout << "Testing T=" << typeid(T).name() << ", VL=" << VL << " ...\n"; - std::cout << "Operations:"; - esimd_test::iterate_ops(ops, [=](OpClass op) { - std::cout << " '" << esimd_test::Op2Str(op) << "'"; - }); - std::cout << "\n"; - - // initialize test data - constexpr int Size = 1024 * 7; - T *A = sycl::malloc_shared(Size, q); - constexpr int NumOps = (int)Ops::size; - int CSize = NumOps * Size; - T *C = sycl::malloc_shared(CSize, q); - - for (int i = 0; i < Size; ++i) { - if constexpr (std::is_unsigned_v) { - A[i] = i; - } else { - A[i] = i - Size / 2; - } - C[i] = 0; - } - - // submit the kernel - try { - auto e = q.submit([&](handler &cgh) { - cgh.parallel_for>( - Size / VL, [=](id<1> i) SYCL_ESIMD_KERNEL { - unsigned off = i * VL; - SimdT va(A + off); - // applies each of the input operations to the va, - // then invokes the lambda below, passing the result of the - // operation, its ID and sequential number within the input sequence - esimd_test::apply_unary_ops( - ops, va, [=](SimdT res, OpClass op, unsigned op_num) { - unsigned res_off = off * NumOps + op_num * VL; - res.copy_to(C + res_off); - }); - }); - }); - e.wait(); - } catch (sycl::exception const &e) { - std::cout << "SYCL exception caught: " << e.what() << '\n'; - sycl::free(A, q); - sycl::free(C, q); - return false; - } - - int err_cnt = 0; - - // now verify the results using provided verification function type - for (unsigned i = 0; i < Size / VL; ++i) { - unsigned off = i * VL; - - for (int j = 0; j < VL; ++j) { - T a = A[off + j]; - - esimd_test::apply_unary_ops( - ops, a, [&](T Gold, OpClass op, unsigned op_num) { - unsigned res_off = off * NumOps + op_num * VL; - T Res = C[res_off + j]; - using Tint = esimd_test::int_type_t; - Tint ResBits = *(Tint *)&Res; - Tint GoldBits = *(Tint *)&Gold; - // allow 1 bit discrepancy for half on modifying op - int delta = ((int)op >= (int)OpClass::minus_minus_pref) && - ((int)op <= (int)OpClass::plus_plus_inf) && - std::is_same_v - ? 1 - : 0; - - if ((Gold != Res) && (abs(ResBits - GoldBits) > delta)) { - if (++err_cnt < 10) { - std::cout << " failed at index " << (res_off + j) << ", op " - << esimd_test::Op2Str(op) << ": " << cast(Res) - << "(0x" << std::hex << ResBits << ")" - << " != " << cast(Gold) << "(0x" << std::hex - << GoldBits << ") [" << esimd_test::Op2Str(op) << " " - << std::dec << cast(a) << "]\n"; - } - } - }); - } - } - if (err_cnt > 0) { - auto Size1 = NumOps * Size; - std::cout << " pass rate: " - << ((float)(Size1 - err_cnt) / (float)Size1) * 100.0f << "% (" - << (Size1 - err_cnt) << "/" << Size1 << ")\n"; - } - - free(A, q); - free(C, q); - std::cout << (err_cnt > 0 ? " FAILED\n" : " Passed\n"); - return err_cnt == 0; -} diff --git a/SYCL/ESIMD/api/unary_ops_heavy_aspect-fp64.cpp b/SYCL/ESIMD/api/unary_ops_heavy_aspect-fp64.cpp index b5281519aa..e277201fc1 100644 --- a/SYCL/ESIMD/api/unary_ops_heavy_aspect-fp64.cpp +++ b/SYCL/ESIMD/api/unary_ops_heavy_aspect-fp64.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: cuda || hip // TODO: esimd_emulator fails due to unimplemented 'half' type // XFAIL: esimd_emulator -// RUN: %clangxx -fsycl %s -o %t.out +// RUN: %clangxx -fsycl -DENABLE_FP64 %s -o %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out // Tests various unary operations applied to simd objects. @@ -23,31 +23,4 @@ // is available in ESIMD. // -#include "unary_ops_heavy.hpp" - -using namespace sycl; -using namespace sycl::ext::intel::esimd; - -int main(void) { - queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler()); - if (!q.get_device().has(sycl::aspect::fp64) { - std::cout << "Skipping test\n"; - return 0; - } - - auto dev = q.get_device(); - std::cout << "Running on " << dev.get_info() << "\n"; - bool passed = true; - using UnOp = esimd_test::UnaryOp; - - auto mod_ops = - esimd_test::OpSeq{}; - passed &= test(mod_ops, q); - - auto singed_ops = esimd_test::OpSeq{}; - passed &= test(singed_ops, q); - - std::cout << (passed ? "Test passed\n" : "Test FAILED\n"); - return passed ? 0 : 1; -} +#include "unary_ops_heavy.cpp" diff --git a/SYCL/ESIMD/ext_math.cpp b/SYCL/ESIMD/ext_math.cpp index 4e6671f83d..c8b057857f 100644 --- a/SYCL/ESIMD/ext_math.cpp +++ b/SYCL/ESIMD/ext_math.cpp @@ -17,11 +17,448 @@ // - math function - sin, cos, ..., div_ieee, pow // - SYCL vs ESIMD APIs -#include "ext_math.hpp" +#include "esimd_test_utils.hpp" + +#include +#include +#include + +#include +#include using namespace sycl; using namespace sycl::ext::intel; +// --- Data initialization functions + +// Initialization data for trigonometric functions' input. +// H/w supports only limited range of sin/cos arguments with decent accuracy: +// absolute error <= 0.0008 for the range of +/- 32767*pi (+/- 102941). + +constexpr int accuracy_limit = 32767 * 3.14 - 1; + +template struct InitTrig { + void operator()(T *In, T *Out, size_t Size) const { + for (auto I = 0; I < Size; ++I) { + In[I] = (I + 1) % accuracy_limit; + Out[I] = (T)0; + } + } +}; + +template struct InitWide { + void operator()(T *In, T *Out, size_t Size) const { + for (auto I = 0; I < Size; ++I) { + In[I] = I + 1.0; + Out[I] = (T)0; + } + } +}; + +template struct InitNarrow { + void operator()(T *In, T *Out, size_t Size) const { + for (auto I = 0; I < Size; ++I) { + In[I] = 2.0f + 16.0f * ((T)I / (T)(Size - 1)); // in [2..18] range + Out[I] = (T)0; + } + } +}; + +template struct InitInRange0_5 { + void operator()(T *In, T *Out, size_t Size) const { + for (auto I = 0; I < Size; ++I) { + In[I] = 5.0f * ((T)I / (T)(Size - 1)); // in [0..5] range + Out[I] = (T)0; + } + } +}; + +template struct InitBin { + void operator()(T *In1, T *In2, T *Out, size_t Size) const { + for (auto I = 0; I < Size; ++I) { + In1[I] = I % 17 + 1; + In2[I] = 4.0f * ((T)I / (T)(Size - 1)); // in [0..4] range + Out[I] = (T)0; + } + } +}; + +// --- Math operation identification + +enum class MathOp { + sin, + cos, + exp, + sqrt, + sqrt_ieee, + inv, + log, + rsqrt, + floor, + ceil, + trunc, + exp2, + log2, + div_ieee, + pow +}; + +// --- Template functions calculating given math operation on host and device + +enum ArgKind { + AllVec, + AllSca, + Sca1Vec2, + Sca2Vec1 +}; + +template struct ESIMDf; +template struct BinESIMDf; +template struct SYCLf; + +template struct HostFunc; + +#define DEFINE_HOST_OP(Op, HostOp) \ + template struct HostFunc { \ + T operator()(T X) { return HostOp; } \ + }; + +DEFINE_HOST_OP(sin, std::sin(X)); +DEFINE_HOST_OP(cos, std::cos(X)); +DEFINE_HOST_OP(exp, std::exp(X)); +DEFINE_HOST_OP(log, std::log(X)); +DEFINE_HOST_OP(inv, 1.0f / X); +DEFINE_HOST_OP(sqrt, std::sqrt(X)); +DEFINE_HOST_OP(sqrt_ieee, std::sqrt(X)); +DEFINE_HOST_OP(rsqrt, 1.0f / std::sqrt(X)); +DEFINE_HOST_OP(floor, std::floor(X)); +DEFINE_HOST_OP(ceil, std::ceil(X)); +DEFINE_HOST_OP(trunc, std::trunc(X)); +DEFINE_HOST_OP(exp2, std::exp2(X)); +DEFINE_HOST_OP(log2, std::log2(X)); + +#define DEFINE_HOST_BIN_OP(Op, HostOp) \ + template struct HostFunc { \ + T operator()(T X, T Y) { return HostOp; } \ + }; + +DEFINE_HOST_BIN_OP(div_ieee, X / Y); +DEFINE_HOST_BIN_OP(pow, std::pow(X, Y)); + +// --- Specializations per each extended math operation + +#define DEFINE_ESIMD_DEVICE_OP(Op) \ + template struct ESIMDf { \ + esimd::simd \ + operator()(esimd::simd X) const SYCL_ESIMD_FUNCTION { \ + return esimd::Op(X); \ + } \ + }; \ + template struct ESIMDf { \ + esimd::simd operator()(T X) const SYCL_ESIMD_FUNCTION { \ + return esimd::Op(X); \ + } \ + }; + +DEFINE_ESIMD_DEVICE_OP(sin); +DEFINE_ESIMD_DEVICE_OP(cos); +DEFINE_ESIMD_DEVICE_OP(exp); +DEFINE_ESIMD_DEVICE_OP(log); +DEFINE_ESIMD_DEVICE_OP(inv); +DEFINE_ESIMD_DEVICE_OP(sqrt); +DEFINE_ESIMD_DEVICE_OP(sqrt_ieee); +DEFINE_ESIMD_DEVICE_OP(rsqrt); +DEFINE_ESIMD_DEVICE_OP(floor); +DEFINE_ESIMD_DEVICE_OP(ceil); +DEFINE_ESIMD_DEVICE_OP(trunc); +DEFINE_ESIMD_DEVICE_OP(exp2); +DEFINE_ESIMD_DEVICE_OP(log2); + +#define DEFINE_ESIMD_DEVICE_BIN_OP(Op) \ + template struct BinESIMDf { \ + esimd::simd operator()(T X, T Y) const SYCL_ESIMD_FUNCTION { \ + return esimd::Op(X, Y); \ + } \ + }; \ + template struct BinESIMDf { \ + esimd::simd \ + operator()(esimd::simd X, \ + esimd::simd Y) const SYCL_ESIMD_FUNCTION { \ + return esimd::Op(X, Y); \ + } \ + }; \ + template struct BinESIMDf { \ + esimd::simd \ + operator()(T X, esimd::simd Y) const SYCL_ESIMD_FUNCTION { \ + return esimd::Op(X, Y); \ + } \ + }; \ + template struct BinESIMDf { \ + esimd::simd operator()(esimd::simd X, \ + T Y) const SYCL_ESIMD_FUNCTION { \ + return esimd::Op(X, Y); \ + } \ + }; + +DEFINE_ESIMD_DEVICE_BIN_OP(div_ieee); +DEFINE_ESIMD_DEVICE_BIN_OP(pow); + +#define DEFINE_SYCL_DEVICE_OP(Op) \ + template struct SYCLf { \ + esimd::simd \ + operator()(esimd::simd X) const SYCL_ESIMD_FUNCTION { \ + /* T must be float for SYCL, so not a template parameter for sycl::Op*/ \ + return sycl::Op(X); \ + } \ + }; \ + template struct SYCLf { \ + esimd::simd operator()(T X) const SYCL_ESIMD_FUNCTION { \ + return sycl::Op(X); \ + } \ + }; + +DEFINE_SYCL_DEVICE_OP(sin); +DEFINE_SYCL_DEVICE_OP(cos); +DEFINE_SYCL_DEVICE_OP(exp); +DEFINE_SYCL_DEVICE_OP(log); + +// --- Generic kernel calculating an extended math operation on array elements + +template class Kernel, typename AccIn, + typename AccOut> +struct UnaryDeviceFunc { + AccIn In; + AccOut Out; + + UnaryDeviceFunc(AccIn &In, AccOut &Out) : In(In), Out(Out) {} + + void operator()(id<1> I) const SYCL_ESIMD_KERNEL { + unsigned int Offset = I * N * sizeof(T); + esimd::simd Vx; + Vx.copy_from(In, Offset); + + if (I.get(0) % 2 == 0) { + for (int J = 0; J < N; J++) { + Kernel DevF{}; + T Val = Vx[J]; + esimd::simd V = DevF(Val); // scalar arg + Vx[J] = V[J]; + } + } else { + Kernel DevF{}; + Vx = DevF(Vx); // vector arg + } + Vx.copy_to(Out, Offset); + }; +}; + +template class Kernel, typename AccIn, + typename AccOut> +struct BinaryDeviceFunc { + AccIn In1; + AccIn In2; + AccOut Out; + + BinaryDeviceFunc(AccIn &In1, AccIn &In2, AccOut &Out) + : In1(In1), In2(In2), Out(Out) {} + + void operator()(id<1> I) const SYCL_ESIMD_KERNEL { + unsigned int Offset = I * N * sizeof(T); + esimd::simd V1(In1, Offset); + esimd::simd V2(In2, Offset); + esimd::simd V; + + if (I.get(0) % 2 == 0) { + int Ind = 0; + { + Kernel DevF{}; + T Val2 = V2[Ind]; + esimd::simd Vv = DevF(V1[Ind], Val2); // both arguments are scalar + V[Ind] = Vv[Ind]; + } + Ind++; + { + Kernel DevF{}; + T Val1 = V1[Ind]; + esimd::simd Vv = DevF(Val1, V2); // scalar, vector + V[Ind] = Vv[Ind]; + } + Ind++; + { + for (int J = Ind; J < N; ++J) { + Kernel DevF{}; + T Val2 = V2[J]; + esimd::simd Vv = DevF(V1, Val2); // scalar 2nd arg + V[J] = Vv[J]; + } + } + } else { + Kernel DevF{}; + V = DevF(V1, V2); // vec 2nd arg + } + V.copy_to(Out, Offset); + }; +}; + +// --- Generic test function for an extended math operation + +template class Kernel, + typename InitF = InitNarrow> +bool test(queue &Q, const std::string &Name, + InitF Init = InitNarrow{}, float delta = 0.0f) { + + constexpr size_t Size = 1024 * 128; + constexpr bool IsBinOp = (Op == MathOp::div_ieee) || (Op == MathOp::pow); + + T *A = new T[Size]; + T *B = new T[Size]; + T *C = new T[Size]; + if constexpr (IsBinOp) { + Init(A, B, C, Size); + } else { + Init(A, B, Size); + } + const char *kind = + std::is_same_v, ESIMDf> + ? "ESIMD" + : "SYCL"; + std::cout << " " << Name << " test, kind=" << kind << "...\n"; + + try { + buffer BufA(A, range<1>(Size)); + buffer BufB(B, range<1>(Size)); + buffer BufC(C, range<1>(Size)); + + // number of workgroups + sycl::range<1> GlobalRange{Size / N}; + + // threads (workitems) in each workgroup + sycl::range<1> LocalRange{1}; + + auto E = Q.submit([&](handler &CGH) { + auto PA = BufA.template get_access(CGH); + auto PC = BufC.template get_access(CGH); + if constexpr (IsBinOp) { + auto PB = BufB.template get_access(CGH); + BinaryDeviceFunc F( + PA, PB, PC); + CGH.parallel_for(nd_range<1>{GlobalRange, LocalRange}, F); + } else { + UnaryDeviceFunc F(PA, + PC); + CGH.parallel_for(nd_range<1>{GlobalRange, LocalRange}, F); + } + }); + E.wait(); + } catch (sycl::exception &Exc) { + std::cout << " *** ERROR. SYCL exception caught: << " << Exc.what() + << "\n"; + return false; + } + + int ErrCnt = 0; + + for (unsigned I = 0; I < Size; ++I) { + T Gold; + + if constexpr (IsBinOp) { + Gold = HostFunc{}((T)A[I], (T)B[I]); + } else { + Gold = HostFunc{}((T)A[I]); + } + T Test = C[I]; + + if (delta == 0.0f) { + delta = sizeof(T) > 2 ? 0.0001 : 0.01; + } + + if (abs(Test - Gold) > delta) { + if (++ErrCnt < 10) { + std::cout << " failed at index " << I << ", " << Test + << " != " << Gold << " (gold)\n"; + } + } + } + delete[] A; + delete[] B; + delete[] C; + + if (ErrCnt > 0) { + std::cout << " pass rate: " + << ((float)(Size - ErrCnt) / (float)Size) * 100.0f << "% (" + << (Size - ErrCnt) << "/" << Size << ")\n"; + } + + std::cout << (ErrCnt > 0 ? " FAILED\n" : " Passed\n"); + return ErrCnt == 0; +} + +// --- Tests all extended math operations with given vector length + +template bool testESIMD(queue &Q) { + bool Pass = true; + + std::cout << "--- TESTING ESIMD functions, T=" << typeid(T).name() + << ", N = " << N << "...\n"; + + Pass &= test(Q, "sqrt", InitWide{}); + Pass &= test(Q, "inv"); + Pass &= test(Q, "rsqrt"); + Pass &= test(Q, "sin", InitTrig{}); + Pass &= test(Q, "cos", InitTrig{}); + Pass &= test(Q, "exp", InitInRange0_5{}); + Pass &= test(Q, "log", InitWide{}); + Pass &= test(Q, "exp2", InitInRange0_5{}); + Pass &= test(Q, "log2", InitWide{}); + Pass &= test(Q, "floor", InitWide{}); + Pass &= test(Q, "ceil", InitWide{}); + Pass &= test(Q, "trunc", InitWide{}); + return Pass; +} + +template bool testESIMDSqrtIEEE(queue &Q) { + bool Pass = true; + std::cout << "--- TESTING ESIMD sqrt_ieee, T=" << typeid(T).name() + << ", N = " << N << "...\n"; + Pass &= test(Q, "sqrt_ieee", InitWide{}); + return Pass; +} + +template bool testESIMDDivIEEE(queue &Q) { + bool Pass = true; + std::cout << "--- TESTING ESIMD div_ieee, T=" << typeid(T).name() + << ", N = " << N << "...\n"; + Pass &= test(Q, "div_ieee", InitBin{}); + return Pass; +} + +template bool testESIMDPow(queue &Q) { + bool Pass = true; + std::cout << "--- TESTING ESIMD pow, T=" << typeid(T).name() + << ", N = " << N << "...\n"; + Pass &= test( + Q, "pow", InitBin{}, 0.1); + return Pass; +} + +template bool testSYCL(queue &Q) { + bool Pass = true; + // TODO SYCL currently supports only these 4 functions, extend the test when + // more are available. + std::cout << "--- TESTING SYCL functions, T=" << typeid(T).name() + << ", N = " << N << "...\n"; + // SYCL functions will have good accuracy for any argument, unlike bare h/w + // ESIMD versions, so init with "wide" data set. + Pass &= test(Q, "sin", InitWide{}); + Pass &= test(Q, "cos", InitWide{}); + Pass &= test(Q, "exp", InitInRange0_5{}); + Pass &= test(Q, "log", InitWide{}); + return Pass; +} + // --- The entry point int main(void) { @@ -36,6 +473,12 @@ int main(void) { Pass &= testSYCL(Q); Pass &= testESIMDPow(Q); Pass &= testESIMDPow(Q); +#ifdef ENABLE_FP64 + Pass &= testESIMDSqrtIEEE(Q); + Pass &= testESIMDSqrtIEEE(Q); + Pass &= testESIMDDivIEEE(Q); + Pass &= testESIMDDivIEEE(Q); +#endif std::cout << (Pass ? "Test Passed\n" : "Test FAILED\n"); return Pass ? 0 : 1; } diff --git a/SYCL/ESIMD/ext_math.hpp b/SYCL/ESIMD/ext_math.hpp deleted file mode 100644 index 3d1278a60c..0000000000 --- a/SYCL/ESIMD/ext_math.hpp +++ /dev/null @@ -1,434 +0,0 @@ -#include "esimd_test_utils.hpp" - -#include -#include -#include - -#include -#include - -using namespace sycl; -using namespace sycl::ext::intel; - -// --- Data initialization functions - -// Initialization data for trigonometric functions' input. -// H/w supports only limited range of sin/cos arguments with decent accuracy: -// absolute error <= 0.0008 for the range of +/- 32767*pi (+/- 102941). - -constexpr int accuracy_limit = 32767 * 3.14 - 1; - -template struct InitTrig { - void operator()(T *In, T *Out, size_t Size) const { - for (auto I = 0; I < Size; ++I) { - In[I] = (I + 1) % accuracy_limit; - Out[I] = (T)0; - } - } -}; - -template struct InitWide { - void operator()(T *In, T *Out, size_t Size) const { - for (auto I = 0; I < Size; ++I) { - In[I] = I + 1.0; - Out[I] = (T)0; - } - } -}; - -template struct InitNarrow { - void operator()(T *In, T *Out, size_t Size) const { - for (auto I = 0; I < Size; ++I) { - In[I] = 2.0f + 16.0f * ((T)I / (T)(Size - 1)); // in [2..18] range - Out[I] = (T)0; - } - } -}; - -template struct InitInRange0_5 { - void operator()(T *In, T *Out, size_t Size) const { - for (auto I = 0; I < Size; ++I) { - In[I] = 5.0f * ((T)I / (T)(Size - 1)); // in [0..5] range - Out[I] = (T)0; - } - } -}; - -template struct InitBin { - void operator()(T *In1, T *In2, T *Out, size_t Size) const { - for (auto I = 0; I < Size; ++I) { - In1[I] = I % 17 + 1; - In2[I] = 4.0f * ((T)I / (T)(Size - 1)); // in [0..4] range - Out[I] = (T)0; - } - } -}; - -// --- Math operation identification - -enum class MathOp { - sin, - cos, - exp, - sqrt, - sqrt_ieee, - inv, - log, - rsqrt, - floor, - ceil, - trunc, - exp2, - log2, - div_ieee, - pow -}; - -// --- Template functions calculating given math operation on host and device - -enum ArgKind { AllVec, AllSca, Sca1Vec2, Sca2Vec1 }; - -template struct ESIMDf; -template struct BinESIMDf; -template struct SYCLf; - -template struct HostFunc; - -#define DEFINE_HOST_OP(Op, HostOp) \ - template struct HostFunc { \ - T operator()(T X) { return HostOp; } \ - }; - -DEFINE_HOST_OP(sin, std::sin(X)); -DEFINE_HOST_OP(cos, std::cos(X)); -DEFINE_HOST_OP(exp, std::exp(X)); -DEFINE_HOST_OP(log, std::log(X)); -DEFINE_HOST_OP(inv, 1.0f / X); -DEFINE_HOST_OP(sqrt, std::sqrt(X)); -DEFINE_HOST_OP(sqrt_ieee, std::sqrt(X)); -DEFINE_HOST_OP(rsqrt, 1.0f / std::sqrt(X)); -DEFINE_HOST_OP(floor, std::floor(X)); -DEFINE_HOST_OP(ceil, std::ceil(X)); -DEFINE_HOST_OP(trunc, std::trunc(X)); -DEFINE_HOST_OP(exp2, std::exp2(X)); -DEFINE_HOST_OP(log2, std::log2(X)); - -#define DEFINE_HOST_BIN_OP(Op, HostOp) \ - template struct HostFunc { \ - T operator()(T X, T Y) { return HostOp; } \ - }; - -DEFINE_HOST_BIN_OP(div_ieee, X / Y); -DEFINE_HOST_BIN_OP(pow, std::pow(X, Y)); - -// --- Specializations per each extended math operation - -#define DEFINE_ESIMD_DEVICE_OP(Op) \ - template struct ESIMDf { \ - esimd::simd \ - operator()(esimd::simd X) const SYCL_ESIMD_FUNCTION { \ - return esimd::Op(X); \ - } \ - }; \ - template struct ESIMDf { \ - esimd::simd operator()(T X) const SYCL_ESIMD_FUNCTION { \ - return esimd::Op(X); \ - } \ - }; - -DEFINE_ESIMD_DEVICE_OP(sin); -DEFINE_ESIMD_DEVICE_OP(cos); -DEFINE_ESIMD_DEVICE_OP(exp); -DEFINE_ESIMD_DEVICE_OP(log); -DEFINE_ESIMD_DEVICE_OP(inv); -DEFINE_ESIMD_DEVICE_OP(sqrt); -DEFINE_ESIMD_DEVICE_OP(sqrt_ieee); -DEFINE_ESIMD_DEVICE_OP(rsqrt); -DEFINE_ESIMD_DEVICE_OP(floor); -DEFINE_ESIMD_DEVICE_OP(ceil); -DEFINE_ESIMD_DEVICE_OP(trunc); -DEFINE_ESIMD_DEVICE_OP(exp2); -DEFINE_ESIMD_DEVICE_OP(log2); - -#define DEFINE_ESIMD_DEVICE_BIN_OP(Op) \ - template struct BinESIMDf { \ - esimd::simd operator()(T X, T Y) const SYCL_ESIMD_FUNCTION { \ - return esimd::Op(X, Y); \ - } \ - }; \ - template struct BinESIMDf { \ - esimd::simd \ - operator()(esimd::simd X, \ - esimd::simd Y) const SYCL_ESIMD_FUNCTION { \ - return esimd::Op(X, Y); \ - } \ - }; \ - template struct BinESIMDf { \ - esimd::simd \ - operator()(T X, esimd::simd Y) const SYCL_ESIMD_FUNCTION { \ - return esimd::Op(X, Y); \ - } \ - }; \ - template struct BinESIMDf { \ - esimd::simd operator()(esimd::simd X, \ - T Y) const SYCL_ESIMD_FUNCTION { \ - return esimd::Op(X, Y); \ - } \ - }; - -DEFINE_ESIMD_DEVICE_BIN_OP(div_ieee); -DEFINE_ESIMD_DEVICE_BIN_OP(pow); - -#define DEFINE_SYCL_DEVICE_OP(Op) \ - template struct SYCLf { \ - esimd::simd \ - operator()(esimd::simd X) const SYCL_ESIMD_FUNCTION { \ - /* T must be float for SYCL, so not a template parameter for sycl::Op*/ \ - return sycl::Op(X); \ - } \ - }; \ - template struct SYCLf { \ - esimd::simd operator()(T X) const SYCL_ESIMD_FUNCTION { \ - return sycl::Op(X); \ - } \ - }; - -DEFINE_SYCL_DEVICE_OP(sin); -DEFINE_SYCL_DEVICE_OP(cos); -DEFINE_SYCL_DEVICE_OP(exp); -DEFINE_SYCL_DEVICE_OP(log); - -// --- Generic kernel calculating an extended math operation on array elements - -template class Kernel, typename AccIn, - typename AccOut> -struct UnaryDeviceFunc { - AccIn In; - AccOut Out; - - UnaryDeviceFunc(AccIn &In, AccOut &Out) : In(In), Out(Out) {} - - void operator()(id<1> I) const SYCL_ESIMD_KERNEL { - unsigned int Offset = I * N * sizeof(T); - esimd::simd Vx; - Vx.copy_from(In, Offset); - - if (I.get(0) % 2 == 0) { - for (int J = 0; J < N; J++) { - Kernel DevF{}; - T Val = Vx[J]; - esimd::simd V = DevF(Val); // scalar arg - Vx[J] = V[J]; - } - } else { - Kernel DevF{}; - Vx = DevF(Vx); // vector arg - } - Vx.copy_to(Out, Offset); - }; -}; - -template class Kernel, typename AccIn, - typename AccOut> -struct BinaryDeviceFunc { - AccIn In1; - AccIn In2; - AccOut Out; - - BinaryDeviceFunc(AccIn &In1, AccIn &In2, AccOut &Out) - : In1(In1), In2(In2), Out(Out) {} - - void operator()(id<1> I) const SYCL_ESIMD_KERNEL { - unsigned int Offset = I * N * sizeof(T); - esimd::simd V1(In1, Offset); - esimd::simd V2(In2, Offset); - esimd::simd V; - - if (I.get(0) % 2 == 0) { - int Ind = 0; - { - Kernel DevF{}; - T Val2 = V2[Ind]; - esimd::simd Vv = DevF(V1[Ind], Val2); // both arguments are scalar - V[Ind] = Vv[Ind]; - } - Ind++; - { - Kernel DevF{}; - T Val1 = V1[Ind]; - esimd::simd Vv = DevF(Val1, V2); // scalar, vector - V[Ind] = Vv[Ind]; - } - Ind++; - { - for (int J = Ind; J < N; ++J) { - Kernel DevF{}; - T Val2 = V2[J]; - esimd::simd Vv = DevF(V1, Val2); // scalar 2nd arg - V[J] = Vv[J]; - } - } - } else { - Kernel DevF{}; - V = DevF(V1, V2); // vec 2nd arg - } - V.copy_to(Out, Offset); - }; -}; - -// --- Generic test function for an extended math operation - -template class Kernel, - typename InitF = InitNarrow> -bool test(queue &Q, const std::string &Name, InitF Init = InitNarrow{}, - float delta = 0.0f) { - - constexpr size_t Size = 1024 * 128; - constexpr bool IsBinOp = (Op == MathOp::div_ieee) || (Op == MathOp::pow); - - T *A = new T[Size]; - T *B = new T[Size]; - T *C = new T[Size]; - if constexpr (IsBinOp) { - Init(A, B, C, Size); - } else { - Init(A, B, Size); - } - const char *kind = - std::is_same_v, ESIMDf> - ? "ESIMD" - : "SYCL"; - std::cout << " " << Name << " test, kind=" << kind << "...\n"; - - try { - buffer BufA(A, range<1>(Size)); - buffer BufB(B, range<1>(Size)); - buffer BufC(C, range<1>(Size)); - - // number of workgroups - sycl::range<1> GlobalRange{Size / N}; - - // threads (workitems) in each workgroup - sycl::range<1> LocalRange{1}; - - auto E = Q.submit([&](handler &CGH) { - auto PA = BufA.template get_access(CGH); - auto PC = BufC.template get_access(CGH); - if constexpr (IsBinOp) { - auto PB = BufB.template get_access(CGH); - BinaryDeviceFunc F(PA, PB, - PC); - CGH.parallel_for(nd_range<1>{GlobalRange, LocalRange}, F); - } else { - UnaryDeviceFunc F(PA, PC); - CGH.parallel_for(nd_range<1>{GlobalRange, LocalRange}, F); - } - }); - E.wait(); - } catch (sycl::exception &Exc) { - std::cout << " *** ERROR. SYCL exception caught: << " << Exc.what() - << "\n"; - return false; - } - - int ErrCnt = 0; - - for (unsigned I = 0; I < Size; ++I) { - T Gold; - - if constexpr (IsBinOp) { - Gold = HostFunc{}((T)A[I], (T)B[I]); - } else { - Gold = HostFunc{}((T)A[I]); - } - T Test = C[I]; - - if (delta == 0.0f) { - delta = sizeof(T) > 2 ? 0.0001 : 0.01; - } - - if (abs(Test - Gold) > delta) { - if (++ErrCnt < 10) { - std::cout << " failed at index " << I << ", " << Test - << " != " << Gold << " (gold)\n"; - } - } - } - delete[] A; - delete[] B; - delete[] C; - - if (ErrCnt > 0) { - std::cout << " pass rate: " - << ((float)(Size - ErrCnt) / (float)Size) * 100.0f << "% (" - << (Size - ErrCnt) << "/" << Size << ")\n"; - } - - std::cout << (ErrCnt > 0 ? " FAILED\n" : " Passed\n"); - return ErrCnt == 0; -} - -// --- Tests all extended math operations with given vector length - -template bool testESIMD(queue &Q) { - bool Pass = true; - - std::cout << "--- TESTING ESIMD functions, T=" << typeid(T).name() - << ", N = " << N << "...\n"; - - Pass &= test(Q, "sqrt", InitWide{}); - Pass &= test(Q, "inv"); - Pass &= test(Q, "rsqrt"); - Pass &= test(Q, "sin", InitTrig{}); - Pass &= test(Q, "cos", InitTrig{}); - Pass &= test(Q, "exp", InitInRange0_5{}); - Pass &= test(Q, "log", InitWide{}); - Pass &= test(Q, "exp2", InitInRange0_5{}); - Pass &= test(Q, "log2", InitWide{}); - Pass &= test(Q, "floor", InitWide{}); - Pass &= test(Q, "ceil", InitWide{}); - Pass &= test(Q, "trunc", InitWide{}); - return Pass; -} - -template bool testESIMDSqrtIEEE(queue &Q) { - bool Pass = true; - std::cout << "--- TESTING ESIMD sqrt_ieee, T=" << typeid(T).name() - << ", N = " << N << "...\n"; - Pass &= test(Q, "sqrt_ieee", InitWide{}); - return Pass; -} - -template bool testESIMDDivIEEE(queue &Q) { - bool Pass = true; - std::cout << "--- TESTING ESIMD div_ieee, T=" << typeid(T).name() - << ", N = " << N << "...\n"; - Pass &= test(Q, "div_ieee", InitBin{}); - return Pass; -} - -template bool testESIMDPow(queue &Q) { - bool Pass = true; - std::cout << "--- TESTING ESIMD pow, T=" << typeid(T).name() << ", N = " << N - << "...\n"; - Pass &= test(Q, "pow", InitBin{}, 0.1); - return Pass; -} - -template bool testSYCL(queue &Q) { - bool Pass = true; - // TODO SYCL currently supports only these 4 functions, extend the test when - // more are available. - std::cout << "--- TESTING SYCL functions, T=" << typeid(T).name() - << ", N = " << N << "...\n"; - // SYCL functions will have good accuracy for any argument, unlike bare h/w - // ESIMD versions, so init with "wide" data set. - Pass &= test(Q, "sin", InitWide{}); - Pass &= test(Q, "cos", InitWide{}); - Pass &= test(Q, "exp", InitInRange0_5{}); - Pass &= test(Q, "log", InitWide{}); - return Pass; -} diff --git a/SYCL/ESIMD/ext_math_aspect-fp64.cpp b/SYCL/ESIMD/ext_math_aspect-fp64.cpp index 91bbe27e58..f489cd9ff9 100644 --- a/SYCL/ESIMD/ext_math_aspect-fp64.cpp +++ b/SYCL/ESIMD/ext_math_aspect-fp64.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: cuda || hip // TODO: esimd_emulator fails due to unimplemented 'half' type // XFAIL: esimd_emulator -// RUN: %clangxx -fsycl %s -o %t.out +// RUN: %clangxx -fsycl -DENABLE_FP64 %s -o %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out // This test checks extended math operations. Combinations of @@ -17,29 +17,4 @@ // - math function - sin, cos, ..., div_ieee, pow // - SYCL vs ESIMD APIs -#include "ext_math.hpp" - -using namespace sycl; -using namespace sycl::ext::intel; - -// --- The entry point - -int main(void) { - queue Q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler()); - if (!Q.get_device().has(sycl::aspect::fp64) { - std::cout << "Skipping test\n"; - return 0; - } - auto Dev = Q.get_device(); - std::cout << "Running on " << Dev.get_info() << "\n"; - bool Pass = true; - - // Not support IEEE-conformant sqrt operations for single precision data. - Pass &= testESIMDSqrtIEEE(Q); - Pass &= testESIMDDivIEEE(Q); - - Pass &= testESIMDSqrtIEEE(Q); - Pass &= testESIMDDivIEEE(Q); - std::cout << (Pass ? "Test Passed\n" : "Test FAILED\n"); - return Pass ? 0 : 1; -} +#include "ext_math.cpp" diff --git a/SYCL/ESIMD/regression/Inputs/dgetrf.hpp b/SYCL/ESIMD/regression/Inputs/dgetrf.hpp index 6b099b38b0..e8e09ad26d 100644 --- a/SYCL/ESIMD/regression/Inputs/dgetrf.hpp +++ b/SYCL/ESIMD/regression/Inputs/dgetrf.hpp @@ -18,10 +18,16 @@ #include #include +#ifdef ENABLE_FP64 +typedef double fptype; +#else +typedef float fptype; +#endif + #define ABS(x) ((x) >= 0 ? (x) : -(x)) #define MIN(x, y) ((x) <= (y) ? (x) : (y)) #define MAX(x, y) ((x) >= (y) ? (x) : (y)) -#define FP_RAND ((double)rand() / (double)RAND_MAX) +#define FP_RAND ((fptype)rand() / (fptype)RAND_MAX) #define OUTN(text, ...) fprintf(stderr, text, ##__VA_ARGS__) #define OUT(text, ...) OUTN(text "\n", ##__VA_ARGS__) @@ -47,7 +53,7 @@ using namespace sycl; using namespace std; using namespace sycl::ext::intel::esimd; -ESIMD_PRIVATE ESIMD_REGISTER(192) simd GRF; +ESIMD_PRIVATE ESIMD_REGISTER(256) simd GRF; #define V(x, w, i) (x).template select(i) #define V1(x, i) V(x, 1, i) @@ -69,7 +75,7 @@ template ESIMD_INLINE void dgetrfnp_panel(int64_t *info) { V1(mask, k) = 0; if (ak0[k] != 0.0) { // scal - double temp = 1.0 / ak0[k]; + fptype temp = 1.0 / ak0[k]; ak0.merge(ak0 * temp, mask); for (int i = 8 + K & (-8); i < M; i += 8) { V8(ak, i) *= temp; @@ -98,7 +104,7 @@ template ESIMD_INLINE void dgetrfnp_panel(int64_t *info) { V1(mask, k) = 0; if (ak0[k] != 0.0) { // scal - double temp = 1.0 / ak0[k]; + fptype temp = 1.0 / ak0[k]; ak0.merge(ak0 * temp, mask); for (int i = 16 + (K & (-8)) + kk; i < M; i += 8) { V8(ak, i) *= temp; @@ -129,7 +135,7 @@ template ESIMD_INLINE void dgetrfnp_panel(int64_t *info) { V1(mask, k) = 0; if (ak0[k] != 0.0) { // scal - double temp = 1.0 / ak0[k]; + fptype temp = 1.0 / ak0[k]; ak0.merge(ak0 * temp, mask); for (int i = 8 + K + kk; i < M; i += 8) { V8(ak, i) *= temp; @@ -160,15 +166,15 @@ template ESIMD_INLINE void dgetrfnp_panel(int64_t *info) { // L=A[K:M,0:K]) - panel to update with P1=A[0:M,K:K+N] = column(U=A[0:K,K:K+N], // T=A[K:M,K:K+N]) - panel to be updated template -ESIMD_INLINE void dgetrfnp_left_step(double *a, int64_t lda, int64_t *info) { +ESIMD_INLINE void dgetrfnp_left_step(fptype *a, int64_t lda, int64_t *info) { auto p1 = V(GRF, M * N, 0); - double *a1; + fptype *a1; int i, j, k; // load P1 for (j = 0, a1 = a + K * lda; j < N; j++, a1 += lda) for (i = 0; i < M; i += 8) { - simd data; + simd data; data.copy_from(a1 + i); V8(p1, j * M + i) = data; } @@ -178,10 +184,10 @@ ESIMD_INLINE void dgetrfnp_left_step(double *a, int64_t lda, int64_t *info) { // (gemm) update T=T-L*U for (int kk = 0; kk < K; kk += 8) { simd_mask<8> mask = 1; - simd a0k, aik; + simd a0k, aik; for (k = 0; k < 8 && kk + k < K; k++) { V1(mask, k) = 0; - simd data; + simd data; data.copy_from(a + kk + (kk + k) * lda); V8(a0k, 0) = data; for (j = 0; j < N; j++) { @@ -193,7 +199,7 @@ ESIMD_INLINE void dgetrfnp_left_step(double *a, int64_t lda, int64_t *info) { } for (k = 0; k < 8 && kk + k < K; k++) { for (i = kk + 8; i < M; i += 8) { - simd data; + simd data; data.copy_from(a + i + (kk + k) * lda); V8(aik, 0) = data; for (j = 0; j < N; j++) { @@ -212,19 +218,19 @@ ESIMD_INLINE void dgetrfnp_left_step(double *a, int64_t lda, int64_t *info) { // store P1 for (j = 0, a1 = a + K * lda; j < N; j++, a1 += lda) for (i = 0; i < M; i += 8) { - simd vals = V8(p1, j * M + i); + simd vals = V8(p1, j * M + i); vals.copy_to(a1 + i); } } #endif // !USE_REF -ESIMD_INLINE void dgetrfnp_esimd(int64_t m, int64_t n, double *a, int64_t lda, +ESIMD_INLINE void dgetrfnp_esimd(int64_t m, int64_t n, fptype *a, int64_t lda, int64_t *ipiv, int64_t *info) { *info = 0; #if defined(USE_REF) int i, j, k; for (k = 0; k < MIN(m, n); k++) { - double temp = a[k + k * lda]; + fptype temp = a[k + k * lda]; if (!(*info) && temp == 0.0) *info = k + 1; // scal @@ -298,7 +304,7 @@ ESIMD_INLINE void dgetrfnp_esimd(int64_t m, int64_t n, double *a, int64_t lda, #endif // defined(USE_REF) } -void dgetrfnp_batch_strided_c(int64_t m, int64_t n, double *a, int64_t lda, +void dgetrfnp_batch_strided_c(int64_t m, int64_t n, fptype *a, int64_t lda, int64_t stride_a, int64_t *ipiv, int64_t stride_ipiv, int64_t batch, int64_t *info) { @@ -309,11 +315,11 @@ void dgetrfnp_batch_strided_c(int64_t m, int64_t n, double *a, int64_t lda, CHECK(status = device.is_gpu(), !status); - double *a_gpu; + fptype *a_gpu; int64_t *ipiv_gpu; int64_t *info_gpu; - CHECK(a_gpu = static_cast( - malloc_shared(stride_a * batch * sizeof(double), device, context)), + CHECK(a_gpu = static_cast( + malloc_shared(stride_a * batch * sizeof(fptype), device, context)), !a_gpu); CHECK(ipiv_gpu = static_cast(malloc_shared( stride_ipiv * batch * sizeof(int64_t), device, context)), @@ -322,7 +328,7 @@ void dgetrfnp_batch_strided_c(int64_t m, int64_t n, double *a, int64_t lda, malloc_shared(batch * sizeof(int64_t), device, context)), !info_gpu); - memcpy(a_gpu, a, stride_a * batch * sizeof(double)); + memcpy(a_gpu, a, stride_a * batch * sizeof(fptype)); sycl::nd_range<1> range(sycl::range<1>{static_cast(batch)}, sycl::range<1>{1}); @@ -344,7 +350,7 @@ void dgetrfnp_batch_strided_c(int64_t m, int64_t n, double *a, int64_t lda, return; } - memcpy(a, a_gpu, stride_a * batch * sizeof(double)); + memcpy(a, a_gpu, stride_a * batch * sizeof(fptype)); memcpy(ipiv, ipiv_gpu, stride_ipiv * batch * sizeof(int64_t)); memcpy(info, info_gpu, batch * sizeof(int64_t)); @@ -353,14 +359,14 @@ void dgetrfnp_batch_strided_c(int64_t m, int64_t n, double *a, int64_t lda, free(info_gpu, context); } -static void fp_init(int64_t m, int64_t n, double *a, int64_t lda) { +static void fp_init(int64_t m, int64_t n, fptype *a, int64_t lda) { int64_t i, j; for (j = 0; j < n; j++) for (i = 0; i < m; i++) a[i + j * lda] = 2.0 * FP_RAND - 1.0; } -static void fp_copy(int64_t m, int64_t n, double *a, int64_t lda, double *b, +static void fp_copy(int64_t m, int64_t n, fptype *a, int64_t lda, fptype *b, int64_t ldb) { int64_t i, j; for (j = 0; j < n; j++) @@ -368,8 +374,8 @@ static void fp_copy(int64_t m, int64_t n, double *a, int64_t lda, double *b, b[i + j * ldb] = a[i + j * lda]; } -static double fp_norm1(int64_t m, int64_t n, double *a, int64_t lda) { - double sum, value = 0.0; +static fptype fp_norm1(int64_t m, int64_t n, fptype *a, int64_t lda) { + fptype sum, value = 0.0; int64_t i, j; for (j = 0; j < n; j++) { sum = 0.0; @@ -381,28 +387,32 @@ static double fp_norm1(int64_t m, int64_t n, double *a, int64_t lda) { return value; } -static int dgetrfnp_batch_strided_check(int64_t m, int64_t n, double *a_in, - double *a, int64_t lda, +static int dgetrfnp_batch_strided_check(int64_t m, int64_t n, fptype *a_in, + fptype *a, int64_t lda, int64_t stride_a, int64_t *ipiv, int64_t stride_ipiv, int64_t batch, int64_t *info) { - double thresh = 30.0; + fptype thresh = 30.0; int fail = 0; int64_t i, j, k, l; char label[1024]; +#ifdef ENABLE_FP64 unsigned char prec_b[] = {0, 0, 0, 0, 0, 0, 0xb0, 0x3c}; - double res = 0.0, nrm = 0.0, ulp = *(double *)prec_b; - double *w = (double *)malloc(sizeof(double) * MAX(m * n, 1)); +#else + unsigned char prec_b[] = {0, 0, 0xb0, 0x3c}; +#endif + fptype res = 0.0, nrm = 0.0, ulp = *(fptype *)prec_b; + fptype *w = (fptype *)malloc(sizeof(fptype) * MAX(m * n, 1)); sprintf(label, "m=%ld, n=%ld, lda=%ld, batch=%ld", m, n, lda, batch); for (k = 0; k < batch; k++) { /* info == 0 */ - CHECK_AND_REPORT("info == 0", label, info[k] != 0, (double)info[k], fail); + CHECK_AND_REPORT("info == 0", label, info[k] != 0, (fptype)info[k], fail); if (m > 0 && n > 0) { /* | L U - A | / ( |A| n ulp ) */ - memset(w, 0, sizeof(double) * m * n); + memset(w, 0, sizeof(fptype) * m * n); if (m < n) { for (j = 0; j < n; j++) for (i = 0; i <= j; i++) @@ -429,7 +439,7 @@ static int dgetrfnp_batch_strided_check(int64_t m, int64_t n, double *a_in, w[i + j * m] -= a_in[k * stride_a + i + j * lda]; res = fp_norm1(m, n, w, m); nrm = fp_norm1(m, n, &a_in[k * stride_a], lda); - nrm *= (double)n * ulp; + nrm *= (fptype)n * ulp; res /= nrm > 0.0 ? nrm : ulp; CHECK_AND_REPORT("| L U - A | / ( |A| n ulp )", label, FAILED(res, thresh), res, fail); @@ -440,7 +450,7 @@ static int dgetrfnp_batch_strided_check(int64_t m, int64_t n, double *a_in, return fail; } -void dgetrfnp_batch_strided_c(int64_t m, int64_t n, double *a, int64_t lda, +void dgetrfnp_batch_strided_c(int64_t m, int64_t n, fptype *a, int64_t lda, int64_t stride_a, int64_t *ipiv, int64_t stride_ipiv, int64_t batch, int64_t *info); @@ -458,10 +468,10 @@ int main(int argc, char *argv[]) { int64_t a_count = MAX(stride_a * batch, 1); int64_t ipiv_count = MAX(stride_ipiv * batch, 1); int64_t info_count = MAX(batch, 1); - double *a = NULL, *a_copy = NULL; + fptype *a = NULL, *a_copy = NULL; int64_t *ipiv = NULL, *info = NULL; - CHECK(a = (double *)malloc(sizeof(double) * a_count), !a); - CHECK(a_copy = (double *)malloc(sizeof(double) * a_count), !a_copy); + CHECK(a = (fptype *)malloc(sizeof(fptype) * a_count), !a); + CHECK(a_copy = (fptype *)malloc(sizeof(fptype) * a_count), !a_copy); CHECK(ipiv = (int64_t *)malloc(sizeof(int64_t) * ipiv_count), !ipiv); CHECK(info = (int64_t *)malloc(sizeof(int64_t) * info_count), !info); @@ -486,3 +496,4 @@ int main(int argc, char *argv[]) { } return exit_status; } + diff --git a/SYCL/ESIMD/regression/Inputs/dgetrf_aspect-fp64.hpp b/SYCL/ESIMD/regression/Inputs/dgetrf_aspect-fp64.hpp deleted file mode 100644 index f2586f471b..0000000000 --- a/SYCL/ESIMD/regression/Inputs/dgetrf_aspect-fp64.hpp +++ /dev/null @@ -1,487 +0,0 @@ -//==-------------- dgetrf.hpp - DPC++ ESIMD on-device test ----------------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -// This test checks the correctness of ESIMD program for batched LU -// decomposition without pivoting. The program contains multiple branches -// corresponding to LU input sizes; all internal functions are inlined. -// -#include -#include -#include -#include -#include -#include - -#define ABS(x) ((x) >= 0 ? (x) : -(x)) -#define MIN(x, y) ((x) <= (y) ? (x) : (y)) -#define MAX(x, y) ((x) >= (y) ? (x) : (y)) -#define FP_RAND ((double)rand() / (double)RAND_MAX) - -#define OUTN(text, ...) fprintf(stderr, text, ##__VA_ARGS__) -#define OUT(text, ...) OUTN(text "\n", ##__VA_ARGS__) - -#define CHECK(cmd, status) \ - do { \ - cmd; \ - if (status) { \ - OUT(#cmd " status: %d", status); \ - exit(1); \ - } \ - } while (0) -#define FAILED(res, thresh) ((res) > (thresh) || (res) != (res)) -#define CHECK_AND_REPORT(test_desc, test_id, fail_cond, res, fail_cnt) \ - do { \ - if (fail_cond) \ - fail_cnt++; \ - OUT("Test (%s): " test_desc ". Result: %f. %s", test_id, res, \ - (fail_cond) ? "FAILED" : "PASSED"); \ - } while (0) - -using namespace cl::sycl; -using namespace std; -using namespace sycl::ext::intel::esimd; - -ESIMD_PRIVATE ESIMD_REGISTER(256) simd GRF; - -#define V(x, w, i) (x).template select(i) -#define V1(x, i) V(x, 1, i) -#define V8(x, i) V(x, 8, i) -#define BCAST8(x, i) (x).template replicate_w<8, 1>(i) - -template ESIMD_INLINE void dgetrfnp_panel(int64_t *info) { - auto a = V(GRF, M * N, 0); - - if (K % 8) { - simd_mask<8> mask = 1; - for (int k = 0; k < K % 8; k++) - V1(mask, k) = 0; - - for (int k = K % 8; k < 8 && k < K + N; k++) { - auto ak = V(a, M, (k - K % 8) * M); - auto ak0 = V8(ak, K & (-8)); - - V1(mask, k) = 0; - if (ak0[k] != 0.0) { - // scal - double temp = 1.0 / ak0[k]; - ak0.merge(ak0 * temp, mask); - for (int i = 8 + K & (-8); i < M; i += 8) { - V8(ak, i) *= temp; - } - - // update - for (int j = k - K % 8 + 1; j < N; j++) { - auto aj = V(a, M, j * M); - auto aj0 = V8(aj, K & (-8)); - auto temp = BCAST8(aj0, k); - aj0.merge(aj0 - temp * ak0, aj0, mask); - for (int i = 8 + K & (-8); i < M; i += 8) { - V8(aj, i) -= temp * V8(ak, i); - } - } - } else if (*info == 0) { - *info = K + k - K % 8 + 1; - } - } - for (int kk = 0; kk < N + K % 8 - 8; kk += 8) { - mask = 1; - for (int k = 0; k < 8 && kk + k < N + K % 8 - 8; k++) { - auto ak = V(a, M, (kk + k + 8 - K % 8) * M); - auto ak0 = V8(ak, kk + (K & (-8)) + 8); - - V1(mask, k) = 0; - if (ak0[k] != 0.0) { - // scal - double temp = 1.0 / ak0[k]; - ak0.merge(ak0 * temp, mask); - for (int i = 16 + (K & (-8)) + kk; i < M; i += 8) { - V8(ak, i) *= temp; - } - - // update - for (int j = kk + k + 8 - K % 8 + 1; j < N; j++) { - auto aj = V(a, M, j * M); - auto aj0 = V8(aj, kk + (K & (-8)) + 8); - auto temp = BCAST8(aj0, k); - aj0.merge(aj0 - temp * ak0, aj0, mask); - for (int i = 16 + (K & (-8)) + kk; i < M; i += 8) { - V8(aj, i) -= temp * V8(ak, i); - } - } - } else if (*info == 0) { - *info = K + kk + k + 8 - K % 8 + 1; - } - } - } - } else { - for (int kk = 0; kk < N; kk += 8) { - simd_mask<8> mask = 1; - for (int k = 0; k < 8 && kk + k < N; k++) { - auto ak = V(a, M, (kk + k) * M); - auto ak0 = V8(ak, kk + K); - - V1(mask, k) = 0; - if (ak0[k] != 0.0) { - // scal - double temp = 1.0 / ak0[k]; - ak0.merge(ak0 * temp, mask); - for (int i = 8 + K + kk; i < M; i += 8) { - V8(ak, i) *= temp; - } - - // update - for (int j = kk + k + 1; j < N; j++) { - auto aj = V(a, M, j * M); - auto aj0 = V8(aj, kk + K); - auto temp = BCAST8(aj0, k); - aj0.merge(aj0 - temp * ak0, aj0, mask); - for (int i = 8 + K + kk; i < M; i += 8) { - V8(aj, i) -= temp * V8(ak, i); - } - } - } else if (*info == 0) { - *info = K + kk + k + 1; - } - } - } - } -} - -#ifndef USE_REF -// A left-looking algorithm step -// M, N - a panel size to be updated and factorized (M * N <= 64 * 6), must fit -// into GRF K - an update rank P0=A[0:M,0:K] = column(F=A[0:K,0:K], -// L=A[K:M,0:K]) - panel to update with P1=A[0:M,K:K+N] = column(U=A[0:K,K:K+N], -// T=A[K:M,K:K+N]) - panel to be updated -template -ESIMD_INLINE void dgetrfnp_left_step(double *a, int64_t lda, int64_t *info) { - auto p1 = V(GRF, M * N, 0); - double *a1; - int i, j, k; - - // load P1 - for (j = 0, a1 = a + K * lda; j < N; j++, a1 += lda) - for (i = 0; i < M; i += 8) { - simd data; - data.copy_from(a1 + i); - V8(p1, j * M + i) = data; - } - - if (K > 0) { - // (trsm) solve F*X=U for X, X overwrites U - // (gemm) update T=T-L*U - for (int kk = 0; kk < K; kk += 8) { - simd_mask<8> mask = 1; - simd a0k, aik; - for (k = 0; k < 8 && kk + k < K; k++) { - V1(mask, k) = 0; - simd data; - data.copy_from(a + kk + (kk + k) * lda); - V8(a0k, 0) = data; - for (j = 0; j < N; j++) { - auto aj = V(p1, M, j * M); - auto aj0 = V8(aj, kk); - auto temp = BCAST8(aj0, k); - aj0.merge(aj0 - temp * a0k, aj0, mask); - } - } - for (k = 0; k < 8 && kk + k < K; k++) { - for (i = kk + 8; i < M; i += 8) { - simd data; - data.copy_from(a + i + (kk + k) * lda); - V8(aik, 0) = data; - for (j = 0; j < N; j++) { - auto aj = V(p1, M, j * M); - auto aj0 = V8(aj, kk); - auto temp = BCAST8(aj0, k); - V8(aj, i) -= temp * aik; - } - } - } - } - } - // (getrf) factorize T=P*L*U - dgetrfnp_panel(info); - - // store P1 - for (j = 0, a1 = a + K * lda; j < N; j++, a1 += lda) - for (i = 0; i < M; i += 8) { - simd vals = V8(p1, j * M + i); - vals.copy_to(a1 + i); - } -} -#endif // !USE_REF - -ESIMD_INLINE void dgetrfnp_esimd(int64_t m, int64_t n, double *a, int64_t lda, - int64_t *ipiv, int64_t *info) { - *info = 0; -#if defined(USE_REF) - int i, j, k; - for (k = 0; k < MIN(m, n); k++) { - double temp = a[k + k * lda]; - if (!(*info) && temp == 0.0) - *info = k + 1; - // scal - temp = 1.0 / temp; - for (i = k + 1; i < m; i++) { - a[i + k * lda] *= temp; - } - // update - for (j = k + 1; j < n; j++) { - temp = a[k + j * lda]; - for (i = k + 1; i < m; i++) { - a[i + j * lda] -= temp * a[i + k * lda]; - } - } - } -#else // defined(USE_REF) - if (m == 8) { - if (n == 8) - dgetrfnp_left_step<8, 8, 0>(a, lda, info); - } else if (m == 16) { - if (n == 8) - dgetrfnp_left_step<16, 8, 0>(a, lda, info); - else if (n == 16) - dgetrfnp_left_step<16, 16, 0>(a, lda, info); - } else if (m == 32) { - if (n == 8) - dgetrfnp_left_step<32, 8, 0>(a, lda, info); - else if (n == 12) - dgetrfnp_left_step<32, 12, 0>(a, lda, info); - else if (n == 16) { - dgetrfnp_left_step<32, 8, 0>(a, lda, info); - dgetrfnp_left_step<32, 8, 8>(a, lda, info); - } else if (n == 24) { - dgetrfnp_left_step<32, 8, 0>(a, lda, info); - dgetrfnp_left_step<32, 8, 8>(a, lda, info); - dgetrfnp_left_step<32, 8, 16>(a, lda, info); - } else if (n == 32) { - dgetrfnp_left_step<32, 8, 0>(a, lda, info); - dgetrfnp_left_step<32, 8, 8>(a, lda, info); - dgetrfnp_left_step<32, 8, 16>(a, lda, info); - dgetrfnp_left_step<32, 8, 24>(a, lda, info); - } - } else if (m == 64) { - if (n == 6) - dgetrfnp_left_step<64, 6, 0>(a, lda, info); - else if (n == 16) { - dgetrfnp_left_step<64, 6, 0>(a, lda, info); - dgetrfnp_left_step<64, 6, 6>(a, lda, info); - dgetrfnp_left_step<64, 4, 12>(a, lda, info); - } else if (n == 32) { - dgetrfnp_left_step<64, 6, 0>(a, lda, info); - dgetrfnp_left_step<64, 6, 6>(a, lda, info); - dgetrfnp_left_step<64, 6, 12>(a, lda, info); - dgetrfnp_left_step<64, 6, 18>(a, lda, info); - dgetrfnp_left_step<64, 6, 24>(a, lda, info); - dgetrfnp_left_step<64, 2, 30>(a, lda, info); - } else if (n == 64) { - dgetrfnp_left_step<64, 6, 0>(a, lda, info); - dgetrfnp_left_step<64, 6, 6>(a, lda, info); - dgetrfnp_left_step<64, 6, 12>(a, lda, info); - dgetrfnp_left_step<64, 6, 18>(a, lda, info); - dgetrfnp_left_step<64, 6, 24>(a, lda, info); - dgetrfnp_left_step<64, 6, 30>(a, lda, info); - dgetrfnp_left_step<64, 6, 36>(a, lda, info); - dgetrfnp_left_step<64, 6, 42>(a, lda, info); - dgetrfnp_left_step<64, 6, 48>(a, lda, info); - dgetrfnp_left_step<64, 6, 54>(a, lda, info); - dgetrfnp_left_step<64, 4, 60>(a, lda, info); - } - } -#endif // defined(USE_REF) -} - -void dgetrfnp_batch_strided_c(int64_t m, int64_t n, double *a, int64_t lda, - int64_t stride_a, int64_t *ipiv, - int64_t stride_ipiv, int64_t batch, - int64_t *info) { - queue queue((gpu_selector())); - auto device = queue.get_device(); - auto context = queue.get_context(); - int status; - - CHECK(status = device.is_gpu(), !status); - - double *a_gpu; - int64_t *ipiv_gpu; - int64_t *info_gpu; - CHECK(a_gpu = static_cast( - malloc_shared(stride_a * batch * sizeof(double), device, context)), - !a_gpu); - CHECK(ipiv_gpu = static_cast(malloc_shared( - stride_ipiv * batch * sizeof(int64_t), device, context)), - !ipiv_gpu); - CHECK(info_gpu = static_cast( - malloc_shared(batch * sizeof(int64_t), device, context)), - !info_gpu); - - memcpy(a_gpu, a, stride_a * batch * sizeof(double)); - - sycl::nd_range<1> range(sycl::range<1>{static_cast(batch)}, - sycl::range<1>{1}); - try { - auto event = queue.submit([&](handler &cgh) { - cgh.parallel_for( - range, [=](nd_item<1> id) SYCL_ESIMD_KERNEL { - int i = id.get_global_id(0); - dgetrfnp_esimd(m, n, &a_gpu[i * stride_a], lda, - &ipiv_gpu[i * stride_ipiv], &info_gpu[i]); - }); - }); - event.wait(); - } catch (const sycl::exception &e) { - std::cout << "*** EXCEPTION caught: " << e.what() << "\n"; - free(a_gpu, context); - free(ipiv_gpu, context); - free(info_gpu, context); - return; - } - - memcpy(a, a_gpu, stride_a * batch * sizeof(double)); - memcpy(ipiv, ipiv_gpu, stride_ipiv * batch * sizeof(int64_t)); - memcpy(info, info_gpu, batch * sizeof(int64_t)); - - free(a_gpu, context); - free(ipiv_gpu, context); - free(info_gpu, context); -} - -static void fp_init(int64_t m, int64_t n, double *a, int64_t lda) { - int64_t i, j; - for (j = 0; j < n; j++) - for (i = 0; i < m; i++) - a[i + j * lda] = 2.0 * FP_RAND - 1.0; -} - -static void fp_copy(int64_t m, int64_t n, double *a, int64_t lda, double *b, - int64_t ldb) { - int64_t i, j; - for (j = 0; j < n; j++) - for (i = 0; i < m; i++) - b[i + j * ldb] = a[i + j * lda]; -} - -static double fp_norm1(int64_t m, int64_t n, double *a, int64_t lda) { - double sum, value = 0.0; - int64_t i, j; - for (j = 0; j < n; j++) { - sum = 0.0; - for (i = 0; i < m; i++) - sum += ABS(a[i + j * lda]); - if (value < sum) - value = sum; - } - return value; -} - -static int dgetrfnp_batch_strided_check(int64_t m, int64_t n, double *a_in, - double *a, int64_t lda, - int64_t stride_a, int64_t *ipiv, - int64_t stride_ipiv, int64_t batch, - int64_t *info) { - double thresh = 30.0; - int fail = 0; - int64_t i, j, k, l; - char label[1024]; - unsigned char prec_b[] = {0, 0, 0, 0, 0, 0, 0xb0, 0x3c}; - double res = 0.0, nrm = 0.0, ulp = *(double *)prec_b; - double *w = (double *)malloc(sizeof(double) * MAX(m * n, 1)); - - sprintf(label, "m=%ld, n=%ld, lda=%ld, batch=%ld", m, n, lda, batch); - - for (k = 0; k < batch; k++) { - /* info == 0 */ - CHECK_AND_REPORT("info == 0", label, info[k] != 0, (double)info[k], fail); - - if (m > 0 && n > 0) { - /* | L U - A | / ( |A| n ulp ) */ - memset(w, 0, sizeof(double) * m * n); - if (m < n) { - for (j = 0; j < n; j++) - for (i = 0; i <= j; i++) - w[i + j * m] = a[i + j * lda + k * stride_a]; - for (i = m - 1; i >= 0; i--) - for (j = 0; j < n; j++) - for (l = 0; l < i; l++) - w[i + j * m] += a[i + l * lda + k * stride_a] * w[l + j * m]; - } else { - for (j = 0; j < n; j++) - for (i = j; i < m; i++) - w[i + j * m] = a[i + j * lda + k * stride_a]; - for (j = 0; j < n; j++) - w[j + j * m] = 1.0; - for (j = n - 1; j >= 0; j--) - for (i = 0; i < m; i++) { - w[i + j * m] *= a[j + j * lda + k * stride_a]; - for (l = 0; l < j; l++) - w[i + j * m] += w[i + l * m] * a[l + j * lda + k * stride_a]; - } - } - for (j = 0; j < n; j++) - for (i = 0; i < m; i++) - w[i + j * m] -= a_in[k * stride_a + i + j * lda]; - res = fp_norm1(m, n, w, m); - nrm = fp_norm1(m, n, &a_in[k * stride_a], lda); - nrm *= (double)n * ulp; - res /= nrm > 0.0 ? nrm : ulp; - CHECK_AND_REPORT("| L U - A | / ( |A| n ulp )", label, - FAILED(res, thresh), res, fail); - } - } - - free(w); - return fail; -} - -void dgetrfnp_batch_strided_c(int64_t m, int64_t n, double *a, int64_t lda, - int64_t stride_a, int64_t *ipiv, - int64_t stride_ipiv, int64_t batch, - int64_t *info); - -int main(int argc, char *argv[]) { - int exit_status = 0; - int64_t m = 64, n = 64, lda = 64; - int64_t stride_a = lda * n, stride_ipiv = n; - - srand(1); - - for (int i = 1; i < argc; i++) { - int64_t batch = (int64_t)atoi(argv[i]); - batch = MAX(batch, 0); - int64_t a_count = MAX(stride_a * batch, 1); - int64_t ipiv_count = MAX(stride_ipiv * batch, 1); - int64_t info_count = MAX(batch, 1); - double *a = NULL, *a_copy = NULL; - int64_t *ipiv = NULL, *info = NULL; - CHECK(a = (double *)malloc(sizeof(double) * a_count), !a); - CHECK(a_copy = (double *)malloc(sizeof(double) * a_count), !a_copy); - CHECK(ipiv = (int64_t *)malloc(sizeof(int64_t) * ipiv_count), !ipiv); - CHECK(info = (int64_t *)malloc(sizeof(int64_t) * info_count), !info); - - /* Initialize input data */ - for (int64_t k = 0; k < batch; k++) { - fp_init(m, n, &a_copy[k * stride_a], lda); - fp_copy(m, n, &a_copy[k * stride_a], lda, &a[k * stride_a], lda); - } - - /* Run the tested function */ - dgetrfnp_batch_strided_c(m, n, a, lda, stride_a, ipiv, stride_ipiv, batch, - info); - - /* Check that the computation completed successfully */ - exit_status += dgetrfnp_batch_strided_check(m, n, a_copy, a, lda, stride_a, - ipiv, stride_ipiv, batch, info); - - free(a); - free(a_copy); - free(ipiv); - free(info); - } - return exit_status; -} diff --git a/SYCL/ESIMD/regression/dgetrf_8x8.cpp b/SYCL/ESIMD/regression/dgetrf_8x8.cpp index e4c2e04514..a088516d28 100644 --- a/SYCL/ESIMD/regression/dgetrf_8x8.cpp +++ b/SYCL/ESIMD/regression/dgetrf_8x8.cpp @@ -20,10 +20,16 @@ #include #include +#ifdef ENABLE_FP64 +typedef double fptype; +#else +typedef float fptype; +#endif + #define ABS(x) ((x) >= 0 ? (x) : -(x)) #define MIN(x, y) ((x) <= (y) ? (x) : (y)) #define MAX(x, y) ((x) >= (y) ? (x) : (y)) -#define FP_RAND ((float)rand() / (float)RAND_MAX) +#define FP_RAND ((fptype)rand() / (fptype)RAND_MAX) #define OUTN(text, ...) fprintf(stderr, text, ##__VA_ARGS__) #define OUT(text, ...) OUTN(text "\n", ##__VA_ARGS__) @@ -49,7 +55,7 @@ using namespace sycl; using namespace std; using namespace sycl::ext::intel::esimd; -ESIMD_PRIVATE ESIMD_REGISTER(384) simd GRF; +ESIMD_PRIVATE ESIMD_REGISTER(384) simd GRF; #define V(x, w, i) (x).template select(i) #define V1(x, i) V(x, 1, i) @@ -67,7 +73,7 @@ template ESIMD_INLINE void dgetrfnp_panel(int64_t *info) { V1(mask, k) = 0; if (ak0[k] != 0.0) { // scal - float temp = 1.0 / ak0[k]; + fptype temp = 1.0 / ak0[k]; ak0.merge(ak0 * temp, mask); for (int i = 8 + K + kk; i < M; i += 8) { V8(ak, i) *= temp; @@ -96,15 +102,15 @@ template ESIMD_INLINE void dgetrfnp_panel(int64_t *info) { // L=A[K:M,0:K]) - panel to update with P1=A[0:M,K:K+N] = column(U=A[0:K,K:K+N], // T=A[K:M,K:K+N]) - panel to be updated template -ESIMD_INLINE void dgetrfnp_left_step(float *a, int64_t lda, int64_t *info) { +ESIMD_INLINE void dgetrfnp_left_step(fptype *a, int64_t lda, int64_t *info) { auto p1 = V(GRF, M * N, 0); - float *a1; + fptype *a1; int i, j, k; // load P1 for (j = 0, a1 = a + K * lda; j < N; j++, a1 += lda) for (i = 0; i < M; i += 8) { - simd data; + simd data; data.copy_from(a1 + i); V8(p1, j * M + i) = data; } @@ -114,18 +120,18 @@ ESIMD_INLINE void dgetrfnp_left_step(float *a, int64_t lda, int64_t *info) { // store P1 for (j = 0, a1 = a + K * lda; j < N; j++, a1 += lda) for (i = 0; i < M; i += 8) { - simd vals = V8(p1, j * M + i); + simd vals = V8(p1, j * M + i); vals.copy_to(a1 + i); } } -ESIMD_INLINE void dgetrfnp_esimd_8x8(float *a, int64_t lda, int64_t *ipiv, +ESIMD_INLINE void dgetrfnp_esimd_8x8(fptype *a, int64_t lda, int64_t *ipiv, int64_t *info) { *info = 0; dgetrfnp_left_step<8, 8, 0>(a, lda, info); } -void dgetrfnp_batch_strided_c(int64_t m, int64_t n, float *a, int64_t lda, +void dgetrfnp_batch_strided_c(int64_t m, int64_t n, fptype *a, int64_t lda, int64_t stride_a, int64_t *ipiv, int64_t stride_ipiv, int64_t batch, int64_t *info) { @@ -136,11 +142,11 @@ void dgetrfnp_batch_strided_c(int64_t m, int64_t n, float *a, int64_t lda, CHECK(status = device.is_gpu(), !status); - float *a_gpu; + fptype *a_gpu; int64_t *ipiv_gpu; int64_t *info_gpu; - CHECK(a_gpu = static_cast( - malloc_shared(stride_a * batch * sizeof(float), device, context)), + CHECK(a_gpu = static_cast( + malloc_shared(stride_a * batch * sizeof(fptype), device, context)), !a_gpu); CHECK(ipiv_gpu = static_cast(malloc_shared( stride_ipiv * batch * sizeof(int64_t), device, context)), @@ -149,7 +155,7 @@ void dgetrfnp_batch_strided_c(int64_t m, int64_t n, float *a, int64_t lda, malloc_shared(batch * sizeof(int64_t), device, context)), !info_gpu); - memcpy(a_gpu, a, stride_a * batch * sizeof(float)); + memcpy(a_gpu, a, stride_a * batch * sizeof(fptype)); sycl::nd_range<1> range(sycl::range<1>{static_cast(batch)}, sycl::range<1>{1}); @@ -171,7 +177,7 @@ void dgetrfnp_batch_strided_c(int64_t m, int64_t n, float *a, int64_t lda, return; } - memcpy(a, a_gpu, stride_a * batch * sizeof(float)); + memcpy(a, a_gpu, stride_a * batch * sizeof(fptype)); memcpy(ipiv, ipiv_gpu, stride_ipiv * batch * sizeof(int64_t)); memcpy(info, info_gpu, batch * sizeof(int64_t)); @@ -180,14 +186,14 @@ void dgetrfnp_batch_strided_c(int64_t m, int64_t n, float *a, int64_t lda, free(info_gpu, context); } -static void fp_init(int64_t m, int64_t n, float *a, int64_t lda) { +static void fp_init(int64_t m, int64_t n, fptype *a, int64_t lda) { int64_t i, j; for (j = 0; j < n; j++) for (i = 0; i < m; i++) a[i + j * lda] = 2.0 * FP_RAND - 1.0; } -static void fp_copy(int64_t m, int64_t n, float *a, int64_t lda, float *b, +static void fp_copy(int64_t m, int64_t n, fptype *a, int64_t lda, fptype *b, int64_t ldb) { int64_t i, j; for (j = 0; j < n; j++) @@ -195,8 +201,8 @@ static void fp_copy(int64_t m, int64_t n, float *a, int64_t lda, float *b, b[i + j * ldb] = a[i + j * lda]; } -static float fp_norm1(int64_t m, int64_t n, float *a, int64_t lda) { - float sum, value = 0.0; +static fptype fp_norm1(int64_t m, int64_t n, fptype *a, int64_t lda) { + fptype sum, value = 0.0; int64_t i, j; for (j = 0; j < n; j++) { sum = 0.0; @@ -208,27 +214,32 @@ static float fp_norm1(int64_t m, int64_t n, float *a, int64_t lda) { return value; } -static int dgetrfnp_batch_strided_check(int64_t m, int64_t n, float *a_in, - float *a, int64_t lda, int64_t stride_a, - int64_t *ipiv, int64_t stride_ipiv, - int64_t batch, int64_t *info) { - float thresh = 30.0; +static int dgetrfnp_batch_strided_check(int64_t m, int64_t n, fptype *a_in, + fptype *a, int64_t lda, + int64_t stride_a, int64_t *ipiv, + int64_t stride_ipiv, int64_t batch, + int64_t *info) { + fptype thresh = 30.0; int fail = 0; int64_t i, j, k, l; char label[1024]; +#ifdef ENABLE_FP64 + unsigned char prec_b[] = {0, 0, 0, 0, 0, 0, 0xb0, 0x3c}; +#else unsigned char prec_b[] = {0, 0, 0xb0, 0x3c}; - float res = 0.0, nrm = 0.0, ulp = *(float *)prec_b; - float *w = (float *)malloc(sizeof(float) * MAX(m * n, 1)); +#endif + fptype res = 0.0, nrm = 0.0, ulp = *(fptype *)prec_b; + fptype *w = (fptype *)malloc(sizeof(fptype) * MAX(m * n, 1)); sprintf(label, "m=%ld, n=%ld, lda=%ld, batch=%ld", m, n, lda, batch); for (k = 0; k < batch; k++) { /* info == 0 */ - CHECK_AND_REPORT("info == 0", label, info[k] != 0, (float)info[k], fail); + CHECK_AND_REPORT("info == 0", label, info[k] != 0, (fptype)info[k], fail); if (m > 0 && n > 0) { /* | L U - A | / ( |A| n ulp ) */ - memset(w, 0, sizeof(float) * m * n); + memset(w, 0, sizeof(fptype) * m * n); if (m < n) { for (j = 0; j < n; j++) for (i = 0; i <= j; i++) @@ -255,7 +266,7 @@ static int dgetrfnp_batch_strided_check(int64_t m, int64_t n, float *a_in, w[i + j * m] -= a_in[k * stride_a + i + j * lda]; res = fp_norm1(m, n, w, m); nrm = fp_norm1(m, n, &a_in[k * stride_a], lda); - nrm *= (float)n * ulp; + nrm *= (fptype)n * ulp; res /= nrm > 0.0 ? nrm : ulp; CHECK_AND_REPORT("| L U - A | / ( |A| n ulp )", label, FAILED(res, thresh), res, fail); @@ -266,7 +277,7 @@ static int dgetrfnp_batch_strided_check(int64_t m, int64_t n, float *a_in, return fail; } -void dgetrfnp_batch_strided_c(int64_t m, int64_t n, float *a, int64_t lda, +void dgetrfnp_batch_strided_c(int64_t m, int64_t n, fptype *a, int64_t lda, int64_t stride_a, int64_t *ipiv, int64_t stride_ipiv, int64_t batch, int64_t *info); @@ -284,10 +295,10 @@ int main(int argc, char *argv[]) { int64_t a_count = MAX(stride_a * batch, 1); int64_t ipiv_count = MAX(stride_ipiv * batch, 1); int64_t info_count = MAX(batch, 1); - float *a = NULL, *a_copy = NULL; + fptype *a = NULL, *a_copy = NULL; int64_t *ipiv = NULL, *info = NULL; - CHECK(a = (float *)malloc(sizeof(float) * a_count), !a); - CHECK(a_copy = (float *)malloc(sizeof(float) * a_count), !a_copy); + CHECK(a = (fptype *)malloc(sizeof(fptype) * a_count), !a); + CHECK(a_copy = (fptype *)malloc(sizeof(fptype) * a_count), !a_copy); CHECK(ipiv = (int64_t *)malloc(sizeof(int64_t) * ipiv_count), !ipiv); CHECK(info = (int64_t *)malloc(sizeof(int64_t) * info_count), !info); @@ -312,3 +323,4 @@ int main(int argc, char *argv[]) { } return exit_status; } + diff --git a/SYCL/ESIMD/regression/dgetrf_8x8_aspect-fp64.cpp b/SYCL/ESIMD/regression/dgetrf_8x8_aspect-fp64.cpp index 4b695a4877..9b2a382019 100644 --- a/SYCL/ESIMD/regression/dgetrf_8x8_aspect-fp64.cpp +++ b/SYCL/ESIMD/regression/dgetrf_8x8_aspect-fp64.cpp @@ -7,308 +7,9 @@ //===----------------------------------------------------------------------===// // REQUIRES: gpu, aspect-fp64 // UNSUPPORTED: cuda || hip -// RUN: %clangxx -fsycl %s -I%S/.. -o %t.out +// RUN: %clangxx -fsycl -DENABLE_FP64 %s -I%S/.. -o %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out 1 // // Reduced version of dgetrf.cpp - M = 8, N = 8, single batch. // -#include -#include -#include -#include -#include -#include - -#define ABS(x) ((x) >= 0 ? (x) : -(x)) -#define MIN(x, y) ((x) <= (y) ? (x) : (y)) -#define MAX(x, y) ((x) >= (y) ? (x) : (y)) -#define FP_RAND ((double)rand() / (double)RAND_MAX) - -#define OUTN(text, ...) fprintf(stderr, text, ##__VA_ARGS__) -#define OUT(text, ...) OUTN(text "\n", ##__VA_ARGS__) - -#define CHECK(cmd, status) \ - do { \ - cmd; \ - if (status) { \ - OUT(#cmd " status: %d", status); \ - exit(1); \ - } \ - } while (0) -#define FAILED(res, thresh) ((res) > (thresh) || (res) != (res)) -#define CHECK_AND_REPORT(test_desc, test_id, fail_cond, res, fail_cnt) \ - do { \ - if (fail_cond) \ - fail_cnt++; \ - OUT("Test (%s): " test_desc ". Result: %f. %s", test_id, res, \ - (fail_cond) ? "FAILED" : "PASSED"); \ - } while (0) - -using namespace cl::sycl; -using namespace std; -using namespace sycl::ext::intel::esimd; - -ESIMD_PRIVATE ESIMD_REGISTER(384) simd GRF; - -#define V(x, w, i) (x).template select(i) -#define V1(x, i) V(x, 1, i) -#define V8(x, i) V(x, 8, i) -#define BCAST8(x, i) (x).template replicate_w<8, 1>(i) - -template ESIMD_INLINE void dgetrfnp_panel(int64_t *info) { - auto a = V(GRF, M * N, 0); - for (int kk = 0; kk < N; kk += 8) { - simd_mask<8> mask = 1; - for (int k = 0; k < 8 && kk + k < N; k++) { - auto ak = V(a, M, (kk + k) * M); - auto ak0 = V8(ak, kk + K); - - V1(mask, k) = 0; - if (ak0[k] != 0.0) { - // scal - double temp = 1.0 / ak0[k]; - ak0.merge(ak0 * temp, mask); - for (int i = 8 + K + kk; i < M; i += 8) { - V8(ak, i) *= temp; - } - - // update - for (int j = kk + k + 1; j < N; j++) { - auto aj = V(a, M, j * M); - auto aj0 = V8(aj, kk + K); - auto temp = BCAST8(aj0, k); - aj0.merge(aj0 - temp * ak0, aj0, mask); - for (int i = 8 + K + kk; i < M; i += 8) { - V8(aj, i) -= temp * V8(ak, i); - } - } - } else if (*info == 0) { - *info = K + kk + k + 1; - } - } - } -} - -// A left-looking algorithm step -// M, N - a panel size to be updated and factorized (M * N <= 64 * 6), must fit -// into GRF K - an update rank P0=A[0:M,0:K] = column(F=A[0:K,0:K], -// L=A[K:M,0:K]) - panel to update with P1=A[0:M,K:K+N] = column(U=A[0:K,K:K+N], -// T=A[K:M,K:K+N]) - panel to be updated -template -ESIMD_INLINE void dgetrfnp_left_step(double *a, int64_t lda, int64_t *info) { - auto p1 = V(GRF, M * N, 0); - double *a1; - int i, j, k; - - // load P1 - for (j = 0, a1 = a + K * lda; j < N; j++, a1 += lda) - for (i = 0; i < M; i += 8) { - simd data; - data.copy_from(a1 + i); - V8(p1, j * M + i) = data; - } - // (getrf) factorize T=P*L*U - dgetrfnp_panel(info); - - // store P1 - for (j = 0, a1 = a + K * lda; j < N; j++, a1 += lda) - for (i = 0; i < M; i += 8) { - simd vals = V8(p1, j * M + i); - vals.copy_to(a1 + i); - } -} - -ESIMD_INLINE void dgetrfnp_esimd_8x8(double *a, int64_t lda, int64_t *ipiv, - int64_t *info) { - *info = 0; - dgetrfnp_left_step<8, 8, 0>(a, lda, info); -} - -void dgetrfnp_batch_strided_c(int64_t m, int64_t n, double *a, int64_t lda, - int64_t stride_a, int64_t *ipiv, - int64_t stride_ipiv, int64_t batch, - int64_t *info) { - queue queue((gpu_selector())); - auto device = queue.get_device(); - auto context = queue.get_context(); - int status; - - CHECK(status = device.is_gpu(), !status); - - double *a_gpu; - int64_t *ipiv_gpu; - int64_t *info_gpu; - CHECK(a_gpu = static_cast( - malloc_shared(stride_a * batch * sizeof(double), device, context)), - !a_gpu); - CHECK(ipiv_gpu = static_cast(malloc_shared( - stride_ipiv * batch * sizeof(int64_t), device, context)), - !ipiv_gpu); - CHECK(info_gpu = static_cast( - malloc_shared(batch * sizeof(int64_t), device, context)), - !info_gpu); - - memcpy(a_gpu, a, stride_a * batch * sizeof(double)); - - sycl::nd_range<1> range(sycl::range<1>{static_cast(batch)}, - sycl::range<1>{1}); - try { - auto event = queue.submit([&](handler &cgh) { - cgh.parallel_for( - range, [=](nd_item<1> id) SYCL_ESIMD_KERNEL { - int i = id.get_global_id(0); - dgetrfnp_esimd_8x8(&a_gpu[i * stride_a], lda, - &ipiv_gpu[i * stride_ipiv], &info_gpu[i]); - }); - }); - event.wait(); - } catch (const sycl::exception &e) { - std::cout << "*** EXCEPTION caught: " << e.what() << "\n"; - free(a_gpu, context); - free(ipiv_gpu, context); - free(info_gpu, context); - return; - } - - memcpy(a, a_gpu, stride_a * batch * sizeof(double)); - memcpy(ipiv, ipiv_gpu, stride_ipiv * batch * sizeof(int64_t)); - memcpy(info, info_gpu, batch * sizeof(int64_t)); - - free(a_gpu, context); - free(ipiv_gpu, context); - free(info_gpu, context); -} - -static void fp_init(int64_t m, int64_t n, double *a, int64_t lda) { - int64_t i, j; - for (j = 0; j < n; j++) - for (i = 0; i < m; i++) - a[i + j * lda] = 2.0 * FP_RAND - 1.0; -} - -static void fp_copy(int64_t m, int64_t n, double *a, int64_t lda, double *b, - int64_t ldb) { - int64_t i, j; - for (j = 0; j < n; j++) - for (i = 0; i < m; i++) - b[i + j * ldb] = a[i + j * lda]; -} - -static double fp_norm1(int64_t m, int64_t n, double *a, int64_t lda) { - double sum, value = 0.0; - int64_t i, j; - for (j = 0; j < n; j++) { - sum = 0.0; - for (i = 0; i < m; i++) - sum += ABS(a[i + j * lda]); - if (value < sum) - value = sum; - } - return value; -} - -static int dgetrfnp_batch_strided_check(int64_t m, int64_t n, double *a_in, - double *a, int64_t lda, - int64_t stride_a, int64_t *ipiv, - int64_t stride_ipiv, int64_t batch, - int64_t *info) { - double thresh = 30.0; - int fail = 0; - int64_t i, j, k, l; - char label[1024]; - unsigned char prec_b[] = {0, 0, 0, 0, 0, 0, 0xb0, 0x3c}; - double res = 0.0, nrm = 0.0, ulp = *(double *)prec_b; - double *w = (double *)malloc(sizeof(double) * MAX(m * n, 1)); - - sprintf(label, "m=%ld, n=%ld, lda=%ld, batch=%ld", m, n, lda, batch); - - for (k = 0; k < batch; k++) { - /* info == 0 */ - CHECK_AND_REPORT("info == 0", label, info[k] != 0, (double)info[k], fail); - - if (m > 0 && n > 0) { - /* | L U - A | / ( |A| n ulp ) */ - memset(w, 0, sizeof(double) * m * n); - if (m < n) { - for (j = 0; j < n; j++) - for (i = 0; i <= j; i++) - w[i + j * m] = a[i + j * lda + k * stride_a]; - for (i = m - 1; i >= 0; i--) - for (j = 0; j < n; j++) - for (l = 0; l < i; l++) - w[i + j * m] += a[i + l * lda + k * stride_a] * w[l + j * m]; - } else { - for (j = 0; j < n; j++) - for (i = j; i < m; i++) - w[i + j * m] = a[i + j * lda + k * stride_a]; - for (j = 0; j < n; j++) - w[j + j * m] = 1.0; - for (j = n - 1; j >= 0; j--) - for (i = 0; i < m; i++) { - w[i + j * m] *= a[j + j * lda + k * stride_a]; - for (l = 0; l < j; l++) - w[i + j * m] += w[i + l * m] * a[l + j * lda + k * stride_a]; - } - } - for (j = 0; j < n; j++) - for (i = 0; i < m; i++) - w[i + j * m] -= a_in[k * stride_a + i + j * lda]; - res = fp_norm1(m, n, w, m); - nrm = fp_norm1(m, n, &a_in[k * stride_a], lda); - nrm *= (double)n * ulp; - res /= nrm > 0.0 ? nrm : ulp; - CHECK_AND_REPORT("| L U - A | / ( |A| n ulp )", label, - FAILED(res, thresh), res, fail); - } - } - - free(w); - return fail; -} - -void dgetrfnp_batch_strided_c(int64_t m, int64_t n, double *a, int64_t lda, - int64_t stride_a, int64_t *ipiv, - int64_t stride_ipiv, int64_t batch, - int64_t *info); - -int main(int argc, char *argv[]) { - int exit_status = 0; - constexpr int64_t m = 8, n = 8, lda = 8; - int64_t stride_a = lda * n, stride_ipiv = n; - - srand(1); - - for (int i = 1; i < argc; i++) { - int64_t batch = (int64_t)atoi(argv[i]); - batch = MAX(batch, 0); - int64_t a_count = MAX(stride_a * batch, 1); - int64_t ipiv_count = MAX(stride_ipiv * batch, 1); - int64_t info_count = MAX(batch, 1); - double *a = NULL, *a_copy = NULL; - int64_t *ipiv = NULL, *info = NULL; - CHECK(a = (double *)malloc(sizeof(double) * a_count), !a); - CHECK(a_copy = (double *)malloc(sizeof(double) * a_count), !a_copy); - CHECK(ipiv = (int64_t *)malloc(sizeof(int64_t) * ipiv_count), !ipiv); - CHECK(info = (int64_t *)malloc(sizeof(int64_t) * info_count), !info); - - /* Initialize input data */ - for (int64_t k = 0; k < batch; k++) { - fp_init(m, n, &a_copy[k * stride_a], lda); - fp_copy(m, n, &a_copy[k * stride_a], lda, &a[k * stride_a], lda); - } - - /* Run the tested function */ - dgetrfnp_batch_strided_c(m, n, a, lda, stride_a, ipiv, stride_ipiv, batch, - info); - - /* Check that the computation completed successfully */ - exit_status += dgetrfnp_batch_strided_check(m, n, a_copy, a, lda, stride_a, - ipiv, stride_ipiv, batch, info); - - free(a); - free(a_copy); - free(ipiv); - free(info); - } - return exit_status; -} +#include "dgetrf_8x8.cpp" diff --git a/SYCL/ESIMD/regression/dgetrf_aspect-fp64.cpp b/SYCL/ESIMD/regression/dgetrf_aspect-fp64.cpp index c3dfece33c..2350f9dbff 100644 --- a/SYCL/ESIMD/regression/dgetrf_aspect-fp64.cpp +++ b/SYCL/ESIMD/regression/dgetrf_aspect-fp64.cpp @@ -8,7 +8,7 @@ // REQUIRES: gpu, aspect-fp64 // UNSUPPORTED: cuda || hip -// RUN: %clangxx -fsycl %s -I%S/.. -o %t.out +// RUN: %clangxx -fsycl -DENABLE_FP64 %s -I%S/.. -o %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out 3 2 1 // // This test checks the correctness of ESIMD program for batched LU @@ -16,4 +16,4 @@ // corresponding to LU input sizes; all internal functions are inlined. // -#include "Inputs/dgetrf_aspect-fp64.hpp" +#include "Inputs/dgetrf.hpp" diff --git a/SYCL/ESIMD/regression/dgetrf_ref_aspect-fp64.cpp b/SYCL/ESIMD/regression/dgetrf_ref_aspect-fp64.cpp index 5b11ff3beb..83d152d6dc 100644 --- a/SYCL/ESIMD/regression/dgetrf_ref_aspect-fp64.cpp +++ b/SYCL/ESIMD/regression/dgetrf_ref_aspect-fp64.cpp @@ -8,7 +8,7 @@ // REQUIRES: gpu, aspect-fp64 // UNSUPPORTED: cuda || hip -// RUN: %clangxx -fsycl -DUSE_REF %s -I%S/.. -o %t.ref.out +// RUN: %clangxx -fsycl -DUSE_REF -DENABLE_FP64 %s -I%S/.. -o %t.ref.out // RUN: %GPU_RUN_PLACEHOLDER %t.ref.out 3 2 1 // // This test checks the correctness of ESIMD program for batched LU diff --git a/SYCL/GroupAlgorithm/SYCL2020/sort.cpp b/SYCL/GroupAlgorithm/SYCL2020/sort.cpp index b7fe34ed3f..56eaf4e963 100644 --- a/SYCL/GroupAlgorithm/SYCL2020/sort.cpp +++ b/SYCL/GroupAlgorithm/SYCL2020/sort.cpp @@ -3,10 +3,349 @@ // RUN: %GPU_RUN_PLACEHOLDER %t.out // RUN: %ACC_RUN_PLACEHOLDER %t.out -#include "sort.hpp" +#include "support.h" +#include + +#include +#include +#include +#include namespace oneapi_exp = sycl::ext::oneapi::experimental; +auto async_handler_ = [](sycl::exception_list ex_list) { + for (auto &ex : ex_list) { + try { + std::rethrow_exception(ex); + } catch (sycl::exception &ex) { + std::cerr << ex.what() << std::endl; + std::exit(EXIT_FAILURE); + } + } +}; + +constexpr uint32_t items_per_work_item = 4; + +struct CustomType { + int x; +}; + +struct CustomFunctor { + bool operator()(const CustomType &lhs, const CustomType &rhs) const { + return lhs.x < rhs.x; + } +}; + +template bool check(T lhs, T rhs, float epsilon) { + return sycl::abs(lhs - rhs) > epsilon; +} +bool check(CustomType lhs, CustomType rhs, float epsilon) { + return sycl::abs(lhs.x - rhs.x) > epsilon; +} + +template +bool verify(T *expected, T *got, std::size_t n, float epsilon) { + for (std::size_t i = 0; i < n; ++i) { + if (check(expected[i], got[i], epsilon)) { + return false; + } + } + return true; +} + +// forward declared classes to name kernels +template class sort_over_group_kernel_name; +template class joint_sort_kernel_name; +template class custom_sorter_kernel_name; + +// this class is needed to pass dimension value to aforementioned classes +template class int_wrapper; + +// custom sorter +template struct bubble_sorter { + Compare comp; + size_t idx; + + template + void operator()(Group g, Ptr begin, Ptr end) { + size_t n = end - begin; + if (idx == 0) + for (size_t i = 0; i < n; ++i) + for (size_t j = i + 1; j < n; ++j) + if (comp(begin[j], begin[i])) + std::swap(begin[i], begin[j]); + } +}; + +template sycl::range get_range(const std::size_t local); + +template <> sycl::range<1> get_range<1>(const std::size_t local) { + return sycl::range<1>(local); +} + +template <> sycl::range<2> get_range<2>(const std::size_t local) { + return sycl::range<2>(local, 1); +} + +template <> sycl::range<3> get_range<3>(const std::size_t local) { + return sycl::range<3>(local, 1, 1); +} + +template +int test_sort_over_group(sycl::queue &q, std::size_t local, + sycl::buffer &bufI1, Compare comp, int test_case) { + auto n = bufI1.size(); + if (n > local) + return -1; + + sycl::range local_range = get_range(local); + + std::size_t local_memory_size = + oneapi_exp::default_sorter<>::memory_required( + sycl::memory_scope::work_group, local_range); + + if (local_memory_size > + q.get_device().template get_info()) + std::cout << "local_memory_size = " << local_memory_size << ", available = " + << q.get_device() + .template get_info() + << std::endl; + q.submit([&](sycl::handler &h) { + auto aI1 = sycl::accessor(bufI1, h); + sycl::accessor + scratch({local_memory_size}, h); + + h.parallel_for, T, Compare>>( + sycl::nd_range(local_range, local_range), + [=](sycl::nd_item id) { + scratch[0] = std::byte{}; + auto local_id = id.get_local_linear_id(); + switch (test_case) { + case 0: + if constexpr (std::is_same_v> && + !std::is_same_v) + aI1[local_id] = oneapi_exp::sort_over_group( + oneapi_exp::group_with_scratchpad( + id.get_group(), + sycl::span{&scratch[0], local_memory_size}), + aI1[local_id]); + break; + case 1: + aI1[local_id] = oneapi_exp::sort_over_group( + oneapi_exp::group_with_scratchpad( + id.get_group(), + sycl::span{&scratch[0], local_memory_size}), + aI1[local_id], comp); + break; + case 2: + aI1[local_id] = oneapi_exp::sort_over_group( + id.get_group(), aI1[local_id], + oneapi_exp::default_sorter( + sycl::span{&scratch[0], local_memory_size})); + break; + } + }); + }).wait_and_throw(); + return 1; +} + +template +int test_joint_sort(sycl::queue &q, std::size_t n_items, std::size_t local, + sycl::buffer &bufI1, Compare comp, int test_case) { + auto n = bufI1.size(); + auto n_groups = (n - 1) / n_items + 1; + + std::size_t local_memory_size = + oneapi_exp::default_sorter<>::memory_required( + sycl::memory_scope::work_group, n); + if (local_memory_size > + q.get_device().template get_info()) + std::cout << "local_memory_size = " << local_memory_size << ", available = " + << q.get_device() + .template get_info() + << std::endl; + q.submit([&](sycl::handler &h) { + auto aI1 = sycl::accessor(bufI1, h); + sycl::accessor + scratch({local_memory_size}, h); + + h.parallel_for>( + sycl::nd_range<1>{{n_groups * local}, {local}}, + [=](sycl::nd_item<1> id) { + auto group_id = id.get_group(0); + auto ptr_keys = &aI1[group_id * n_items]; + // Replacing the line above with the line below also works + // auto ptr_keys = aI1.get_pointer() + group_id * n_items; + + scratch[0] = std::byte{}; + switch (test_case) { + case 0: + if constexpr (std::is_same_v> && + !std::is_same_v) + oneapi_exp::joint_sort( + oneapi_exp::group_with_scratchpad( + id.get_group(), + sycl::span{&scratch[0], local_memory_size}), + ptr_keys, + ptr_keys + sycl::min(n_items, n - group_id * n_items)); + break; + case 1: + oneapi_exp::joint_sort( + oneapi_exp::group_with_scratchpad( + id.get_group(), + sycl::span{&scratch[0], local_memory_size}), + ptr_keys, + ptr_keys + sycl::min(n_items, n - group_id * n_items), comp); + break; + case 2: + oneapi_exp::joint_sort( + id.get_group(), ptr_keys, + ptr_keys + sycl::min(n_items, n - group_id * n_items), + oneapi_exp::default_sorter( + sycl::span{&scratch[0], local_memory_size})); + break; + } + }); + }).wait_and_throw(); + return n_groups; +} + +template +int test_custom_sorter(sycl::queue &q, sycl::buffer &bufI1, Compare comp) { + std::size_t local = 4; + auto n = bufI1.size(); + if (n > local) + return -1; + local = std::min(local, n); + + q.submit([&](sycl::handler &h) { + auto aI1 = sycl::accessor(bufI1, h); + + h.parallel_for>( + sycl::nd_range<2>({local, 1}, {local, 1}), [=](sycl::nd_item<2> id) { + auto ptr = aI1.get_pointer(); + + oneapi_exp::joint_sort( + id.get_group(), ptr, ptr + n, + bubble_sorter{comp, id.get_local_linear_id()}); + }); + }).wait_and_throw(); + return 1; +} + +template +void run_sort(sycl::queue &q, std::vector &in, std::size_t size, + Compare comp, int test_case, int sort_case) { + std::vector in2(in.begin(), in.begin() + size); + std::vector expected(in.begin(), in.begin() + size); + constexpr size_t work_size_limit = 4; + std::size_t local = std::min( + work_size_limit, + q.get_device() + .template get_info()); + local = std::min(local, size); + auto n_items = items_per_work_item * local; + + int n_groups = 1; + { // scope to destruct buffers + sycl::buffer bufKeys(in2.data(), size); + { + switch (sort_case) { + case 0: + // this case is just to check the compilation + n_groups = test_sort_over_group<1>(q, local, bufKeys, comp, test_case); + + n_groups = test_sort_over_group<2>(q, local, bufKeys, comp, test_case); + break; + case 1: + n_groups = test_joint_sort(q, n_items, local, bufKeys, comp, test_case); + break; + case 2: + n_groups = test_custom_sorter(q, bufKeys, comp); + break; + } + } + } + + // check results + for (int i_group = 0; i_group < n_groups; ++i_group) { + std::sort(expected.begin() + i_group * n_items, + expected.begin() + std::min((i_group + 1) * n_items, size), comp); + } + if (n_groups != -1 && + (test_case != 0 || + test_case == 0 && std::is_same_v> && + !std::is_same_v)&&!verify(expected.data(), in2.data(), + size, 0.001f)) { + std::cerr << "Verification failed \n"; + exit(1); + } +} + +template struct test_sort_cases { + template + void operator()(sycl::queue &q, std::size_t dataSize, Compare comp, + Generator generate) { + std::vector stationaryData(dataSize); + // fill data + for (std::size_t i = 0; i < dataSize; ++i) + stationaryData[i] = generate(i); + + // run test + for (int test_case = 0; test_case < 3; ++test_case) { + for (int sort_case = 0; sort_case < 3; ++sort_case) { + run_sort(q, stationaryData, dataSize, comp, test_case, sort_case); + } + } + } +}; + +void test_custom_type(sycl::queue &q, std::size_t dataSize) { + std::vector stationaryData(dataSize, CustomType{0}); + // fill data + for (std::size_t i = 0; i < dataSize; ++i) + stationaryData[i] = CustomType{int(i)}; + + // run test + for (int test_case = 0; test_case < 1; ++test_case) { + for (int sort_case = 0; sort_case < 3; ++sort_case) { + run_sort(q, stationaryData, dataSize, CustomFunctor{}, test_case, + sort_case); + } + } +} + +template +void test_sort_by_comp(sycl::queue &q, std::size_t dataSize) { + std::default_random_engine generator; + std::normal_distribution distribution((10.0), (2.0)); + + T max_size = std::numeric_limits::max(); + std::size_t to_fill = dataSize; + if (dataSize > max_size) + to_fill = max_size; + + // reversed order + test_sort_cases()(q, to_fill, Compare{}, + [to_fill](std::size_t i) { return T(to_fill - i - 1); }); + // filled by 1 + test_sort_cases()(q, dataSize, Compare{}, + [](std::size_t) { return T(1); }); + // random distribution + test_sort_cases()(q, dataSize, Compare{}, + [&distribution, &generator](std::size_t) { + return T(distribution(generator)); + }); +} + +template +void test_sort_by_type(sycl::queue &q, std::size_t dataSize) { + test_sort_by_comp>(q, dataSize); + test_sort_by_comp>(q, dataSize); +} + int main(int argc, char *argv[]) { sycl::queue q(sycl::default_selector{}, async_handler_); if (!isSupportedDevice(q.get_device())) { @@ -24,8 +363,10 @@ int main(int argc, char *argv[]) { test_sort_by_type(q, sizes[i]); test_sort_by_type(q, sizes[i]); test_sort_by_type(q, sizes[i]); - test_custom_type(q, sizes[i]); +#ifdef ENABLE_FP64 + test_sort_by_type(q, sizes[i]); +#endif } std::cout << "Test passed." << std::endl; } diff --git a/SYCL/GroupAlgorithm/SYCL2020/sort.hpp b/SYCL/GroupAlgorithm/SYCL2020/sort.hpp deleted file mode 100644 index c6fce86fff..0000000000 --- a/SYCL/GroupAlgorithm/SYCL2020/sort.hpp +++ /dev/null @@ -1,342 +0,0 @@ -#include "support.h" -#include - -#include -#include -#include -#include - -namespace oneapi_exp = sycl::ext::oneapi::experimental; - -auto async_handler_ = [](sycl::exception_list ex_list) { - for (auto &ex : ex_list) { - try { - std::rethrow_exception(ex); - } catch (sycl::exception &ex) { - std::cerr << ex.what() << std::endl; - std::exit(EXIT_FAILURE); - } - } -}; - -constexpr uint32_t items_per_work_item = 4; - -struct CustomType { - int x; -}; - -struct CustomFunctor { - bool operator()(const CustomType &lhs, const CustomType &rhs) const { - return lhs.x < rhs.x; - } -}; - -template bool check(T lhs, T rhs, float epsilon) { - return sycl::abs(lhs - rhs) > epsilon; -} -bool check(CustomType lhs, CustomType rhs, float epsilon) { - return sycl::abs(lhs.x - rhs.x) > epsilon; -} - -template -bool verify(T *expected, T *got, std::size_t n, float epsilon) { - for (std::size_t i = 0; i < n; ++i) { - if (check(expected[i], got[i], epsilon)) { - return false; - } - } - return true; -} - -// forward declared classes to name kernels -template class sort_over_group_kernel_name; -template class joint_sort_kernel_name; -template class custom_sorter_kernel_name; - -// this class is needed to pass dimension value to aforementioned classes -template class int_wrapper; - -// custom sorter -template struct bubble_sorter { - Compare comp; - size_t idx; - - template - void operator()(Group g, Ptr begin, Ptr end) { - size_t n = end - begin; - if (idx == 0) - for (size_t i = 0; i < n; ++i) - for (size_t j = i + 1; j < n; ++j) - if (comp(begin[j], begin[i])) - std::swap(begin[i], begin[j]); - } -}; - -template sycl::range get_range(const std::size_t local); - -template <> sycl::range<1> get_range<1>(const std::size_t local) { - return sycl::range<1>(local); -} - -template <> sycl::range<2> get_range<2>(const std::size_t local) { - return sycl::range<2>(local, 1); -} - -template <> sycl::range<3> get_range<3>(const std::size_t local) { - return sycl::range<3>(local, 1, 1); -} - -template -int test_sort_over_group(sycl::queue &q, std::size_t local, - sycl::buffer &bufI1, Compare comp, int test_case) { - auto n = bufI1.size(); - if (n > local) - return -1; - - sycl::range local_range = get_range(local); - - std::size_t local_memory_size = - oneapi_exp::default_sorter<>::memory_required( - sycl::memory_scope::work_group, local_range); - - if (local_memory_size > - q.get_device().template get_info()) - std::cout << "local_memory_size = " << local_memory_size << ", available = " - << q.get_device() - .template get_info() - << std::endl; - q.submit([&](sycl::handler &h) { - auto aI1 = sycl::accessor(bufI1, h); - sycl::accessor - scratch({local_memory_size}, h); - - h.parallel_for, T, Compare>>( - sycl::nd_range(local_range, local_range), - [=](sycl::nd_item id) { - scratch[0] = std::byte{}; - auto local_id = id.get_local_linear_id(); - switch (test_case) { - case 0: - if constexpr (std::is_same_v> && - !std::is_same_v) - aI1[local_id] = oneapi_exp::sort_over_group( - oneapi_exp::group_with_scratchpad( - id.get_group(), - sycl::span{&scratch[0], local_memory_size}), - aI1[local_id]); - break; - case 1: - aI1[local_id] = oneapi_exp::sort_over_group( - oneapi_exp::group_with_scratchpad( - id.get_group(), - sycl::span{&scratch[0], local_memory_size}), - aI1[local_id], comp); - break; - case 2: - aI1[local_id] = oneapi_exp::sort_over_group( - id.get_group(), aI1[local_id], - oneapi_exp::default_sorter( - sycl::span{&scratch[0], local_memory_size})); - break; - } - }); - }).wait_and_throw(); - return 1; -} - -template -int test_joint_sort(sycl::queue &q, std::size_t n_items, std::size_t local, - sycl::buffer &bufI1, Compare comp, int test_case) { - auto n = bufI1.size(); - auto n_groups = (n - 1) / n_items + 1; - - std::size_t local_memory_size = - oneapi_exp::default_sorter<>::memory_required( - sycl::memory_scope::work_group, n); - if (local_memory_size > - q.get_device().template get_info()) - std::cout << "local_memory_size = " << local_memory_size << ", available = " - << q.get_device() - .template get_info() - << std::endl; - q.submit([&](sycl::handler &h) { - auto aI1 = sycl::accessor(bufI1, h); - sycl::accessor - scratch({local_memory_size}, h); - - h.parallel_for>( - sycl::nd_range<1>{{n_groups * local}, {local}}, - [=](sycl::nd_item<1> id) { - auto group_id = id.get_group(0); - auto ptr_keys = &aI1[group_id * n_items]; - // Replacing the line above with the line below also works - // auto ptr_keys = aI1.get_pointer() + group_id * n_items; - - scratch[0] = std::byte{}; - switch (test_case) { - case 0: - if constexpr (std::is_same_v> && - !std::is_same_v) - oneapi_exp::joint_sort( - oneapi_exp::group_with_scratchpad( - id.get_group(), - sycl::span{&scratch[0], local_memory_size}), - ptr_keys, - ptr_keys + sycl::min(n_items, n - group_id * n_items)); - break; - case 1: - oneapi_exp::joint_sort( - oneapi_exp::group_with_scratchpad( - id.get_group(), - sycl::span{&scratch[0], local_memory_size}), - ptr_keys, - ptr_keys + sycl::min(n_items, n - group_id * n_items), comp); - break; - case 2: - oneapi_exp::joint_sort( - id.get_group(), ptr_keys, - ptr_keys + sycl::min(n_items, n - group_id * n_items), - oneapi_exp::default_sorter( - sycl::span{&scratch[0], local_memory_size})); - break; - } - }); - }).wait_and_throw(); - return n_groups; -} - -template -int test_custom_sorter(sycl::queue &q, sycl::buffer &bufI1, Compare comp) { - std::size_t local = 4; - auto n = bufI1.size(); - if (n > local) - return -1; - local = std::min(local, n); - - q.submit([&](sycl::handler &h) { - auto aI1 = sycl::accessor(bufI1, h); - - h.parallel_for>( - sycl::nd_range<2>({local, 1}, {local, 1}), [=](sycl::nd_item<2> id) { - auto ptr = aI1.get_pointer(); - - oneapi_exp::joint_sort( - id.get_group(), ptr, ptr + n, - bubble_sorter{comp, id.get_local_linear_id()}); - }); - }).wait_and_throw(); - return 1; -} - -template -void run_sort(sycl::queue &q, std::vector &in, std::size_t size, - Compare comp, int test_case, int sort_case) { - std::vector in2(in.begin(), in.begin() + size); - std::vector expected(in.begin(), in.begin() + size); - constexpr size_t work_size_limit = 4; - std::size_t local = std::min( - work_size_limit, - q.get_device() - .template get_info()); - local = std::min(local, size); - auto n_items = items_per_work_item * local; - - int n_groups = 1; - { // scope to destruct buffers - sycl::buffer bufKeys(in2.data(), size); - { - switch (sort_case) { - case 0: - // this case is just to check the compilation - n_groups = test_sort_over_group<1>(q, local, bufKeys, comp, test_case); - - n_groups = test_sort_over_group<2>(q, local, bufKeys, comp, test_case); - break; - case 1: - n_groups = test_joint_sort(q, n_items, local, bufKeys, comp, test_case); - break; - case 2: - n_groups = test_custom_sorter(q, bufKeys, comp); - break; - } - } - } - - // check results - for (int i_group = 0; i_group < n_groups; ++i_group) { - std::sort(expected.begin() + i_group * n_items, - expected.begin() + std::min((i_group + 1) * n_items, size), comp); - } - if (n_groups != -1 && - (test_case != 0 || - test_case == 0 && std::is_same_v> && - !std::is_same_v)&&!verify(expected.data(), in2.data(), - size, 0.001f)) { - std::cerr << "Verification failed \n"; - exit(1); - } -} - -template struct test_sort_cases { - template - void operator()(sycl::queue &q, std::size_t dataSize, Compare comp, - Generator generate) { - std::vector stationaryData(dataSize); - // fill data - for (std::size_t i = 0; i < dataSize; ++i) - stationaryData[i] = generate(i); - - // run test - for (int test_case = 0; test_case < 3; ++test_case) { - for (int sort_case = 0; sort_case < 3; ++sort_case) { - run_sort(q, stationaryData, dataSize, comp, test_case, sort_case); - } - } - } -}; - -void test_custom_type(sycl::queue &q, std::size_t dataSize) { - std::vector stationaryData(dataSize, CustomType{0}); - // fill data - for (std::size_t i = 0; i < dataSize; ++i) - stationaryData[i] = CustomType{int(i)}; - - // run test - for (int test_case = 0; test_case < 1; ++test_case) { - for (int sort_case = 0; sort_case < 3; ++sort_case) { - run_sort(q, stationaryData, dataSize, CustomFunctor{}, test_case, - sort_case); - } - } -} - -template -void test_sort_by_comp(sycl::queue &q, std::size_t dataSize) { - std::default_random_engine generator; - std::normal_distribution distribution((10.0), (2.0)); - - T max_size = std::numeric_limits::max(); - std::size_t to_fill = dataSize; - if (dataSize > max_size) - to_fill = max_size; - - // reversed order - test_sort_cases()(q, to_fill, Compare{}, - [to_fill](std::size_t i) { return T(to_fill - i - 1); }); - // filled by 1 - test_sort_cases()(q, dataSize, Compare{}, - [](std::size_t) { return T(1); }); - // random distribution - test_sort_cases()(q, dataSize, Compare{}, - [&distribution, &generator](std::size_t) { - return T(distribution(generator)); - }); -} - -template -void test_sort_by_type(sycl::queue &q, std::size_t dataSize) { - test_sort_by_comp>(q, dataSize); - test_sort_by_comp>(q, dataSize); -} \ No newline at end of file diff --git a/SYCL/GroupAlgorithm/SYCL2020/sort_aspect-fp64.cpp b/SYCL/GroupAlgorithm/SYCL2020/sort_aspect-fp64.cpp index 1925b84605..a5374b5339 100644 --- a/SYCL/GroupAlgorithm/SYCL2020/sort_aspect-fp64.cpp +++ b/SYCL/GroupAlgorithm/SYCL2020/sort_aspect-fp64.cpp @@ -1,38 +1,7 @@ // REQUIRES: aspect-fp64 -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -I . -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -DENABLE_FP64 %s -I . -o %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out // RUN: %ACC_RUN_PLACEHOLDER %t.out -#include "sort.hpp" - -namespace oneapi_exp = sycl::ext::oneapi::experimental; - -int main(int argc, char *argv[]) { - sycl::queue q(sycl::default_selector{}, async_handler_); - - if (!q.get_device().has(sycl::aspect::fp64) { - std::cout << "Skipping test\n"; - return 0; - } - - if (!isSupportedDevice(q.get_device())) { - std::cout << "Skipping test\n"; - return 0; - } - - std::vector sizes{1, 12, 32}; - - for (int i = 0; i < sizes.size(); ++i) { - test_sort_by_type(q, sizes[i]); - test_sort_by_type(q, sizes[i]); - test_sort_by_type(q, sizes[i]); - test_sort_by_type(q, sizes[i]); - test_sort_by_type(q, sizes[i]); - test_sort_by_type(q, sizes[i]); - test_sort_by_type(q, sizes[i]); - - test_custom_type(q, sizes[i]); - } - std::cout << "Test passed." << std::endl; -} +#include "sort.cpp" diff --git a/SYCL/InlineAsm/asm_float_add.cpp b/SYCL/InlineAsm/asm_float_add.cpp deleted file mode 100644 index f633743449..0000000000 --- a/SYCL/InlineAsm/asm_float_add.cpp +++ /dev/null @@ -1,67 +0,0 @@ -// UNSUPPORTED: cuda || hip_nvidia -// REQUIRES: gpu,linux -// RUN: %clangxx -fsycl %s -o %t.out -// RUN: %GPU_RUN_PLACEHOLDER %t.out - -#include "include/asmhelper.h" -#include -#include -#include -#include - -using dataType = sycl::cl_float; - -template -struct KernelFunctor : WithInputBuffers, WithOutputBuffer { - KernelFunctor(const std::vector &input1, const std::vector &input2) - : WithInputBuffers(input1, input2), WithOutputBuffer( - input1.size()) {} - - void operator()(sycl::handler &cgh) { - auto A = - this->getInputBuffer(0).template get_access( - cgh); - auto B = - this->getInputBuffer(1).template get_access( - cgh); - auto C = - this->getOutputBuffer().template get_access( - cgh); - - cgh.parallel_for>( - sycl::range<1>{this->getOutputBufferSize()}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(8)]] { -#if defined(__SYCL_DEVICE_ONLY__) - asm("add (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>" - : "=rw"(C[wiID]) - : "rw"(A[wiID]), "rw"(B[wiID])); -#else - C[wiID] = A[wiID] + B[wiID]; -#endif - }); - } -}; - -int main() { - std::vector inputA(DEFAULT_PROBLEM_SIZE), - inputB(DEFAULT_PROBLEM_SIZE); - for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) { - inputA[i] = (float)1 / std::pow(2, i); - inputB[i] = (float)2 / std::pow(2, i); - } - - KernelFunctor<> f(inputA, inputB); - if (!launchInlineASMTest(f)) - return 0; - - auto &C = f.getOutputBufferData(); - for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) { - if (C[i] != inputA[i] + inputB[i]) { - std::cerr << "At index: " << i << ". "; - std::cerr << C[i] << " != " << inputA[i] + inputB[i] << "\n"; - return 1; - } - } - - return 0; -} diff --git a/SYCL/InlineAsm/asm_float_imm_arg.cpp b/SYCL/InlineAsm/asm_float_imm_arg.cpp deleted file mode 100644 index d2fb47000f..0000000000 --- a/SYCL/InlineAsm/asm_float_imm_arg.cpp +++ /dev/null @@ -1,60 +0,0 @@ -// UNSUPPORTED: cuda || hip_nvidia -// REQUIRES: gpu,linux -// RUN: %clangxx -fsycl %s -o %t.out -// RUN: %GPU_RUN_PLACEHOLDER %t.out - -#include "include/asmhelper.h" -#include -#include -#include -#include - -constexpr float IMM_ARGUMENT = 0.5; -using dataType = sycl::cl_float; - -template -struct KernelFunctor : WithInputBuffers, WithOutputBuffer { - KernelFunctor(const std::vector &input) - : WithInputBuffers(input), WithOutputBuffer(input.size()) {} - - void operator()(sycl::handler &cgh) { - auto A = - this->getInputBuffer(0).template get_access( - cgh); - auto B = - this->getOutputBuffer().template get_access( - cgh); - - cgh.parallel_for>( - sycl::range<1>{this->getOutputBufferSize()}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(8)]] { -#if defined(__SYCL_DEVICE_ONLY__) - asm("mul (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0> %2" - : "=rw"(B[wiID]) - : "rw"(A[wiID]), "i"(IMM_ARGUMENT)); -#else - B[wiID] = A[wiID] * IMM_ARGUMENT; -#endif - }); - } -}; - -int main() { - std::vector input(DEFAULT_PROBLEM_SIZE); - for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) - input[i] = (float)1 / std::pow(2, i); - - KernelFunctor<> f(input); - if (!launchInlineASMTest(f)) - return 0; - - auto &B = f.getOutputBufferData(); - for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) { - if (B[i] != input[i] * IMM_ARGUMENT) { - std::cerr << "At index: " << i << ". "; - std::cerr << B[i] << " != " << input[i] * IMM_ARGUMENT << "\n"; - return 1; - } - } - return 0; -} diff --git a/SYCL/KernelParams/union_kernel_param.cpp b/SYCL/KernelParams/union_kernel_param.cpp index 2e5095bcc8..fee2fbd4c6 100644 --- a/SYCL/KernelParams/union_kernel_param.cpp +++ b/SYCL/KernelParams/union_kernel_param.cpp @@ -9,31 +9,37 @@ #include #include +#ifdef ENABLE_FP64 +typedef double fptype; +#else +typedef float fptype; +#endif + union TestUnion { public: int myint; char mychar; - float myfloat; + fptype mytype; - TestUnion() { myfloat = 0.0f; }; + TestUnion() { mytype = 0.0; }; }; int main(int argc, char **argv) { TestUnion x; - x.myfloat = 5.0f; - float myfloat = 0.0f; + x.mytype = 5.0; + fptype mytype = 0.0; sycl::queue queue; { - sycl::buffer buf(&myfloat, 1); + sycl::buffer buf(&mytype, 1); queue.submit([&](sycl::handler &cgh) { auto acc = buf.get_access(cgh); - cgh.single_task([=]() { acc[0] = x.myfloat; }); + cgh.single_task([=]() { acc[0] = x.mytype; }); }); } - if (myfloat != 5.0f) { - printf("FAILED\nmyfloat = %d\n", myfloat); + if (mytype != 5.0) { + printf("FAILED\nmytype = %d\n", mytype); return 1; } return 0; diff --git a/SYCL/KernelParams/union_kernel_param_aspect-fp64.cpp b/SYCL/KernelParams/union_kernel_param_aspect-fp64.cpp index b8715fd992..ba6e71b12e 100644 --- a/SYCL/KernelParams/union_kernel_param_aspect-fp64.cpp +++ b/SYCL/KernelParams/union_kernel_param_aspect-fp64.cpp @@ -1,46 +1,9 @@ // This test checks kernel execution with union type as kernel parameters. // REQUIRES: aspect-fp64 -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -DENABLE_FP64 %s -o %t.out // RUN: %HOST_RUN_PLACEHOLDER %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out // RUN: %ACC_RUN_PLACEHOLDER %t.out - -#include -#include - -union TestUnion { -public: - int myint; - char mychar; - double mydouble; - - TestUnion() { mydouble = 0.0; }; -}; - -int main(int argc, char **argv) { - TestUnion x; - x.mydouble = 5.0; - double mydouble = 0.0; - - cl::sycl::queue queue; - if (!queue.get_device().has(sycl::aspect::fp64) { - std::cout << "Skipping test\n"; - return 0; - } - - { - cl::sycl::buffer buf(&mydouble, 1); - queue.submit([&](cl::sycl::handler &cgh) { - auto acc = buf.get_access(cgh); - cgh.single_task([=]() { acc[0] = x.mydouble; }); - }); - } - - if (mydouble != 5.0) { - printf("FAILED\nmydouble = %d\n", mydouble); - return 1; - } - return 0; -} +#include "union_kernel_param.cpp" diff --git a/SYCL/Regression/commandlist/Inputs/FindPrimesSYCL.cpp b/SYCL/Regression/commandlist/Inputs/FindPrimesSYCL.cpp deleted file mode 100644 index c7ff46673d..0000000000 --- a/SYCL/Regression/commandlist/Inputs/FindPrimesSYCL.cpp +++ /dev/null @@ -1,112 +0,0 @@ -#include "FindPrimesSYCL.h" - -#include -#include -#include -#include -#include - -#include -constexpr sycl::access::mode sycl_read = sycl::access::mode::read; -constexpr sycl::access::mode sycl_write = sycl::access::mode::write; - -using namespace std; - -/* This is the class used to name the kernel for the runtime. - * This must be done when the kernel is expressed as a lambda. */ -class FindPrimeI; - -float find_prime_s(work *w) { - - // need this for the lambda capture and sycl queue submit - auto &VRI = w->VRI; - auto N = w->VRI.size(); - auto niter = w->niter; - auto nitems = w->nitems; - - sycl::range<1> numOfItems{nitems}; - sycl::buffer bufferR(VRI.data(), N); - - auto start = std::chrono::high_resolution_clock::now(); - - sycl::event event; - -#ifdef __SYCL_DEVICE_ONLY__ -#define CONSTANT __attribute__((opencl_constant)) -#else -#define CONSTANT -#endif - - if (w->queueLock) { - w->queueLock->lock(); - } - event = w->deviceQueue->submit([&](sycl::handler &cgh) { - auto accessorR = bufferR.template get_access(cgh); - sycl::stream cout(1024, 256, cgh); - - auto k2 = [=](sycl::item<1> item) { - size_t maxstride = 1 + N / nitems; - for (size_t istride = 0; istride < maxstride; ++istride) { - unsigned int number = istride * nitems + item.get_linear_id(); - if (number < N) { - for (size_t i = 0; i < niter; ++i) { - bool is_prime = !(number % 2 == 0); - const int upper_bound = sycl::sqrt(1.0f * number) + 1; - int k = 3; - while (k < upper_bound && is_prime) { - is_prime = !(number % k == 0); - k += 2; // don't have to test even numbers - } - accessorR[number] = is_prime; - } - } else { - break; - } - } - }; - cgh.parallel_for(numOfItems, k2); - }); - if (w->queueLock) { - w->queueLock->unlock(); - } - - // deviceQueue.wait(); - try { - event.wait_and_throw(); - } catch (sycl::exception const &e) { - std::cout << "Caught asynchronous SYCL exception:\n" - << e.what() << std::endl; - } - - auto stop = std::chrono::high_resolution_clock::now(); - - auto submit_time = - event.get_profiling_info(); - auto start_time = - event.get_profiling_info(); - auto end_time = - event.get_profiling_info(); - - w->start_time = start_time; - w->end_time = end_time; - w->submit_time = submit_time; - w->start = start; - w->stop = stop; - - // std::cout << "submit time: " << submission_time - // << std::endl; - // std::cout << "execut time: " << execution_time - // << std::endl; - - w->result = 0; - for (auto &e : w->VRI) { - if (e) { - ++w->result; - } - } - - std::chrono::duration diff{0}; - diff = (stop - start); - - return diff.count(); -} diff --git a/SYCL/SpecConstants/2020/handler-api.cpp b/SYCL/SpecConstants/2020/handler-api.cpp index 669a1b1c09..2a57d12b5b 100644 --- a/SYCL/SpecConstants/2020/handler-api.cpp +++ b/SYCL/SpecConstants/2020/handler-api.cpp @@ -24,6 +24,9 @@ constexpr sycl::specialization_id int_id; constexpr sycl::specialization_id int_id2(2); constexpr sycl::specialization_id custom_type_id; +#ifdef ENABLE_FP64 +constexpr sycl::specialization_id double_id(3.14); +#endif class TestDefaultValuesKernel; class EmptyKernel; @@ -71,16 +74,25 @@ bool test_default_values(sycl::queue q) { sycl::buffer int_buffer(1); sycl::buffer int_buffer2(1); sycl::buffer custom_type_buffer(1); +#ifdef ENABLE_FP64 + sycl::buffer double_buffer(1); +#endif q.submit([&](sycl::handler &cgh) { auto int_acc = int_buffer.get_access(cgh); auto int_acc2 = int_buffer2.get_access(cgh); auto custom_type_acc = custom_type_buffer.get_access(cgh); +#ifdef ENABLE_FP6 + auto double_acc = double_buffer.get_access(cgh); +#endif cgh.single_task([=](sycl::kernel_handler kh) { int_acc[0] = kh.get_specialization_constant(); int_acc2[0] = kh.get_specialization_constant(); custom_type_acc[0] = kh.get_specialization_constant(); +#ifdef ENABLE_FP64 + double_acc[0] = kh.get_specialization_constant(); +#endif }); }); @@ -100,6 +112,11 @@ bool test_default_values(sycl::queue q) { if (!check_value(custom_type_ref, custom_type_acc[0], "custom_type specialization constant")) return false; +#ifdef ENABLE_FP64 + auto double_acc = double_buffer.get_access(); + if (!check_value(3.14, double_acc[0], "double specialization constant")) + return false; +#endif return true; } @@ -117,11 +134,23 @@ bool test_set_and_get_on_host(sycl::queue q) { custom_type_ref, cgh.get_specialization_constant(), "custom_type specializaiton constant before setting any value")) ++errors; +#ifdef ENABLE_FP64 + if (!check_value(3.14, cgh.get_specialization_constant(), + "double specializaiton constant before setting any value")) + ++errors; +#endif int new_int_value = 8; custom_type new_custom_type_value('b', 1.0, 12); +#ifdef ENABLE_FP64 + double new_double_value = 3.0; +#endif + cgh.set_specialization_constant(new_int_value); cgh.set_specialization_constant(new_custom_type_value); +#ifdef ENABLE_FP64 + cgh.set_specialization_constant(new_double_value); +#endif if (!check_value( new_int_value, cgh.get_specialization_constant(), @@ -133,6 +162,12 @@ bool test_set_and_get_on_host(sycl::queue q) { cgh.get_specialization_constant(), "custom_type specializaiton constant after setting a new value")) ++errors; +#ifdef ENABLE_FP64 + if (!check_value( + new_double_value, cgh.get_specialization_constant(), + "double specializaiton constant after setting a new value")) + ++errors; +#endif cgh.single_task([=]() {}); }); @@ -144,25 +179,40 @@ bool test_set_and_get_on_device(sycl::queue q) { sycl::buffer int_buffer(1); sycl::buffer int_buffer2(1); sycl::buffer custom_type_buffer(1); +#ifdef ENABLE_FP64 + sycl::buffer double_buffer(1); +#endif int new_int_value = 8; int new_int_value2 = 0; custom_type new_custom_type_value('b', 1.0, 12); +#ifdef ENABLE_FP64 + double new_double_value = 3.0; +#endif q.submit([&](sycl::handler &cgh) { auto int_acc = int_buffer.get_access(cgh); auto int_acc2 = int_buffer2.get_access(cgh); auto custom_type_acc = custom_type_buffer.get_access(cgh); +#ifdef ENABLE_FP64 + auto double_acc = double_buffer.get_access(cgh); +#endif cgh.set_specialization_constant(new_int_value); cgh.set_specialization_constant(new_int_value2); cgh.set_specialization_constant(new_custom_type_value); +#ifdef ENABLE_FP64 + cgh.set_specialization_constant(new_double_value); +#endif cgh.single_task([=](sycl::kernel_handler kh) { int_acc[0] = kh.get_specialization_constant(); int_acc2[0] = kh.get_specialization_constant(); custom_type_acc[0] = kh.get_specialization_constant(); +#ifdef ENABLE_FP64 + double_acc[0] = kh.get_specialization_constant(); +#endif }); }); @@ -181,6 +231,11 @@ bool test_set_and_get_on_device(sycl::queue q) { if (!check_value(new_custom_type_value, custom_type_acc[0], "custom_type specialization constant")) return false; - +#ifdef ENABLE_FP64 + auto double_acc = double_buffer.get_access(); + if (!check_value(new_double_value, double_acc[0], + "double specialization constant")) + return false; +#endif return true; } diff --git a/SYCL/SpecConstants/2020/handler-api_aspect-fp64.cpp b/SYCL/SpecConstants/2020/handler-api_aspect-fp64.cpp index 13d002f7da..a57e4419a6 100644 --- a/SYCL/SpecConstants/2020/handler-api_aspect-fp64.cpp +++ b/SYCL/SpecConstants/2020/handler-api_aspect-fp64.cpp @@ -8,7 +8,7 @@ // scope and correctly retrieved within a kernel // // REQUIRES: aspect-fp64 -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out \ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -DENABLE_FP64 %s -o %t.out \ // RUN: -fsycl-dead-args-optimization // FIXME: SYCL 2020 specialization constants are not supported on host device // RUN: %CPU_RUN_PLACEHOLDER %t.out @@ -16,118 +16,4 @@ // FIXME: ACC devices use emulation path, which is not yet supported // UNSUPPORTED: hip -#include -#include -#include - -#include "common.hpp" - -constexpr sycl::specialization_id double_id(3.14); - -class TestDefaultValuesKernel; -class EmptyKernel; -class TestSetAndGetOnDevice; - -bool test_default_values(sycl::queue q); -bool test_set_and_get_on_host(sycl::queue q); -bool test_set_and_get_on_device(sycl::queue q); - -int main() { - auto exception_handler = [&](sycl::exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { - try { - std::rethrow_exception(e); - } catch (sycl::exception const &e) { - std::cout << "An async SYCL exception was caught: " << e.what() - << std::endl; - std::exit(1); - } - } - }; - - sycl::queue q(exception_handler); - - if (!q.get_device().has(sycl::aspect::fp64) { - std::cout << "Skipping test\n"; - return 0; - } - - if (!test_default_values(q)) { - std::cout << "Test for default values of specialization constants failed!" - << std::endl; - return 1; - } - - if (!test_set_and_get_on_host(q)) { - std::cout << "Test for set and get API on host failed!" << std::endl; - return 1; - } - - if (!test_set_and_get_on_device(q)) { - std::cout << "Test for set and get API on device failed!" << std::endl; - return 1; - } - - return 0; -}; - -bool test_default_values(sycl::queue q) { - sycl::buffer double_buffer(1); - - q.submit([&](sycl::handler &cgh) { - auto double_acc = double_buffer.get_access(cgh); - cgh.single_task([=](sycl::kernel_handler kh) { - double_acc[0] = kh.get_specialization_constant(); - }); - }); - - auto double_acc = double_buffer.get_access(); - if (!check_value(3.14, double_acc[0], "double specialization constant")) - return false; - - return true; -} - -bool test_set_and_get_on_host(sycl::queue q) { - unsigned errors = 0; - q.submit([&](sycl::handler &cgh) { - if (!check_value(3.14, cgh.get_specialization_constant(), - "double specializaiton constant before setting any value")) - ++errors; - - double new_double_value = 3.0; - cgh.set_specialization_constant(new_double_value); - - if (!check_value( - new_double_value, cgh.get_specialization_constant(), - "double specializaiton constant after setting a new value")) - ++errors; - - cgh.single_task([=]() {}); - }); - - return errors == 0; -} - -bool test_set_and_get_on_device(sycl::queue q) { - sycl::buffer double_buffer(1); - - double new_double_value = 3.0; - - q.submit([&](sycl::handler &cgh) { - auto double_acc = double_buffer.get_access(cgh); - - cgh.set_specialization_constant(new_double_value); - - cgh.single_task([=](sycl::kernel_handler kh) { - double_acc[0] = kh.get_specialization_constant(); - }); - }); - - auto double_acc = double_buffer.get_access(); - if (!check_value(new_double_value, double_acc[0], - "double specialization constant")) - return false; - - return true; -} +#include "handler-api.cpp" diff --git a/SYCL/SpecConstants/2020/kernel-bundle-api.cpp b/SYCL/SpecConstants/2020/kernel-bundle-api.cpp index 1320114af0..d793d39b3a 100644 --- a/SYCL/SpecConstants/2020/kernel-bundle-api.cpp +++ b/SYCL/SpecConstants/2020/kernel-bundle-api.cpp @@ -23,6 +23,9 @@ constexpr sycl::specialization_id int_id; constexpr sycl::specialization_id custom_type_id; +#ifdef ENABLE_FP64 +constexpr sycl::specialization_id double_id(3.14); +#endif class TestDefaultValuesKernel; class EmptyKernel; @@ -77,7 +80,9 @@ bool test_default_values(sycl::queue q) { sycl::buffer int_buffer(1); sycl::buffer custom_type_buffer(1); - +#ifdef ENABLE_FP64 + sycl::buffer double_buffer(1); +#endif auto input_bundle = sycl::get_kernel_bundle(q.get_context()); auto exec_bundle = sycl::build(input_bundle); @@ -87,12 +92,17 @@ bool test_default_values(sycl::queue q) { auto int_acc = int_buffer.get_access(cgh); auto custom_type_acc = custom_type_buffer.get_access(cgh); +#ifdef ENABLE_FP64 + auto double_acc = double_buffer.get_access(cgh); +#endif cgh.single_task([=](sycl::kernel_handler kh) { int_acc[0] = kh.get_specialization_constant(); custom_type_acc[0] = kh.get_specialization_constant(); +#ifdef ENABLE_FP64 + double_acc[0] = kh.get_specialization_constant(); +#endif }); }); - auto int_acc = int_buffer.get_access(); if (!check_value( 0, int_acc[0], @@ -105,7 +115,11 @@ bool test_default_values(sycl::queue q) { if (!check_value(custom_type_ref, custom_type_acc[0], "custom_type specialization constant")) return false; - +#ifdef ENABLE_FP64 + auto double_acc = double_buffer.get_access(); + if (!check_value(3.14, double_acc[0], "double specialization constant")) + return false; +#endif return true; } @@ -131,7 +145,6 @@ bool test_set_and_get_on_host(sycl::queue q) { << std::endl; return false; } - // Check default values if (!check_value( 0, input_bundle.get_specialization_constant(), @@ -144,14 +157,25 @@ bool test_set_and_get_on_host(sycl::queue q) { input_bundle.get_specialization_constant(), "custom_type specializaiton constant before setting any value")) ++errors; - +#ifdef ENABLE_FP64 + if (!check_value(3.14, + input_bundle.get_specialization_constant(), + "double specializaiton constant before setting any value")) + ++errors; +#endif // Update values int new_int_value = 42; custom_type new_custom_type_value('b', 1.0, 12); +#ifdef ENABLE_FP64 + double new_double_value = 3.0; +#endif input_bundle.set_specialization_constant(new_int_value); input_bundle.set_specialization_constant( new_custom_type_value); +#ifdef ENABLE_FP64 + input_bundle.set_specialization_constant(new_double_value); +#endif // And re-check them again if (!check_value( @@ -164,6 +188,12 @@ bool test_set_and_get_on_host(sycl::queue q) { input_bundle.get_specialization_constant(), "custom_type specializaiton constant after setting a new value")) ++errors; +#ifdef ENABLE_FP64 + if (!check_value(new_double_value, + input_bundle.get_specialization_constant(), + "double specializaiton constant after setting a value")) + ++errors; +#endif // Let's try to build the bundle auto exec_bundle = sycl::build(input_bundle); @@ -178,6 +208,12 @@ bool test_set_and_get_on_host(sycl::queue q) { exec_bundle.get_specialization_constant(), "custom_type specializaiton constant after build")) ++errors; +#ifdef ENABLE_FP64 + if (!check_value(new_double_value, + exec_bundle.get_specialization_constant(), + "double specializaiton constant after build")) + ++errors; +#endif } catch (sycl::exception &e) { } @@ -187,29 +223,41 @@ bool test_set_and_get_on_host(sycl::queue q) { bool test_set_and_get_on_device(sycl::queue q) { sycl::buffer int_buffer(1); sycl::buffer custom_type_buffer(1); +#ifdef ENABLE_FP64 + sycl::buffer double_buffer(1); +#endif int new_int_value = 42; custom_type new_custom_type_value('b', 1.0, 12); +#ifdef ENABLE_FP64 + double new_double_value = 3.0; +#endif auto input_bundle = sycl::get_kernel_bundle(q.get_context()); input_bundle.set_specialization_constant(new_int_value); input_bundle.set_specialization_constant( new_custom_type_value); +#ifdef ENABLE_FP64 + input_bundle.set_specialization_constant(new_double_value); +#endif auto exec_bundle = sycl::build(input_bundle); - q.submit([&](sycl::handler &cgh) { cgh.use_kernel_bundle(exec_bundle); auto int_acc = int_buffer.get_access(cgh); auto custom_type_acc = custom_type_buffer.get_access(cgh); - +#ifdef ENABLE_FP64 + auto double_acc = double_buffer.get_access(cgh); +#endif cgh.single_task([=](sycl::kernel_handler kh) { int_acc[0] = kh.get_specialization_constant(); custom_type_acc[0] = kh.get_specialization_constant(); +#ifdef ENABLE_FP64 + double_acc[0] = kh.get_specialization_constant(); +#endif }); }); - auto int_acc = int_buffer.get_access(); if (!check_value(new_int_value, int_acc[0], "integer specialization constant")) @@ -220,6 +268,11 @@ bool test_set_and_get_on_device(sycl::queue q) { if (!check_value(new_custom_type_value, custom_type_acc[0], "custom_type specialization constant")) return false; - +#ifdef ENABLE_FP64 + auto double_acc = double_buffer.get_access(); + if (!check_value(new_double_value, double_acc[0], + "double specialization constant")) + return false; +#endif return true; } diff --git a/SYCL/SpecConstants/2020/kernel-bundle-api_aspect-fp64.cpp b/SYCL/SpecConstants/2020/kernel-bundle-api_aspect-fp64.cpp index 3fe7ee60cc..9c15fe2fcb 100644 --- a/SYCL/SpecConstants/2020/kernel-bundle-api_aspect-fp64.cpp +++ b/SYCL/SpecConstants/2020/kernel-bundle-api_aspect-fp64.cpp @@ -8,7 +8,7 @@ // API and correctly retrieved within a kernel // // REQUIRES: aspect-fp64 -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out \ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -DENABLE_FP64 %s -o %t.out \ // RUN: -fsycl-dead-args-optimization // FIXME: SYCL 2020 specialization constants are not supported on host device // RUN: %CPU_RUN_PLACEHOLDER %t.out @@ -16,167 +16,4 @@ // FIXME: ACC devices use emulation path, which is not yet supported // UNSUPPORTED: hip -#include -#include -#include - -#include "common.hpp" - -constexpr sycl::specialization_id double_id(3.14); - -class TestDefaultValuesKernel; -class EmptyKernel; -class TestSetAndGetOnDevice; - -bool test_default_values(sycl::queue q); -bool test_set_and_get_on_host(sycl::queue q); -bool test_set_and_get_on_device(sycl::queue q); - -int main() { - auto exception_handler = [&](sycl::exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { - try { - std::rethrow_exception(e); - } catch (sycl::exception const &e) { - std::cout << "An async SYCL exception was caught: " << e.what() - << std::endl; - std::exit(1); - } - } - }; - - sycl::queue q(exception_handler); - - if (!q.get_device().has(sycl::aspect::fp64) { - std::cout << "Skipping test\n"; - return 0; - } - - if (!test_default_values(q)) { - std::cout << "Test for default values of specialization constants failed!" - << std::endl; - return 1; - } - - if (!test_set_and_get_on_host(q)) { - std::cout << "Test for set and get API on host failed!" << std::endl; - return 1; - } - - if (!test_set_and_get_on_device(q)) { - std::cout << "Test for set and get API on device failed!" << std::endl; - return 1; - } - - return 0; -}; - -bool test_default_values(sycl::queue q) { - if (!sycl::has_kernel_bundle(q.get_context())) { - std::cout << "Cannot obtain kernel_bundle in input state, skipping default " - "values test" - << std::endl; - // TODO: check that online_compielr aspec is not available - return true; - } - - sycl::buffer double_buffer(1); - - auto input_bundle = - sycl::get_kernel_bundle(q.get_context()); - auto exec_bundle = sycl::build(input_bundle); - - q.submit([&](sycl::handler &cgh) { - cgh.use_kernel_bundle(exec_bundle); - auto double_acc = double_buffer.get_access(cgh); - cgh.single_task([=](sycl::kernel_handler kh) { - double_acc[0] = kh.get_specialization_constant(); - }); - }); - - auto double_acc = double_buffer.get_access(); - if (!check_value(3.14, double_acc[0], "double specialization constant")) - return false; - - return true; -} - -bool test_set_and_get_on_host(sycl::queue q) { - if (!sycl::has_kernel_bundle(q.get_context())) { - std::cout << "Cannot obtain kernel_bundle in input state, skipping default " - "values test" - << std::endl; - // TODO: check that online_compielr aspec is not available - return true; - } - - unsigned errors = 0; - - try { - auto input_bundle = - sycl::get_kernel_bundle(q.get_context()); - - if (!input_bundle.contains_specialization_constants()) { - std::cout - << "Obtained kernel_bundle is expected to contain specialization " - "constants, but it doesn't!" - << std::endl; - return false; - } - - if (!check_value(3.14, - input_bundle.get_specialization_constant(), - "double specializaiton constant before setting any value")) - ++errors; - - // Update values - double new_double_value = 3.0; - - input_bundle.set_specialization_constant(new_double_value); - - // And re-check them again - if (!check_value(new_double_value, - input_bundle.get_specialization_constant(), - "double specializaiton constant after setting a value")) - ++errors; - - // Let's try to build the bundle - auto exec_bundle = sycl::build(input_bundle); - - // And ensure that updated spec constant values are still there - if (!check_value(new_double_value, - exec_bundle.get_specialization_constant(), - "double specializaiton constant after build")) - ++errors; - } catch (sycl::exception &e) { - } - - return 0 == errors; -} - -bool test_set_and_get_on_device(sycl::queue q) { - sycl::buffer double_buffer(1); - - double new_double_value = 3.0; - - auto input_bundle = - sycl::get_kernel_bundle(q.get_context()); - input_bundle.set_specialization_constant(new_double_value); - auto exec_bundle = sycl::build(input_bundle); - - q.submit([&](sycl::handler &cgh) { - cgh.use_kernel_bundle(exec_bundle); - auto double_acc = double_buffer.get_access(cgh); - - cgh.single_task([=](sycl::kernel_handler kh) { - double_acc[0] = kh.get_specialization_constant(); - }); - }); - - auto double_acc = double_buffer.get_access(); - if (!check_value(new_double_value, double_acc[0], - "double specialization constant")) - return false; - - return true; -} +#include "kernel-bundle-api.cpp" diff --git a/SYCL/SubGroup/barrier.cpp b/SYCL/SubGroup/barrier.cpp index d74c175ef8..4a6e99b651 100644 --- a/SYCL/SubGroup/barrier.cpp +++ b/SYCL/SubGroup/barrier.cpp @@ -10,8 +10,70 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -#include "barrier.hpp" +#include "helper.hpp" +#include +#include +#include + +template class sycl_subgr; +using namespace sycl; +template +void check(queue &Queue, size_t G = 240, size_t L = 60) { + try { + nd_range<1> NdRange(G, L); + std::vector data(G); + std::iota(data.begin(), data.end(), sizeof(T)); + buffer addbuf(data.data(), range<1>(G)); + buffer sgsizebuf(1); + Queue.submit([&](handler &cgh) { + auto addacc = addbuf.template get_access(cgh); + auto sgsizeacc = sgsizebuf.get_access(cgh); + + cgh.parallel_for>( + NdRange, [=](nd_item<1> NdItem) { + ext::oneapi::sub_group SG = NdItem.get_sub_group(); + size_t lid = SG.get_local_id().get(0); + size_t gid = NdItem.get_global_id(0); + size_t SGoff = gid - lid; + + T res = 0; + for (size_t i = 0; i <= lid; i++) { + res += addacc[SGoff + i]; + } + if constexpr (UseNewSyntax) { + group_barrier(SG); + } else { + SG.barrier(access::fence_space::global_space); + } + addacc[gid] = res; + if (NdItem.get_global_id(0) == 0) + sgsizeacc[0] = SG.get_max_local_range()[0]; + }); + }); + auto addacc = addbuf.template get_access(); + auto sgsizeacc = sgsizebuf.get_access(); + + size_t sg_size = sgsizeacc[0]; + int WGid = -1, SGid = 0; + T add = 0; + for (int j = 0; j < G; j++) { + if (j % L % sg_size == 0) { + SGid++; + add = 0; + } + if (j % L == 0) { + WGid++; + SGid = 0; + } + add += j + sizeof(T); + exit_if_not_equal(addacc[j], add, "barrier"); + } + } catch (exception e) { + std::cout << "SYCL exception caught: " << e.what(); + exit(1); + } +} int main() { queue Queue; if (Queue.get_device().is_host()) { @@ -28,6 +90,12 @@ int main() { check(Queue); check(Queue); check(Queue); +#ifdef ENABLE_FP64 + if (Queue.get_device().has(sycl::aspect::fp64)) { + check(Queue); + check(Queue); + } +#endif std::cout << "Test passed." << std::endl; return 0; } diff --git a/SYCL/SubGroup/barrier.hpp b/SYCL/SubGroup/barrier.hpp deleted file mode 100644 index cccffc2d72..0000000000 --- a/SYCL/SubGroup/barrier.hpp +++ /dev/null @@ -1,63 +0,0 @@ -#include "helper.hpp" -#include -#include -#include - -template class sycl_subgr; -using namespace sycl; -template -void check(queue &Queue, size_t G = 240, size_t L = 60) { - try { - nd_range<1> NdRange(G, L); - std::vector data(G); - std::iota(data.begin(), data.end(), sizeof(T)); - buffer addbuf(data.data(), range<1>(G)); - buffer sgsizebuf(1); - Queue.submit([&](handler &cgh) { - auto addacc = addbuf.template get_access(cgh); - auto sgsizeacc = sgsizebuf.get_access(cgh); - - cgh.parallel_for>( - NdRange, [=](nd_item<1> NdItem) { - ext::oneapi::sub_group SG = NdItem.get_sub_group(); - size_t lid = SG.get_local_id().get(0); - size_t gid = NdItem.get_global_id(0); - size_t SGoff = gid - lid; - - T res = 0; - for (size_t i = 0; i <= lid; i++) { - res += addacc[SGoff + i]; - } - if constexpr (UseNewSyntax) { - group_barrier(SG); - } else { - SG.barrier(access::fence_space::global_space); - } - addacc[gid] = res; - if (NdItem.get_global_id(0) == 0) - sgsizeacc[0] = SG.get_max_local_range()[0]; - }); - }); - auto addacc = addbuf.template get_access(); - auto sgsizeacc = sgsizebuf.get_access(); - - size_t sg_size = sgsizeacc[0]; - int WGid = -1, SGid = 0; - T add = 0; - for (int j = 0; j < G; j++) { - if (j % L % sg_size == 0) { - SGid++; - add = 0; - } - if (j % L == 0) { - WGid++; - SGid = 0; - } - add += j + sizeof(T); - exit_if_not_equal(addacc[j], add, "barrier"); - } - } catch (exception e) { - std::cout << "SYCL exception caught: " << e.what(); - exit(1); - } -} diff --git a/SYCL/SubGroup/barrier_aspect-fp64.cpp b/SYCL/SubGroup/barrier_aspect-fp64.cpp index 9f45b0b67e..a7d201581c 100644 --- a/SYCL/SubGroup/barrier_aspect-fp64.cpp +++ b/SYCL/SubGroup/barrier_aspect-fp64.cpp @@ -1,5 +1,5 @@ // REQUIRES: aspect-fp64 -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -DENABLE_FP64 %s -o %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out // RUN: %ACC_RUN_PLACEHOLDER %t.out @@ -11,18 +11,4 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -#include "barrier.hpp" - -int main() { - queue Queue; - if (Queue.get_device().is_host()) { - std::cout << "Skipping test\n"; - return 0; - } - if (Queue.get_device().has(sycl::aspect::fp64)) { - check(Queue); - check(Queue); - } - std::cout << "Test passed." << std::endl; - return 0; -} +#include "barrier.cpp" diff --git a/SYCL/SubGroup/generic-shuffle.cpp b/SYCL/SubGroup/generic-shuffle.cpp index 3ede067ce8..8ed39c0446 100644 --- a/SYCL/SubGroup/generic-shuffle.cpp +++ b/SYCL/SubGroup/generic-shuffle.cpp @@ -11,10 +11,207 @@ // //===----------------------------------------------------------------------===// -#include "generic-shuffle.hpp" +#include "helper.hpp" +#include +#include +#include +#include +template class pointer_kernel; using namespace sycl; +template +void check_pointer(queue &Queue, size_t G = 256, size_t L = 64) { + try { + nd_range<1> NdRange(G, L); + buffer buf(G); + buffer buf_up(G); + buffer buf_down(G); + buffer buf_xor(G); + buffer sgsizebuf(1); + Queue.submit([&](handler &cgh) { + auto acc = buf.template get_access(cgh); + auto acc_up = buf_up.template get_access(cgh); + auto acc_down = + buf_down.template get_access(cgh); + auto acc_xor = buf_xor.template get_access(cgh); + auto sgsizeacc = sgsizebuf.get_access(cgh); + + cgh.parallel_for( + NdRange, [=](nd_item<1> NdItem) { + ext::oneapi::sub_group SG = NdItem.get_sub_group(); + uint32_t wggid = NdItem.get_global_id(0); + uint32_t sgid = SG.get_group_id().get(0); + if (wggid == 0) + sgsizeacc[0] = SG.get_max_local_range()[0]; + + T *ptr = static_cast(0x0) + wggid; + + /*GID of middle element in every subgroup*/ + acc[NdItem.get_global_id()] = + SG.shuffle(ptr, SG.get_max_local_range()[0] / 2); + + /* Save GID-SGID */ + acc_up[NdItem.get_global_id()] = SG.shuffle_up(ptr, sgid); + + /* Save GID+SGID */ + acc_down[NdItem.get_global_id()] = SG.shuffle_down(ptr, sgid); + + /* Save GID with SGLID = ( SGLID XOR SGID ) % SGMaxSize */ + acc_xor[NdItem.get_global_id()] = + SG.shuffle_xor(ptr, sgid % SG.get_max_local_range()[0]); + }); + }); + auto acc = buf.template get_access(); + auto acc_up = buf_up.template get_access(); + auto acc_down = buf_down.template get_access(); + auto acc_xor = buf_xor.template get_access(); + auto sgsizeacc = sgsizebuf.get_access(); + + size_t sg_size = sgsizeacc[0]; + int SGid = 0; + int SGLid = 0; + int SGBeginGid = 0; + for (int j = 0; j < G; j++) { + if (j % L % sg_size == 0) { + SGid++; + SGLid = 0; + SGBeginGid = j; + } + if (j % L == 0) { + SGid = 0; + SGLid = 0; + SGBeginGid = j; + } + + /*GID of middle element in every subgroup*/ + exit_if_not_equal(acc[j], + static_cast(0x0) + + (j / L * L + SGid * sg_size + sg_size / 2), + "shuffle"); + + /* Value GID+SGID for all element except last SGID in SG*/ + if (j % L % sg_size + SGid < sg_size && j % L + SGid < L) { + exit_if_not_equal(acc_down[j], static_cast(0x0) + (j + SGid), + "shuffle_down"); + } + + /* Value GID-SGID for all element except first SGID in SG*/ + if (j % L % sg_size >= SGid) { + exit_if_not_equal(acc_up[j], static_cast(0x0) + (j - SGid), + "shuffle_up"); + } + + /* Value GID with SGLID = ( SGLID XOR SGID ) % SGMaxSize */ + exit_if_not_equal(acc_xor[j], + static_cast(0x0) + + (SGBeginGid + (SGLid ^ (SGid % sg_size))), + "shuffle_xor"); + SGLid++; + } + } catch (exception e) { + std::cout << "SYCL exception caught: " << e.what(); + exit(1); + } +} + +template +void check_struct(queue &Queue, Generator &Gen, size_t G = 256, size_t L = 64) { + + // Fill a vector with values that will be shuffled + std::vector values(G); + std::generate(values.begin(), values.end(), Gen); + + try { + nd_range<1> NdRange(G, L); + buffer buf(G); + buffer buf_up(G); + buffer buf_down(G); + buffer buf_xor(G); + buffer sgsizebuf(1); + buffer buf_in(values.data(), values.size()); + Queue.submit([&](handler &cgh) { + auto acc = buf.template get_access(cgh); + auto acc_up = buf_up.template get_access(cgh); + auto acc_down = + buf_down.template get_access(cgh); + auto acc_xor = buf_xor.template get_access(cgh); + auto sgsizeacc = sgsizebuf.get_access(cgh); + auto in = buf_in.template get_access(cgh); + + cgh.parallel_for( + NdRange, [=](nd_item<1> NdItem) { + ext::oneapi::sub_group SG = NdItem.get_sub_group(); + uint32_t wggid = NdItem.get_global_id(0); + uint32_t sgid = SG.get_group_id().get(0); + if (wggid == 0) + sgsizeacc[0] = SG.get_max_local_range()[0]; + + T val = in[wggid]; + + /*GID of middle element in every subgroup*/ + acc[NdItem.get_global_id()] = + SG.shuffle(val, SG.get_max_local_range()[0] / 2); + + /* Save GID-SGID */ + acc_up[NdItem.get_global_id()] = SG.shuffle_up(val, sgid); + + /* Save GID+SGID */ + acc_down[NdItem.get_global_id()] = SG.shuffle_down(val, sgid); + + /* Save GID with SGLID = ( SGLID XOR SGID ) % SGMaxSize */ + acc_xor[NdItem.get_global_id()] = + SG.shuffle_xor(val, sgid % SG.get_max_local_range()[0]); + }); + }); + auto acc = buf.template get_access(); + auto acc_up = buf_up.template get_access(); + auto acc_down = buf_down.template get_access(); + auto acc_xor = buf_xor.template get_access(); + auto sgsizeacc = sgsizebuf.get_access(); + + size_t sg_size = sgsizeacc[0]; + int SGid = 0; + int SGLid = 0; + int SGBeginGid = 0; + for (int j = 0; j < G; j++) { + if (j % L % sg_size == 0) { + SGid++; + SGLid = 0; + SGBeginGid = j; + } + if (j % L == 0) { + SGid = 0; + SGLid = 0; + SGBeginGid = j; + } + + /*GID of middle element in every subgroup*/ + exit_if_not_equal( + acc[j], values[j / L * L + SGid * sg_size + sg_size / 2], "shuffle"); + + /* Value GID+SGID for all element except last SGID in SG*/ + if (j % L % sg_size + SGid < sg_size && j % L + SGid < L) { + exit_if_not_equal(acc_down[j], values[j + SGid], "shuffle_down"); + } + + /* Value GID-SGID for all element except first SGID in SG*/ + if (j % L % sg_size >= SGid) { + exit_if_not_equal(acc_up[j], values[j - SGid], "shuffle_up"); + } + + /* Value GID with SGLID = ( SGLID XOR SGID ) % SGMaxSize */ + exit_if_not_equal(acc_xor[j], + values[SGBeginGid + (SGLid ^ (SGid % sg_size))], + "shuffle_xor"); + SGLid++; + } + } catch (exception e) { + std::cout << "SYCL exception caught: " << e.what(); + exit(1); + } +} + int main() { queue Queue; if (Queue.get_device().is_host()) { @@ -31,7 +228,13 @@ int main() { }; check_struct>( Queue, ComplexFloatGenerator); - +#ifdef ENABLE_FP64 + auto ComplexDoubleGenerator = [state = std::complex(0, 1)]() mutable { + return state += std::complex(2, 2); + }; + check_struct>( + Queue, ComplexDoubleGenerator); +#endif std::cout << "Test passed." << std::endl; return 0; } diff --git a/SYCL/SubGroup/generic-shuffle.hpp b/SYCL/SubGroup/generic-shuffle.hpp deleted file mode 100644 index 96c6c3783f..0000000000 --- a/SYCL/SubGroup/generic-shuffle.hpp +++ /dev/null @@ -1,200 +0,0 @@ -#include "helper.hpp" -#include -#include -#include -#include -template class pointer_kernel; - -using namespace sycl; - -template -void check_pointer(queue &Queue, size_t G = 256, size_t L = 64) { - try { - nd_range<1> NdRange(G, L); - buffer buf(G); - buffer buf_up(G); - buffer buf_down(G); - buffer buf_xor(G); - buffer sgsizebuf(1); - Queue.submit([&](handler &cgh) { - auto acc = buf.template get_access(cgh); - auto acc_up = buf_up.template get_access(cgh); - auto acc_down = - buf_down.template get_access(cgh); - auto acc_xor = buf_xor.template get_access(cgh); - auto sgsizeacc = sgsizebuf.get_access(cgh); - - cgh.parallel_for( - NdRange, [=](nd_item<1> NdItem) { - ext::oneapi::sub_group SG = NdItem.get_sub_group(); - uint32_t wggid = NdItem.get_global_id(0); - uint32_t sgid = SG.get_group_id().get(0); - if (wggid == 0) - sgsizeacc[0] = SG.get_max_local_range()[0]; - - T *ptr = static_cast(0x0) + wggid; - - /*GID of middle element in every subgroup*/ - acc[NdItem.get_global_id()] = - SG.shuffle(ptr, SG.get_max_local_range()[0] / 2); - - /* Save GID-SGID */ - acc_up[NdItem.get_global_id()] = SG.shuffle_up(ptr, sgid); - - /* Save GID+SGID */ - acc_down[NdItem.get_global_id()] = SG.shuffle_down(ptr, sgid); - - /* Save GID with SGLID = ( SGLID XOR SGID ) % SGMaxSize */ - acc_xor[NdItem.get_global_id()] = - SG.shuffle_xor(ptr, sgid % SG.get_max_local_range()[0]); - }); - }); - auto acc = buf.template get_access(); - auto acc_up = buf_up.template get_access(); - auto acc_down = buf_down.template get_access(); - auto acc_xor = buf_xor.template get_access(); - auto sgsizeacc = sgsizebuf.get_access(); - - size_t sg_size = sgsizeacc[0]; - int SGid = 0; - int SGLid = 0; - int SGBeginGid = 0; - for (int j = 0; j < G; j++) { - if (j % L % sg_size == 0) { - SGid++; - SGLid = 0; - SGBeginGid = j; - } - if (j % L == 0) { - SGid = 0; - SGLid = 0; - SGBeginGid = j; - } - - /*GID of middle element in every subgroup*/ - exit_if_not_equal(acc[j], - static_cast(0x0) + - (j / L * L + SGid * sg_size + sg_size / 2), - "shuffle"); - - /* Value GID+SGID for all element except last SGID in SG*/ - if (j % L % sg_size + SGid < sg_size && j % L + SGid < L) { - exit_if_not_equal(acc_down[j], static_cast(0x0) + (j + SGid), - "shuffle_down"); - } - - /* Value GID-SGID for all element except first SGID in SG*/ - if (j % L % sg_size >= SGid) { - exit_if_not_equal(acc_up[j], static_cast(0x0) + (j - SGid), - "shuffle_up"); - } - - /* Value GID with SGLID = ( SGLID XOR SGID ) % SGMaxSize */ - exit_if_not_equal(acc_xor[j], - static_cast(0x0) + - (SGBeginGid + (SGLid ^ (SGid % sg_size))), - "shuffle_xor"); - SGLid++; - } - } catch (exception e) { - std::cout << "SYCL exception caught: " << e.what(); - exit(1); - } -} - -template -void check_struct(queue &Queue, Generator &Gen, size_t G = 256, size_t L = 64) { - - // Fill a vector with values that will be shuffled - std::vector values(G); - std::generate(values.begin(), values.end(), Gen); - - try { - nd_range<1> NdRange(G, L); - buffer buf(G); - buffer buf_up(G); - buffer buf_down(G); - buffer buf_xor(G); - buffer sgsizebuf(1); - buffer buf_in(values.data(), values.size()); - Queue.submit([&](handler &cgh) { - auto acc = buf.template get_access(cgh); - auto acc_up = buf_up.template get_access(cgh); - auto acc_down = - buf_down.template get_access(cgh); - auto acc_xor = buf_xor.template get_access(cgh); - auto sgsizeacc = sgsizebuf.get_access(cgh); - auto in = buf_in.template get_access(cgh); - - cgh.parallel_for( - NdRange, [=](nd_item<1> NdItem) { - ext::oneapi::sub_group SG = NdItem.get_sub_group(); - uint32_t wggid = NdItem.get_global_id(0); - uint32_t sgid = SG.get_group_id().get(0); - if (wggid == 0) - sgsizeacc[0] = SG.get_max_local_range()[0]; - - T val = in[wggid]; - - /*GID of middle element in every subgroup*/ - acc[NdItem.get_global_id()] = - SG.shuffle(val, SG.get_max_local_range()[0] / 2); - - /* Save GID-SGID */ - acc_up[NdItem.get_global_id()] = SG.shuffle_up(val, sgid); - - /* Save GID+SGID */ - acc_down[NdItem.get_global_id()] = SG.shuffle_down(val, sgid); - - /* Save GID with SGLID = ( SGLID XOR SGID ) % SGMaxSize */ - acc_xor[NdItem.get_global_id()] = - SG.shuffle_xor(val, sgid % SG.get_max_local_range()[0]); - }); - }); - auto acc = buf.template get_access(); - auto acc_up = buf_up.template get_access(); - auto acc_down = buf_down.template get_access(); - auto acc_xor = buf_xor.template get_access(); - auto sgsizeacc = sgsizebuf.get_access(); - - size_t sg_size = sgsizeacc[0]; - int SGid = 0; - int SGLid = 0; - int SGBeginGid = 0; - for (int j = 0; j < G; j++) { - if (j % L % sg_size == 0) { - SGid++; - SGLid = 0; - SGBeginGid = j; - } - if (j % L == 0) { - SGid = 0; - SGLid = 0; - SGBeginGid = j; - } - - /*GID of middle element in every subgroup*/ - exit_if_not_equal( - acc[j], values[j / L * L + SGid * sg_size + sg_size / 2], "shuffle"); - - /* Value GID+SGID for all element except last SGID in SG*/ - if (j % L % sg_size + SGid < sg_size && j % L + SGid < L) { - exit_if_not_equal(acc_down[j], values[j + SGid], "shuffle_down"); - } - - /* Value GID-SGID for all element except first SGID in SG*/ - if (j % L % sg_size >= SGid) { - exit_if_not_equal(acc_up[j], values[j - SGid], "shuffle_up"); - } - - /* Value GID with SGLID = ( SGLID XOR SGID ) % SGMaxSize */ - exit_if_not_equal(acc_xor[j], - values[SGBeginGid + (SGLid ^ (SGid % sg_size))], - "shuffle_xor"); - SGLid++; - } - } catch (exception e) { - std::cout << "SYCL exception caught: " << e.what(); - exit(1); - } -} diff --git a/SYCL/SubGroup/generic-shuffle_aspect-fp64.cpp b/SYCL/SubGroup/generic-shuffle_aspect-fp64.cpp index 6bfac67d4a..b8530ee1f9 100644 --- a/SYCL/SubGroup/generic-shuffle_aspect-fp64.cpp +++ b/SYCL/SubGroup/generic-shuffle_aspect-fp64.cpp @@ -1,11 +1,10 @@ // REQUIRES: aspect-fp64 -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -DENABLE_FP64 %s -o %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out // RUN: %ACC_RUN_PLACEHOLDER %t.out // -//==-- generic_shuffle_aspect-fp64.cpp - SYCL sub_group generic shuffle test *- -// C++ -*--==// +//==-- generic_shuffle_aspect-fp64.cpp - SYCL sub_group generic shuffle test *- C++ -*--==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -13,24 +12,4 @@ // //===----------------------------------------------------------------------===// -#include "generic-shuffle.hpp" - -using namespace sycl; - -int main() { - queue Queue; - if (Queue.get_device().is_host() or - !Queue.get_device().has(sycl::aspect::fp64)) { - std::cout << "Skipping test\n"; - return 0; - } - - auto ComplexDoubleGenerator = [state = std::complex(0, 1)]() mutable { - return state += std::complex(2, 2); - }; - check_struct>( - Queue, ComplexDoubleGenerator); - - std::cout << "Test passed." << std::endl; - return 0; -} +#include "generic-shuffle.cpp" diff --git a/SYCL/SubGroup/info.cpp b/SYCL/SubGroup/info.cpp index 71ae70679b..1eb2017407 100644 --- a/SYCL/SubGroup/info.cpp +++ b/SYCL/SubGroup/info.cpp @@ -62,14 +62,30 @@ int main() { auto sg_sizes = Device.get_info(); for (auto r : {range<3>(3, 4, 5), range<3>(1, 1, 1), range<3>(4, 2, 1), range<3>(32, 3, 4), range<3>(7, 9, 11)}) { + Res = + Kernel + .get_sub_group_info( + Device, r); + bool Expected = + std::find(sg_sizes.begin(), sg_sizes.end(), Res) != sg_sizes.end(); + exit_if_not_equal(Expected, true, "max_sub_group_size"); + Res = Kernel.get_info( Device, r); - bool Expected = + Expected = std::find(sg_sizes.begin(), sg_sizes.end(), Res) != sg_sizes.end(); exit_if_not_equal(Expected, true, "max_sub_group_size"); } } + Res = + Kernel + .get_sub_group_info( + Device); + + /* Sub-group size is not specified in kernel or IL*/ + exit_if_not_equal(Res, 0, "compile_num_sub_groups"); + Res = Kernel.get_info( Device); @@ -84,6 +100,12 @@ int main() { std::end(Vec) && std::find(Vec.begin(), Vec.end(), "cl_intel_required_subgroup_size") != std::end(Vec)) { + Res = Kernel.get_sub_group_info< + info::kernel_sub_group::compile_sub_group_size>(Device); + + /* Required sub-group size is not specified in kernel or IL*/ + exit_if_not_equal(Res, 0, "compile_sub_group_size"); + Res = Kernel.get_info( Device); diff --git a/SYCL/SubGroup/load_store.cpp b/SYCL/SubGroup/load_store.cpp index 44963631bf..8f4ada4444 100644 --- a/SYCL/SubGroup/load_store.cpp +++ b/SYCL/SubGroup/load_store.cpp @@ -15,10 +15,178 @@ // //===----------------------------------------------------------------------===// -#include "load_store.hpp" +#include "helper.hpp" +#include + +#include + +template class sycl_subgr; using namespace sycl; +template void check(queue &Queue) { + const int G = 512, L = 256; + + auto sg_sizes = Queue.get_device().get_info(); + size_t max_sg_size = *std::max_element(sg_sizes.begin(), sg_sizes.end()); + + try { + nd_range<1> NdRange(G, L); + buffer syclbuf(G + max_sg_size * N); + buffer sgsizebuf(1); + { + auto acc = syclbuf.template get_access(); + for (int i = 0; i < G; i++) { + acc[i] = i; + acc[i] += 0.25; // Check that floating point types are not casted to int + } + } + Queue.submit([&](handler &cgh) { + auto acc = syclbuf.template get_access(cgh); + auto sgsizeacc = sgsizebuf.get_access(cgh); + accessor LocalMem( + {L + max_sg_size * N}, cgh); + cgh.parallel_for>(NdRange, [=](nd_item<1> NdItem) { + ext::oneapi::sub_group SG = NdItem.get_sub_group(); + auto SGid = SG.get_group_id().get(0); + auto SGsize = SG.get_max_local_range().get(0); + /* Avoid overlapping data ranges inside and between local groups */ + if (SGid % N == 0 && (SGid + N) * SGsize <= L) { + size_t SGOffset = SGid * SGsize; + size_t WGSGoffset = NdItem.get_group(0) * L + SGOffset; + multi_ptr mp( + &acc[WGSGoffset]); + multi_ptr MPL( + &LocalMem[SGOffset]); + + // half does not have full support for volatile type qualifier + using CVT = std::conditional_t, const T, + const volatile T>; + + multi_ptr mp_cv(mp); + multi_ptr MPL_CV(MPL); + // Add all values in read block + vec v(SG.load(mp)); + vec v_cv(SG.load(mp_cv)); + if (utils::cmp_vec( + v, v_cv)) // Store result only if same for non-cv and cv + SG.store(MPL, v); + vec t(utils::add_vec(SG.load(MPL))); + vec t_cv(utils::add_vec(SG.load(MPL_CV))); + if (utils::cmp_vec( + t, t_cv)) // Store result only if same for non-cv and cv + SG.store(mp, t); + } + if (NdItem.get_global_id(0) == 0) + sgsizeacc[0] = SGsize; + }); + }); + auto acc = syclbuf.template get_access(); + auto sgsizeacc = sgsizebuf.get_access(); + size_t sg_size = sgsizeacc[0]; + int WGid = -1, SGid = 0; + for (int j = 0; j < (G - (sg_size * N)); j++) { + if (j % L % sg_size == 0) { + SGid++; + } + if (j % L == 0) { + WGid++; + SGid = 0; + } + T ref = 0; + if (SGid % N) { + ref = acc[j - (SGid % N) * sg_size]; + } else { + for (int i = 0; i < N; i++) { + ref += (T)(j + i * sg_size) + 0.25; + } + } + /* There is no defined out-of-range behavior for these functions. */ + if ((SGid + N) * sg_size <= L) { + std::string s("Vector<"); + s += std::string(typeid(ref).name()) + std::string(",") + + std::to_string(N) + std::string(">[") + std::to_string(j) + + std::string("]"); + exit_if_not_equal(acc[j], ref, s.c_str()); + } + } + } catch (exception e) { + std::cout << "SYCL exception caught: " << e.what(); + exit(1); + } +} +template void check(queue &Queue) { + const int G = 128, L = 64; + try { + nd_range<1> NdRange(G, L); + buffer syclbuf(G); + buffer sgsizebuf(1); + { + auto acc = syclbuf.template get_access(); + for (int i = 0; i < G; i++) { + acc[i] = i; + acc[i] += 0.1; // Check that floating point types are not casted to int + } + } + + Queue.submit([&](handler &cgh) { + auto acc = syclbuf.template get_access(cgh); + auto sgsizeacc = sgsizebuf.get_access(cgh); + accessor LocalMem( + {L}, cgh); + cgh.parallel_for>(NdRange, [=](nd_item<1> NdItem) { + ext::oneapi::sub_group SG = NdItem.get_sub_group(); + if (NdItem.get_global_id(0) == 0) + sgsizeacc[0] = SG.get_max_local_range()[0]; + size_t SGOffset = + SG.get_group_id().get(0) * SG.get_max_local_range().get(0); + size_t WGSGoffset = NdItem.get_group(0) * L + SGOffset; + multi_ptr mp(&acc[WGSGoffset]); + multi_ptr MPL( + &LocalMem[SGOffset]); + + // half does not have full support for volatile type qualifier + using CVT = std::conditional_t, const T, + const volatile T>; + + multi_ptr mp_cv(mp); + multi_ptr MPL_CV(MPL); + T s = SG.load(mp) + (T)SG.get_local_id().get(0); + T s_cv = SG.load(mp_cv) + (T)SG.get_local_id().get(0); + if (s == s_cv) // Store result only if same for non-cv and cv + SG.store(MPL, s); + T t = SG.load(MPL) + (T)SG.get_local_id().get(0); + T t_cv = SG.load(MPL_CV) + (T)SG.get_local_id().get(0); + if (t == t_cv) // Store result only if same for non-cv and cv + SG.store(mp, t); + }); + }); + auto acc = syclbuf.template get_access(); + auto sgsizeacc = sgsizebuf.get_access(); + size_t sg_size = sgsizeacc[0]; + int WGid = -1, SGid = 0; + for (int j = 0; j < G; j++) { + if (j % L % sg_size == 0) { + SGid++; + } + if (j % L == 0) { + WGid++; + SGid = 0; + } + std::string s("Scalar<"); + s += std::string(typeid(acc[j]).name()) + std::string(">[") + + std::to_string(j) + std::string("]"); + + exit_if_not_equal(acc[j], (T)(j + 2 * (j % L % sg_size)) + 0.1, + s.c_str()); + } + + } catch (exception e) { + std::cout << "SYCL exception caught: " << e.what(); + exit(1); + } +} + int main() { queue Queue; if (Queue.get_device().is_host()) { @@ -100,6 +268,16 @@ int main() { check(Queue); check(Queue); check(Queue); +#ifdef ENABLE_FP64 + typedef double aligned_double __attribute__((aligned(16))); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); +#endif } std::cout << "Test passed." << std::endl; return 0; diff --git a/SYCL/SubGroup/load_store.hpp b/SYCL/SubGroup/load_store.hpp deleted file mode 100644 index b52b1d0c05..0000000000 --- a/SYCL/SubGroup/load_store.hpp +++ /dev/null @@ -1,171 +0,0 @@ -#include "helper.hpp" -#include - -#include - -template class sycl_subgr; - -using namespace sycl; - -template void check(queue &Queue) { - const int G = 512, L = 256; - - auto sg_sizes = Queue.get_device().get_info(); - size_t max_sg_size = *std::max_element(sg_sizes.begin(), sg_sizes.end()); - - try { - nd_range<1> NdRange(G, L); - buffer syclbuf(G + max_sg_size * N); - buffer sgsizebuf(1); - { - auto acc = syclbuf.template get_access(); - for (int i = 0; i < G; i++) { - acc[i] = i; - acc[i] += 0.25; // Check that floating point types are not casted to int - } - } - Queue.submit([&](handler &cgh) { - auto acc = syclbuf.template get_access(cgh); - auto sgsizeacc = sgsizebuf.get_access(cgh); - accessor LocalMem( - {L + max_sg_size * N}, cgh); - cgh.parallel_for>(NdRange, [=](nd_item<1> NdItem) { - ext::oneapi::sub_group SG = NdItem.get_sub_group(); - auto SGid = SG.get_group_id().get(0); - auto SGsize = SG.get_max_local_range().get(0); - /* Avoid overlapping data ranges inside and between local groups */ - if (SGid % N == 0 && (SGid + N) * SGsize <= L) { - size_t SGOffset = SGid * SGsize; - size_t WGSGoffset = NdItem.get_group(0) * L + SGOffset; - multi_ptr mp( - &acc[WGSGoffset]); - multi_ptr MPL( - &LocalMem[SGOffset]); - - // half does not have full support for volatile type qualifier - using CVT = std::conditional_t, const T, - const volatile T>; - - multi_ptr mp_cv(mp); - multi_ptr MPL_CV(MPL); - // Add all values in read block - vec v(SG.load(mp)); - vec v_cv(SG.load(mp_cv)); - if (utils::cmp_vec( - v, v_cv)) // Store result only if same for non-cv and cv - SG.store(MPL, v); - vec t(utils::add_vec(SG.load(MPL))); - vec t_cv(utils::add_vec(SG.load(MPL_CV))); - if (utils::cmp_vec( - t, t_cv)) // Store result only if same for non-cv and cv - SG.store(mp, t); - } - if (NdItem.get_global_id(0) == 0) - sgsizeacc[0] = SGsize; - }); - }); - auto acc = syclbuf.template get_access(); - auto sgsizeacc = sgsizebuf.get_access(); - size_t sg_size = sgsizeacc[0]; - int WGid = -1, SGid = 0; - for (int j = 0; j < (G - (sg_size * N)); j++) { - if (j % L % sg_size == 0) { - SGid++; - } - if (j % L == 0) { - WGid++; - SGid = 0; - } - T ref = 0; - if (SGid % N) { - ref = acc[j - (SGid % N) * sg_size]; - } else { - for (int i = 0; i < N; i++) { - ref += (T)(j + i * sg_size) + 0.25; - } - } - /* There is no defined out-of-range behavior for these functions. */ - if ((SGid + N) * sg_size <= L) { - std::string s("Vector<"); - s += std::string(typeid(ref).name()) + std::string(",") + - std::to_string(N) + std::string(">[") + std::to_string(j) + - std::string("]"); - exit_if_not_equal(acc[j], ref, s.c_str()); - } - } - } catch (exception e) { - std::cout << "SYCL exception caught: " << e.what(); - exit(1); - } -} -template void check(queue &Queue) { - const int G = 128, L = 64; - try { - nd_range<1> NdRange(G, L); - buffer syclbuf(G); - buffer sgsizebuf(1); - { - auto acc = syclbuf.template get_access(); - for (int i = 0; i < G; i++) { - acc[i] = i; - acc[i] += 0.1; // Check that floating point types are not casted to int - } - } - - Queue.submit([&](handler &cgh) { - auto acc = syclbuf.template get_access(cgh); - auto sgsizeacc = sgsizebuf.get_access(cgh); - accessor LocalMem( - {L}, cgh); - cgh.parallel_for>(NdRange, [=](nd_item<1> NdItem) { - ext::oneapi::sub_group SG = NdItem.get_sub_group(); - if (NdItem.get_global_id(0) == 0) - sgsizeacc[0] = SG.get_max_local_range()[0]; - size_t SGOffset = - SG.get_group_id().get(0) * SG.get_max_local_range().get(0); - size_t WGSGoffset = NdItem.get_group(0) * L + SGOffset; - multi_ptr mp(&acc[WGSGoffset]); - multi_ptr MPL( - &LocalMem[SGOffset]); - - // half does not have full support for volatile type qualifier - using CVT = std::conditional_t, const T, - const volatile T>; - - multi_ptr mp_cv(mp); - multi_ptr MPL_CV(MPL); - T s = SG.load(mp) + (T)SG.get_local_id().get(0); - T s_cv = SG.load(mp_cv) + (T)SG.get_local_id().get(0); - if (s == s_cv) // Store result only if same for non-cv and cv - SG.store(MPL, s); - T t = SG.load(MPL) + (T)SG.get_local_id().get(0); - T t_cv = SG.load(MPL_CV) + (T)SG.get_local_id().get(0); - if (t == t_cv) // Store result only if same for non-cv and cv - SG.store(mp, t); - }); - }); - auto acc = syclbuf.template get_access(); - auto sgsizeacc = sgsizebuf.get_access(); - size_t sg_size = sgsizeacc[0]; - int WGid = -1, SGid = 0; - for (int j = 0; j < G; j++) { - if (j % L % sg_size == 0) { - SGid++; - } - if (j % L == 0) { - WGid++; - SGid = 0; - } - std::string s("Scalar<"); - s += std::string(typeid(acc[j]).name()) + std::string(">[") + - std::to_string(j) + std::string("]"); - - exit_if_not_equal(acc[j], (T)(j + 2 * (j % L % sg_size)) + 0.1, - s.c_str()); - } - - } catch (exception e) { - std::cout << "SYCL exception caught: " << e.what(); - exit(1); - } -} diff --git a/SYCL/SubGroup/load_store_aspect-fp64.cpp b/SYCL/SubGroup/load_store_aspect-fp64.cpp index 329515a4d1..c4011a2d28 100644 --- a/SYCL/SubGroup/load_store_aspect-fp64.cpp +++ b/SYCL/SubGroup/load_store_aspect-fp64.cpp @@ -1,5 +1,5 @@ // REQUIRES: aspect-fp64 -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -DENABLE_FP64 %s -o %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out // RUN: %ACC_RUN_PLACEHOLDER %t.out @@ -16,32 +16,4 @@ // //===----------------------------------------------------------------------===// -#include "load_store.hpp" - -using namespace sycl; - -int main() { - queue Queue; - if (Queue.get_device().is_host() or - !Queue.get_device().has(sycl::aspect::fp64)) { - std::cout << "Skipping test\n"; - return 0; - } - std::string PlatformName = - Queue.get_device().get_platform().get_info(); - auto Vec = Queue.get_device().get_info(); - if (std::find(Vec.begin(), Vec.end(), "cl_intel_subgroups_long") != - std::end(Vec) || - PlatformName.find("CUDA") != std::string::npos) { - typedef double aligned_double __attribute__((aligned(16))); - check(Queue); - check(Queue); - check(Queue); - check(Queue); - check(Queue); - check(Queue); - check(Queue); - } - std::cout << "Test passed." << std::endl; - return 0; -} +#include "load_store.cpp" diff --git a/SYCL/USM/copy.cpp b/SYCL/USM/copy.cpp index c359962c45..3d9786548f 100644 --- a/SYCL/USM/copy.cpp +++ b/SYCL/USM/copy.cpp @@ -12,11 +12,15 @@ // RUN: %GPU_RUN_PLACEHOLDER %t1.out // RUN: %ACC_RUN_PLACEHOLDER %t1.out -#include "copy.hpp"; +#include using namespace sycl; using namespace sycl::usm; +template class transfer; + +static constexpr int N = 100; // should be even + struct test_struct { short a; int b; @@ -24,20 +28,83 @@ struct test_struct { long long d; half e; float f; +#ifdef ENABLE_FP64 + double g; +#endif }; bool operator==(const test_struct &lhs, const test_struct &rhs) { +#ifdef ENABLE_FP64 + return lhs.a == rhs.a && lhs.b == rhs.b && lhs.c == rhs.c && lhs.d == rhs.d && + lhs.e == rhs.e && lhs.f == rhs.f && lhs.g == rhs.g; +#else return lhs.a == rhs.a && lhs.b == rhs.b && lhs.c == rhs.c && lhs.d == rhs.d && lhs.e == rhs.e && lhs.f == rhs.f; +#endif +} + +template T *regular(queue q, alloc kind) { + return malloc(N, q, kind); +} + +template T *aligned(queue q, alloc kind) { + return aligned_alloc(alignof(long long), N, q, kind); +} + +template void test(queue q, T val, T *src, T *dst, bool dev_dst) { + q.fill(src, val, N).wait(); + + // Use queue::copy for the first half and handler::copy for the second + q.copy(src, dst, N / 2).wait(); + q.submit([&](handler &h) { h.copy(src + N / 2, dst + N / 2, N / 2); }).wait(); + + T *out = dst; + + std::array arr; + if (dev_dst) { // if copied to device, transfer data back to host + buffer buf{arr}; + q.submit([&](handler &h) { + accessor acc{buf, h}; + h.parallel_for>(N, [=](id<1> i) { acc[i] = dst[i]; }); + }); + out = arr.data(); + } + + for (int i = 0; i < N; ++i) { + assert(out[i] == val); + } + + free(src, q); + free(dst, q); +} + +template void runTests(queue q, T val, alloc kind1, alloc kind2) { + bool dev_dst1 = (kind1 == alloc::device); + bool dev_dst2 = (kind2 == alloc::device); + test(q, val, regular(q, kind1), regular(q, kind2), dev_dst2); + test(q, val, regular(q, kind2), regular(q, kind1), dev_dst1); + test(q, val, aligned(q, kind1), aligned(q, kind2), dev_dst2); + test(q, val, aligned(q, kind2), aligned(q, kind1), dev_dst1); + test(q, val, regular(q, kind1), aligned(q, kind2), dev_dst2); + test(q, val, regular(q, kind2), aligned(q, kind1), dev_dst1); + test(q, val, aligned(q, kind1), regular(q, kind2), dev_dst2); + test(q, val, aligned(q, kind2), regular(q, kind1), dev_dst1); } int main() { queue q; auto dev = q.get_device(); +#ifdef ENABLE_FP64 + test_struct test_obj{4, 42, 424, 4242, 4.2f, 4.242f, 4.24242}; +#else test_struct test_obj{4, 42, 424, 4242, 4.2f, 4.242f}; +#endif if (dev.has(aspect::usm_host_allocations)) { +#ifdef ENABLE_FP64 + runTests(q, 4.24242, alloc::host, alloc::host); +#endif runTests(q, 4, alloc::host, alloc::host); runTests(q, 42, alloc::host, alloc::host); runTests(q, 424, alloc::host, alloc::host); @@ -48,6 +115,9 @@ int main() { } if (dev.has(aspect::usm_shared_allocations)) { +#ifdef ENABLE_FP64 + runTests(q, 4.24242, alloc::shared, alloc::shared); +#endif runTests(q, 4, alloc::shared, alloc::shared); runTests(q, 42, alloc::shared, alloc::shared); runTests(q, 424, alloc::shared, alloc::shared); @@ -58,6 +128,9 @@ int main() { } if (dev.has(aspect::usm_device_allocations)) { +#ifdef ENABLE_FP64 + runTests(q, 4.24242, alloc::device, alloc::device); +#endif runTests(q, 4, alloc::device, alloc::device); runTests(q, 42, alloc::device, alloc::device); runTests(q, 424, alloc::device, alloc::device); @@ -69,6 +142,9 @@ int main() { if (dev.has(aspect::usm_host_allocations) && dev.has(aspect::usm_shared_allocations)) { +#ifdef ENABLE_FP64 + runTests(q, 4.24242, alloc::host, alloc::shared); +#endif runTests(q, 4, alloc::host, alloc::shared); runTests(q, 42, alloc::host, alloc::shared); runTests(q, 424, alloc::host, alloc::shared); @@ -80,6 +156,9 @@ int main() { if (dev.has(aspect::usm_host_allocations) && dev.has(aspect::usm_device_allocations)) { +#ifdef ENABLE_FP64 + runTests(q, 4.24242, alloc::host, alloc::device); +#endif runTests(q, 4, alloc::host, alloc::device); runTests(q, 42, alloc::host, alloc::device); runTests(q, 424, alloc::host, alloc::device); @@ -91,6 +170,9 @@ int main() { if (dev.has(aspect::usm_shared_allocations) && dev.has(aspect::usm_device_allocations)) { +#ifdef ENABLE_FP64 + runTests(q, 4.24242, alloc::shared, alloc::device); +#endif runTests(q, 4, alloc::shared, alloc::device); runTests(q, 42, alloc::shared, alloc::device); runTests(q, 424, alloc::shared, alloc::device); diff --git a/SYCL/USM/copy.hpp b/SYCL/USM/copy.hpp deleted file mode 100644 index 1b0a7b0f15..0000000000 --- a/SYCL/USM/copy.hpp +++ /dev/null @@ -1,56 +0,0 @@ -#include - -using namespace sycl; -using namespace sycl::usm; - -template class transfer; - -static constexpr int N = 100; // should be even - -template T *regular(queue q, alloc kind) { - return malloc(N, q, kind); -} - -template T *aligned(queue q, alloc kind) { - return aligned_alloc(alignof(long long), N, q, kind); -} - -template void test(queue q, T val, T *src, T *dst, bool dev_dst) { - q.fill(src, val, N).wait(); - - // Use queue::copy for the first half and handler::copy for the second - q.copy(src, dst, N / 2).wait(); - q.submit([&](handler &h) { h.copy(src + N / 2, dst + N / 2, N / 2); }).wait(); - - T *out = dst; - - std::array arr; - if (dev_dst) { // if copied to device, transfer data back to host - buffer buf{arr}; - q.submit([&](handler &h) { - accessor acc{buf, h}; - h.parallel_for>(N, [=](id<1> i) { acc[i] = dst[i]; }); - }); - out = arr.data(); - } - - for (int i = 0; i < N; ++i) { - assert(out[i] == val); - } - - free(src, q); - free(dst, q); -} - -template void runTests(queue q, T val, alloc kind1, alloc kind2) { - bool dev_dst1 = (kind1 == alloc::device); - bool dev_dst2 = (kind2 == alloc::device); - test(q, val, regular(q, kind1), regular(q, kind2), dev_dst2); - test(q, val, regular(q, kind2), regular(q, kind1), dev_dst1); - test(q, val, aligned(q, kind1), aligned(q, kind2), dev_dst2); - test(q, val, aligned(q, kind2), aligned(q, kind1), dev_dst1); - test(q, val, regular(q, kind1), aligned(q, kind2), dev_dst2); - test(q, val, regular(q, kind2), aligned(q, kind1), dev_dst1); - test(q, val, aligned(q, kind1), regular(q, kind2), dev_dst2); - test(q, val, aligned(q, kind2), regular(q, kind1), dev_dst1); -} \ No newline at end of file diff --git a/SYCL/USM/copy_aspect-fp64.cpp b/SYCL/USM/copy_aspect-fp64.cpp index e1aea79f2f..41ab15afcb 100644 --- a/SYCL/USM/copy_aspect-fp64.cpp +++ b/SYCL/USM/copy_aspect-fp64.cpp @@ -1,5 +1,4 @@ -//==---- copy_aspect-fp64.cp - USM copy test -//------------------------------------------==// +//==---- copy_aspect-fp64.cp - USM copy test ------------------------------------------==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -8,53 +7,10 @@ //===----------------------------------------------------------------------===// // // REQUIRES: aspect-fp64 -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t1.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -DENABLE_FP64 %s -o %t1.out // RUN: %HOST_RUN_PLACEHOLDER %t1.out // RUN: %CPU_RUN_PLACEHOLDER %t1.out // RUN: %GPU_RUN_PLACEHOLDER %t1.out // RUN: %ACC_RUN_PLACEHOLDER %t1.out -#include "copy.hpp"; - -using namespace sycl; -using namespace sycl::usm; - -int main() { - queue q; - - if (!q.get_device().has(aspect::fp64)) { - std::cout << "Skipping test\n"; - return 0; - } - - auto dev = q.get_device(); - - if (dev.has(aspect::usm_host_allocations)) { - runTests(q, 4.24242, alloc::host, alloc::host); - } - - if (dev.has(aspect::usm_shared_allocations)) { - runTests(q, 4.24242, alloc::shared, alloc::shared); - } - - if (dev.has(aspect::usm_device_allocations)) { - runTests(q, 4.24242, alloc::device, alloc::device); - } - - if (dev.has(aspect::usm_host_allocations) && - dev.has(aspect::usm_shared_allocations)) { - runTests(q, 4.24242, alloc::host, alloc::shared); - } - - if (dev.has(aspect::usm_host_allocations) && - dev.has(aspect::usm_device_allocations)) { - runTests(q, 4.24242, alloc::host, alloc::device); - } - - if (dev.has(aspect::usm_shared_allocations) && - dev.has(aspect::usm_device_allocations)) { - runTests(q, 4.24242, alloc::shared, alloc::device); - } - - return 0; -} +#include "copy.cpp"; diff --git a/SYCL/USM/fill.cpp b/SYCL/USM/fill.cpp index 3a4d5a8fc7..2f37ad58d2 100644 --- a/SYCL/USM/fill.cpp +++ b/SYCL/USM/fill.cpp @@ -12,10 +12,15 @@ // RUN: %GPU_RUN_PLACEHOLDER %t1.out // RUN: %ACC_RUN_PLACEHOLDER %t1.out -#include "fill.hpp" +#include using namespace sycl; +template class usm_device_transfer; +template class usm_aligned_device_transfer; + +static constexpr int N = 100; + struct test_struct { short a; int b; @@ -23,11 +28,103 @@ struct test_struct { long long d; sycl::half e; float f; +#ifdef ENABLE_FP64 + double g; +#endif }; bool operator==(const test_struct &lhs, const test_struct &rhs) { +#ifdef ENABLE_FP64 + return lhs.a == rhs.a && lhs.b == rhs.b && lhs.c == rhs.c && lhs.d == rhs.d && + lhs.e == rhs.e && lhs.f == rhs.f && lhs.g == rhs.g; +#else return lhs.a == rhs.a && lhs.b == rhs.b && lhs.c == rhs.c && lhs.d == rhs.d && lhs.e == rhs.e && lhs.f == rhs.f; +#endif +} + +template +void runHostTests(device dev, context ctxt, queue q, T val) { + T *array; + + array = (T *)malloc_host(N * sizeof(T), q); + q.submit([&](handler &h) { h.fill(array, val, N); }).wait(); + for (int i = 0; i < N; ++i) { + assert(array[i] == val); + } + free(array, ctxt); + + array = (T *)aligned_alloc_host(alignof(long long), N * sizeof(T), ctxt); + q.submit([&](handler &h) { h.fill(array, val, N); }).wait(); + for (int i = 0; i < N; ++i) { + assert(array[i] == val); + } + free(array, ctxt); +} + +template +void runSharedTests(device dev, context ctxt, queue q, T val) { + T *array; + + array = (T *)malloc_shared(N * sizeof(T), q); + q.submit([&](handler &h) { h.fill(array, val, N); }).wait(); + for (int i = 0; i < N; ++i) { + assert(array[i] == val); + } + free(array, ctxt); + + array = + (T *)aligned_alloc_shared(alignof(long long), N * sizeof(T), dev, ctxt); + q.submit([&](handler &h) { h.fill(array, val, N); }).wait(); + for (int i = 0; i < N; ++i) { + assert(array[i] == val); + } + free(array, ctxt); +} + +template +void runDeviceTests(device dev, context ctxt, queue q, T val) { + T *array; + std::vector out; + out.resize(N); + + array = (T *)malloc_device(N * sizeof(T), q); + q.submit([&](handler &h) { h.fill(array, val, N); }).wait(); + + { + buffer buf{&out[0], range<1>{N}}; + q.submit([&](handler &h) { + auto acc = buf.template get_access(h); + h.parallel_for>( + range<1>(N), [=](id<1> item) { acc[item] = array[item]; }); + }).wait(); + } + + for (int i = 0; i < N; ++i) { + assert(out[i] == val); + } + free(array, ctxt); + + out.clear(); + out.resize(N); + + array = + (T *)aligned_alloc_device(alignof(long long), N * sizeof(T), dev, ctxt); + q.submit([&](handler &h) { h.fill(array, val, N); }).wait(); + + { + buffer buf{&out[0], range<1>{N}}; + q.submit([&](handler &h) { + auto acc = buf.template get_access(h); + h.parallel_for>( + range<1>(N), [=](id<1> item) { acc[item] = array[item]; }); + }).wait(); + } + + for (int i = 0; i < N; ++i) { + assert(out[i] == val); + } + free(array, ctxt); } int main() { @@ -35,9 +132,16 @@ int main() { auto dev = q.get_device(); auto ctxt = q.get_context(); +#ifdef ENABLE_FP64 + test_struct test_obj{4, 42, 424, 4242, 4.2f, 4.242, 4.24242}; +#else test_struct test_obj{4, 42, 424, 4242, 4.2f, 4.242}; +#endif if (dev.get_info()) { +#ifdef ENABLE_FP64 + runHostTests(dev, ctxt, q, 4.24242); +#endif runHostTests(dev, ctxt, q, 4); runHostTests(dev, ctxt, q, 42); runHostTests(dev, ctxt, q, 424); @@ -48,6 +152,9 @@ int main() { } if (dev.get_info()) { +#ifdef ENABLE_FP64 + runSharedTests(dev, ctxt, q, 4.24242); +#endif runSharedTests(dev, ctxt, q, 4); runSharedTests(dev, ctxt, q, 42); runSharedTests(dev, ctxt, q, 424); @@ -58,6 +165,9 @@ int main() { } if (dev.get_info()) { +#ifdef ENABLE_FP64 + runDeviceTests(dev, ctxt, q, 4.24242); +#endif runDeviceTests(dev, ctxt, q, 4); runDeviceTests(dev, ctxt, q, 42); runDeviceTests(dev, ctxt, q, 420); diff --git a/SYCL/USM/fill.hpp b/SYCL/USM/fill.hpp deleted file mode 100644 index a28ecbfff2..0000000000 --- a/SYCL/USM/fill.hpp +++ /dev/null @@ -1,92 +0,0 @@ -#include - -using namespace sycl; - -template class usm_device_transfer; -template class usm_aligned_device_transfer; - -static constexpr int N = 100; - -template -void runHostTests(device dev, context ctxt, queue q, T val) { - T *array; - - array = (T *)malloc_host(N * sizeof(T), q); - q.submit([&](handler &h) { h.fill(array, val, N); }).wait(); - for (int i = 0; i < N; ++i) { - assert(array[i] == val); - } - free(array, ctxt); - - array = (T *)aligned_alloc_host(alignof(long long), N * sizeof(T), ctxt); - q.submit([&](handler &h) { h.fill(array, val, N); }).wait(); - for (int i = 0; i < N; ++i) { - assert(array[i] == val); - } - free(array, ctxt); -} - -template -void runSharedTests(device dev, context ctxt, queue q, T val) { - T *array; - - array = (T *)malloc_shared(N * sizeof(T), q); - q.submit([&](handler &h) { h.fill(array, val, N); }).wait(); - for (int i = 0; i < N; ++i) { - assert(array[i] == val); - } - free(array, ctxt); - - array = - (T *)aligned_alloc_shared(alignof(long long), N * sizeof(T), dev, ctxt); - q.submit([&](handler &h) { h.fill(array, val, N); }).wait(); - for (int i = 0; i < N; ++i) { - assert(array[i] == val); - } - free(array, ctxt); -} - -template -void runDeviceTests(device dev, context ctxt, queue q, T val) { - T *array; - std::vector out; - out.resize(N); - - array = (T *)malloc_device(N * sizeof(T), q); - q.submit([&](handler &h) { h.fill(array, val, N); }).wait(); - - { - buffer buf{&out[0], range<1>{N}}; - q.submit([&](handler &h) { - auto acc = buf.template get_access(h); - h.parallel_for>( - range<1>(N), [=](id<1> item) { acc[item] = array[item]; }); - }).wait(); - } - - for (int i = 0; i < N; ++i) { - assert(out[i] == val); - } - free(array, ctxt); - - out.clear(); - out.resize(N); - - array = - (T *)aligned_alloc_device(alignof(long long), N * sizeof(T), dev, ctxt); - q.submit([&](handler &h) { h.fill(array, val, N); }).wait(); - - { - buffer buf{&out[0], range<1>{N}}; - q.submit([&](handler &h) { - auto acc = buf.template get_access(h); - h.parallel_for>( - range<1>(N), [=](id<1> item) { acc[item] = array[item]; }); - }).wait(); - } - - for (int i = 0; i < N; ++i) { - assert(out[i] == val); - } - free(array, ctxt); -} diff --git a/SYCL/USM/fill_aspect-fp64.cpp b/SYCL/USM/fill_aspect-fp64.cpp index acc00c06fa..d67bbeba65 100644 --- a/SYCL/USM/fill_aspect-fp64.cpp +++ b/SYCL/USM/fill_aspect-fp64.cpp @@ -8,38 +8,10 @@ //===----------------------------------------------------------------------===// // // REQUIRES: aspect-fp64 -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t1.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -DENABLE_FP64 %s -o %t1.out // RUN: %HOST_RUN_PLACEHOLDER %t1.out // RUN: %CPU_RUN_PLACEHOLDER %t1.out // RUN: %GPU_RUN_PLACEHOLDER %t1.out // RUN: %ACC_RUN_PLACEHOLDER %t1.out -#include "fill.hpp"; - -using namespace sycl; - -int main() { - queue q; - - if (!q.get_device().has(aspect::fp64)) { - std::cout << "Skipping test\n"; - return 0; - } - - auto dev = q.get_device(); - auto ctxt = q.get_context(); - - if (dev.get_info()) { - runHostTests(dev, ctxt, q, 4.24242); - } - - if (dev.get_info()) { - runHostTests(dev, ctxt, q, 4.24242); - } - - if (dev.get_info()) { - runHostTests(dev, ctxt, q, 4.24242); - } - - return 0; -} +#include "fill.cpp"; From 77cd416319f34205ff1245a73c87583d0314f829 Mon Sep 17 00:00:00 2001 From: Yanliang Mu Date: Tue, 23 Aug 2022 15:12:48 +0800 Subject: [PATCH 18/35] fix 3-way merge issue --- SYCL/InlineAsm/asm_float_add.cpp | 67 +++++++++++ SYCL/InlineAsm/asm_float_imm_arg.cpp | 60 ++++++++++ .../commandlist/Inputs/FindPrimesSYCL.cpp | 112 ++++++++++++++++++ 3 files changed, 239 insertions(+) create mode 100644 SYCL/InlineAsm/asm_float_add.cpp create mode 100644 SYCL/InlineAsm/asm_float_imm_arg.cpp create mode 100644 SYCL/Regression/commandlist/Inputs/FindPrimesSYCL.cpp diff --git a/SYCL/InlineAsm/asm_float_add.cpp b/SYCL/InlineAsm/asm_float_add.cpp new file mode 100644 index 0000000000..f633743449 --- /dev/null +++ b/SYCL/InlineAsm/asm_float_add.cpp @@ -0,0 +1,67 @@ +// UNSUPPORTED: cuda || hip_nvidia +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +#include "include/asmhelper.h" +#include +#include +#include +#include + +using dataType = sycl::cl_float; + +template +struct KernelFunctor : WithInputBuffers, WithOutputBuffer { + KernelFunctor(const std::vector &input1, const std::vector &input2) + : WithInputBuffers(input1, input2), WithOutputBuffer( + input1.size()) {} + + void operator()(sycl::handler &cgh) { + auto A = + this->getInputBuffer(0).template get_access( + cgh); + auto B = + this->getInputBuffer(1).template get_access( + cgh); + auto C = + this->getOutputBuffer().template get_access( + cgh); + + cgh.parallel_for>( + sycl::range<1>{this->getOutputBufferSize()}, + [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(8)]] { +#if defined(__SYCL_DEVICE_ONLY__) + asm("add (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>" + : "=rw"(C[wiID]) + : "rw"(A[wiID]), "rw"(B[wiID])); +#else + C[wiID] = A[wiID] + B[wiID]; +#endif + }); + } +}; + +int main() { + std::vector inputA(DEFAULT_PROBLEM_SIZE), + inputB(DEFAULT_PROBLEM_SIZE); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) { + inputA[i] = (float)1 / std::pow(2, i); + inputB[i] = (float)2 / std::pow(2, i); + } + + KernelFunctor<> f(inputA, inputB); + if (!launchInlineASMTest(f)) + return 0; + + auto &C = f.getOutputBufferData(); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) { + if (C[i] != inputA[i] + inputB[i]) { + std::cerr << "At index: " << i << ". "; + std::cerr << C[i] << " != " << inputA[i] + inputB[i] << "\n"; + return 1; + } + } + + return 0; +} diff --git a/SYCL/InlineAsm/asm_float_imm_arg.cpp b/SYCL/InlineAsm/asm_float_imm_arg.cpp new file mode 100644 index 0000000000..d2fb47000f --- /dev/null +++ b/SYCL/InlineAsm/asm_float_imm_arg.cpp @@ -0,0 +1,60 @@ +// UNSUPPORTED: cuda || hip_nvidia +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +#include "include/asmhelper.h" +#include +#include +#include +#include + +constexpr float IMM_ARGUMENT = 0.5; +using dataType = sycl::cl_float; + +template +struct KernelFunctor : WithInputBuffers, WithOutputBuffer { + KernelFunctor(const std::vector &input) + : WithInputBuffers(input), WithOutputBuffer(input.size()) {} + + void operator()(sycl::handler &cgh) { + auto A = + this->getInputBuffer(0).template get_access( + cgh); + auto B = + this->getOutputBuffer().template get_access( + cgh); + + cgh.parallel_for>( + sycl::range<1>{this->getOutputBufferSize()}, + [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(8)]] { +#if defined(__SYCL_DEVICE_ONLY__) + asm("mul (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0> %2" + : "=rw"(B[wiID]) + : "rw"(A[wiID]), "i"(IMM_ARGUMENT)); +#else + B[wiID] = A[wiID] * IMM_ARGUMENT; +#endif + }); + } +}; + +int main() { + std::vector input(DEFAULT_PROBLEM_SIZE); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) + input[i] = (float)1 / std::pow(2, i); + + KernelFunctor<> f(input); + if (!launchInlineASMTest(f)) + return 0; + + auto &B = f.getOutputBufferData(); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) { + if (B[i] != input[i] * IMM_ARGUMENT) { + std::cerr << "At index: " << i << ". "; + std::cerr << B[i] << " != " << input[i] * IMM_ARGUMENT << "\n"; + return 1; + } + } + return 0; +} diff --git a/SYCL/Regression/commandlist/Inputs/FindPrimesSYCL.cpp b/SYCL/Regression/commandlist/Inputs/FindPrimesSYCL.cpp new file mode 100644 index 0000000000..c7ff46673d --- /dev/null +++ b/SYCL/Regression/commandlist/Inputs/FindPrimesSYCL.cpp @@ -0,0 +1,112 @@ +#include "FindPrimesSYCL.h" + +#include +#include +#include +#include +#include + +#include +constexpr sycl::access::mode sycl_read = sycl::access::mode::read; +constexpr sycl::access::mode sycl_write = sycl::access::mode::write; + +using namespace std; + +/* This is the class used to name the kernel for the runtime. + * This must be done when the kernel is expressed as a lambda. */ +class FindPrimeI; + +float find_prime_s(work *w) { + + // need this for the lambda capture and sycl queue submit + auto &VRI = w->VRI; + auto N = w->VRI.size(); + auto niter = w->niter; + auto nitems = w->nitems; + + sycl::range<1> numOfItems{nitems}; + sycl::buffer bufferR(VRI.data(), N); + + auto start = std::chrono::high_resolution_clock::now(); + + sycl::event event; + +#ifdef __SYCL_DEVICE_ONLY__ +#define CONSTANT __attribute__((opencl_constant)) +#else +#define CONSTANT +#endif + + if (w->queueLock) { + w->queueLock->lock(); + } + event = w->deviceQueue->submit([&](sycl::handler &cgh) { + auto accessorR = bufferR.template get_access(cgh); + sycl::stream cout(1024, 256, cgh); + + auto k2 = [=](sycl::item<1> item) { + size_t maxstride = 1 + N / nitems; + for (size_t istride = 0; istride < maxstride; ++istride) { + unsigned int number = istride * nitems + item.get_linear_id(); + if (number < N) { + for (size_t i = 0; i < niter; ++i) { + bool is_prime = !(number % 2 == 0); + const int upper_bound = sycl::sqrt(1.0f * number) + 1; + int k = 3; + while (k < upper_bound && is_prime) { + is_prime = !(number % k == 0); + k += 2; // don't have to test even numbers + } + accessorR[number] = is_prime; + } + } else { + break; + } + } + }; + cgh.parallel_for(numOfItems, k2); + }); + if (w->queueLock) { + w->queueLock->unlock(); + } + + // deviceQueue.wait(); + try { + event.wait_and_throw(); + } catch (sycl::exception const &e) { + std::cout << "Caught asynchronous SYCL exception:\n" + << e.what() << std::endl; + } + + auto stop = std::chrono::high_resolution_clock::now(); + + auto submit_time = + event.get_profiling_info(); + auto start_time = + event.get_profiling_info(); + auto end_time = + event.get_profiling_info(); + + w->start_time = start_time; + w->end_time = end_time; + w->submit_time = submit_time; + w->start = start; + w->stop = stop; + + // std::cout << "submit time: " << submission_time + // << std::endl; + // std::cout << "execut time: " << execution_time + // << std::endl; + + w->result = 0; + for (auto &e : w->VRI) { + if (e) { + ++w->result; + } + } + + std::chrono::duration diff{0}; + diff = (stop - start); + + return diff.count(); +} From b2c02ecdec28cd314cdf7cced3f9bfb3751b9c22 Mon Sep 17 00:00:00 2001 From: Yanliang Mu Date: Tue, 23 Aug 2022 15:39:18 +0800 Subject: [PATCH 19/35] fix clang-format issue --- SYCL/Basic/buffer/buffer.cpp | 245 +++++++++--------- .../api/bin_and_cmp_ops_heavy_aspect-fp64.cpp | 2 +- SYCL/ESIMD/regression/Inputs/dgetrf.hpp | 1 - SYCL/ESIMD/regression/dgetrf_8x8.cpp | 1 - SYCL/SubGroup/generic-shuffle_aspect-fp64.cpp | 3 +- SYCL/USM/copy_aspect-fp64.cpp | 3 +- 6 files changed, 128 insertions(+), 127 deletions(-) diff --git a/SYCL/Basic/buffer/buffer.cpp b/SYCL/Basic/buffer/buffer.cpp index 6814b10a06..630a5ce814 100644 --- a/SYCL/Basic/buffer/buffer.cpp +++ b/SYCL/Basic/buffer/buffer.cpp @@ -585,143 +585,144 @@ int main() { for (size_t i = 0; i < size; i++) { if (bool_vector[i] != true || int_vector[i] != 3) { #ifdef ENABLE_FP64 - if (bool_vector[i] != true || int_vector[i] != 3 || - double_vector[i] != 7.5) { + if (bool_vector[i] != true || int_vector[i] != 3 || + double_vector[i] != 7.5) { #endif - assert(false && "Data was not copied back"); - return 1; + assert(false && "Data was not copied back"); + return 1; + } } } - } - // Check that data is not copied back after canceling write-back using - // set_write_back - { - std::vector data1(10, -1); + // Check that data is not copied back after canceling write-back using + // set_write_back { - buffer b(range<1>(10)); - b.set_final_data(data1.data()); - b.set_write_back(false); - queue myQueue; - myQueue.submit([&](handler &cgh) { - auto B = b.get_access(cgh); - cgh.parallel_for(range<1>{10}, - [=](id<1> index) { B[index] = 0; }); - }); - } - // Data is not copied back because write-back is canceled - for (int i = 0; i < 10; i++) - if (data1[i] != -1) { - assert(false); - failed = true; + std::vector data1(10, -1); + { + buffer b(range<1>(10)); + b.set_final_data(data1.data()); + b.set_write_back(false); + queue myQueue; + myQueue.submit([&](handler &cgh) { + auto B = b.get_access(cgh); + cgh.parallel_for(range<1>{10}, + [=](id<1> index) { B[index] = 0; }); + }); } - } + // Data is not copied back because write-back is canceled + for (int i = 0; i < 10; i++) + if (data1[i] != -1) { + assert(false); + failed = true; + } + } - { - std::vector data1(10, -1); - std::vector data2(10, -2); { - buffer a(data1.data(), range<1>(10)); - buffer b(data2.data(), range<1>(10)); - queue myQueue; - myQueue.submit([&](handler &cgh) { - auto A = a.get_access(cgh); - auto B = b.get_access(cgh); - cgh.parallel_for( - range<1>{10}, [=](id<1> index) { A[index] = 0; }); - }); - } // Data is copied back - for (int i = 0; i < 10; i++) - assert(data2[i] == -2); - for (int i = 0; i < 10; i++) - assert(data1[i] == 0); - } + std::vector data1(10, -1); + std::vector data2(10, -2); + { + buffer a(data1.data(), range<1>(10)); + buffer b(data2.data(), range<1>(10)); + queue myQueue; + myQueue.submit([&](handler &cgh) { + auto A = a.get_access(cgh); + auto B = b.get_access(cgh); + cgh.parallel_for( + range<1>{10}, [=](id<1> index) { A[index] = 0; }); + }); + } // Data is copied back + for (int i = 0; i < 10; i++) + assert(data2[i] == -2); + for (int i = 0; i < 10; i++) + assert(data1[i] == 0); + } - { - std::vector data1(10, -1); - std::vector data2(10, -2); { - buffer a(data1.data(), range<1>(10)); - buffer b(data2); - accessor - A(a); - accessor - B(b); - queue myQueue; - myQueue.submit([&](handler &cgh) { - cgh.require(A); - cgh.require(B); - cgh.parallel_for( - range<1>{10}, [=](id<1> index) { A[index] = 0; }); - }); - } // Data is copied back - for (int i = 0; i < 10; i++) - assert(data2[i] == -2); - for (int i = 0; i < 10; i++) - assert(data1[i] == 0); - } - - { - int data[10]; - void *voidPtr = (void *)data; - buffer b(range<1>(10)); - b.set_final_data(voidPtr); - } + std::vector data1(10, -1); + std::vector data2(10, -2); + { + buffer a(data1.data(), range<1>(10)); + buffer b(data2); + accessor + A(a); + accessor + B(b); + queue myQueue; + myQueue.submit([&](handler &cgh) { + cgh.require(A); + cgh.require(B); + cgh.parallel_for( + range<1>{10}, [=](id<1> index) { A[index] = 0; }); + }); + } // Data is copied back + for (int i = 0; i < 10; i++) + assert(data2[i] == -2); + for (int i = 0; i < 10; i++) + assert(data1[i] == 0); + } - { - std::allocator buf_alloc; - std::shared_ptr data(new float8[8], [](float8 *p) { delete[] p; }); - sycl::buffer> b(data, sycl::range<1>(8), - buf_alloc); - } + { + int data[10]; + void *voidPtr = (void *)data; + buffer b(range<1>(10)); + b.set_final_data(voidPtr); + } - { - constexpr int Size = 6; - sycl::buffer Buf_1(Size); - sycl::buffer Buf_2(Size / 2); + { + std::allocator buf_alloc; + std::shared_ptr data(new float8[8], + [](float8 *p) { delete[] p; }); + sycl::buffer> b(data, sycl::range<1>(8), + buf_alloc); + } { - auto AccA = Buf_1.get_access(Size / 2); - auto AccB = Buf_2.get_access(Size / 2); - assert(AccA.get_size() == AccB.get_size()); - assert(AccA.get_range() == AccB.get_range()); - assert(AccA.get_count() == AccB.get_count()); + constexpr int Size = 6; + sycl::buffer Buf_1(Size); + sycl::buffer Buf_2(Size / 2); + + { + auto AccA = Buf_1.get_access(Size / 2); + auto AccB = Buf_2.get_access(Size / 2); + assert(AccA.get_size() == AccB.get_size()); + assert(AccA.get_range() == AccB.get_range()); + assert(AccA.get_count() == AccB.get_count()); + } + + auto AH0 = accessor(Buf_1); + auto BH0 = accessor(Buf_2); + assert(AH0.get_size() == sizeof(char)); + assert(BH0.get_size() == sizeof(char)); + assert(AH0.get_count() == 1); + assert(BH0.get_count() == 1); + + queue Queue; + Queue.submit([&](handler &CGH) { + auto AK0 = + accessor( + Buf_1, CGH); + auto BK0 = + accessor( + Buf_2, CGH); + assert(AK0.get_size() == sizeof(char)); + assert(BK0.get_size() == sizeof(char)); + assert(AK0.get_count() == 1); + assert(BK0.get_count() == 1); + CGH.single_task([]() {}); + }); } - auto AH0 = accessor(Buf_1); - auto BH0 = accessor(Buf_2); - assert(AH0.get_size() == sizeof(char)); - assert(BH0.get_size() == sizeof(char)); - assert(AH0.get_count() == 1); - assert(BH0.get_count() == 1); - - queue Queue; - Queue.submit([&](handler &CGH) { - auto AK0 = - accessor( - Buf_1, CGH); - auto BK0 = - accessor( - Buf_2, CGH); - assert(AK0.get_size() == sizeof(char)); - assert(BK0.get_size() == sizeof(char)); - assert(AK0.get_count() == 1); - assert(BK0.get_count() == 1); - CGH.single_task([]() {}); - }); - } + { + int data = 5; + buffer Buffer(&data, range<1>(1)); + assert(Buffer.size() == 1); + assert(Buffer.byte_size() == 1 * sizeof(int)); + } - { - int data = 5; - buffer Buffer(&data, range<1>(1)); - assert(Buffer.size() == 1); - assert(Buffer.byte_size() == 1 * sizeof(int)); + // TODO tests with mutex property + return failed; } - - // TODO tests with mutex property - return failed; -} diff --git a/SYCL/ESIMD/api/bin_and_cmp_ops_heavy_aspect-fp64.cpp b/SYCL/ESIMD/api/bin_and_cmp_ops_heavy_aspect-fp64.cpp index 4aee26383f..af9a7ccd53 100644 --- a/SYCL/ESIMD/api/bin_and_cmp_ops_heavy_aspect-fp64.cpp +++ b/SYCL/ESIMD/api/bin_and_cmp_ops_heavy_aspect-fp64.cpp @@ -1,5 +1,5 @@ //==--------------- bin_un_cmp_ops_heavy_aspect-fp64.cpp - DPC++ ESIMD -//on-device test -==// +// on-device test -==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/SYCL/ESIMD/regression/Inputs/dgetrf.hpp b/SYCL/ESIMD/regression/Inputs/dgetrf.hpp index e8e09ad26d..d3bd85950f 100644 --- a/SYCL/ESIMD/regression/Inputs/dgetrf.hpp +++ b/SYCL/ESIMD/regression/Inputs/dgetrf.hpp @@ -496,4 +496,3 @@ int main(int argc, char *argv[]) { } return exit_status; } - diff --git a/SYCL/ESIMD/regression/dgetrf_8x8.cpp b/SYCL/ESIMD/regression/dgetrf_8x8.cpp index a088516d28..c680df7df9 100644 --- a/SYCL/ESIMD/regression/dgetrf_8x8.cpp +++ b/SYCL/ESIMD/regression/dgetrf_8x8.cpp @@ -323,4 +323,3 @@ int main(int argc, char *argv[]) { } return exit_status; } - diff --git a/SYCL/SubGroup/generic-shuffle_aspect-fp64.cpp b/SYCL/SubGroup/generic-shuffle_aspect-fp64.cpp index b8530ee1f9..4643b64a56 100644 --- a/SYCL/SubGroup/generic-shuffle_aspect-fp64.cpp +++ b/SYCL/SubGroup/generic-shuffle_aspect-fp64.cpp @@ -4,7 +4,8 @@ // RUN: %GPU_RUN_PLACEHOLDER %t.out // RUN: %ACC_RUN_PLACEHOLDER %t.out // -//==-- generic_shuffle_aspect-fp64.cpp - SYCL sub_group generic shuffle test *- C++ -*--==// +//==-- generic_shuffle_aspect-fp64.cpp - SYCL sub_group generic shuffle test *- +//C++ -*--==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/SYCL/USM/copy_aspect-fp64.cpp b/SYCL/USM/copy_aspect-fp64.cpp index 41ab15afcb..6c682134ad 100644 --- a/SYCL/USM/copy_aspect-fp64.cpp +++ b/SYCL/USM/copy_aspect-fp64.cpp @@ -1,4 +1,5 @@ -//==---- copy_aspect-fp64.cp - USM copy test ------------------------------------------==// +//==---- copy_aspect-fp64.cp - USM copy test +//------------------------------------------==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. From 9c24b687e5c1af71eae57cebe90fb895e9561810 Mon Sep 17 00:00:00 2001 From: Yanliang Mu Date: Tue, 23 Aug 2022 15:48:36 +0800 Subject: [PATCH 20/35] fix clang-format issue --- SYCL/SubGroup/generic-shuffle_aspect-fp64.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SYCL/SubGroup/generic-shuffle_aspect-fp64.cpp b/SYCL/SubGroup/generic-shuffle_aspect-fp64.cpp index 4643b64a56..607c9e0261 100644 --- a/SYCL/SubGroup/generic-shuffle_aspect-fp64.cpp +++ b/SYCL/SubGroup/generic-shuffle_aspect-fp64.cpp @@ -5,7 +5,7 @@ // RUN: %ACC_RUN_PLACEHOLDER %t.out // //==-- generic_shuffle_aspect-fp64.cpp - SYCL sub_group generic shuffle test *- -//C++ -*--==// +// C++ -*--==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. From a5e61539ba5aeab9c5d90e65b8c17b679e89efa0 Mon Sep 17 00:00:00 2001 From: Yanliang Mu Date: Tue, 23 Aug 2022 16:34:32 +0800 Subject: [PATCH 21/35] fix 3-way merge issue in subgroup_info test, and split 'double' code into info_aspect-fp64.cpp --- SYCL/SubGroup/info.cpp | 28 +++++----------------------- SYCL/SubGroup/info_aspect-fp64.cpp | 16 ++++++++++++++++ 2 files changed, 21 insertions(+), 23 deletions(-) create mode 100644 SYCL/SubGroup/info_aspect-fp64.cpp diff --git a/SYCL/SubGroup/info.cpp b/SYCL/SubGroup/info.cpp index 1eb2017407..e44afdd5ad 100644 --- a/SYCL/SubGroup/info.cpp +++ b/SYCL/SubGroup/info.cpp @@ -40,7 +40,11 @@ int main() { auto Kernel = KB.get_kernel(KernelID); range<2> GlobalRange{50, 40}; +#ifdef ENABLE_FP64 + buffer ABuf{GlobalRange}, BBuf{GlobalRange}, CBuf{GlobalRange}; +#else buffer ABuf{GlobalRange}, BBuf{GlobalRange}, CBuf{GlobalRange}; +#endif Queue.submit([&](sycl::handler &cgh) { auto A = ABuf.get_access(cgh); @@ -62,30 +66,14 @@ int main() { auto sg_sizes = Device.get_info(); for (auto r : {range<3>(3, 4, 5), range<3>(1, 1, 1), range<3>(4, 2, 1), range<3>(32, 3, 4), range<3>(7, 9, 11)}) { - Res = - Kernel - .get_sub_group_info( - Device, r); - bool Expected = - std::find(sg_sizes.begin(), sg_sizes.end(), Res) != sg_sizes.end(); - exit_if_not_equal(Expected, true, "max_sub_group_size"); - Res = Kernel.get_info( Device, r); - Expected = + bool Expected = std::find(sg_sizes.begin(), sg_sizes.end(), Res) != sg_sizes.end(); exit_if_not_equal(Expected, true, "max_sub_group_size"); } } - Res = - Kernel - .get_sub_group_info( - Device); - - /* Sub-group size is not specified in kernel or IL*/ - exit_if_not_equal(Res, 0, "compile_num_sub_groups"); - Res = Kernel.get_info( Device); @@ -100,12 +88,6 @@ int main() { std::end(Vec) && std::find(Vec.begin(), Vec.end(), "cl_intel_required_subgroup_size") != std::end(Vec)) { - Res = Kernel.get_sub_group_info< - info::kernel_sub_group::compile_sub_group_size>(Device); - - /* Required sub-group size is not specified in kernel or IL*/ - exit_if_not_equal(Res, 0, "compile_sub_group_size"); - Res = Kernel.get_info( Device); diff --git a/SYCL/SubGroup/info_aspect-fp64.cpp b/SYCL/SubGroup/info_aspect-fp64.cpp new file mode 100644 index 0000000000..b7a1589798 --- /dev/null +++ b/SYCL/SubGroup/info_aspect-fp64.cpp @@ -0,0 +1,16 @@ +// See https://github.com/intel/llvm/issues/2922 for more info +// UNSUPPORTED: cuda || hip +// REQUIRES: aspect-fp64 +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -DENABLE_FP64 %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +//==------------- info.cpp - SYCL sub_group parameters test ----*- C++ -*---==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "info.cpp" From 49bb4fea8efdd0a6b2c7e9dc044576264686701f Mon Sep 17 00:00:00 2001 From: Yanliang Mu Date: Wed, 24 Aug 2022 10:13:47 +0800 Subject: [PATCH 22/35] Added common comment in _aspect-fp64.cpp, fixed typo issue and changed to use C++ style type aliases. --- .../assignment_atomic64_aspect-fp64.cpp | 5 ++ ...ssignment_atomic64_generic_aspect-fp64.cpp | 5 ++ SYCL/Basic/buffer/buffer.cpp | 12 ++- SYCL/Basic/buffer/buffer_aspect-fp64.cpp | 5 ++ .../specialization_constants_aspect-fp64.cpp | 5 ++ ...ization_constants_override_aspect-fp64.cpp | 5 ++ SYCL/DeviceLib/built-ins/nan_aspect-fp64.cpp | 5 ++ .../api/bin_and_cmp_ops_heavy_aspect-fp64.cpp | 5 ++ .../api/saturation_smoke_aspect-fp64.cpp | 5 ++ .../simd_view_select_2d_fp_aspect-fp64.cpp | 5 ++ .../ESIMD/api/unary_ops_heavy_aspect-fp64.cpp | 16 ++-- SYCL/ESIMD/ext_math_aspect-fp64.cpp | 10 +-- SYCL/ESIMD/regression/Inputs/dgetrf.hpp | 6 +- SYCL/ESIMD/regression/dgetrf_8x8.cpp | 4 +- .../regression/dgetrf_8x8_aspect-fp64.cpp | 7 +- SYCL/ESIMD/regression/dgetrf_aspect-fp64.cpp | 11 ++- .../regression/dgetrf_ref_aspect-fp64.cpp | 10 +-- .../SYCL2020/sort_aspect-fp64.cpp | 5 ++ SYCL/InlineAsm/asm_float_add.cpp | 10 ++- SYCL/InlineAsm/asm_float_add_aspect-fp64.cpp | 11 +++ SYCL/InlineAsm/asm_float_imm_arg.cpp | 13 +++- .../asm_float_imm_arg_aspect-fp64.cpp | 11 +++ SYCL/KernelParams/union_kernel_param.cpp | 4 +- .../union_kernel_param_aspect-fp64.cpp | 5 +- SYCL/SpecConstants/2020/handler-api.cpp | 75 ++++++++++--------- .../2020/handler-api_aspect-fp64.cpp | 12 +-- .../2020/kernel-bundle-api_aspect-fp64.cpp | 12 +-- SYCL/SubGroup/barrier_aspect-fp64.cpp | 5 ++ SYCL/SubGroup/generic-shuffle_aspect-fp64.cpp | 5 ++ SYCL/SubGroup/load_store_aspect-fp64.cpp | 5 ++ SYCL/USM/copy_aspect-fp64.cpp | 5 ++ SYCL/USM/fill_aspect-fp64.cpp | 5 ++ 32 files changed, 204 insertions(+), 100 deletions(-) create mode 100644 SYCL/InlineAsm/asm_float_add_aspect-fp64.cpp create mode 100644 SYCL/InlineAsm/asm_float_imm_arg_aspect-fp64.cpp diff --git a/SYCL/AtomicRef/assignment_atomic64_aspect-fp64.cpp b/SYCL/AtomicRef/assignment_atomic64_aspect-fp64.cpp index a46953ecd2..0dce0ba117 100644 --- a/SYCL/AtomicRef/assignment_atomic64_aspect-fp64.cpp +++ b/SYCL/AtomicRef/assignment_atomic64_aspect-fp64.cpp @@ -1,3 +1,8 @@ +// Enable FP64 part of . To be removed once DPC++ +// supports optional device features and the code could be enabled +// unconditionally without causing failures in speculative compilation +// of the kernels. +// // REQUIRES: aspect-fp64 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -DENABLE_FP64 %s -o %t.out // RUN: %HOST_RUN_PLACEHOLDER %t.out diff --git a/SYCL/AtomicRef/assignment_atomic64_generic_aspect-fp64.cpp b/SYCL/AtomicRef/assignment_atomic64_generic_aspect-fp64.cpp index 9e61572d0e..51df4c6745 100644 --- a/SYCL/AtomicRef/assignment_atomic64_generic_aspect-fp64.cpp +++ b/SYCL/AtomicRef/assignment_atomic64_generic_aspect-fp64.cpp @@ -1,3 +1,8 @@ +// Enable FP64 part of . To be removed once DPC++ +// supports optional device features and the code could be enabled +// unconditionally without causing failures in speculative compilation +// of the kernels. +// // REQUIRES: aspect-fp64 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -DENABLE_FP64 %s -o %t.out // RUN: %HOST_RUN_PLACEHOLDER %t.out diff --git a/SYCL/Basic/buffer/buffer.cpp b/SYCL/Basic/buffer/buffer.cpp index 630a5ce814..0ca947d150 100644 --- a/SYCL/Basic/buffer/buffer.cpp +++ b/SYCL/Basic/buffer/buffer.cpp @@ -583,15 +583,13 @@ int main() { } // Data is copied back for (size_t i = 0; i < size; i++) { - if (bool_vector[i] != true || int_vector[i] != 3) { + bool Passed = true; + Passed &= (bool_vector[i] == true); + Passed &= (int_vector[i] == 3); #ifdef ENABLE_FP64 - if (bool_vector[i] != true || int_vector[i] != 3 || - double_vector[i] != 7.5) { + Passed &= (double_vector[i] == 7.5); #endif - assert(false && "Data was not copied back"); - return 1; - } - } + assert(Passed && "Data was not copied back"); } // Check that data is not copied back after canceling write-back using diff --git a/SYCL/Basic/buffer/buffer_aspect-fp64.cpp b/SYCL/Basic/buffer/buffer_aspect-fp64.cpp index d26a0d4de6..c4232caf5a 100644 --- a/SYCL/Basic/buffer/buffer_aspect-fp64.cpp +++ b/SYCL/Basic/buffer/buffer_aspect-fp64.cpp @@ -1,3 +1,8 @@ +// Enable FP64 part of . To be removed once DPC++ +// supports optional device features and the code could be enabled +// unconditionally without causing failures in speculative compilation +// of the kernels. +// // REQUIRES: aspect-fp64 // RUN: %clangxx %cxx_std_optionc++17 -DENABLE_FP64 %s -o %t1.out %sycl_options // RUN: %HOST_RUN_PLACEHOLDER %t1.out diff --git a/SYCL/DeprecatedFeatures/SpecConsts1.2.1/specialization_constants_aspect-fp64.cpp b/SYCL/DeprecatedFeatures/SpecConsts1.2.1/specialization_constants_aspect-fp64.cpp index 2cd47178f9..b7b0397edd 100644 --- a/SYCL/DeprecatedFeatures/SpecConsts1.2.1/specialization_constants_aspect-fp64.cpp +++ b/SYCL/DeprecatedFeatures/SpecConsts1.2.1/specialization_constants_aspect-fp64.cpp @@ -1,3 +1,8 @@ +// Enable FP64 part of . To be removed once DPC++ +// supports optional device features and the code could be enabled +// unconditionally without causing failures in speculative compilation +// of the kernels. +// // REQUIRES: aspect-fp64 // RUN: %clangxx -fsycl -D__SYCL_INTERNAL_API -DENABLE_FP64 %s -o %t.out // RUN: %HOST_RUN_PLACEHOLDER %t.out diff --git a/SYCL/DeprecatedFeatures/SpecConsts1.2.1/specialization_constants_override_aspect-fp64.cpp b/SYCL/DeprecatedFeatures/SpecConsts1.2.1/specialization_constants_override_aspect-fp64.cpp index 179b27abec..a08e975ba0 100644 --- a/SYCL/DeprecatedFeatures/SpecConsts1.2.1/specialization_constants_override_aspect-fp64.cpp +++ b/SYCL/DeprecatedFeatures/SpecConsts1.2.1/specialization_constants_override_aspect-fp64.cpp @@ -1,3 +1,8 @@ +// Enable FP64 part of . +// To be removed once DPC++ supports optional device features +// and the code could be enabled unconditionally without causing failures +// in speculative compilation of the kernels. +// // REQUIRES: aspect-fp64 // RUN: %clangxx -fsycl -D__SYCL_INTERNAL_API -DENABLE_FP64 %s -o %t.out // RUN: %HOST_RUN_PLACEHOLDER %t.out diff --git a/SYCL/DeviceLib/built-ins/nan_aspect-fp64.cpp b/SYCL/DeviceLib/built-ins/nan_aspect-fp64.cpp index 357e52f864..684e568924 100644 --- a/SYCL/DeviceLib/built-ins/nan_aspect-fp64.cpp +++ b/SYCL/DeviceLib/built-ins/nan_aspect-fp64.cpp @@ -1,3 +1,8 @@ +// Enable FP64 part of . To be removed once DPC++ +// supports optional device features and the code could be enabled +// unconditionally without causing failures in speculative compilation +// of the kernels. +// // REQUIRES: aspect-fp64 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -DENABLE_FP64 %s -o %t.out // RUN: %HOST_RUN_PLACEHOLDER %t.out diff --git a/SYCL/ESIMD/api/bin_and_cmp_ops_heavy_aspect-fp64.cpp b/SYCL/ESIMD/api/bin_and_cmp_ops_heavy_aspect-fp64.cpp index af9a7ccd53..1a84b2ec1e 100644 --- a/SYCL/ESIMD/api/bin_and_cmp_ops_heavy_aspect-fp64.cpp +++ b/SYCL/ESIMD/api/bin_and_cmp_ops_heavy_aspect-fp64.cpp @@ -6,6 +6,11 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// +// Enable FP64 part of . To be removed once DPC++ +// supports optional device features and the code could be enabled +// unconditionally without causing failures in speculative compilation +// of the kernels. +// // REQUIRES: aspect-fp64, gpu // UNSUPPORTED: cuda || hip // TODO: esimd_emulator fails due to unimplemented 'half' type diff --git a/SYCL/ESIMD/api/saturation_smoke_aspect-fp64.cpp b/SYCL/ESIMD/api/saturation_smoke_aspect-fp64.cpp index 6bf6fb4734..1495c161fc 100644 --- a/SYCL/ESIMD/api/saturation_smoke_aspect-fp64.cpp +++ b/SYCL/ESIMD/api/saturation_smoke_aspect-fp64.cpp @@ -6,6 +6,11 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// +// Enable FP64 part of . To be removed once DPC++ +// supports optional device features and the code could be enabled +// unconditionally without causing failures in speculative compilation +// of the kernels. +// // REQUIRES: gpu, aspect-fp64 // UNSUPPORTED: cuda || hip // TODO: esimd_emulator fails due to unimplemented 'half' type diff --git a/SYCL/ESIMD/api/simd_view_select_2d_fp_aspect-fp64.cpp b/SYCL/ESIMD/api/simd_view_select_2d_fp_aspect-fp64.cpp index 7730fa6787..17a7fe90d1 100644 --- a/SYCL/ESIMD/api/simd_view_select_2d_fp_aspect-fp64.cpp +++ b/SYCL/ESIMD/api/simd_view_select_2d_fp_aspect-fp64.cpp @@ -5,6 +5,11 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// +// Enable FP64 part of . To be removed once DPC++ +// supports optional device features and the code could be enabled +// unconditionally without causing failures in speculative compilation +// of the kernels. +// // REQUIRES: gpu, aspect-fp64 // UNSUPPORTED: cuda || hip // TODO: esimd_emulator fails due to unimplemented 'single_task()' method diff --git a/SYCL/ESIMD/api/unary_ops_heavy_aspect-fp64.cpp b/SYCL/ESIMD/api/unary_ops_heavy_aspect-fp64.cpp index e277201fc1..99701b1298 100644 --- a/SYCL/ESIMD/api/unary_ops_heavy_aspect-fp64.cpp +++ b/SYCL/ESIMD/api/unary_ops_heavy_aspect-fp64.cpp @@ -5,6 +5,11 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// +// Enable FP64 part of . To be removed once DPC++ +// supports optional device features and the code could be enabled +// unconditionally without causing failures in speculative compilation +// of the kernels. +// // REQUIRES: gpu, aspect-fp64 // UNSUPPORTED: cuda || hip // TODO: esimd_emulator fails due to unimplemented 'half' type @@ -12,15 +17,4 @@ // RUN: %clangxx -fsycl -DENABLE_FP64 %s -o %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out -// Tests various unary operations applied to simd objects. - -// TODO -// Arithmetic operations behaviour depends on Gen's control regiter's rounding -// mode, which is RTNE by default: -// cr0.5:4 is 00b = Round to Nearest or Even (RTNE) -// For half this leads to divergence between Gen and host (emulated) results -// larger than certain threshold. Might need to tune the cr0 once this feature -// is available in ESIMD. -// - #include "unary_ops_heavy.cpp" diff --git a/SYCL/ESIMD/ext_math_aspect-fp64.cpp b/SYCL/ESIMD/ext_math_aspect-fp64.cpp index f489cd9ff9..1b44dda5b1 100644 --- a/SYCL/ESIMD/ext_math_aspect-fp64.cpp +++ b/SYCL/ESIMD/ext_math_aspect-fp64.cpp @@ -5,6 +5,11 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// +// Enable FP64 part of . To be removed once DPC++ +// supports optional device features and the code could be enabled +// unconditionally without causing failures in speculative compilation +// of the kernels. +// // REQUIRES: gpu, aspect-fp64 // UNSUPPORTED: cuda || hip // TODO: esimd_emulator fails due to unimplemented 'half' type @@ -12,9 +17,4 @@ // RUN: %clangxx -fsycl -DENABLE_FP64 %s -o %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out -// This test checks extended math operations. Combinations of -// - argument type - half, float -// - math function - sin, cos, ..., div_ieee, pow -// - SYCL vs ESIMD APIs - #include "ext_math.cpp" diff --git a/SYCL/ESIMD/regression/Inputs/dgetrf.hpp b/SYCL/ESIMD/regression/Inputs/dgetrf.hpp index d3bd85950f..6cc7fd675b 100644 --- a/SYCL/ESIMD/regression/Inputs/dgetrf.hpp +++ b/SYCL/ESIMD/regression/Inputs/dgetrf.hpp @@ -19,9 +19,9 @@ #include #ifdef ENABLE_FP64 -typedef double fptype; +using fptype = double; #else -typedef float fptype; +using fptype = float; #endif #define ABS(x) ((x) >= 0 ? (x) : -(x)) @@ -53,7 +53,7 @@ using namespace sycl; using namespace std; using namespace sycl::ext::intel::esimd; -ESIMD_PRIVATE ESIMD_REGISTER(256) simd GRF; +ESIMD_PRIVATE ESIMD_REGISTER(192) simd GRF; #define V(x, w, i) (x).template select(i) #define V1(x, i) V(x, 1, i) diff --git a/SYCL/ESIMD/regression/dgetrf_8x8.cpp b/SYCL/ESIMD/regression/dgetrf_8x8.cpp index c680df7df9..eebb690fb6 100644 --- a/SYCL/ESIMD/regression/dgetrf_8x8.cpp +++ b/SYCL/ESIMD/regression/dgetrf_8x8.cpp @@ -21,9 +21,9 @@ #include #ifdef ENABLE_FP64 -typedef double fptype; +using fptype = double; #else -typedef float fptype; +using fptype = float; #endif #define ABS(x) ((x) >= 0 ? (x) : -(x)) diff --git a/SYCL/ESIMD/regression/dgetrf_8x8_aspect-fp64.cpp b/SYCL/ESIMD/regression/dgetrf_8x8_aspect-fp64.cpp index 9b2a382019..43403c5d7b 100644 --- a/SYCL/ESIMD/regression/dgetrf_8x8_aspect-fp64.cpp +++ b/SYCL/ESIMD/regression/dgetrf_8x8_aspect-fp64.cpp @@ -5,11 +5,14 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// +// Enable FP64 part of . To be removed once DPC++ +// supports optional device features and the code could be enabled +// unconditionally without causing failures in speculative compilation +// of the kernels. +// // REQUIRES: gpu, aspect-fp64 // UNSUPPORTED: cuda || hip // RUN: %clangxx -fsycl -DENABLE_FP64 %s -I%S/.. -o %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out 1 // -// Reduced version of dgetrf.cpp - M = 8, N = 8, single batch. -// #include "dgetrf_8x8.cpp" diff --git a/SYCL/ESIMD/regression/dgetrf_aspect-fp64.cpp b/SYCL/ESIMD/regression/dgetrf_aspect-fp64.cpp index 2350f9dbff..0d3b0631cc 100644 --- a/SYCL/ESIMD/regression/dgetrf_aspect-fp64.cpp +++ b/SYCL/ESIMD/regression/dgetrf_aspect-fp64.cpp @@ -5,15 +5,14 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// - +// Enable FP64 part of . To be removed once DPC++ +// supports optional device features and the code could be enabled +// unconditionally without causing failures in speculative compilation +// of the kernels. +// // REQUIRES: gpu, aspect-fp64 // UNSUPPORTED: cuda || hip // RUN: %clangxx -fsycl -DENABLE_FP64 %s -I%S/.. -o %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out 3 2 1 -// -// This test checks the correctness of ESIMD program for batched LU -// decomposition without pivoting. The program contains multiple branches -// corresponding to LU input sizes; all internal functions are inlined. -// #include "Inputs/dgetrf.hpp" diff --git a/SYCL/ESIMD/regression/dgetrf_ref_aspect-fp64.cpp b/SYCL/ESIMD/regression/dgetrf_ref_aspect-fp64.cpp index 83d152d6dc..c7040a716c 100644 --- a/SYCL/ESIMD/regression/dgetrf_ref_aspect-fp64.cpp +++ b/SYCL/ESIMD/regression/dgetrf_ref_aspect-fp64.cpp @@ -5,15 +5,15 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// - +// Enable FP64 part of . To be removed once DPC++ +// supports optional device features and the code could be enabled +// unconditionally without causing failures in speculative compilation +// of the kernels. +// // REQUIRES: gpu, aspect-fp64 // UNSUPPORTED: cuda || hip // RUN: %clangxx -fsycl -DUSE_REF -DENABLE_FP64 %s -I%S/.. -o %t.ref.out // RUN: %GPU_RUN_PLACEHOLDER %t.ref.out 3 2 1 // -// This test checks the correctness of ESIMD program for batched LU -// decomposition without pivoting. The program contains multiple branches -// corresponding to LU input sizes; all internal functions are inlined. -// #include "Inputs/dgetrf.hpp" diff --git a/SYCL/GroupAlgorithm/SYCL2020/sort_aspect-fp64.cpp b/SYCL/GroupAlgorithm/SYCL2020/sort_aspect-fp64.cpp index a5374b5339..95137d2200 100644 --- a/SYCL/GroupAlgorithm/SYCL2020/sort_aspect-fp64.cpp +++ b/SYCL/GroupAlgorithm/SYCL2020/sort_aspect-fp64.cpp @@ -1,3 +1,8 @@ +// Enable FP64 part of . To be removed once DPC++ +// supports optional device features and the code could be enabled +// unconditionally without causing failures in speculative compilation +// of the kernels. +// // REQUIRES: aspect-fp64 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -DENABLE_FP64 %s -I . -o %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out diff --git a/SYCL/InlineAsm/asm_float_add.cpp b/SYCL/InlineAsm/asm_float_add.cpp index f633743449..5b9bdfade7 100644 --- a/SYCL/InlineAsm/asm_float_add.cpp +++ b/SYCL/InlineAsm/asm_float_add.cpp @@ -9,7 +9,13 @@ #include #include +#ifdef ENABLE_FP64 +using fptype = double; +using dataType = sycl::cl_double; +#else +using fptype = float; using dataType = sycl::cl_float; +#endif template struct KernelFunctor : WithInputBuffers, WithOutputBuffer { @@ -46,8 +52,8 @@ int main() { std::vector inputA(DEFAULT_PROBLEM_SIZE), inputB(DEFAULT_PROBLEM_SIZE); for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) { - inputA[i] = (float)1 / std::pow(2, i); - inputB[i] = (float)2 / std::pow(2, i); + inputA[i] = (fptype)1 / std::pow(2, i); + inputB[i] = (fptype)2 / std::pow(2, i); } KernelFunctor<> f(inputA, inputB); diff --git a/SYCL/InlineAsm/asm_float_add_aspect-fp64.cpp b/SYCL/InlineAsm/asm_float_add_aspect-fp64.cpp new file mode 100644 index 0000000000..04f37e9abc --- /dev/null +++ b/SYCL/InlineAsm/asm_float_add_aspect-fp64.cpp @@ -0,0 +1,11 @@ +// Enable FP64 part of . To be removed once DPC++ +// supports optional device features and the code could be enabled +// unconditionally without causing failures in speculative compilation +// of the kernels. +// +// UNSUPPORTED: cuda || hip_nvidia +// REQUIRES: gpu,linux,aspect-fp64 +// RUN: %clangxx -fsycl %s -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +#include "asm_float_add.cpp" diff --git a/SYCL/InlineAsm/asm_float_imm_arg.cpp b/SYCL/InlineAsm/asm_float_imm_arg.cpp index d2fb47000f..112c7d9907 100644 --- a/SYCL/InlineAsm/asm_float_imm_arg.cpp +++ b/SYCL/InlineAsm/asm_float_imm_arg.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda || hip_nvidia -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,aspect-fp64 // RUN: %clangxx -fsycl %s -o %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out @@ -9,8 +9,15 @@ #include #include -constexpr float IMM_ARGUMENT = 0.5; +#ifdef ENABLE_FP64 +using fptype = double; +using dataType = sycl::cl_double; +#else +using fptype = float; using dataType = sycl::cl_float; +#endif + +constexpr fptype IMM_ARGUMENT = 0.5; template struct KernelFunctor : WithInputBuffers, WithOutputBuffer { @@ -42,7 +49,7 @@ struct KernelFunctor : WithInputBuffers, WithOutputBuffer { int main() { std::vector input(DEFAULT_PROBLEM_SIZE); for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) - input[i] = (float)1 / std::pow(2, i); + input[i] = (fptype)1 / std::pow(2, i); KernelFunctor<> f(input); if (!launchInlineASMTest(f)) diff --git a/SYCL/InlineAsm/asm_float_imm_arg_aspect-fp64.cpp b/SYCL/InlineAsm/asm_float_imm_arg_aspect-fp64.cpp new file mode 100644 index 0000000000..a837defb76 --- /dev/null +++ b/SYCL/InlineAsm/asm_float_imm_arg_aspect-fp64.cpp @@ -0,0 +1,11 @@ +// Enable FP64 part of . To be removed once DPC++ +// supports optional device features and the code could be enabled +// unconditionally without causing failures in speculative compilation +// of the kernels. +// +// UNSUPPORTED: cuda || hip_nvidia +// REQUIRES: gpu,linux,aspect-fp64 +// RUN: %clangxx -fsycl -DENABLE_FP64 %s -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +#include "asm_float_imm_arg.cpp" diff --git a/SYCL/KernelParams/union_kernel_param.cpp b/SYCL/KernelParams/union_kernel_param.cpp index fee2fbd4c6..aa005d0fdb 100644 --- a/SYCL/KernelParams/union_kernel_param.cpp +++ b/SYCL/KernelParams/union_kernel_param.cpp @@ -10,9 +10,9 @@ #include #ifdef ENABLE_FP64 -typedef double fptype; +using fptype = double; #else -typedef float fptype; +using fptype = float; #endif union TestUnion { diff --git a/SYCL/KernelParams/union_kernel_param_aspect-fp64.cpp b/SYCL/KernelParams/union_kernel_param_aspect-fp64.cpp index ba6e71b12e..de5afb7dfa 100644 --- a/SYCL/KernelParams/union_kernel_param_aspect-fp64.cpp +++ b/SYCL/KernelParams/union_kernel_param_aspect-fp64.cpp @@ -1,4 +1,7 @@ -// This test checks kernel execution with union type as kernel parameters. +// Enable FP64 part of . To be removed once DPC++ +// supports optional device features and the code could be enabled +// unconditionally without causing failures in speculative compilation +// of the kernels. // REQUIRES: aspect-fp64 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -DENABLE_FP64 %s -o %t.out diff --git a/SYCL/SpecConstants/2020/handler-api.cpp b/SYCL/SpecConstants/2020/handler-api.cpp index 2a57d12b5b..2c3f8d8065 100644 --- a/SYCL/SpecConstants/2020/handler-api.cpp +++ b/SYCL/SpecConstants/2020/handler-api.cpp @@ -23,10 +23,10 @@ constexpr sycl::specialization_id int_id; constexpr sycl::specialization_id int_id2(2); -constexpr sycl::specialization_id custom_type_id; #ifdef ENABLE_FP64 constexpr sycl::specialization_id double_id(3.14); #endif +constexpr sycl::specialization_id custom_type_id; class TestDefaultValuesKernel; class EmptyKernel; @@ -73,26 +73,26 @@ int main() { bool test_default_values(sycl::queue q) { sycl::buffer int_buffer(1); sycl::buffer int_buffer2(1); - sycl::buffer custom_type_buffer(1); #ifdef ENABLE_FP64 sycl::buffer double_buffer(1); #endif + sycl::buffer custom_type_buffer(1); q.submit([&](sycl::handler &cgh) { auto int_acc = int_buffer.get_access(cgh); auto int_acc2 = int_buffer2.get_access(cgh); - auto custom_type_acc = - custom_type_buffer.get_access(cgh); -#ifdef ENABLE_FP6 +#ifdef ENABLE_FP64 auto double_acc = double_buffer.get_access(cgh); #endif + auto custom_type_acc = + custom_type_buffer.get_access(cgh); cgh.single_task([=](sycl::kernel_handler kh) { int_acc[0] = kh.get_specialization_constant(); int_acc2[0] = kh.get_specialization_constant(); - custom_type_acc[0] = kh.get_specialization_constant(); #ifdef ENABLE_FP64 double_acc[0] = kh.get_specialization_constant(); #endif + custom_type_acc[0] = kh.get_specialization_constant(); }); }); @@ -106,17 +106,18 @@ bool test_default_values(sycl::queue q) { if (!check_value(2, int_acc2[0], "integer specialization constant")) return false; +#ifdef ENABLE_FP64 + auto double_acc = double_buffer.get_access(); + if (!check_value(3.14, double_acc[0], "double specialization constant")) + return false; +#endif + auto custom_type_acc = custom_type_buffer.get_access(); const custom_type custom_type_ref; if (!check_value(custom_type_ref, custom_type_acc[0], "custom_type specialization constant")) return false; -#ifdef ENABLE_FP64 - auto double_acc = double_buffer.get_access(); - if (!check_value(3.14, double_acc[0], "double specialization constant")) - return false; -#endif return true; } @@ -129,39 +130,35 @@ bool test_set_and_get_on_host(sycl::queue q) { "integer specializaiton constant before setting any value")) ++errors; - custom_type custom_type_ref; - if (!check_value( - custom_type_ref, cgh.get_specialization_constant(), - "custom_type specializaiton constant before setting any value")) - ++errors; #ifdef ENABLE_FP64 if (!check_value(3.14, cgh.get_specialization_constant(), "double specializaiton constant before setting any value")) ++errors; #endif + custom_type custom_type_ref; + if (!check_value( + custom_type_ref, cgh.get_specialization_constant(), + "custom_type specializaiton constant before setting any value")) + ++errors; + int new_int_value = 8; - custom_type new_custom_type_value('b', 1.0, 12); #ifdef ENABLE_FP64 double new_double_value = 3.0; #endif + custom_type new_custom_type_value('b', 1.0, 12); cgh.set_specialization_constant(new_int_value); - cgh.set_specialization_constant(new_custom_type_value); #ifdef ENABLE_FP64 cgh.set_specialization_constant(new_double_value); #endif + cgh.set_specialization_constant(new_custom_type_value); if (!check_value( new_int_value, cgh.get_specialization_constant(), "integer specializaiton constant after setting a new value")) ++errors; - if (!check_value( - new_custom_type_value, - cgh.get_specialization_constant(), - "custom_type specializaiton constant after setting a new value")) - ++errors; #ifdef ENABLE_FP64 if (!check_value( new_double_value, cgh.get_specialization_constant(), @@ -169,6 +166,12 @@ bool test_set_and_get_on_host(sycl::queue q) { ++errors; #endif + if (!check_value( + new_custom_type_value, + cgh.get_specialization_constant(), + "custom_type specializaiton constant after setting a new value")) + ++errors; + cgh.single_task([=]() {}); }); @@ -178,41 +181,41 @@ bool test_set_and_get_on_host(sycl::queue q) { bool test_set_and_get_on_device(sycl::queue q) { sycl::buffer int_buffer(1); sycl::buffer int_buffer2(1); - sycl::buffer custom_type_buffer(1); #ifdef ENABLE_FP64 sycl::buffer double_buffer(1); #endif + sycl::buffer custom_type_buffer(1); int new_int_value = 8; int new_int_value2 = 0; - custom_type new_custom_type_value('b', 1.0, 12); #ifdef ENABLE_FP64 double new_double_value = 3.0; #endif + custom_type new_custom_type_value('b', 1.0, 12); q.submit([&](sycl::handler &cgh) { auto int_acc = int_buffer.get_access(cgh); auto int_acc2 = int_buffer2.get_access(cgh); - auto custom_type_acc = - custom_type_buffer.get_access(cgh); #ifdef ENABLE_FP64 auto double_acc = double_buffer.get_access(cgh); #endif + auto custom_type_acc = + custom_type_buffer.get_access(cgh); cgh.set_specialization_constant(new_int_value); - cgh.set_specialization_constant(new_int_value2); - cgh.set_specialization_constant(new_custom_type_value); #ifdef ENABLE_FP64 cgh.set_specialization_constant(new_double_value); #endif + cgh.set_specialization_constant(new_int_value2); + cgh.set_specialization_constant(new_custom_type_value); cgh.single_task([=](sycl::kernel_handler kh) { int_acc[0] = kh.get_specialization_constant(); int_acc2[0] = kh.get_specialization_constant(); - custom_type_acc[0] = kh.get_specialization_constant(); #ifdef ENABLE_FP64 double_acc[0] = kh.get_specialization_constant(); #endif + custom_type_acc[0] = kh.get_specialization_constant(); }); }); @@ -226,16 +229,18 @@ bool test_set_and_get_on_device(sycl::queue q) { "integer specialization constant")) return false; - auto custom_type_acc = - custom_type_buffer.get_access(); - if (!check_value(new_custom_type_value, custom_type_acc[0], - "custom_type specialization constant")) - return false; #ifdef ENABLE_FP64 auto double_acc = double_buffer.get_access(); if (!check_value(new_double_value, double_acc[0], "double specialization constant")) return false; #endif + + auto custom_type_acc = + custom_type_buffer.get_access(); + if (!check_value(new_custom_type_value, custom_type_acc[0], + "custom_type specialization constant")) + return false; + return true; } diff --git a/SYCL/SpecConstants/2020/handler-api_aspect-fp64.cpp b/SYCL/SpecConstants/2020/handler-api_aspect-fp64.cpp index a57e4419a6..3c4b3edc8a 100644 --- a/SYCL/SpecConstants/2020/handler-api_aspect-fp64.cpp +++ b/SYCL/SpecConstants/2020/handler-api_aspect-fp64.cpp @@ -1,11 +1,7 @@ -// This test is intended to check basic operations with SYCL 2020 specialization -// constants using sycl::handler and sycl::kernel_handler APIs: -// - test that specialization constants can be accessed in kernel and they -// have their default values if `set_specialization_constants` wasn't called -// - test that specialization constant values can be set and retrieved within -// command group scope -// - test that specialization constant values can be set within command group -// scope and correctly retrieved within a kernel +// Enable FP64 part of . To be removed once DPC++ +// supports optional device features and the code could be enabled +// unconditionally without causing failures in speculative compilation +// of the kernels. // // REQUIRES: aspect-fp64 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -DENABLE_FP64 %s -o %t.out \ diff --git a/SYCL/SpecConstants/2020/kernel-bundle-api_aspect-fp64.cpp b/SYCL/SpecConstants/2020/kernel-bundle-api_aspect-fp64.cpp index 9c15fe2fcb..e6bc742141 100644 --- a/SYCL/SpecConstants/2020/kernel-bundle-api_aspect-fp64.cpp +++ b/SYCL/SpecConstants/2020/kernel-bundle-api_aspect-fp64.cpp @@ -1,11 +1,7 @@ -// This test is intended to check basic operations with SYCL 2020 specialization -// constants using sycl::kernel_bundle and sycl::kernel_handler APIs: -// - test that specialization constants can be accessed in kernel and they -// have their default values if `set_specialization_constants` wasn't called -// - test that specialization constant values can be set and retrieved through -// kernel_bundle APIs on host -// - test that specialization constant values can be set through kernel_bundle -// API and correctly retrieved within a kernel +// Enable FP64 part of . To be removed once DPC++ +// supports optional device features and the code could be enabled +// unconditionally without causing failures in speculative compilation +// of the kernels. // // REQUIRES: aspect-fp64 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -DENABLE_FP64 %s -o %t.out \ diff --git a/SYCL/SubGroup/barrier_aspect-fp64.cpp b/SYCL/SubGroup/barrier_aspect-fp64.cpp index a7d201581c..1aed97757b 100644 --- a/SYCL/SubGroup/barrier_aspect-fp64.cpp +++ b/SYCL/SubGroup/barrier_aspect-fp64.cpp @@ -1,3 +1,8 @@ +// Enable FP64 part of . To be removed once DPC++ +// supports optional device features and the code could be enabled +// unconditionally without causing failures in speculative compilation +// of the kernels. +// // REQUIRES: aspect-fp64 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -DENABLE_FP64 %s -o %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out diff --git a/SYCL/SubGroup/generic-shuffle_aspect-fp64.cpp b/SYCL/SubGroup/generic-shuffle_aspect-fp64.cpp index 607c9e0261..3a4894d498 100644 --- a/SYCL/SubGroup/generic-shuffle_aspect-fp64.cpp +++ b/SYCL/SubGroup/generic-shuffle_aspect-fp64.cpp @@ -1,3 +1,8 @@ +// Enable FP64 part of . To be removed once DPC++ +// supports optional device features and the code could be enabled +// unconditionally without causing failures in speculative compilation +// of the kernels. +// // REQUIRES: aspect-fp64 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -DENABLE_FP64 %s -o %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out diff --git a/SYCL/SubGroup/load_store_aspect-fp64.cpp b/SYCL/SubGroup/load_store_aspect-fp64.cpp index c4011a2d28..b68d87dbbb 100644 --- a/SYCL/SubGroup/load_store_aspect-fp64.cpp +++ b/SYCL/SubGroup/load_store_aspect-fp64.cpp @@ -1,3 +1,8 @@ +// Enable FP64 part of . To be removed once DPC++ +// supports optional device features and the code could be enabled +// unconditionally without causing failures in speculative compilation +// of the kernels. +// // REQUIRES: aspect-fp64 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -DENABLE_FP64 %s -o %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out diff --git a/SYCL/USM/copy_aspect-fp64.cpp b/SYCL/USM/copy_aspect-fp64.cpp index 6c682134ad..1de25cf5b8 100644 --- a/SYCL/USM/copy_aspect-fp64.cpp +++ b/SYCL/USM/copy_aspect-fp64.cpp @@ -7,6 +7,11 @@ // //===----------------------------------------------------------------------===// // +// Enable FP64 part of . To be removed once DPC++ +// supports optional device features and the code could be enabled +// unconditionally without causing failures in speculative compilation +// of the kernels. +// // REQUIRES: aspect-fp64 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -DENABLE_FP64 %s -o %t1.out // RUN: %HOST_RUN_PLACEHOLDER %t1.out diff --git a/SYCL/USM/fill_aspect-fp64.cpp b/SYCL/USM/fill_aspect-fp64.cpp index d67bbeba65..7b73a62d91 100644 --- a/SYCL/USM/fill_aspect-fp64.cpp +++ b/SYCL/USM/fill_aspect-fp64.cpp @@ -7,6 +7,11 @@ // //===----------------------------------------------------------------------===// // +// Enable FP64 part of . To be removed once DPC++ +// supports optional device features and the code could be enabled +// unconditionally without causing failures in speculative compilation +// of the kernels. +// // REQUIRES: aspect-fp64 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -DENABLE_FP64 %s -o %t1.out // RUN: %HOST_RUN_PLACEHOLDER %t1.out From 3c70cd9c4d6b45511c56351edca7949be28626ab Mon Sep 17 00:00:00 2001 From: Yanliang Mu Date: Wed, 24 Aug 2022 10:42:05 +0800 Subject: [PATCH 23/35] fix clang-format issue --- SYCL/AtomicRef/assignment_atomic64_generic_aspect-fp64.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/SYCL/AtomicRef/assignment_atomic64_generic_aspect-fp64.cpp b/SYCL/AtomicRef/assignment_atomic64_generic_aspect-fp64.cpp index 51df4c6745..fb5feaace7 100644 --- a/SYCL/AtomicRef/assignment_atomic64_generic_aspect-fp64.cpp +++ b/SYCL/AtomicRef/assignment_atomic64_generic_aspect-fp64.cpp @@ -1,5 +1,5 @@ -// Enable FP64 part of . To be removed once DPC++ -// supports optional device features and the code could be enabled +// Enable FP64 part of . To be removed once +// DPC++ supports optional device features and the code could be enabled // unconditionally without causing failures in speculative compilation // of the kernels. // From 802e9d1a9e800b7a3baa6c3e234ad2e7a83cec9b Mon Sep 17 00:00:00 2001 From: Yanliang Mu Date: Wed, 24 Aug 2022 11:28:55 +0800 Subject: [PATCH 24/35] fix unmatched brackets --- SYCL/Basic/buffer/buffer.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/SYCL/Basic/buffer/buffer.cpp b/SYCL/Basic/buffer/buffer.cpp index 0ca947d150..ab8e0dd694 100644 --- a/SYCL/Basic/buffer/buffer.cpp +++ b/SYCL/Basic/buffer/buffer.cpp @@ -724,3 +724,4 @@ int main() { // TODO tests with mutex property return failed; } +} From e21dfb9d774abcf42bd1320ede647f068a04c59b Mon Sep 17 00:00:00 2001 From: Yanliang Mu Date: Thu, 25 Aug 2022 16:16:10 +0800 Subject: [PATCH 25/35] use function template to run the test for multiple data types in asm_float_add and re-enable IEEE-conformant single precision check in ESIMD/ext_math.cpp --- SYCL/ESIMD/ext_math.cpp | 4 +- SYCL/InlineAsm/asm_float_add.cpp | 39 +++++++++++--------- SYCL/InlineAsm/asm_float_add_aspect-fp64.cpp | 2 +- 3 files changed, 26 insertions(+), 19 deletions(-) diff --git a/SYCL/ESIMD/ext_math.cpp b/SYCL/ESIMD/ext_math.cpp index b6a0ac2e10..e53a534f88 100644 --- a/SYCL/ESIMD/ext_math.cpp +++ b/SYCL/ESIMD/ext_math.cpp @@ -476,10 +476,12 @@ int main(void) { Pass &= testSYCL(Q); Pass &= testSYCL(Q); } -#ifdef ENABLE_FP64 Pass &= testESIMDSqrtIEEE(Q); +#ifdef ENABLE_FP64 Pass &= testESIMDSqrtIEEE(Q); +#endif Pass &= testESIMDDivIEEE(Q); +#ifdef ENABLE_FP64 Pass &= testESIMDDivIEEE(Q); #endif std::cout << (Pass ? "Test Passed\n" : "Test FAILED\n"); diff --git a/SYCL/InlineAsm/asm_float_add.cpp b/SYCL/InlineAsm/asm_float_add.cpp index 5b9bdfade7..6614cca4a1 100644 --- a/SYCL/InlineAsm/asm_float_add.cpp +++ b/SYCL/InlineAsm/asm_float_add.cpp @@ -9,15 +9,7 @@ #include #include -#ifdef ENABLE_FP64 -using fptype = double; -using dataType = sycl::cl_double; -#else -using fptype = float; -using dataType = sycl::cl_float; -#endif - -template +template struct KernelFunctor : WithInputBuffers, WithOutputBuffer { KernelFunctor(const std::vector &input1, const std::vector &input2) : WithInputBuffers(input1, input2), WithOutputBuffer( @@ -48,26 +40,39 @@ struct KernelFunctor : WithInputBuffers, WithOutputBuffer { } }; -int main() { - std::vector inputA(DEFAULT_PROBLEM_SIZE), +template +bool check() { + std::vector inputA(DEFAULT_PROBLEM_SIZE), inputB(DEFAULT_PROBLEM_SIZE); for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) { - inputA[i] = (fptype)1 / std::pow(2, i); - inputB[i] = (fptype)2 / std::pow(2, i); + inputA[i] = (T1)1 / std::pow(2, i); + inputB[i] = (T1)2 / std::pow(2, i); } - KernelFunctor<> f(inputA, inputB); + KernelFunctor f(inputA, inputB); if (!launchInlineASMTest(f)) - return 0; + return true; auto &C = f.getOutputBufferData(); for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) { if (C[i] != inputA[i] + inputB[i]) { std::cerr << "At index: " << i << ". "; std::cerr << C[i] << " != " << inputA[i] + inputB[i] << "\n"; - return 1; + return false; } } - return 0; + return true; } + +int main() { + bool Passed = true; + + Passed &= check(); +#ifdef ENABLE_FP64 + Passed &= check(); +#endif + + return Passed ? 0 : 1; +} + diff --git a/SYCL/InlineAsm/asm_float_add_aspect-fp64.cpp b/SYCL/InlineAsm/asm_float_add_aspect-fp64.cpp index 04f37e9abc..b923f9a5cd 100644 --- a/SYCL/InlineAsm/asm_float_add_aspect-fp64.cpp +++ b/SYCL/InlineAsm/asm_float_add_aspect-fp64.cpp @@ -5,7 +5,7 @@ // // UNSUPPORTED: cuda || hip_nvidia // REQUIRES: gpu,linux,aspect-fp64 -// RUN: %clangxx -fsycl %s -o %t.out +// RUN: %clangxx -fsycl -DENABLE_FP64 %s -o %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out #include "asm_float_add.cpp" From 7d94677d173f02a13eea579c73f0ee902afe6d32 Mon Sep 17 00:00:00 2001 From: Yanliang Mu Date: Fri, 26 Aug 2022 09:46:38 +0800 Subject: [PATCH 26/35] shorten the comment line < 80 characters. --- SYCL/ESIMD/api/saturation_smoke_aspect-fp64.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/SYCL/ESIMD/api/saturation_smoke_aspect-fp64.cpp b/SYCL/ESIMD/api/saturation_smoke_aspect-fp64.cpp index 1495c161fc..1edf0ff409 100644 --- a/SYCL/ESIMD/api/saturation_smoke_aspect-fp64.cpp +++ b/SYCL/ESIMD/api/saturation_smoke_aspect-fp64.cpp @@ -1,5 +1,4 @@ -//==---- saturation_smoke_aspect-fp64.cpp - DPC++ ESIMD on-device test -//-----==// +//==---- saturation_smoke_aspect-fp64.cpp - DPC++ ESIMD on-device test ----==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. From 4f50f1256451994dea40df01b126ecc0d8ebf7cc Mon Sep 17 00:00:00 2001 From: Yanliang Mu Date: Fri, 26 Aug 2022 10:59:21 +0800 Subject: [PATCH 27/35] fix clang-format issue --- SYCL/InlineAsm/asm_float_add.cpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/SYCL/InlineAsm/asm_float_add.cpp b/SYCL/InlineAsm/asm_float_add.cpp index 6614cca4a1..f8aa0d6679 100644 --- a/SYCL/InlineAsm/asm_float_add.cpp +++ b/SYCL/InlineAsm/asm_float_add.cpp @@ -40,10 +40,8 @@ struct KernelFunctor : WithInputBuffers, WithOutputBuffer { } }; -template -bool check() { - std::vector inputA(DEFAULT_PROBLEM_SIZE), - inputB(DEFAULT_PROBLEM_SIZE); +template bool check() { + std::vector inputA(DEFAULT_PROBLEM_SIZE), inputB(DEFAULT_PROBLEM_SIZE); for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) { inputA[i] = (T1)1 / std::pow(2, i); inputB[i] = (T1)2 / std::pow(2, i); @@ -75,4 +73,3 @@ int main() { return Passed ? 0 : 1; } - From f7db8ac9b9a5a9f1d2732f44fee5c439bc55a38f Mon Sep 17 00:00:00 2001 From: Yanliang Mu Date: Mon, 29 Aug 2022 16:30:27 +0800 Subject: [PATCH 28/35] fix all dangling formatting, use function template to check float/double. --- SYCL/Basic/buffer/buffer.cpp | 242 +++++++++--------- .../api/bin_and_cmp_ops_heavy_aspect-fp64.cpp | 3 +- .../functions_select_2d_core_aspect-fp64.cpp | 3 +- SYCL/ESIMD/regression/Inputs/dgetrf.hpp | 200 ++++++++------- SYCL/ESIMD/regression/dgetrf_8x8.cpp | 114 +++++---- SYCL/InlineAsm/asm_float_imm_arg.cpp | 53 ++-- SYCL/KernelParams/union_kernel_param.cpp | 36 +-- SYCL/SubGroup/generic-shuffle_aspect-fp64.cpp | 3 +- SYCL/USM/copy_aspect-fp64.cpp | 3 +- SYCL/USM/fill_aspect-fp64.cpp | 3 +- 10 files changed, 343 insertions(+), 317 deletions(-) diff --git a/SYCL/Basic/buffer/buffer.cpp b/SYCL/Basic/buffer/buffer.cpp index ab8e0dd694..913fe015c1 100644 --- a/SYCL/Basic/buffer/buffer.cpp +++ b/SYCL/Basic/buffer/buffer.cpp @@ -14,10 +14,10 @@ // //===----------------------------------------------------------------------===// +#include + #include -#include #include -#include using namespace sycl; @@ -25,7 +25,6 @@ int main() { int data = 5; bool failed = false; buffer buf(&data, range<1>(1)); - { int data1[10] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1}; { @@ -509,6 +508,7 @@ int main() { size_t size = 32; const size_t dims = 1; sycl::range r(size); + std::shared_ptr bool_shrd(new bool[size], [](bool *data) { delete[] data; }); std::shared_ptr int_shrd(new int[size], @@ -551,6 +551,7 @@ int main() { std::fill(double_shrd.get(), (double_shrd.get() + size), double()); #endif m.unlock(); + buf_bool_shrd.set_final_data(bool_vector.begin()); buf_int_shrd.set_final_data(int_vector.begin()); #ifdef ENABLE_FP64 @@ -591,137 +592,136 @@ int main() { #endif assert(Passed && "Data was not copied back"); } + } - // Check that data is not copied back after canceling write-back using - // set_write_back - { - std::vector data1(10, -1); - { - buffer b(range<1>(10)); - b.set_final_data(data1.data()); - b.set_write_back(false); - queue myQueue; - myQueue.submit([&](handler &cgh) { - auto B = b.get_access(cgh); - cgh.parallel_for(range<1>{10}, - [=](id<1> index) { B[index] = 0; }); - }); - } - // Data is not copied back because write-back is canceled - for (int i = 0; i < 10; i++) - if (data1[i] != -1) { - assert(false); - failed = true; - } - } - + // Check that data is not copied back after canceling write-back using + // set_write_back + { + std::vector data1(10, -1); { - std::vector data1(10, -1); - std::vector data2(10, -2); - { - buffer a(data1.data(), range<1>(10)); - buffer b(data2.data(), range<1>(10)); - queue myQueue; - myQueue.submit([&](handler &cgh) { - auto A = a.get_access(cgh); - auto B = b.get_access(cgh); - cgh.parallel_for( - range<1>{10}, [=](id<1> index) { A[index] = 0; }); - }); - } // Data is copied back - for (int i = 0; i < 10; i++) - assert(data2[i] == -2); - for (int i = 0; i < 10; i++) - assert(data1[i] == 0); + buffer b(range<1>(10)); + b.set_final_data(data1.data()); + b.set_write_back(false); + queue myQueue; + myQueue.submit([&](handler &cgh) { + auto B = b.get_access(cgh); + cgh.parallel_for(range<1>{10}, + [=](id<1> index) { B[index] = 0; }); + }); } + // Data is not copied back because write-back is canceled + for (int i = 0; i < 10; i++) + if (data1[i] != -1) { + assert(false); + failed = true; + } + } + { + std::vector data1(10, -1); + std::vector data2(10, -2); { - std::vector data1(10, -1); - std::vector data2(10, -2); - { - buffer a(data1.data(), range<1>(10)); - buffer b(data2); - accessor - A(a); - accessor - B(b); - queue myQueue; - myQueue.submit([&](handler &cgh) { - cgh.require(A); - cgh.require(B); - cgh.parallel_for( - range<1>{10}, [=](id<1> index) { A[index] = 0; }); - }); - } // Data is copied back - for (int i = 0; i < 10; i++) - assert(data2[i] == -2); - for (int i = 0; i < 10; i++) - assert(data1[i] == 0); - } + buffer a(data1.data(), range<1>(10)); + buffer b(data2.data(), range<1>(10)); + queue myQueue; + myQueue.submit([&](handler &cgh) { + auto A = a.get_access(cgh); + auto B = b.get_access(cgh); + cgh.parallel_for( + range<1>{10}, [=](id<1> index) { A[index] = 0; }); + }); + } // Data is copied back + for (int i = 0; i < 10; i++) + assert(data2[i] == -2); + for (int i = 0; i < 10; i++) + assert(data1[i] == 0); + } + { + std::vector data1(10, -1); + std::vector data2(10, -2); { - int data[10]; - void *voidPtr = (void *)data; - buffer b(range<1>(10)); - b.set_final_data(voidPtr); - } + buffer a(data1.data(), range<1>(10)); + buffer b(data2); + accessor + A(a); + accessor + B(b); + queue myQueue; + myQueue.submit([&](handler &cgh) { + cgh.require(A); + cgh.require(B); + cgh.parallel_for( + range<1>{10}, [=](id<1> index) { A[index] = 0; }); + }); + } // Data is copied back + for (int i = 0; i < 10; i++) + assert(data2[i] == -2); + for (int i = 0; i < 10; i++) + assert(data1[i] == 0); + } - { - std::allocator buf_alloc; - std::shared_ptr data(new float8[8], - [](float8 *p) { delete[] p; }); - sycl::buffer> b(data, sycl::range<1>(8), - buf_alloc); - } + { + int data[10]; + void *voidPtr = (void *)data; + buffer b(range<1>(10)); + b.set_final_data(voidPtr); + } - { - constexpr int Size = 6; - sycl::buffer Buf_1(Size); - sycl::buffer Buf_2(Size / 2); - - { - auto AccA = Buf_1.get_access(Size / 2); - auto AccB = Buf_2.get_access(Size / 2); - assert(AccA.get_size() == AccB.get_size()); - assert(AccA.get_range() == AccB.get_range()); - assert(AccA.get_count() == AccB.get_count()); - } + { + std::allocator buf_alloc; + std::shared_ptr data(new float8[8], [](float8 *p) { delete[] p; }); + sycl::buffer> b(data, sycl::range<1>(8), + buf_alloc); + } - auto AH0 = accessor(Buf_1); - auto BH0 = accessor(Buf_2); - assert(AH0.get_size() == sizeof(char)); - assert(BH0.get_size() == sizeof(char)); - assert(AH0.get_count() == 1); - assert(BH0.get_count() == 1); - - queue Queue; - Queue.submit([&](handler &CGH) { - auto AK0 = - accessor( - Buf_1, CGH); - auto BK0 = - accessor( - Buf_2, CGH); - assert(AK0.get_size() == sizeof(char)); - assert(BK0.get_size() == sizeof(char)); - assert(AK0.get_count() == 1); - assert(BK0.get_count() == 1); - CGH.single_task([]() {}); - }); - } + { + constexpr int Size = 6; + sycl::buffer Buf_1(Size); + sycl::buffer Buf_2(Size / 2); { - int data = 5; - buffer Buffer(&data, range<1>(1)); - assert(Buffer.size() == 1); - assert(Buffer.byte_size() == 1 * sizeof(int)); + auto AccA = Buf_1.get_access(Size / 2); + auto AccB = Buf_2.get_access(Size / 2); + assert(AccA.get_size() == AccB.get_size()); + assert(AccA.get_range() == AccB.get_range()); + assert(AccA.get_count() == AccB.get_count()); } - // TODO tests with mutex property - return failed; + auto AH0 = accessor(Buf_1); + auto BH0 = accessor(Buf_2); + assert(AH0.get_size() == sizeof(char)); + assert(BH0.get_size() == sizeof(char)); + assert(AH0.get_count() == 1); + assert(BH0.get_count() == 1); + + queue Queue; + Queue.submit([&](handler &CGH) { + auto AK0 = + accessor( + Buf_1, CGH); + auto BK0 = + accessor( + Buf_2, CGH); + assert(AK0.get_size() == sizeof(char)); + assert(BK0.get_size() == sizeof(char)); + assert(AK0.get_count() == 1); + assert(BK0.get_count() == 1); + CGH.single_task([]() {}); + }); + } + + { + int data = 5; + buffer Buffer(&data, range<1>(1)); + assert(Buffer.size() == 1); + assert(Buffer.byte_size() == 1 * sizeof(int)); } + + // TODO tests with mutex property + return failed; } diff --git a/SYCL/ESIMD/api/bin_and_cmp_ops_heavy_aspect-fp64.cpp b/SYCL/ESIMD/api/bin_and_cmp_ops_heavy_aspect-fp64.cpp index 1a84b2ec1e..d9d2b256bb 100644 --- a/SYCL/ESIMD/api/bin_and_cmp_ops_heavy_aspect-fp64.cpp +++ b/SYCL/ESIMD/api/bin_and_cmp_ops_heavy_aspect-fp64.cpp @@ -1,5 +1,4 @@ -//==--------------- bin_un_cmp_ops_heavy_aspect-fp64.cpp - DPC++ ESIMD -// on-device test -==// +//==-- bin_un_cmp_ops_heavy_aspect-fp64.cpp - DPC++ ESIMD on-device test --==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/SYCL/ESIMD/api/functional/functions/functions_select_2d_core_aspect-fp64.cpp b/SYCL/ESIMD/api/functional/functions/functions_select_2d_core_aspect-fp64.cpp index 2ac0db0279..f1a57bbe72 100644 --- a/SYCL/ESIMD/api/functional/functions/functions_select_2d_core_aspect-fp64.cpp +++ b/SYCL/ESIMD/api/functional/functions/functions_select_2d_core_aspect-fp64.cpp @@ -1,5 +1,4 @@ -//==-- functions_select_2d_core_aspect-fp64.cpp - DPC++ ESIMD on-device test -// ----------------------------------------------------------------==// +//==- functions_select_2d_core_aspect-fp64.cpp - DPC++ ESIMD on-device test ==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/SYCL/ESIMD/regression/Inputs/dgetrf.hpp b/SYCL/ESIMD/regression/Inputs/dgetrf.hpp index 6cc7fd675b..beefa32580 100644 --- a/SYCL/ESIMD/regression/Inputs/dgetrf.hpp +++ b/SYCL/ESIMD/regression/Inputs/dgetrf.hpp @@ -18,16 +18,9 @@ #include #include -#ifdef ENABLE_FP64 -using fptype = double; -#else -using fptype = float; -#endif - #define ABS(x) ((x) >= 0 ? (x) : -(x)) #define MIN(x, y) ((x) <= (y) ? (x) : (y)) #define MAX(x, y) ((x) >= (y) ? (x) : (y)) -#define FP_RAND ((fptype)rand() / (fptype)RAND_MAX) #define OUTN(text, ...) fprintf(stderr, text, ##__VA_ARGS__) #define OUT(text, ...) OUTN(text "\n", ##__VA_ARGS__) @@ -53,15 +46,16 @@ using namespace sycl; using namespace std; using namespace sycl::ext::intel::esimd; -ESIMD_PRIVATE ESIMD_REGISTER(192) simd GRF; +template ESIMD_PRIVATE ESIMD_REGISTER(192) simd GRF; #define V(x, w, i) (x).template select(i) #define V1(x, i) V(x, 1, i) #define V8(x, i) V(x, 8, i) #define BCAST8(x, i) (x).template replicate_w<8, 1>(i) -template ESIMD_INLINE void dgetrfnp_panel(int64_t *info) { - auto a = V(GRF, M * N, 0); +template +ESIMD_INLINE void dgetrfnp_panel(int64_t *info) { + auto a = V(GRF, M * N, 0); if (K % 8) { simd_mask<8> mask = 1; @@ -75,7 +69,7 @@ template ESIMD_INLINE void dgetrfnp_panel(int64_t *info) { V1(mask, k) = 0; if (ak0[k] != 0.0) { // scal - fptype temp = 1.0 / ak0[k]; + T temp = 1.0 / ak0[k]; ak0.merge(ak0 * temp, mask); for (int i = 8 + K & (-8); i < M; i += 8) { V8(ak, i) *= temp; @@ -104,7 +98,7 @@ template ESIMD_INLINE void dgetrfnp_panel(int64_t *info) { V1(mask, k) = 0; if (ak0[k] != 0.0) { // scal - fptype temp = 1.0 / ak0[k]; + T temp = 1.0 / ak0[k]; ak0.merge(ak0 * temp, mask); for (int i = 16 + (K & (-8)) + kk; i < M; i += 8) { V8(ak, i) *= temp; @@ -135,7 +129,7 @@ template ESIMD_INLINE void dgetrfnp_panel(int64_t *info) { V1(mask, k) = 0; if (ak0[k] != 0.0) { // scal - fptype temp = 1.0 / ak0[k]; + T temp = 1.0 / ak0[k]; ak0.merge(ak0 * temp, mask); for (int i = 8 + K + kk; i < M; i += 8) { V8(ak, i) *= temp; @@ -165,16 +159,16 @@ template ESIMD_INLINE void dgetrfnp_panel(int64_t *info) { // into GRF K - an update rank P0=A[0:M,0:K] = column(F=A[0:K,0:K], // L=A[K:M,0:K]) - panel to update with P1=A[0:M,K:K+N] = column(U=A[0:K,K:K+N], // T=A[K:M,K:K+N]) - panel to be updated -template -ESIMD_INLINE void dgetrfnp_left_step(fptype *a, int64_t lda, int64_t *info) { - auto p1 = V(GRF, M * N, 0); - fptype *a1; +template +ESIMD_INLINE void dgetrfnp_left_step(T *a, int64_t lda, int64_t *info) { + auto p1 = V(GRF, M * N, 0); + T *a1; int i, j, k; // load P1 for (j = 0, a1 = a + K * lda; j < N; j++, a1 += lda) for (i = 0; i < M; i += 8) { - simd data; + simd data; data.copy_from(a1 + i); V8(p1, j * M + i) = data; } @@ -184,10 +178,10 @@ ESIMD_INLINE void dgetrfnp_left_step(fptype *a, int64_t lda, int64_t *info) { // (gemm) update T=T-L*U for (int kk = 0; kk < K; kk += 8) { simd_mask<8> mask = 1; - simd a0k, aik; + simd a0k, aik; for (k = 0; k < 8 && kk + k < K; k++) { V1(mask, k) = 0; - simd data; + simd data; data.copy_from(a + kk + (kk + k) * lda); V8(a0k, 0) = data; for (j = 0; j < N; j++) { @@ -199,7 +193,7 @@ ESIMD_INLINE void dgetrfnp_left_step(fptype *a, int64_t lda, int64_t *info) { } for (k = 0; k < 8 && kk + k < K; k++) { for (i = kk + 8; i < M; i += 8) { - simd data; + simd data; data.copy_from(a + i + (kk + k) * lda); V8(aik, 0) = data; for (j = 0; j < N; j++) { @@ -213,24 +207,25 @@ ESIMD_INLINE void dgetrfnp_left_step(fptype *a, int64_t lda, int64_t *info) { } } // (getrf) factorize T=P*L*U - dgetrfnp_panel(info); + dgetrfnp_panel(info); // store P1 for (j = 0, a1 = a + K * lda; j < N; j++, a1 += lda) for (i = 0; i < M; i += 8) { - simd vals = V8(p1, j * M + i); + simd vals = V8(p1, j * M + i); vals.copy_to(a1 + i); } } #endif // !USE_REF -ESIMD_INLINE void dgetrfnp_esimd(int64_t m, int64_t n, fptype *a, int64_t lda, +template +ESIMD_INLINE void dgetrfnp_esimd(int64_t m, int64_t n, T *a, int64_t lda, int64_t *ipiv, int64_t *info) { *info = 0; #if defined(USE_REF) int i, j, k; for (k = 0; k < MIN(m, n); k++) { - fptype temp = a[k + k * lda]; + T temp = a[k + k * lda]; if (!(*info) && temp == 0.0) *info = k + 1; // scal @@ -249,65 +244,65 @@ ESIMD_INLINE void dgetrfnp_esimd(int64_t m, int64_t n, fptype *a, int64_t lda, #else // defined(USE_REF) if (m == 8) { if (n == 8) - dgetrfnp_left_step<8, 8, 0>(a, lda, info); + dgetrfnp_left_step(a, lda, info); } else if (m == 16) { if (n == 8) - dgetrfnp_left_step<16, 8, 0>(a, lda, info); + dgetrfnp_left_step(a, lda, info); else if (n == 16) - dgetrfnp_left_step<16, 16, 0>(a, lda, info); + dgetrfnp_left_step(a, lda, info); } else if (m == 32) { if (n == 8) - dgetrfnp_left_step<32, 8, 0>(a, lda, info); + dgetrfnp_left_step(a, lda, info); else if (n == 12) - dgetrfnp_left_step<32, 12, 0>(a, lda, info); + dgetrfnp_left_step(a, lda, info); else if (n == 16) { - dgetrfnp_left_step<32, 8, 0>(a, lda, info); - dgetrfnp_left_step<32, 8, 8>(a, lda, info); + dgetrfnp_left_step(a, lda, info); + dgetrfnp_left_step(a, lda, info); } else if (n == 24) { - dgetrfnp_left_step<32, 8, 0>(a, lda, info); - dgetrfnp_left_step<32, 8, 8>(a, lda, info); - dgetrfnp_left_step<32, 8, 16>(a, lda, info); + dgetrfnp_left_step(a, lda, info); + dgetrfnp_left_step(a, lda, info); + dgetrfnp_left_step(a, lda, info); } else if (n == 32) { - dgetrfnp_left_step<32, 8, 0>(a, lda, info); - dgetrfnp_left_step<32, 8, 8>(a, lda, info); - dgetrfnp_left_step<32, 8, 16>(a, lda, info); - dgetrfnp_left_step<32, 8, 24>(a, lda, info); + dgetrfnp_left_step(a, lda, info); + dgetrfnp_left_step(a, lda, info); + dgetrfnp_left_step(a, lda, info); + dgetrfnp_left_step(a, lda, info); } } else if (m == 64) { if (n == 6) - dgetrfnp_left_step<64, 6, 0>(a, lda, info); + dgetrfnp_left_step(a, lda, info); else if (n == 16) { - dgetrfnp_left_step<64, 6, 0>(a, lda, info); - dgetrfnp_left_step<64, 6, 6>(a, lda, info); - dgetrfnp_left_step<64, 4, 12>(a, lda, info); + dgetrfnp_left_step(a, lda, info); + dgetrfnp_left_step(a, lda, info); + dgetrfnp_left_step(a, lda, info); } else if (n == 32) { - dgetrfnp_left_step<64, 6, 0>(a, lda, info); - dgetrfnp_left_step<64, 6, 6>(a, lda, info); - dgetrfnp_left_step<64, 6, 12>(a, lda, info); - dgetrfnp_left_step<64, 6, 18>(a, lda, info); - dgetrfnp_left_step<64, 6, 24>(a, lda, info); - dgetrfnp_left_step<64, 2, 30>(a, lda, info); + dgetrfnp_left_step(a, lda, info); + dgetrfnp_left_step(a, lda, info); + dgetrfnp_left_step(a, lda, info); + dgetrfnp_left_step(a, lda, info); + dgetrfnp_left_step(a, lda, info); + dgetrfnp_left_step(a, lda, info); } else if (n == 64) { - dgetrfnp_left_step<64, 6, 0>(a, lda, info); - dgetrfnp_left_step<64, 6, 6>(a, lda, info); - dgetrfnp_left_step<64, 6, 12>(a, lda, info); - dgetrfnp_left_step<64, 6, 18>(a, lda, info); - dgetrfnp_left_step<64, 6, 24>(a, lda, info); - dgetrfnp_left_step<64, 6, 30>(a, lda, info); - dgetrfnp_left_step<64, 6, 36>(a, lda, info); - dgetrfnp_left_step<64, 6, 42>(a, lda, info); - dgetrfnp_left_step<64, 6, 48>(a, lda, info); - dgetrfnp_left_step<64, 6, 54>(a, lda, info); - dgetrfnp_left_step<64, 4, 60>(a, lda, info); + dgetrfnp_left_step(a, lda, info); + dgetrfnp_left_step(a, lda, info); + dgetrfnp_left_step(a, lda, info); + dgetrfnp_left_step(a, lda, info); + dgetrfnp_left_step(a, lda, info); + dgetrfnp_left_step(a, lda, info); + dgetrfnp_left_step(a, lda, info); + dgetrfnp_left_step(a, lda, info); + dgetrfnp_left_step(a, lda, info); + dgetrfnp_left_step(a, lda, info); + dgetrfnp_left_step(a, lda, info); } } #endif // defined(USE_REF) } -void dgetrfnp_batch_strided_c(int64_t m, int64_t n, fptype *a, int64_t lda, - int64_t stride_a, int64_t *ipiv, - int64_t stride_ipiv, int64_t batch, - int64_t *info) { +template void dgetrfnp_batch_strided_c(int64_t m, int64_t n, + T *a, int64_t lda, int64_t stride_a, + int64_t *ipiv, int64_t stride_ipiv, + int64_t batch, int64_t *info) { queue queue((gpu_selector())); auto device = queue.get_device(); auto context = queue.get_context(); @@ -315,11 +310,11 @@ void dgetrfnp_batch_strided_c(int64_t m, int64_t n, fptype *a, int64_t lda, CHECK(status = device.is_gpu(), !status); - fptype *a_gpu; + T *a_gpu; int64_t *ipiv_gpu; int64_t *info_gpu; - CHECK(a_gpu = static_cast( - malloc_shared(stride_a * batch * sizeof(fptype), device, context)), + CHECK(a_gpu = static_cast( + malloc_shared(stride_a * batch * sizeof(T), device, context)), !a_gpu); CHECK(ipiv_gpu = static_cast(malloc_shared( stride_ipiv * batch * sizeof(int64_t), device, context)), @@ -328,7 +323,7 @@ void dgetrfnp_batch_strided_c(int64_t m, int64_t n, fptype *a, int64_t lda, malloc_shared(batch * sizeof(int64_t), device, context)), !info_gpu); - memcpy(a_gpu, a, stride_a * batch * sizeof(fptype)); + memcpy(a_gpu, a, stride_a * batch * sizeof(T)); sycl::nd_range<1> range(sycl::range<1>{static_cast(batch)}, sycl::range<1>{1}); @@ -350,7 +345,7 @@ void dgetrfnp_batch_strided_c(int64_t m, int64_t n, fptype *a, int64_t lda, return; } - memcpy(a, a_gpu, stride_a * batch * sizeof(fptype)); + memcpy(a, a_gpu, stride_a * batch * sizeof(T)); memcpy(ipiv, ipiv_gpu, stride_ipiv * batch * sizeof(int64_t)); memcpy(info, info_gpu, batch * sizeof(int64_t)); @@ -359,14 +354,16 @@ void dgetrfnp_batch_strided_c(int64_t m, int64_t n, fptype *a, int64_t lda, free(info_gpu, context); } -static void fp_init(int64_t m, int64_t n, fptype *a, int64_t lda) { +template +static void fp_init(int64_t m, int64_t n, T *a, int64_t lda) { int64_t i, j; for (j = 0; j < n; j++) for (i = 0; i < m; i++) - a[i + j * lda] = 2.0 * FP_RAND - 1.0; + a[i + j * lda] = 2.0 * ((T)rand() / (T)RAND_MAX) - 1.0; } -static void fp_copy(int64_t m, int64_t n, fptype *a, int64_t lda, fptype *b, +template +static void fp_copy(int64_t m, int64_t n, T *a, int64_t lda, T *b, int64_t ldb) { int64_t i, j; for (j = 0; j < n; j++) @@ -374,8 +371,9 @@ static void fp_copy(int64_t m, int64_t n, fptype *a, int64_t lda, fptype *b, b[i + j * ldb] = a[i + j * lda]; } -static fptype fp_norm1(int64_t m, int64_t n, fptype *a, int64_t lda) { - fptype sum, value = 0.0; +template +static T fp_norm1(int64_t m, int64_t n, T *a, int64_t lda) { + T sum, value = 0.0; int64_t i, j; for (j = 0; j < n; j++) { sum = 0.0; @@ -387,32 +385,36 @@ static fptype fp_norm1(int64_t m, int64_t n, fptype *a, int64_t lda) { return value; } -static int dgetrfnp_batch_strided_check(int64_t m, int64_t n, fptype *a_in, - fptype *a, int64_t lda, +template +static int dgetrfnp_batch_strided_check(int64_t m, int64_t n, T *a_in, + T *a, int64_t lda, int64_t stride_a, int64_t *ipiv, int64_t stride_ipiv, int64_t batch, int64_t *info) { - fptype thresh = 30.0; + T thresh = 30.0; int fail = 0; int64_t i, j, k, l; char label[1024]; -#ifdef ENABLE_FP64 - unsigned char prec_b[] = {0, 0, 0, 0, 0, 0, 0xb0, 0x3c}; -#else - unsigned char prec_b[] = {0, 0, 0xb0, 0x3c}; -#endif - fptype res = 0.0, nrm = 0.0, ulp = *(fptype *)prec_b; - fptype *w = (fptype *)malloc(sizeof(fptype) * MAX(m * n, 1)); + unsigned char prec_b1[] = {0, 0, 0xb0, 0x3c}; + unsigned char prec_b2[] = {0, 0, 0, 0, 0, 0, 0xb0, 0x3c}; + T res = 0.0, nrm = 0.0, ulp; + if (std::is_same::value) { + ulp = *(T *)prec_b2; + } else { + ulp = *(T *)prec_b1; + } + + T *w = (T *)malloc(sizeof(T) * MAX(m * n, 1)); sprintf(label, "m=%ld, n=%ld, lda=%ld, batch=%ld", m, n, lda, batch); for (k = 0; k < batch; k++) { /* info == 0 */ - CHECK_AND_REPORT("info == 0", label, info[k] != 0, (fptype)info[k], fail); + CHECK_AND_REPORT("info == 0", label, info[k] != 0, (T)info[k], fail); if (m > 0 && n > 0) { /* | L U - A | / ( |A| n ulp ) */ - memset(w, 0, sizeof(fptype) * m * n); + memset(w, 0, sizeof(T) * m * n); if (m < n) { for (j = 0; j < n; j++) for (i = 0; i <= j; i++) @@ -439,7 +441,7 @@ static int dgetrfnp_batch_strided_check(int64_t m, int64_t n, fptype *a_in, w[i + j * m] -= a_in[k * stride_a + i + j * lda]; res = fp_norm1(m, n, w, m); nrm = fp_norm1(m, n, &a_in[k * stride_a], lda); - nrm *= (fptype)n * ulp; + nrm *= (T)n * ulp; res /= nrm > 0.0 ? nrm : ulp; CHECK_AND_REPORT("| L U - A | / ( |A| n ulp )", label, FAILED(res, thresh), res, fail); @@ -450,12 +452,7 @@ static int dgetrfnp_batch_strided_check(int64_t m, int64_t n, fptype *a_in, return fail; } -void dgetrfnp_batch_strided_c(int64_t m, int64_t n, fptype *a, int64_t lda, - int64_t stride_a, int64_t *ipiv, - int64_t stride_ipiv, int64_t batch, - int64_t *info); - -int main(int argc, char *argv[]) { +template int check(int argc, char *argv[]) { int exit_status = 0; int64_t m = 64, n = 64, lda = 64; int64_t stride_a = lda * n, stride_ipiv = n; @@ -468,10 +465,10 @@ int main(int argc, char *argv[]) { int64_t a_count = MAX(stride_a * batch, 1); int64_t ipiv_count = MAX(stride_ipiv * batch, 1); int64_t info_count = MAX(batch, 1); - fptype *a = NULL, *a_copy = NULL; + T *a = NULL, *a_copy = NULL; int64_t *ipiv = NULL, *info = NULL; - CHECK(a = (fptype *)malloc(sizeof(fptype) * a_count), !a); - CHECK(a_copy = (fptype *)malloc(sizeof(fptype) * a_count), !a_copy); + CHECK(a = (T *)malloc(sizeof(T) * a_count), !a); + CHECK(a_copy = (T *)malloc(sizeof(T) * a_count), !a_copy); CHECK(ipiv = (int64_t *)malloc(sizeof(int64_t) * ipiv_count), !ipiv); CHECK(info = (int64_t *)malloc(sizeof(int64_t) * info_count), !info); @@ -496,3 +493,14 @@ int main(int argc, char *argv[]) { } return exit_status; } + +int main(int argc, char *argv[]) { + int Passed = 0; + + Passed += check(argc, argv); +#ifdef ENABLE_FP64 + Passed += check(argc, argv); +#endif + + return Passed; +} diff --git a/SYCL/ESIMD/regression/dgetrf_8x8.cpp b/SYCL/ESIMD/regression/dgetrf_8x8.cpp index eebb690fb6..c7ac320868 100644 --- a/SYCL/ESIMD/regression/dgetrf_8x8.cpp +++ b/SYCL/ESIMD/regression/dgetrf_8x8.cpp @@ -20,16 +20,9 @@ #include #include -#ifdef ENABLE_FP64 -using fptype = double; -#else -using fptype = float; -#endif - #define ABS(x) ((x) >= 0 ? (x) : -(x)) #define MIN(x, y) ((x) <= (y) ? (x) : (y)) #define MAX(x, y) ((x) >= (y) ? (x) : (y)) -#define FP_RAND ((fptype)rand() / (fptype)RAND_MAX) #define OUTN(text, ...) fprintf(stderr, text, ##__VA_ARGS__) #define OUT(text, ...) OUTN(text "\n", ##__VA_ARGS__) @@ -55,15 +48,15 @@ using namespace sycl; using namespace std; using namespace sycl::ext::intel::esimd; -ESIMD_PRIVATE ESIMD_REGISTER(384) simd GRF; +template ESIMD_PRIVATE ESIMD_REGISTER(384) simd GRF; #define V(x, w, i) (x).template select(i) #define V1(x, i) V(x, 1, i) #define V8(x, i) V(x, 8, i) #define BCAST8(x, i) (x).template replicate_w<8, 1>(i) -template ESIMD_INLINE void dgetrfnp_panel(int64_t *info) { - auto a = V(GRF, M * N, 0); +template ESIMD_INLINE void dgetrfnp_panel(int64_t *info) { + auto a = V(GRF, M * N, 0); for (int kk = 0; kk < N; kk += 8) { simd_mask<8> mask = 1; for (int k = 0; k < 8 && kk + k < N; k++) { @@ -73,7 +66,7 @@ template ESIMD_INLINE void dgetrfnp_panel(int64_t *info) { V1(mask, k) = 0; if (ak0[k] != 0.0) { // scal - fptype temp = 1.0 / ak0[k]; + T temp = 1.0 / ak0[k]; ak0.merge(ak0 * temp, mask); for (int i = 8 + K + kk; i < M; i += 8) { V8(ak, i) *= temp; @@ -101,16 +94,16 @@ template ESIMD_INLINE void dgetrfnp_panel(int64_t *info) { // into GRF K - an update rank P0=A[0:M,0:K] = column(F=A[0:K,0:K], // L=A[K:M,0:K]) - panel to update with P1=A[0:M,K:K+N] = column(U=A[0:K,K:K+N], // T=A[K:M,K:K+N]) - panel to be updated -template -ESIMD_INLINE void dgetrfnp_left_step(fptype *a, int64_t lda, int64_t *info) { - auto p1 = V(GRF, M * N, 0); - fptype *a1; +template +ESIMD_INLINE void dgetrfnp_left_step(T *a, int64_t lda, int64_t *info) { + auto p1 = V(GRF, M * N, 0); + T *a1; int i, j, k; // load P1 for (j = 0, a1 = a + K * lda; j < N; j++, a1 += lda) for (i = 0; i < M; i += 8) { - simd data; + simd data; data.copy_from(a1 + i); V8(p1, j * M + i) = data; } @@ -120,21 +113,22 @@ ESIMD_INLINE void dgetrfnp_left_step(fptype *a, int64_t lda, int64_t *info) { // store P1 for (j = 0, a1 = a + K * lda; j < N; j++, a1 += lda) for (i = 0; i < M; i += 8) { - simd vals = V8(p1, j * M + i); + simd vals = V8(p1, j * M + i); vals.copy_to(a1 + i); } } -ESIMD_INLINE void dgetrfnp_esimd_8x8(fptype *a, int64_t lda, int64_t *ipiv, +template +ESIMD_INLINE void dgetrfnp_esimd_8x8(T *a, int64_t lda, int64_t *ipiv, int64_t *info) { *info = 0; - dgetrfnp_left_step<8, 8, 0>(a, lda, info); + dgetrfnp_left_step(a, lda, info); } -void dgetrfnp_batch_strided_c(int64_t m, int64_t n, fptype *a, int64_t lda, - int64_t stride_a, int64_t *ipiv, - int64_t stride_ipiv, int64_t batch, - int64_t *info) { +template void dgetrfnp_batch_strided_c(int64_t m, int64_t n, + T *a, int64_t lda, int64_t stride_a, + int64_t *ipiv, int64_t stride_ipiv, + int64_t batch, int64_t *info) { queue queue((gpu_selector())); auto device = queue.get_device(); auto context = queue.get_context(); @@ -142,11 +136,11 @@ void dgetrfnp_batch_strided_c(int64_t m, int64_t n, fptype *a, int64_t lda, CHECK(status = device.is_gpu(), !status); - fptype *a_gpu; + T *a_gpu; int64_t *ipiv_gpu; int64_t *info_gpu; - CHECK(a_gpu = static_cast( - malloc_shared(stride_a * batch * sizeof(fptype), device, context)), + CHECK(a_gpu = static_cast( + malloc_shared(stride_a * batch * sizeof(T), device, context)), !a_gpu); CHECK(ipiv_gpu = static_cast(malloc_shared( stride_ipiv * batch * sizeof(int64_t), device, context)), @@ -155,7 +149,7 @@ void dgetrfnp_batch_strided_c(int64_t m, int64_t n, fptype *a, int64_t lda, malloc_shared(batch * sizeof(int64_t), device, context)), !info_gpu); - memcpy(a_gpu, a, stride_a * batch * sizeof(fptype)); + memcpy(a_gpu, a, stride_a * batch * sizeof(T)); sycl::nd_range<1> range(sycl::range<1>{static_cast(batch)}, sycl::range<1>{1}); @@ -177,7 +171,7 @@ void dgetrfnp_batch_strided_c(int64_t m, int64_t n, fptype *a, int64_t lda, return; } - memcpy(a, a_gpu, stride_a * batch * sizeof(fptype)); + memcpy(a, a_gpu, stride_a * batch * sizeof(T)); memcpy(ipiv, ipiv_gpu, stride_ipiv * batch * sizeof(int64_t)); memcpy(info, info_gpu, batch * sizeof(int64_t)); @@ -186,14 +180,16 @@ void dgetrfnp_batch_strided_c(int64_t m, int64_t n, fptype *a, int64_t lda, free(info_gpu, context); } -static void fp_init(int64_t m, int64_t n, fptype *a, int64_t lda) { +template +static void fp_init(int64_t m, int64_t n, T *a, int64_t lda) { int64_t i, j; for (j = 0; j < n; j++) for (i = 0; i < m; i++) - a[i + j * lda] = 2.0 * FP_RAND - 1.0; + a[i + j * lda] = 2.0 * ((T)rand() / (T)RAND_MAX) - 1.0; } -static void fp_copy(int64_t m, int64_t n, fptype *a, int64_t lda, fptype *b, +template +static void fp_copy(int64_t m, int64_t n, T *a, int64_t lda, T *b, int64_t ldb) { int64_t i, j; for (j = 0; j < n; j++) @@ -201,8 +197,9 @@ static void fp_copy(int64_t m, int64_t n, fptype *a, int64_t lda, fptype *b, b[i + j * ldb] = a[i + j * lda]; } -static fptype fp_norm1(int64_t m, int64_t n, fptype *a, int64_t lda) { - fptype sum, value = 0.0; +template +static T fp_norm1(int64_t m, int64_t n, T *a, int64_t lda) { + T sum, value = 0.0; int64_t i, j; for (j = 0; j < n; j++) { sum = 0.0; @@ -214,32 +211,36 @@ static fptype fp_norm1(int64_t m, int64_t n, fptype *a, int64_t lda) { return value; } -static int dgetrfnp_batch_strided_check(int64_t m, int64_t n, fptype *a_in, - fptype *a, int64_t lda, +template +static int dgetrfnp_batch_strided_check(int64_t m, int64_t n, T *a_in, + T *a, int64_t lda, int64_t stride_a, int64_t *ipiv, int64_t stride_ipiv, int64_t batch, int64_t *info) { - fptype thresh = 30.0; + T thresh = 30.0; int fail = 0; int64_t i, j, k, l; char label[1024]; -#ifdef ENABLE_FP64 - unsigned char prec_b[] = {0, 0, 0, 0, 0, 0, 0xb0, 0x3c}; -#else - unsigned char prec_b[] = {0, 0, 0xb0, 0x3c}; -#endif - fptype res = 0.0, nrm = 0.0, ulp = *(fptype *)prec_b; - fptype *w = (fptype *)malloc(sizeof(fptype) * MAX(m * n, 1)); + unsigned char prec_b1[] = {0, 0, 0xb0, 0x3c}; + unsigned char prec_b2[] = {0, 0, 0, 0, 0, 0, 0xb0, 0x3c}; + T res = 0.0, nrm = 0.0, ulp + if (std::is_same::value) { + ulp = *(T *)prec_b2; + } else { + ulp = *(T *)prec_b1; + }; + + T *w = (T *)malloc(sizeof(T) * MAX(m * n, 1)); sprintf(label, "m=%ld, n=%ld, lda=%ld, batch=%ld", m, n, lda, batch); for (k = 0; k < batch; k++) { /* info == 0 */ - CHECK_AND_REPORT("info == 0", label, info[k] != 0, (fptype)info[k], fail); + CHECK_AND_REPORT("info == 0", label, info[k] != 0, (T)info[k], fail); if (m > 0 && n > 0) { /* | L U - A | / ( |A| n ulp ) */ - memset(w, 0, sizeof(fptype) * m * n); + memset(w, 0, sizeof(T) * m * n); if (m < n) { for (j = 0; j < n; j++) for (i = 0; i <= j; i++) @@ -266,7 +267,7 @@ static int dgetrfnp_batch_strided_check(int64_t m, int64_t n, fptype *a_in, w[i + j * m] -= a_in[k * stride_a + i + j * lda]; res = fp_norm1(m, n, w, m); nrm = fp_norm1(m, n, &a_in[k * stride_a], lda); - nrm *= (fptype)n * ulp; + nrm *= (T)n * ulp; res /= nrm > 0.0 ? nrm : ulp; CHECK_AND_REPORT("| L U - A | / ( |A| n ulp )", label, FAILED(res, thresh), res, fail); @@ -277,12 +278,12 @@ static int dgetrfnp_batch_strided_check(int64_t m, int64_t n, fptype *a_in, return fail; } -void dgetrfnp_batch_strided_c(int64_t m, int64_t n, fptype *a, int64_t lda, +void dgetrfnp_batch_strided_c(int64_t m, int64_t n, T *a, int64_t lda, int64_t stride_a, int64_t *ipiv, int64_t stride_ipiv, int64_t batch, int64_t *info); -int main(int argc, char *argv[]) { +template int check(int argc, char *argv[]) { int exit_status = 0; constexpr int64_t m = 8, n = 8, lda = 8; int64_t stride_a = lda * n, stride_ipiv = n; @@ -295,10 +296,10 @@ int main(int argc, char *argv[]) { int64_t a_count = MAX(stride_a * batch, 1); int64_t ipiv_count = MAX(stride_ipiv * batch, 1); int64_t info_count = MAX(batch, 1); - fptype *a = NULL, *a_copy = NULL; + T *a = NULL, *a_copy = NULL; int64_t *ipiv = NULL, *info = NULL; - CHECK(a = (fptype *)malloc(sizeof(fptype) * a_count), !a); - CHECK(a_copy = (fptype *)malloc(sizeof(fptype) * a_count), !a_copy); + CHECK(a = (T *)malloc(sizeof(T) * a_count), !a); + CHECK(a_copy = (T *)malloc(sizeof(T) * a_count), !a_copy); CHECK(ipiv = (int64_t *)malloc(sizeof(int64_t) * ipiv_count), !ipiv); CHECK(info = (int64_t *)malloc(sizeof(int64_t) * info_count), !info); @@ -323,3 +324,14 @@ int main(int argc, char *argv[]) { } return exit_status; } + +int main(int argc, char *argv[]) { + int Passed = 0; + + Passed += check(argc, argv); +#ifdef ENABLE_FP64 + Passed += check(argc, argv); +#endif + + return Passed; +} diff --git a/SYCL/InlineAsm/asm_float_imm_arg.cpp b/SYCL/InlineAsm/asm_float_imm_arg.cpp index 112c7d9907..8c4b05fc32 100644 --- a/SYCL/InlineAsm/asm_float_imm_arg.cpp +++ b/SYCL/InlineAsm/asm_float_imm_arg.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda || hip_nvidia -// REQUIRES: gpu,linux,aspect-fp64 +// REQUIRES: gpu,linux // RUN: %clangxx -fsycl %s -o %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out @@ -9,20 +9,12 @@ #include #include -#ifdef ENABLE_FP64 -using fptype = double; -using dataType = sycl::cl_double; -#else -using fptype = float; -using dataType = sycl::cl_float; -#endif - -constexpr fptype IMM_ARGUMENT = 0.5; +template constexpr T IMM_ARGUMENT = T(0.5); -template -struct KernelFunctor : WithInputBuffers, WithOutputBuffer { - KernelFunctor(const std::vector &input) - : WithInputBuffers(input), WithOutputBuffer(input.size()) {} +template +struct KernelFunctor : WithInputBuffers, WithOutputBuffer { + KernelFunctor(const std::vector &input) + : WithInputBuffers(input), WithOutputBuffer(input.size()) {} void operator()(sycl::handler &cgh) { auto A = @@ -32,36 +24,49 @@ struct KernelFunctor : WithInputBuffers, WithOutputBuffer { this->getOutputBuffer().template get_access( cgh); - cgh.parallel_for>( + cgh.parallel_for>( sycl::range<1>{this->getOutputBufferSize()}, [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(8)]] { #if defined(__SYCL_DEVICE_ONLY__) asm("mul (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0> %2" : "=rw"(B[wiID]) - : "rw"(A[wiID]), "i"(IMM_ARGUMENT)); + : "rw"(A[wiID]), "i"(IMM_ARGUMENT)); #else - B[wiID] = A[wiID] * IMM_ARGUMENT; + B[wiID] = A[wiID] * IMM_ARGUMENT; #endif }); } }; -int main() { - std::vector input(DEFAULT_PROBLEM_SIZE); +template bool check() { + constexpr T1 IMM_ARGUMENT = T1(0.5); + + std::vector input(DEFAULT_PROBLEM_SIZE); for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) - input[i] = (fptype)1 / std::pow(2, i); + input[i] = (T1)1 / std::pow(2, i); - KernelFunctor<> f(input); + KernelFunctor f(input); if (!launchInlineASMTest(f)) - return 0; + return true; auto &B = f.getOutputBufferData(); for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) { if (B[i] != input[i] * IMM_ARGUMENT) { std::cerr << "At index: " << i << ". "; std::cerr << B[i] << " != " << input[i] * IMM_ARGUMENT << "\n"; - return 1; + return false; } } - return 0; + return true; +} + +int main() { + bool Passed = true; + + Passed &= check(); +#ifdef ENABLE_FP64 + Passed &= check(); +#endif + + return Passed ? 0 : 1; } diff --git a/SYCL/KernelParams/union_kernel_param.cpp b/SYCL/KernelParams/union_kernel_param.cpp index aa005d0fdb..0ab2c1d321 100644 --- a/SYCL/KernelParams/union_kernel_param.cpp +++ b/SYCL/KernelParams/union_kernel_param.cpp @@ -9,38 +9,44 @@ #include #include -#ifdef ENABLE_FP64 -using fptype = double; -#else -using fptype = float; -#endif - -union TestUnion { +template union TestUnion { public: int myint; char mychar; - fptype mytype; + T mytype; TestUnion() { mytype = 0.0; }; }; -int main(int argc, char **argv) { - TestUnion x; +template bool check() { + TestUnion x; x.mytype = 5.0; - fptype mytype = 0.0; + T mytype = 0.0; sycl::queue queue; { - sycl::buffer buf(&mytype, 1); + sycl::buffer buf(&mytype, 1); queue.submit([&](sycl::handler &cgh) { - auto acc = buf.get_access(cgh); + auto acc = buf.template get_access(cgh); cgh.single_task([=]() { acc[0] = x.mytype; }); }); } if (mytype != 5.0) { printf("FAILED\nmytype = %d\n", mytype); - return 1; + return false; } - return 0; + return true; +} + +int main(int argc, char **argv) { + bool Passed = true; + + Passed &= check(); +#ifdef ENABLE_FP64 + Passed &= check(); +#endif + + return Passed ? 0 : 1; } + diff --git a/SYCL/SubGroup/generic-shuffle_aspect-fp64.cpp b/SYCL/SubGroup/generic-shuffle_aspect-fp64.cpp index 3a4894d498..fef972a51c 100644 --- a/SYCL/SubGroup/generic-shuffle_aspect-fp64.cpp +++ b/SYCL/SubGroup/generic-shuffle_aspect-fp64.cpp @@ -9,8 +9,7 @@ // RUN: %GPU_RUN_PLACEHOLDER %t.out // RUN: %ACC_RUN_PLACEHOLDER %t.out // -//==-- generic_shuffle_aspect-fp64.cpp - SYCL sub_group generic shuffle test *- -// C++ -*--==// +//==-------------- generic_shuffle_aspect-fp64.cpp -*- C++ -*---------------==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/SYCL/USM/copy_aspect-fp64.cpp b/SYCL/USM/copy_aspect-fp64.cpp index 1de25cf5b8..b02bbbf52a 100644 --- a/SYCL/USM/copy_aspect-fp64.cpp +++ b/SYCL/USM/copy_aspect-fp64.cpp @@ -1,5 +1,4 @@ -//==---- copy_aspect-fp64.cp - USM copy test -//------------------------------------------==// +//==--------------- copy_aspect-fp64.cp - USM copy test --------------------==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/SYCL/USM/fill_aspect-fp64.cpp b/SYCL/USM/fill_aspect-fp64.cpp index 7b73a62d91..ac602b709e 100644 --- a/SYCL/USM/fill_aspect-fp64.cpp +++ b/SYCL/USM/fill_aspect-fp64.cpp @@ -1,5 +1,4 @@ -//==---- fill_aspect-fp64.cpp - USM fill test for double type -//---------------==// +//==------- fill_aspect-fp64.cpp - USM fill test for double type -----------==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. From dc24d24386d63c4c8f36be7e9c570f0cfac1c823 Mon Sep 17 00:00:00 2001 From: Yanliang Mu Date: Mon, 29 Aug 2022 16:37:25 +0800 Subject: [PATCH 29/35] fix clang-format issue --- SYCL/ESIMD/regression/Inputs/dgetrf.hpp | 20 ++++++++-------- SYCL/ESIMD/regression/dgetrf_8x8.cpp | 29 ++++++++++++------------ SYCL/InlineAsm/asm_float_imm_arg.cpp | 2 +- SYCL/KernelParams/union_kernel_param.cpp | 1 - 4 files changed, 26 insertions(+), 26 deletions(-) diff --git a/SYCL/ESIMD/regression/Inputs/dgetrf.hpp b/SYCL/ESIMD/regression/Inputs/dgetrf.hpp index beefa32580..2ccf5f5356 100644 --- a/SYCL/ESIMD/regression/Inputs/dgetrf.hpp +++ b/SYCL/ESIMD/regression/Inputs/dgetrf.hpp @@ -46,7 +46,7 @@ using namespace sycl; using namespace std; using namespace sycl::ext::intel::esimd; -template ESIMD_PRIVATE ESIMD_REGISTER(192) simd GRF; +template ESIMD_PRIVATE ESIMD_REGISTER(192) simd GRF; #define V(x, w, i) (x).template select(i) #define V1(x, i) V(x, 1, i) @@ -299,10 +299,11 @@ ESIMD_INLINE void dgetrfnp_esimd(int64_t m, int64_t n, T *a, int64_t lda, #endif // defined(USE_REF) } -template void dgetrfnp_batch_strided_c(int64_t m, int64_t n, - T *a, int64_t lda, int64_t stride_a, - int64_t *ipiv, int64_t stride_ipiv, - int64_t batch, int64_t *info) { +template +void dgetrfnp_batch_strided_c(int64_t m, int64_t n, T *a, int64_t lda, + int64_t stride_a, int64_t *ipiv, + int64_t stride_ipiv, int64_t batch, + int64_t *info) { queue queue((gpu_selector())); auto device = queue.get_device(); auto context = queue.get_context(); @@ -386,11 +387,10 @@ static T fp_norm1(int64_t m, int64_t n, T *a, int64_t lda) { } template -static int dgetrfnp_batch_strided_check(int64_t m, int64_t n, T *a_in, - T *a, int64_t lda, - int64_t stride_a, int64_t *ipiv, - int64_t stride_ipiv, int64_t batch, - int64_t *info) { +static int dgetrfnp_batch_strided_check(int64_t m, int64_t n, T *a_in, T *a, + int64_t lda, int64_t stride_a, + int64_t *ipiv, int64_t stride_ipiv, + int64_t batch, int64_t *info) { T thresh = 30.0; int fail = 0; int64_t i, j, k, l; diff --git a/SYCL/ESIMD/regression/dgetrf_8x8.cpp b/SYCL/ESIMD/regression/dgetrf_8x8.cpp index c7ac320868..ce35cc6b03 100644 --- a/SYCL/ESIMD/regression/dgetrf_8x8.cpp +++ b/SYCL/ESIMD/regression/dgetrf_8x8.cpp @@ -48,14 +48,15 @@ using namespace sycl; using namespace std; using namespace sycl::ext::intel::esimd; -template ESIMD_PRIVATE ESIMD_REGISTER(384) simd GRF; +template ESIMD_PRIVATE ESIMD_REGISTER(384) simd GRF; #define V(x, w, i) (x).template select(i) #define V1(x, i) V(x, 1, i) #define V8(x, i) V(x, 8, i) #define BCAST8(x, i) (x).template replicate_w<8, 1>(i) -template ESIMD_INLINE void dgetrfnp_panel(int64_t *info) { +template +ESIMD_INLINE void dgetrfnp_panel(int64_t *info) { auto a = V(GRF, M * N, 0); for (int kk = 0; kk < N; kk += 8) { simd_mask<8> mask = 1; @@ -125,10 +126,11 @@ ESIMD_INLINE void dgetrfnp_esimd_8x8(T *a, int64_t lda, int64_t *ipiv, dgetrfnp_left_step(a, lda, info); } -template void dgetrfnp_batch_strided_c(int64_t m, int64_t n, - T *a, int64_t lda, int64_t stride_a, - int64_t *ipiv, int64_t stride_ipiv, - int64_t batch, int64_t *info) { +template +void dgetrfnp_batch_strided_c(int64_t m, int64_t n, T *a, int64_t lda, + int64_t stride_a, int64_t *ipiv, + int64_t stride_ipiv, int64_t batch, + int64_t *info) { queue queue((gpu_selector())); auto device = queue.get_device(); auto context = queue.get_context(); @@ -212,21 +214,20 @@ static T fp_norm1(int64_t m, int64_t n, T *a, int64_t lda) { } template -static int dgetrfnp_batch_strided_check(int64_t m, int64_t n, T *a_in, - T *a, int64_t lda, - int64_t stride_a, int64_t *ipiv, - int64_t stride_ipiv, int64_t batch, - int64_t *info) { +static int dgetrfnp_batch_strided_check(int64_t m, int64_t n, T *a_in, T *a, + int64_t lda, int64_t stride_a, + int64_t *ipiv, int64_t stride_ipiv, + int64_t batch, int64_t *info) { T thresh = 30.0; int fail = 0; int64_t i, j, k, l; char label[1024]; unsigned char prec_b1[] = {0, 0, 0xb0, 0x3c}; unsigned char prec_b2[] = {0, 0, 0, 0, 0, 0, 0xb0, 0x3c}; - T res = 0.0, nrm = 0.0, ulp - if (std::is_same::value) { + T res = 0.0, nrm = 0.0, ulp if (std::is_same::value) { ulp = *(T *)prec_b2; - } else { + } + else { ulp = *(T *)prec_b1; }; diff --git a/SYCL/InlineAsm/asm_float_imm_arg.cpp b/SYCL/InlineAsm/asm_float_imm_arg.cpp index 8c4b05fc32..3946beaefa 100644 --- a/SYCL/InlineAsm/asm_float_imm_arg.cpp +++ b/SYCL/InlineAsm/asm_float_imm_arg.cpp @@ -9,7 +9,7 @@ #include #include -template constexpr T IMM_ARGUMENT = T(0.5); +template constexpr T IMM_ARGUMENT = T(0.5); template struct KernelFunctor : WithInputBuffers, WithOutputBuffer { diff --git a/SYCL/KernelParams/union_kernel_param.cpp b/SYCL/KernelParams/union_kernel_param.cpp index 0ab2c1d321..c1f6ea1ca5 100644 --- a/SYCL/KernelParams/union_kernel_param.cpp +++ b/SYCL/KernelParams/union_kernel_param.cpp @@ -49,4 +49,3 @@ int main(int argc, char **argv) { return Passed ? 0 : 1; } - From f22ff9a5a152921ab19b2c93e196ca8a4109c62f Mon Sep 17 00:00:00 2001 From: Yanliang Mu Date: Mon, 29 Aug 2022 21:27:13 +0800 Subject: [PATCH 30/35] typo fix in dgetrf_8x8.cpp --- SYCL/ESIMD/regression/dgetrf_8x8.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/SYCL/ESIMD/regression/dgetrf_8x8.cpp b/SYCL/ESIMD/regression/dgetrf_8x8.cpp index ce35cc6b03..536899d3da 100644 --- a/SYCL/ESIMD/regression/dgetrf_8x8.cpp +++ b/SYCL/ESIMD/regression/dgetrf_8x8.cpp @@ -224,7 +224,8 @@ static int dgetrfnp_batch_strided_check(int64_t m, int64_t n, T *a_in, T *a, char label[1024]; unsigned char prec_b1[] = {0, 0, 0xb0, 0x3c}; unsigned char prec_b2[] = {0, 0, 0, 0, 0, 0, 0xb0, 0x3c}; - T res = 0.0, nrm = 0.0, ulp if (std::is_same::value) { + T res = 0.0, nrm = 0.0, ulp; + if (std::is_same::value) { ulp = *(T *)prec_b2; } else { From 9904e763e90f7440ac29180888017b5cc8fce82f Mon Sep 17 00:00:00 2001 From: Yanliang Mu Date: Mon, 29 Aug 2022 21:35:40 +0800 Subject: [PATCH 31/35] fix clang-format issue --- SYCL/ESIMD/regression/dgetrf_8x8.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/SYCL/ESIMD/regression/dgetrf_8x8.cpp b/SYCL/ESIMD/regression/dgetrf_8x8.cpp index 536899d3da..4551ac8762 100644 --- a/SYCL/ESIMD/regression/dgetrf_8x8.cpp +++ b/SYCL/ESIMD/regression/dgetrf_8x8.cpp @@ -227,8 +227,7 @@ static int dgetrfnp_batch_strided_check(int64_t m, int64_t n, T *a_in, T *a, T res = 0.0, nrm = 0.0, ulp; if (std::is_same::value) { ulp = *(T *)prec_b2; - } - else { + } else { ulp = *(T *)prec_b1; }; From 3f55d85d9db852ecd1396a3beb3fe66be601d0cc Mon Sep 17 00:00:00 2001 From: Yanliang Mu Date: Tue, 30 Aug 2022 10:41:52 +0800 Subject: [PATCH 32/35] 1) add REQUIRES:aspect-fp64 to restrict some 'double' type tests running on Gen12 or late 2) fix typo issue in dgetrf_8x8.cpp 3) split test replicate_smoke.cpp 4) reslve conflict in SYCL/ESIMD/api/bin_and_cmp_ops_heavy.cpp --- SYCL/ESIMD/aot_mixed.cpp | 4 ++-- SYCL/ESIMD/api/bin_and_cmp_ops_heavy.cpp | 3 +++ .../ctors/ctor_converting_fp_extra.cpp | 2 +- .../ctors/ctor_load_acc_fp_extra.cpp | 2 +- .../ctors/ctor_load_usm_fp_extra.cpp | 2 +- ...rement_and_increment_accuracy_fp_extra.cpp | 2 +- ...rator_decrement_and_increment_fp_extra.cpp | 2 +- SYCL/ESIMD/api/replicate_smoke.cpp | 2 ++ .../ESIMD/api/replicate_smoke_aspect-fp64.cpp | 19 +++++++++++++++++++ SYCL/ESIMD/regression/dgetrf_8x8.cpp | 7 +------ SYCL/ESIMD/spec_const/spec_const_double.cpp | 2 +- SYCL/SubGroup/broadcast_fp64.cpp | 1 + SYCL/SubGroup/shuffle_fp64.cpp | 1 + 13 files changed, 35 insertions(+), 14 deletions(-) create mode 100644 SYCL/ESIMD/api/replicate_smoke_aspect-fp64.cpp diff --git a/SYCL/ESIMD/aot_mixed.cpp b/SYCL/ESIMD/aot_mixed.cpp index 0f237a558a..4d9a1857ca 100644 --- a/SYCL/ESIMD/aot_mixed.cpp +++ b/SYCL/ESIMD/aot_mixed.cpp @@ -8,9 +8,9 @@ // REQUIRES: gpu // UNSUPPORTED: cuda || hip // UNSUPPORTED: esimd_emulator -// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen "-device gen9" -o %t.sycl.out -DENABLE_SYCL=0 %s +// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen "-device *" -o %t.sycl.out -DENABLE_SYCL=0 %s // RUN: %GPU_RUN_PLACEHOLDER %t.sycl.out -// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen "-device gen9" -o %t.out %s +// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen "-device *" -o %t.out %s // RUN: %GPU_RUN_PLACEHOLDER %t.out // This test checks the following ESIMD ahead-of-time compilation scenarios: diff --git a/SYCL/ESIMD/api/bin_and_cmp_ops_heavy.cpp b/SYCL/ESIMD/api/bin_and_cmp_ops_heavy.cpp index 43fcee2416..656c244647 100644 --- a/SYCL/ESIMD/api/bin_and_cmp_ops_heavy.cpp +++ b/SYCL/ESIMD/api/bin_and_cmp_ops_heavy.cpp @@ -290,6 +290,9 @@ int main(void) { auto cmp_ops = esimd_test::CmpOps; passed &= test(cmp_ops, q); passed &= test(cmp_ops, q); +#ifdef ENABLE_FP64 + passed &= test(arith_ops, q, 1e-15); +#endif passed &= test(cmp_ops, q); passed &= test(cmp_ops, q, 1); passed &= test(cmp_ops, q, 1); diff --git a/SYCL/ESIMD/api/functional/ctors/ctor_converting_fp_extra.cpp b/SYCL/ESIMD/api/functional/ctors/ctor_converting_fp_extra.cpp index 39f9be9fcd..3cd6e5c556 100644 --- a/SYCL/ESIMD/api/functional/ctors/ctor_converting_fp_extra.cpp +++ b/SYCL/ESIMD/api/functional/ctors/ctor_converting_fp_extra.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, level_zero +// REQUIRES: gpu, level_zero, aspect-fp64 // XREQUIRES: gpu // TODO gpu and level_zero in REQUIRES due to only this platforms supported yet. // The current "REQUIRES" should be replaced with "gpu" only as mentioned in diff --git a/SYCL/ESIMD/api/functional/ctors/ctor_load_acc_fp_extra.cpp b/SYCL/ESIMD/api/functional/ctors/ctor_load_acc_fp_extra.cpp index 797ff009c8..4cf2aa7add 100644 --- a/SYCL/ESIMD/api/functional/ctors/ctor_load_acc_fp_extra.cpp +++ b/SYCL/ESIMD/api/functional/ctors/ctor_load_acc_fp_extra.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, level_zero +// REQUIRES: gpu, level_zero, aspect-fp64 // XREQUIRES: gpu // TODO gpu and level_zero in REQUIRES due to only this platforms supported yet. // The current "REQUIRES" should be replaced with "gpu" only as mentioned in diff --git a/SYCL/ESIMD/api/functional/ctors/ctor_load_usm_fp_extra.cpp b/SYCL/ESIMD/api/functional/ctors/ctor_load_usm_fp_extra.cpp index 4b960879c1..4e8bb1414f 100644 --- a/SYCL/ESIMD/api/functional/ctors/ctor_load_usm_fp_extra.cpp +++ b/SYCL/ESIMD/api/functional/ctors/ctor_load_usm_fp_extra.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, level_zero +// REQUIRES: gpu, level_zero, aspect-fp64 // XREQUIRES: gpu // TODO gpu and level_zero in REQUIRES due to only this platforms supported yet. // The current "REQUIRES" should be replaced with "gpu" only as mentioned in diff --git a/SYCL/ESIMD/api/functional/operators/operator_decrement_and_increment_accuracy_fp_extra.cpp b/SYCL/ESIMD/api/functional/operators/operator_decrement_and_increment_accuracy_fp_extra.cpp index 5e38a3b7cf..bba6deaa45 100644 --- a/SYCL/ESIMD/api/functional/operators/operator_decrement_and_increment_accuracy_fp_extra.cpp +++ b/SYCL/ESIMD/api/functional/operators/operator_decrement_and_increment_accuracy_fp_extra.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, level_zero +// REQUIRES: gpu, level_zero, aspect-fp64 // XREQUIRES: gpu // TODO gpu and level_zero in REQUIRES due to only this platforms supported yet. // The current "REQUIRES" should be replaced with "gpu" only as mentioned in diff --git a/SYCL/ESIMD/api/functional/operators/operator_decrement_and_increment_fp_extra.cpp b/SYCL/ESIMD/api/functional/operators/operator_decrement_and_increment_fp_extra.cpp index 7f88d0ad96..87355f845d 100644 --- a/SYCL/ESIMD/api/functional/operators/operator_decrement_and_increment_fp_extra.cpp +++ b/SYCL/ESIMD/api/functional/operators/operator_decrement_and_increment_fp_extra.cpp @@ -6,7 +6,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu, level_zero +// REQUIRES: gpu, level_zero, aspect-fp64 // XREQUIRES: gpu // TODO gpu and level_zero in REQUIRES due to only this platforms supported yet. // The current "REQUIRES" should be replaced with "gpu" only as mentioned in diff --git a/SYCL/ESIMD/api/replicate_smoke.cpp b/SYCL/ESIMD/api/replicate_smoke.cpp index c11ca8b05b..cd9ddc0575 100644 --- a/SYCL/ESIMD/api/replicate_smoke.cpp +++ b/SYCL/ESIMD/api/replicate_smoke.cpp @@ -184,7 +184,9 @@ int main(int argc, char **argv) { passed &= test(q); passed &= test(q); passed &= test(q); +#ifdef ENABLE_FP64 passed &= test(q); +#endif std::cout << (passed ? "Test passed\n" : "Test FAILED\n"); return passed ? 0 : 1; diff --git a/SYCL/ESIMD/api/replicate_smoke_aspect-fp64.cpp b/SYCL/ESIMD/api/replicate_smoke_aspect-fp64.cpp new file mode 100644 index 0000000000..ad587c4df8 --- /dev/null +++ b/SYCL/ESIMD/api/replicate_smoke_aspect-fp64.cpp @@ -0,0 +1,19 @@ +//==------- replicate_smoke.cpp - DPC++ ESIMD on-device test --------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Enable FP64 part of . To be removed once DPC++ +// supports optional device features and the code could be enabled +// unconditionally without causing failures in speculative compilation +// of the kernels. +// +// REQUIRES: gpu, aspect-fp64 +// UNSUPPORTED: cuda || hip +// RUN: %clangxx -fsycl -DENABLE_FP64 %s -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// +#include "replicate_smoke.cpp" diff --git a/SYCL/ESIMD/regression/dgetrf_8x8.cpp b/SYCL/ESIMD/regression/dgetrf_8x8.cpp index 4551ac8762..4604e78c2e 100644 --- a/SYCL/ESIMD/regression/dgetrf_8x8.cpp +++ b/SYCL/ESIMD/regression/dgetrf_8x8.cpp @@ -109,7 +109,7 @@ ESIMD_INLINE void dgetrfnp_left_step(T *a, int64_t lda, int64_t *info) { V8(p1, j * M + i) = data; } // (getrf) factorize T=P*L*U - dgetrfnp_panel(info); + dgetrfnp_panel(info); // store P1 for (j = 0, a1 = a + K * lda; j < N; j++, a1 += lda) @@ -279,11 +279,6 @@ static int dgetrfnp_batch_strided_check(int64_t m, int64_t n, T *a_in, T *a, return fail; } -void dgetrfnp_batch_strided_c(int64_t m, int64_t n, T *a, int64_t lda, - int64_t stride_a, int64_t *ipiv, - int64_t stride_ipiv, int64_t batch, - int64_t *info); - template int check(int argc, char *argv[]) { int exit_status = 0; constexpr int64_t m = 8, n = 8, lda = 8; diff --git a/SYCL/ESIMD/spec_const/spec_const_double.cpp b/SYCL/ESIMD/spec_const/spec_const_double.cpp index 658164f702..8358a4fa82 100644 --- a/SYCL/ESIMD/spec_const/spec_const_double.cpp +++ b/SYCL/ESIMD/spec_const/spec_const_double.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu +// REQUIRES: gpu, aspect-fp64 // RUN: %clangxx -fsycl -I%S/.. %s -o %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out // UNSUPPORTED: cuda || hip diff --git a/SYCL/SubGroup/broadcast_fp64.cpp b/SYCL/SubGroup/broadcast_fp64.cpp index 43fea7165b..7966557385 100644 --- a/SYCL/SubGroup/broadcast_fp64.cpp +++ b/SYCL/SubGroup/broadcast_fp64.cpp @@ -1,3 +1,4 @@ +// REQUIRES: aspect-fp64 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out diff --git a/SYCL/SubGroup/shuffle_fp64.cpp b/SYCL/SubGroup/shuffle_fp64.cpp index 2ad6a49e7d..13a074ad0d 100644 --- a/SYCL/SubGroup/shuffle_fp64.cpp +++ b/SYCL/SubGroup/shuffle_fp64.cpp @@ -1,3 +1,4 @@ +// REQUIRES: aspect-fp64 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out From bfcff9e09beed4d62dc260c29e83f2a3ae0cbe5d Mon Sep 17 00:00:00 2001 From: Yanliang Mu Date: Tue, 30 Aug 2022 14:56:59 +0800 Subject: [PATCH 33/35] use '-device gen12lp' for aot_mixed.cpp --- SYCL/ESIMD/aot_mixed.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/SYCL/ESIMD/aot_mixed.cpp b/SYCL/ESIMD/aot_mixed.cpp index 4d9a1857ca..366abf19d8 100644 --- a/SYCL/ESIMD/aot_mixed.cpp +++ b/SYCL/ESIMD/aot_mixed.cpp @@ -8,9 +8,9 @@ // REQUIRES: gpu // UNSUPPORTED: cuda || hip // UNSUPPORTED: esimd_emulator -// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen "-device *" -o %t.sycl.out -DENABLE_SYCL=0 %s +// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen "-device gen12lp" -o %t.sycl.out -DENABLE_SYCL=0 %s // RUN: %GPU_RUN_PLACEHOLDER %t.sycl.out -// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen "-device *" -o %t.out %s +// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen "-device gen12lp" -o %t.out %s // RUN: %GPU_RUN_PLACEHOLDER %t.out // This test checks the following ESIMD ahead-of-time compilation scenarios: From 86e20be56d213abdd2c9ab1b3c14ec72ae8e41ec Mon Sep 17 00:00:00 2001 From: Yanliang Mu Date: Wed, 31 Aug 2022 09:13:33 +0800 Subject: [PATCH 34/35] rename aspect-fp64 to aspect_fp64 to stick to underscores and not mix then with '-' --- ...omic64_aspect-fp64.cpp => assignment_atomic64_aspect_fp64.cpp} | 0 ...spect-fp64.cpp => assignment_atomic64_generic_aspect_fp64.cpp} | 0 .../buffer/{buffer_aspect-fp64.cpp => buffer_aspect_fp64.cpp} | 0 ...s_aspect-fp64.cpp => specialization_constants_aspect_fp64.cpp} | 0 ...fp64.cpp => specialization_constants_override_aspect_fp64.cpp} | 0 .../built-ins/{nan_aspect-fp64.cpp => nan_aspect_fp64.cpp} | 0 ...eavy_aspect-fp64.cpp => bin_and_cmp_ops_heavy_aspect_fp64.cpp} | 0 ...e_aspect-fp64.cpp => functions_select_2d_core_aspect_fp64.cpp} | 0 ...cate_smoke_aspect-fp64.cpp => replicate_smoke_aspect_fp64.cpp} | 0 ...ion_smoke_aspect-fp64.cpp => saturation_smoke_aspect_fp64.cpp} | 0 ..._fp_aspect-fp64.cpp => simd_view_select_2d_fp_aspect_fp64.cpp} | 0 ..._ops_heavy_aspect-fp64.cpp => unary_ops_heavy_aspect_fp64.cpp} | 0 SYCL/ESIMD/{ext_math_aspect-fp64.cpp => ext_math_aspect_fp64.cpp} | 0 .../{dgetrf_8x8_aspect-fp64.cpp => dgetrf_8x8_aspect_fp64.cpp} | 0 .../regression/{dgetrf_aspect-fp64.cpp => dgetrf_aspect_fp64.cpp} | 0 .../{dgetrf_ref_aspect-fp64.cpp => dgetrf_ref_aspect_fp64.cpp} | 0 .../SYCL2020/{sort_aspect-fp64.cpp => sort_aspect_fp64.cpp} | 0 ...sm_float_add_aspect-fp64.cpp => asm_float_add_aspect_fp64.cpp} | 0 ..._imm_arg_aspect-fp64.cpp => asm_float_imm_arg_aspect_fp64.cpp} | 0 ...l_param_aspect-fp64.cpp => union_kernel_param_aspect_fp64.cpp} | 0 .../{handler-api_aspect-fp64.cpp => handler-api_aspect_fp64.cpp} | 0 ...ndle-api_aspect-fp64.cpp => kernel-bundle-api_aspect_fp64.cpp} | 0 .../SubGroup/{barrier_aspect-fp64.cpp => barrier_aspect_fp64.cpp} | 0 ...ic-shuffle_aspect-fp64.cpp => generic-shuffle_aspect_fp64.cpp} | 0 SYCL/SubGroup/{info_aspect-fp64.cpp => info_aspect_fp64.cpp} | 0 .../{load_store_aspect-fp64.cpp => load_store_aspect_fp64.cpp} | 0 SYCL/USM/{copy_aspect-fp64.cpp => copy_aspect_fp64.cpp} | 0 SYCL/USM/{fill_aspect-fp64.cpp => fill_aspect_fp64.cpp} | 0 28 files changed, 0 insertions(+), 0 deletions(-) rename SYCL/AtomicRef/{assignment_atomic64_aspect-fp64.cpp => assignment_atomic64_aspect_fp64.cpp} (100%) rename SYCL/AtomicRef/{assignment_atomic64_generic_aspect-fp64.cpp => assignment_atomic64_generic_aspect_fp64.cpp} (100%) rename SYCL/Basic/buffer/{buffer_aspect-fp64.cpp => buffer_aspect_fp64.cpp} (100%) rename SYCL/DeprecatedFeatures/SpecConsts1.2.1/{specialization_constants_aspect-fp64.cpp => specialization_constants_aspect_fp64.cpp} (100%) rename SYCL/DeprecatedFeatures/SpecConsts1.2.1/{specialization_constants_override_aspect-fp64.cpp => specialization_constants_override_aspect_fp64.cpp} (100%) rename SYCL/DeviceLib/built-ins/{nan_aspect-fp64.cpp => nan_aspect_fp64.cpp} (100%) rename SYCL/ESIMD/api/{bin_and_cmp_ops_heavy_aspect-fp64.cpp => bin_and_cmp_ops_heavy_aspect_fp64.cpp} (100%) rename SYCL/ESIMD/api/functional/functions/{functions_select_2d_core_aspect-fp64.cpp => functions_select_2d_core_aspect_fp64.cpp} (100%) rename SYCL/ESIMD/api/{replicate_smoke_aspect-fp64.cpp => replicate_smoke_aspect_fp64.cpp} (100%) rename SYCL/ESIMD/api/{saturation_smoke_aspect-fp64.cpp => saturation_smoke_aspect_fp64.cpp} (100%) rename SYCL/ESIMD/api/{simd_view_select_2d_fp_aspect-fp64.cpp => simd_view_select_2d_fp_aspect_fp64.cpp} (100%) rename SYCL/ESIMD/api/{unary_ops_heavy_aspect-fp64.cpp => unary_ops_heavy_aspect_fp64.cpp} (100%) rename SYCL/ESIMD/{ext_math_aspect-fp64.cpp => ext_math_aspect_fp64.cpp} (100%) rename SYCL/ESIMD/regression/{dgetrf_8x8_aspect-fp64.cpp => dgetrf_8x8_aspect_fp64.cpp} (100%) rename SYCL/ESIMD/regression/{dgetrf_aspect-fp64.cpp => dgetrf_aspect_fp64.cpp} (100%) rename SYCL/ESIMD/regression/{dgetrf_ref_aspect-fp64.cpp => dgetrf_ref_aspect_fp64.cpp} (100%) rename SYCL/GroupAlgorithm/SYCL2020/{sort_aspect-fp64.cpp => sort_aspect_fp64.cpp} (100%) rename SYCL/InlineAsm/{asm_float_add_aspect-fp64.cpp => asm_float_add_aspect_fp64.cpp} (100%) rename SYCL/InlineAsm/{asm_float_imm_arg_aspect-fp64.cpp => asm_float_imm_arg_aspect_fp64.cpp} (100%) rename SYCL/KernelParams/{union_kernel_param_aspect-fp64.cpp => union_kernel_param_aspect_fp64.cpp} (100%) rename SYCL/SpecConstants/2020/{handler-api_aspect-fp64.cpp => handler-api_aspect_fp64.cpp} (100%) rename SYCL/SpecConstants/2020/{kernel-bundle-api_aspect-fp64.cpp => kernel-bundle-api_aspect_fp64.cpp} (100%) rename SYCL/SubGroup/{barrier_aspect-fp64.cpp => barrier_aspect_fp64.cpp} (100%) rename SYCL/SubGroup/{generic-shuffle_aspect-fp64.cpp => generic-shuffle_aspect_fp64.cpp} (100%) rename SYCL/SubGroup/{info_aspect-fp64.cpp => info_aspect_fp64.cpp} (100%) rename SYCL/SubGroup/{load_store_aspect-fp64.cpp => load_store_aspect_fp64.cpp} (100%) rename SYCL/USM/{copy_aspect-fp64.cpp => copy_aspect_fp64.cpp} (100%) rename SYCL/USM/{fill_aspect-fp64.cpp => fill_aspect_fp64.cpp} (100%) diff --git a/SYCL/AtomicRef/assignment_atomic64_aspect-fp64.cpp b/SYCL/AtomicRef/assignment_atomic64_aspect_fp64.cpp similarity index 100% rename from SYCL/AtomicRef/assignment_atomic64_aspect-fp64.cpp rename to SYCL/AtomicRef/assignment_atomic64_aspect_fp64.cpp diff --git a/SYCL/AtomicRef/assignment_atomic64_generic_aspect-fp64.cpp b/SYCL/AtomicRef/assignment_atomic64_generic_aspect_fp64.cpp similarity index 100% rename from SYCL/AtomicRef/assignment_atomic64_generic_aspect-fp64.cpp rename to SYCL/AtomicRef/assignment_atomic64_generic_aspect_fp64.cpp diff --git a/SYCL/Basic/buffer/buffer_aspect-fp64.cpp b/SYCL/Basic/buffer/buffer_aspect_fp64.cpp similarity index 100% rename from SYCL/Basic/buffer/buffer_aspect-fp64.cpp rename to SYCL/Basic/buffer/buffer_aspect_fp64.cpp diff --git a/SYCL/DeprecatedFeatures/SpecConsts1.2.1/specialization_constants_aspect-fp64.cpp b/SYCL/DeprecatedFeatures/SpecConsts1.2.1/specialization_constants_aspect_fp64.cpp similarity index 100% rename from SYCL/DeprecatedFeatures/SpecConsts1.2.1/specialization_constants_aspect-fp64.cpp rename to SYCL/DeprecatedFeatures/SpecConsts1.2.1/specialization_constants_aspect_fp64.cpp diff --git a/SYCL/DeprecatedFeatures/SpecConsts1.2.1/specialization_constants_override_aspect-fp64.cpp b/SYCL/DeprecatedFeatures/SpecConsts1.2.1/specialization_constants_override_aspect_fp64.cpp similarity index 100% rename from SYCL/DeprecatedFeatures/SpecConsts1.2.1/specialization_constants_override_aspect-fp64.cpp rename to SYCL/DeprecatedFeatures/SpecConsts1.2.1/specialization_constants_override_aspect_fp64.cpp diff --git a/SYCL/DeviceLib/built-ins/nan_aspect-fp64.cpp b/SYCL/DeviceLib/built-ins/nan_aspect_fp64.cpp similarity index 100% rename from SYCL/DeviceLib/built-ins/nan_aspect-fp64.cpp rename to SYCL/DeviceLib/built-ins/nan_aspect_fp64.cpp diff --git a/SYCL/ESIMD/api/bin_and_cmp_ops_heavy_aspect-fp64.cpp b/SYCL/ESIMD/api/bin_and_cmp_ops_heavy_aspect_fp64.cpp similarity index 100% rename from SYCL/ESIMD/api/bin_and_cmp_ops_heavy_aspect-fp64.cpp rename to SYCL/ESIMD/api/bin_and_cmp_ops_heavy_aspect_fp64.cpp diff --git a/SYCL/ESIMD/api/functional/functions/functions_select_2d_core_aspect-fp64.cpp b/SYCL/ESIMD/api/functional/functions/functions_select_2d_core_aspect_fp64.cpp similarity index 100% rename from SYCL/ESIMD/api/functional/functions/functions_select_2d_core_aspect-fp64.cpp rename to SYCL/ESIMD/api/functional/functions/functions_select_2d_core_aspect_fp64.cpp diff --git a/SYCL/ESIMD/api/replicate_smoke_aspect-fp64.cpp b/SYCL/ESIMD/api/replicate_smoke_aspect_fp64.cpp similarity index 100% rename from SYCL/ESIMD/api/replicate_smoke_aspect-fp64.cpp rename to SYCL/ESIMD/api/replicate_smoke_aspect_fp64.cpp diff --git a/SYCL/ESIMD/api/saturation_smoke_aspect-fp64.cpp b/SYCL/ESIMD/api/saturation_smoke_aspect_fp64.cpp similarity index 100% rename from SYCL/ESIMD/api/saturation_smoke_aspect-fp64.cpp rename to SYCL/ESIMD/api/saturation_smoke_aspect_fp64.cpp diff --git a/SYCL/ESIMD/api/simd_view_select_2d_fp_aspect-fp64.cpp b/SYCL/ESIMD/api/simd_view_select_2d_fp_aspect_fp64.cpp similarity index 100% rename from SYCL/ESIMD/api/simd_view_select_2d_fp_aspect-fp64.cpp rename to SYCL/ESIMD/api/simd_view_select_2d_fp_aspect_fp64.cpp diff --git a/SYCL/ESIMD/api/unary_ops_heavy_aspect-fp64.cpp b/SYCL/ESIMD/api/unary_ops_heavy_aspect_fp64.cpp similarity index 100% rename from SYCL/ESIMD/api/unary_ops_heavy_aspect-fp64.cpp rename to SYCL/ESIMD/api/unary_ops_heavy_aspect_fp64.cpp diff --git a/SYCL/ESIMD/ext_math_aspect-fp64.cpp b/SYCL/ESIMD/ext_math_aspect_fp64.cpp similarity index 100% rename from SYCL/ESIMD/ext_math_aspect-fp64.cpp rename to SYCL/ESIMD/ext_math_aspect_fp64.cpp diff --git a/SYCL/ESIMD/regression/dgetrf_8x8_aspect-fp64.cpp b/SYCL/ESIMD/regression/dgetrf_8x8_aspect_fp64.cpp similarity index 100% rename from SYCL/ESIMD/regression/dgetrf_8x8_aspect-fp64.cpp rename to SYCL/ESIMD/regression/dgetrf_8x8_aspect_fp64.cpp diff --git a/SYCL/ESIMD/regression/dgetrf_aspect-fp64.cpp b/SYCL/ESIMD/regression/dgetrf_aspect_fp64.cpp similarity index 100% rename from SYCL/ESIMD/regression/dgetrf_aspect-fp64.cpp rename to SYCL/ESIMD/regression/dgetrf_aspect_fp64.cpp diff --git a/SYCL/ESIMD/regression/dgetrf_ref_aspect-fp64.cpp b/SYCL/ESIMD/regression/dgetrf_ref_aspect_fp64.cpp similarity index 100% rename from SYCL/ESIMD/regression/dgetrf_ref_aspect-fp64.cpp rename to SYCL/ESIMD/regression/dgetrf_ref_aspect_fp64.cpp diff --git a/SYCL/GroupAlgorithm/SYCL2020/sort_aspect-fp64.cpp b/SYCL/GroupAlgorithm/SYCL2020/sort_aspect_fp64.cpp similarity index 100% rename from SYCL/GroupAlgorithm/SYCL2020/sort_aspect-fp64.cpp rename to SYCL/GroupAlgorithm/SYCL2020/sort_aspect_fp64.cpp diff --git a/SYCL/InlineAsm/asm_float_add_aspect-fp64.cpp b/SYCL/InlineAsm/asm_float_add_aspect_fp64.cpp similarity index 100% rename from SYCL/InlineAsm/asm_float_add_aspect-fp64.cpp rename to SYCL/InlineAsm/asm_float_add_aspect_fp64.cpp diff --git a/SYCL/InlineAsm/asm_float_imm_arg_aspect-fp64.cpp b/SYCL/InlineAsm/asm_float_imm_arg_aspect_fp64.cpp similarity index 100% rename from SYCL/InlineAsm/asm_float_imm_arg_aspect-fp64.cpp rename to SYCL/InlineAsm/asm_float_imm_arg_aspect_fp64.cpp diff --git a/SYCL/KernelParams/union_kernel_param_aspect-fp64.cpp b/SYCL/KernelParams/union_kernel_param_aspect_fp64.cpp similarity index 100% rename from SYCL/KernelParams/union_kernel_param_aspect-fp64.cpp rename to SYCL/KernelParams/union_kernel_param_aspect_fp64.cpp diff --git a/SYCL/SpecConstants/2020/handler-api_aspect-fp64.cpp b/SYCL/SpecConstants/2020/handler-api_aspect_fp64.cpp similarity index 100% rename from SYCL/SpecConstants/2020/handler-api_aspect-fp64.cpp rename to SYCL/SpecConstants/2020/handler-api_aspect_fp64.cpp diff --git a/SYCL/SpecConstants/2020/kernel-bundle-api_aspect-fp64.cpp b/SYCL/SpecConstants/2020/kernel-bundle-api_aspect_fp64.cpp similarity index 100% rename from SYCL/SpecConstants/2020/kernel-bundle-api_aspect-fp64.cpp rename to SYCL/SpecConstants/2020/kernel-bundle-api_aspect_fp64.cpp diff --git a/SYCL/SubGroup/barrier_aspect-fp64.cpp b/SYCL/SubGroup/barrier_aspect_fp64.cpp similarity index 100% rename from SYCL/SubGroup/barrier_aspect-fp64.cpp rename to SYCL/SubGroup/barrier_aspect_fp64.cpp diff --git a/SYCL/SubGroup/generic-shuffle_aspect-fp64.cpp b/SYCL/SubGroup/generic-shuffle_aspect_fp64.cpp similarity index 100% rename from SYCL/SubGroup/generic-shuffle_aspect-fp64.cpp rename to SYCL/SubGroup/generic-shuffle_aspect_fp64.cpp diff --git a/SYCL/SubGroup/info_aspect-fp64.cpp b/SYCL/SubGroup/info_aspect_fp64.cpp similarity index 100% rename from SYCL/SubGroup/info_aspect-fp64.cpp rename to SYCL/SubGroup/info_aspect_fp64.cpp diff --git a/SYCL/SubGroup/load_store_aspect-fp64.cpp b/SYCL/SubGroup/load_store_aspect_fp64.cpp similarity index 100% rename from SYCL/SubGroup/load_store_aspect-fp64.cpp rename to SYCL/SubGroup/load_store_aspect_fp64.cpp diff --git a/SYCL/USM/copy_aspect-fp64.cpp b/SYCL/USM/copy_aspect_fp64.cpp similarity index 100% rename from SYCL/USM/copy_aspect-fp64.cpp rename to SYCL/USM/copy_aspect_fp64.cpp diff --git a/SYCL/USM/fill_aspect-fp64.cpp b/SYCL/USM/fill_aspect_fp64.cpp similarity index 100% rename from SYCL/USM/fill_aspect-fp64.cpp rename to SYCL/USM/fill_aspect_fp64.cpp From 2fbfbbe565228857366caf3a7d07f87ecb77b791 Mon Sep 17 00:00:00 2001 From: Yanliang Mu Date: Wed, 31 Aug 2022 09:27:37 +0800 Subject: [PATCH 35/35] also to rename the file markup --- .../functions/functions_select_2d_core_aspect_fp64.cpp | 2 +- SYCL/ESIMD/api/saturation_smoke_aspect_fp64.cpp | 2 +- SYCL/ESIMD/api/simd_view_select_2d_fp_aspect_fp64.cpp | 2 +- SYCL/ESIMD/api/unary_ops_heavy_aspect_fp64.cpp | 2 +- SYCL/ESIMD/ext_math_aspect_fp64.cpp | 2 +- SYCL/ESIMD/regression/dgetrf_8x8_aspect_fp64.cpp | 2 +- SYCL/ESIMD/regression/dgetrf_aspect_fp64.cpp | 2 +- SYCL/ESIMD/regression/dgetrf_ref_aspect_fp64.cpp | 2 +- SYCL/SubGroup/barrier_aspect_fp64.cpp | 2 +- SYCL/SubGroup/load_store_aspect_fp64.cpp | 2 +- SYCL/USM/fill_aspect_fp64.cpp | 2 +- 11 files changed, 11 insertions(+), 11 deletions(-) diff --git a/SYCL/ESIMD/api/functional/functions/functions_select_2d_core_aspect_fp64.cpp b/SYCL/ESIMD/api/functional/functions/functions_select_2d_core_aspect_fp64.cpp index f1a57bbe72..73071ff8ca 100644 --- a/SYCL/ESIMD/api/functional/functions/functions_select_2d_core_aspect_fp64.cpp +++ b/SYCL/ESIMD/api/functional/functions/functions_select_2d_core_aspect_fp64.cpp @@ -1,4 +1,4 @@ -//==- functions_select_2d_core_aspect-fp64.cpp - DPC++ ESIMD on-device test ==// +//==- functions_select_2d_core_aspect_fp64.cpp - DPC++ ESIMD on-device test ==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/SYCL/ESIMD/api/saturation_smoke_aspect_fp64.cpp b/SYCL/ESIMD/api/saturation_smoke_aspect_fp64.cpp index 1edf0ff409..c1c0180911 100644 --- a/SYCL/ESIMD/api/saturation_smoke_aspect_fp64.cpp +++ b/SYCL/ESIMD/api/saturation_smoke_aspect_fp64.cpp @@ -1,4 +1,4 @@ -//==---- saturation_smoke_aspect-fp64.cpp - DPC++ ESIMD on-device test ----==// +//==---- saturation_smoke_aspect_fp64.cpp - DPC++ ESIMD on-device test ----==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/SYCL/ESIMD/api/simd_view_select_2d_fp_aspect_fp64.cpp b/SYCL/ESIMD/api/simd_view_select_2d_fp_aspect_fp64.cpp index 17a7fe90d1..5b46c63cdf 100644 --- a/SYCL/ESIMD/api/simd_view_select_2d_fp_aspect_fp64.cpp +++ b/SYCL/ESIMD/api/simd_view_select_2d_fp_aspect_fp64.cpp @@ -1,4 +1,4 @@ -//==- simd_view_select_2d_fp_aspect-fp64.cpp - DPC++ ESIMD on-device test -==// +//==- simd_view_select_2d_fp_aspect_fp64.cpp - DPC++ ESIMD on-device test -==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/SYCL/ESIMD/api/unary_ops_heavy_aspect_fp64.cpp b/SYCL/ESIMD/api/unary_ops_heavy_aspect_fp64.cpp index 99701b1298..dbf71de392 100644 --- a/SYCL/ESIMD/api/unary_ops_heavy_aspect_fp64.cpp +++ b/SYCL/ESIMD/api/unary_ops_heavy_aspect_fp64.cpp @@ -1,4 +1,4 @@ -//==------ unary_ops_heavy_aspect-fp64.cpp - DPC++ ESIMD on-device test ---==// +//==------ unary_ops_heavy_aspect_fp64.cpp - DPC++ ESIMD on-device test ---==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/SYCL/ESIMD/ext_math_aspect_fp64.cpp b/SYCL/ESIMD/ext_math_aspect_fp64.cpp index 1b44dda5b1..e519e916ea 100644 --- a/SYCL/ESIMD/ext_math_aspect_fp64.cpp +++ b/SYCL/ESIMD/ext_math_aspect_fp64.cpp @@ -1,4 +1,4 @@ -//==----- ext_math_aspect-fp64.cpp - DPC++ ESIMD extended math test -------==// +//==----- ext_math_aspect_fp64.cpp - DPC++ ESIMD extended math test -------==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/SYCL/ESIMD/regression/dgetrf_8x8_aspect_fp64.cpp b/SYCL/ESIMD/regression/dgetrf_8x8_aspect_fp64.cpp index 43403c5d7b..6adfc7d568 100644 --- a/SYCL/ESIMD/regression/dgetrf_8x8_aspect_fp64.cpp +++ b/SYCL/ESIMD/regression/dgetrf_8x8_aspect_fp64.cpp @@ -1,4 +1,4 @@ -//==------- dgetrf_8x8_aspect-fp64.cpp - DPC++ ESIMD on-device test -------==// +//==------- dgetrf_8x8_aspect_fp64.cpp - DPC++ ESIMD on-device test -------==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/SYCL/ESIMD/regression/dgetrf_aspect_fp64.cpp b/SYCL/ESIMD/regression/dgetrf_aspect_fp64.cpp index 0d3b0631cc..29c31249c1 100644 --- a/SYCL/ESIMD/regression/dgetrf_aspect_fp64.cpp +++ b/SYCL/ESIMD/regression/dgetrf_aspect_fp64.cpp @@ -1,4 +1,4 @@ -//==--------- dgetrf_aspect-fp64.cpp - DPC++ ESIMD on-device test ---------==// +//==--------- dgetrf_aspect_fp64.cpp - DPC++ ESIMD on-device test ---------==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/SYCL/ESIMD/regression/dgetrf_ref_aspect_fp64.cpp b/SYCL/ESIMD/regression/dgetrf_ref_aspect_fp64.cpp index c7040a716c..bb91041317 100644 --- a/SYCL/ESIMD/regression/dgetrf_ref_aspect_fp64.cpp +++ b/SYCL/ESIMD/regression/dgetrf_ref_aspect_fp64.cpp @@ -1,4 +1,4 @@ -//==------- dgetrf_ref_aspect-fp64.cpp - DPC++ ESIMD on-device test -------==// +//==------- dgetrf_ref_aspect_fp64.cpp - DPC++ ESIMD on-device test -------==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/SYCL/SubGroup/barrier_aspect_fp64.cpp b/SYCL/SubGroup/barrier_aspect_fp64.cpp index 1aed97757b..bb720a7087 100644 --- a/SYCL/SubGroup/barrier_aspect_fp64.cpp +++ b/SYCL/SubGroup/barrier_aspect_fp64.cpp @@ -9,7 +9,7 @@ // RUN: %GPU_RUN_PLACEHOLDER %t.out // RUN: %ACC_RUN_PLACEHOLDER %t.out -//==-- barrier_aspect-fp64.cpp - SYCL sub_group barrier test ---*- C++ -*---==// +//==-- barrier_aspect_fp64.cpp - SYCL sub_group barrier test ---*- C++ -*---==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/SYCL/SubGroup/load_store_aspect_fp64.cpp b/SYCL/SubGroup/load_store_aspect_fp64.cpp index b68d87dbbb..02b184d89e 100644 --- a/SYCL/SubGroup/load_store_aspect_fp64.cpp +++ b/SYCL/SubGroup/load_store_aspect_fp64.cpp @@ -13,7 +13,7 @@ // AMD // XFAIL: hip_amd // -//==----- load_store_aspect-fp64.cpp - SYCL sub_group load/store test ------==// +//==----- load_store_aspect_fp64.cpp - SYCL sub_group load/store test ------==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/SYCL/USM/fill_aspect_fp64.cpp b/SYCL/USM/fill_aspect_fp64.cpp index ac602b709e..c542bd7c9e 100644 --- a/SYCL/USM/fill_aspect_fp64.cpp +++ b/SYCL/USM/fill_aspect_fp64.cpp @@ -1,4 +1,4 @@ -//==------- fill_aspect-fp64.cpp - USM fill test for double type -----------==// +//==------- fill_aspect_fp64.cpp - USM fill test for double type -----------==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information.