From 0c51f6082760f15d268bb4320c832cebe91704e9 Mon Sep 17 00:00:00 2001 From: Alexey Bader Date: Mon, 7 Aug 2023 17:37:39 -0700 Subject: [PATCH 01/24] [SYCL] Switch SPIR-V offload target to opaque pointers (#9828) Although there are a few tests failing due to this change, we need to go with this change to avoid future regressions and unblock changes removing typed pointers support. The regressions are supposed to be fixed by follow-up patches. --- llvm/CMakeLists.txt | 2 +- sycl/test-e2e/BFloat16/bfloat16_conversions.cpp | 3 +++ sycl/test-e2e/BFloat16/bfloat16_type.cpp | 4 +++- sycl/test-e2e/DeviceLib/string_test.cpp | 3 +++ sycl/test-e2e/ESIMD/ext_math.cpp | 3 +++ sycl/test-e2e/KernelFusion/internalize_array_wrapper.cpp | 3 +++ sycl/test-e2e/Reduction/reduction_usm.cpp | 3 +++ sycl/test-e2e/Regression/local-arg-align.cpp | 3 +++ 8 files changed, 22 insertions(+), 2 deletions(-) diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt index 5f9099f793c0b..317997e9c3a8b 100644 --- a/llvm/CMakeLists.txt +++ b/llvm/CMakeLists.txt @@ -893,7 +893,7 @@ set(DPCPP_ENABLE_OPAQUE_POINTERS TRUE CACHE BOOL if (DPCPP_ENABLE_OPAQUE_POINTERS) add_definitions("-DENABLE_OPAQUE_POINTERS=1") endif(DPCPP_ENABLE_OPAQUE_POINTERS) -set(SPIRV_ENABLE_OPAQUE_POINTERS FALSE CACHE BOOL +set(SPIRV_ENABLE_OPAQUE_POINTERS TRUE CACHE BOOL "Enable opaque pointers for SPIR-V offload by default.") if(SPIRV_ENABLE_OPAQUE_POINTERS) add_definitions("-DSPIRV_ENABLE_OPAQUE_POINTERS=1") diff --git a/sycl/test-e2e/BFloat16/bfloat16_conversions.cpp b/sycl/test-e2e/BFloat16/bfloat16_conversions.cpp index 1e552a8aceeaa..85abf3303ec7c 100755 --- a/sycl/test-e2e/BFloat16/bfloat16_conversions.cpp +++ b/sycl/test-e2e/BFloat16/bfloat16_conversions.cpp @@ -5,6 +5,9 @@ // software emulation. // UNSUPPORTED: accelerator +// FIXME: enable opaque pointers support on CPU. +// UNSUPPORTED: cpu + //==---------- bfloat16_conversions.cpp - SYCL bfloat16 type test ---------==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. diff --git a/sycl/test-e2e/BFloat16/bfloat16_type.cpp b/sycl/test-e2e/BFloat16/bfloat16_type.cpp index 3d087d04ed898..db3a85fb670f7 100644 --- a/sycl/test-e2e/BFloat16/bfloat16_type.cpp +++ b/sycl/test-e2e/BFloat16/bfloat16_type.cpp @@ -5,7 +5,9 @@ // TODO currently the feature isn't supported on FPGA. // UNSUPPORTED: accelerator -// + +// FIXME: enable opaque pointers support on CPU. +// UNSUPPORTED: cpu //==----------- bfloat16_type.cpp - SYCL bfloat16 type test ----------------==// // diff --git a/sycl/test-e2e/DeviceLib/string_test.cpp b/sycl/test-e2e/DeviceLib/string_test.cpp index 92377520fd4ce..be4e7ed38ca27 100644 --- a/sycl/test-e2e/DeviceLib/string_test.cpp +++ b/sycl/test-e2e/DeviceLib/string_test.cpp @@ -5,6 +5,9 @@ // RUN: %{build} -fno-builtin -fsycl-device-lib-jit-link -o %t.out // RUN: %if !gpu %{ %{run} %t.out %} +// FIXME: enable opaque pointers support on CPU. +// UNSUPPORTED: cpu + #include #include #include diff --git a/sycl/test-e2e/ESIMD/ext_math.cpp b/sycl/test-e2e/ESIMD/ext_math.cpp index 47a9e7b251532..d6aa4e5d19791 100644 --- a/sycl/test-e2e/ESIMD/ext_math.cpp +++ b/sycl/test-e2e/ESIMD/ext_math.cpp @@ -9,6 +9,9 @@ // RUN: %{build} -fsycl-device-code-split=per_kernel %{mathflags} -o %t.out // RUN: %{run} %t.out +// FIXME: enable opaque pointers support +// REQUIRES: TEMPORARY_DISABLED + // This test checks extended math operations. Combinations of // - argument type - half, float // - math function - sin, cos, ..., div_ieee, pow diff --git a/sycl/test-e2e/KernelFusion/internalize_array_wrapper.cpp b/sycl/test-e2e/KernelFusion/internalize_array_wrapper.cpp index e90f42d023616..b968b48af9497 100644 --- a/sycl/test-e2e/KernelFusion/internalize_array_wrapper.cpp +++ b/sycl/test-e2e/KernelFusion/internalize_array_wrapper.cpp @@ -2,6 +2,9 @@ // RUN: %{build} -fsycl-embed-ir -O2 -o %t.out // RUN: %{run} %t.out +// FIXME: enable opaque pointers support +// REQUIRES: TEMPORARY_DISABLED + // Test internalization of a nested array type. #include diff --git a/sycl/test-e2e/Reduction/reduction_usm.cpp b/sycl/test-e2e/Reduction/reduction_usm.cpp index 9a27956982117..eac92c670a7b1 100644 --- a/sycl/test-e2e/Reduction/reduction_usm.cpp +++ b/sycl/test-e2e/Reduction/reduction_usm.cpp @@ -7,6 +7,9 @@ // Windows doesn't yet have full shutdown(). // UNSUPPORTED: ze_debug && windows +// FIXME: enable opaque pointers support +// REQUIRES: TEMPORARY_DISABLED + // This test performs basic checks of parallel_for(nd_range, reduction, func) // with reductions initialized with USM pointer. diff --git a/sycl/test-e2e/Regression/local-arg-align.cpp b/sycl/test-e2e/Regression/local-arg-align.cpp index d47dc375f6d6f..76c7ed1eef94f 100644 --- a/sycl/test-e2e/Regression/local-arg-align.cpp +++ b/sycl/test-e2e/Regression/local-arg-align.cpp @@ -2,6 +2,9 @@ // // RUN: %{run} %t.out +// FIXME: enable opaque pointers support +// REQUIRES: TEMPORARY_DISABLED + //==-- local-arg-align.cpp - Test for local argument alignmnent ------------==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. From e020f698e7436525654ff4cb860138f746db6df3 Mon Sep 17 00:00:00 2001 From: elizabethandrews Date: Tue, 8 Aug 2023 10:09:20 -0400 Subject: [PATCH 02/24] [SYCL] Ignore vec_type_hint attribute in SYCL 2020 (#10619) According to the SYCL 2020 spec, [[sycl::vec_type_hint()]] attribute should accept arguments of the type sycl::vec type. The attribute should also be accepted with non conforming lambda syntax. The current implementation in SYCL corresponds to the openCL version of this argument (with an additional spelling for SYCL), i.e. the attribute accepts extended vector type, floating point types and integral type. An error diagnostic is thrown for sycl:vec type. Since the attribute is deprecated and is not handled by any SYCL backend, and will be removed in a future version of the spec, to be minimally conformant with SYCL 2020 spec, this PR just ignores the attribute instead of adding support for sycl::vec type. Support was also added for non conforming lambda syntax --- clang/include/clang/Basic/Attr.td | 1 + .../clang/Basic/DiagnosticSemaKinds.td | 3 +++ clang/lib/Sema/SemaDeclAttr.cpp | 7 +++-- clang/test/SemaSYCL/vec-type-hint-2.cpp | 26 +++++++++++++++++++ clang/test/SemaSYCL/vec-type-hint.cpp | 2 +- 5 files changed, 36 insertions(+), 3 deletions(-) create mode 100644 clang/test/SemaSYCL/vec-type-hint-2.cpp diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 4c877707dbf7b..618530475e6b7 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -4031,6 +4031,7 @@ def VecTypeHint : InheritableAttr { let Spellings = [GNU<"vec_type_hint">, CXX11<"sycl", "vec_type_hint">]; let Args = [TypeArgument<"TypeHint">]; let Subjects = SubjectList<[Function], ErrorDiag>; + let SupportsNonconformingLambdaSyntax = 1; let Documentation = [Undocumented]; } diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 3db8d2409c0b0..bf4d97dd5dba3 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -11979,6 +11979,9 @@ def warn_ivdep_attribute_argument : Warning< def warn_attribute_spelling_deprecated : Warning< "attribute %0 is deprecated">, InGroup; +def warn_attribute_deprecated_ignored : Warning< + "attribute %0 is deprecated; attribute ignored">, + InGroup; def note_spelling_suggestion : Note< "did you mean to use %0 instead?">; def warn_attribute_requires_non_negative_integer_argument : diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index f512c5a9880cb..5c6fb8f6ff001 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -4550,8 +4550,11 @@ static void handleSYCLIntelLoopFuseAttr(Sema &S, Decl *D, const ParsedAttr &A) { static void handleVecTypeHint(Sema &S, Decl *D, const ParsedAttr &AL) { // This attribute is deprecated without replacement in SYCL 2020 mode. - if (S.LangOpts.getSYCLVersion() > LangOptions::SYCL_2017) - S.Diag(AL.getLoc(), diag::warn_attribute_spelling_deprecated) << AL; + // Ignore the attribute in SYCL 2020. + if (S.LangOpts.getSYCLVersion() > LangOptions::SYCL_2017) { + S.Diag(AL.getLoc(), diag::warn_attribute_deprecated_ignored) << AL; + return; + } // If the attribute is used with the [[sycl::vec_type_hint]] spelling in SYCL // 2017 mode, we want to warn about using the newer name in the older diff --git a/clang/test/SemaSYCL/vec-type-hint-2.cpp b/clang/test/SemaSYCL/vec-type-hint-2.cpp new file mode 100644 index 0000000000000..820ab2bc011ee --- /dev/null +++ b/clang/test/SemaSYCL/vec-type-hint-2.cpp @@ -0,0 +1,26 @@ +// RUN: %clang_cc1 -fsycl-is-device -sycl-std=2020 -internal-isystem %S/Inputs -fsyntax-only -verify %s + +// Test which verifies [[sycl::vec_type_hint()]] is accepted +// with non-conforming lambda syntax. + +// NOTE: This attribute is not supported in the SYCL backends. +// To be minimally conformant with SYCL2020, attribute is +// accepted by the Clang FE with a warning. No additional +// semantic handling or IR generation is done for this +// attribute. + +#include "sycl.hpp" + +struct test {}; + +using namespace sycl; +queue q; + +void bar() { + q.submit([&](handler &h) { + h.single_task( + // expected-warning@+1 {{attribute 'vec_type_hint' is deprecated; attribute ignored}} + []() [[sycl::vec_type_hint(test)]] {}); + }); +} + diff --git a/clang/test/SemaSYCL/vec-type-hint.cpp b/clang/test/SemaSYCL/vec-type-hint.cpp index d72d0e5ec91b4..e9fccb4ae2928 100644 --- a/clang/test/SemaSYCL/vec-type-hint.cpp +++ b/clang/test/SemaSYCL/vec-type-hint.cpp @@ -7,4 +7,4 @@ // __attribute__((vec_type_hint)) is deprecated without replacement in SYCL // 2020 mode, but is allowed in SYCL 2017 and OpenCL modes. -KERNEL __attribute__((vec_type_hint(int))) void foo() {} // sycl-2020-warning {{attribute 'vec_type_hint' is deprecated}} +KERNEL __attribute__((vec_type_hint(int))) void foo() {} // sycl-2020-warning {{attribute 'vec_type_hint' is deprecated; attribute ignored}} From 39506e44b9c9f36968a4551eb7d5ad483d544bbb Mon Sep 17 00:00:00 2001 From: Alexey Bader Date: Tue, 8 Aug 2023 07:40:23 -0700 Subject: [PATCH 03/24] [CI] Remove Nightly build configuration for opaque pointers (#10723) This mode is tested by the default configuration now. --- .github/workflows/sycl_nightly.yml | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/.github/workflows/sycl_nightly.yml b/.github/workflows/sycl_nightly.yml index cf4b1a8176111..6e6223679dba9 100644 --- a/.github/workflows/sycl_nightly.yml +++ b/.github/workflows/sycl_nightly.yml @@ -31,18 +31,6 @@ jobs: # prefer widespread gzip compression. artifact_archive_name: sycl_linux.tar.gz - ubuntu2204_opaque_pointers_build_test: - if: github.repository == 'intel/llvm' - uses: ./.github/workflows/sycl_linux_build_and_test.yml - needs: test_matrix - secrets: inherit - with: - build_cache_root: "/__w/" - build_cache_suffix: opaque_pointers - build_artifact_suffix: opaque_pointers - build_configure_extra_args: "--hip --cuda --enable-esimd-emulator --cmake-opt=-DSPIRV_ENABLE_OPAQUE_POINTERS=TRUE" - merge_ref: '' - windows_default: name: Windows if: github.repository == 'intel/llvm' From 319f06780dc113a25f9baa15c510633c91ec2da7 Mon Sep 17 00:00:00 2001 From: aelovikov-intel Date: Tue, 8 Aug 2023 07:51:06 -0700 Subject: [PATCH 04/24] [CI] Switch pre-commit to a new scheme (#10720) Use pull_request trigger (instead of pull_request_target) for everything except AWS CUDA E2E testing. The latter has to go to a separate workflow (workflow_run) in order to have access to the AWS EC key kept as a github secret. As part of the changes, I also stopped using matrix generator for the pre-commit task. Instead, the matrix is written directly inside the task's .yml file. The only minor difference in the behavior is that driver installation happens on an image with previous driver installed, not on a system without any driver. --- .github/workflows/sycl_exp_precommit.yml | 93 ---------------------- .github/workflows/sycl_precommit_aws.yml | 9 ++- .github/workflows/sycl_precommit_linux.yml | 88 ++++++++++---------- 3 files changed, 56 insertions(+), 134 deletions(-) delete mode 100644 .github/workflows/sycl_exp_precommit.yml diff --git a/.github/workflows/sycl_exp_precommit.yml b/.github/workflows/sycl_exp_precommit.yml deleted file mode 100644 index a0439088cca8a..0000000000000 --- a/.github/workflows/sycl_exp_precommit.yml +++ /dev/null @@ -1,93 +0,0 @@ -name: SYCL Experimental Pre-Commit - -on: - pull_request: - branches: - - sycl - paths: - - '.github/workflows/**' - -jobs: - detect_changes: - uses: ./.github/workflows/sycl_detect_changes.yml - - lint: - needs: [detect_changes] - if: | - github.event.pull_request.head.repo.full_name == 'intel/llvm' - runs-on: [Linux, build] - container: - image: ghcr.io/intel/llvm/sycl_ubuntu2204_nightly:no-drivers - options: -u 1001:1001 - steps: - - uses: actions/checkout@v3 - with: - sparse-checkout: | - devops/actions/cached_checkout - devops/actions/clang-format - devops/actions/cleanup - - name: Register cleanup after job is finished - uses: ./devops/actions/cleanup - - name: 'PR commits + 2' - run: echo "PR_FETCH_DEPTH=$(( ${{ github.event.pull_request.commits }} + 2 ))" >> "${GITHUB_ENV}" - - uses: ./devops/actions/cached_checkout - with: - path: src - fetch-depth: ${{ env.PR_FETCH_DEPTH }} - ref: ${{ github.event.pull_request.head.sha }} - merge_ref: '' - cache_path: "/__w/repo_cache/" - - name: Run clang-format - uses: ./devops/actions/clang-format - with: - path: src - - build: - needs: [lint] - if: | - always() - && (success() || contains(github.event.pull_request.labels.*.name, 'ignore-lint')) - uses: ./.github/workflows/sycl_linux_build.yml - with: - build_ref: ${{ github.sha }} - merge_ref: '' - build_cache_root: "/__w/" - build_artifact_suffix: "default" - build_cache_suffix: "default" - changes: '[]' - - test: - needs: [build, detect_changes] - strategy: - fail-fast: false - matrix: - include: - - name: ESIMD Emu - runner: '["Linux", "x86-cpu"]' - image: ghcr.io/intel/llvm/ubuntu2204_build:latest - image_options: -u 1001 - target_devices: ext_intel_esimd_emulator:gpu - - name: AMD/HIP - runner: '["Linux", "amdgpu"]' - image: ghcr.io/intel/llvm/ubuntu2204_build:latest - image_options: -u 1001 --device=/dev/dri --device=/dev/kfd - target_devices: ext_oneapi_hip:gpu - - name: Intel - runner: '["Linux", "gen12"]' - image: ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest - image_options: -u 1001 --device=/dev/dri --privileged --cap-add SYS_ADMIN - target_devices: ext_oneapi_level_zero:gpu;opencl:gpu;opencl:cpu - reset_gpu: ${{ contains(needs.detect_changes.outputs.filters, 'drivers') }} - uses: ./.github/workflows/sycl_linux_run_tests.yml - with: - name: ${{ matrix.name }} - runner: ${{ matrix. runner }} - image: ${{ matrix.image }} - image_options: ${{ matrix.image_options }} - target_devices: ${{ matrix.target_devices }} - ref: ${{ github.sha }} - merge_ref: '' - - sycl_toolchain_artifact: sycl_linux_default - sycl_toolchain_archive: ${{ needs.build.outputs.artifact_archive_name }} - sycl_toolchain_decompress_command: ${{ needs.build.outputs.artifact_decompress_command }} diff --git a/.github/workflows/sycl_precommit_aws.yml b/.github/workflows/sycl_precommit_aws.yml index 627c59c32dacc..f9b30a104e097 100644 --- a/.github/workflows/sycl_precommit_aws.yml +++ b/.github/workflows/sycl_precommit_aws.yml @@ -1,8 +1,15 @@ name: E2E on AWS CUDA +# We have to keep pre-commit AWS CUDA testing in a separate workflow because we +# need access to AWS secret and that isn't available on pull_request jobs for +# PRs from forks. And github's "require approval for all outside collaborators" +# is bypassed on pull_request_target. +# +# Also, we use commit status and not check suite/run (which, in theory, is more +# powerful) due to https://github.com/orgs/community/discussions/24616. on: workflow_run: - workflows: [SYCL Experimental Pre-Commit] + workflows: [SYCL Pre Commit on Linux] types: - completed diff --git a/.github/workflows/sycl_precommit_linux.yml b/.github/workflows/sycl_precommit_linux.yml index e1492b37e0400..7e2306f428d15 100644 --- a/.github/workflows/sycl_precommit_linux.yml +++ b/.github/workflows/sycl_precommit_linux.yml @@ -1,11 +1,13 @@ name: SYCL Pre Commit on Linux on: - pull_request_target: + # We rely on "Fork pull request workflows from outside collaborators" - + # "Require approval for all outside collaborators" at + # https://github.com/intel/llvm/settings/actions for security. + pull_request: branches: - sycl - sycl-devops-pr/** - - llvmspirv_pulldown # Do not run builds if changes are only in the following locations paths-ignore: - '.github/ISSUE_TEMPLATE/**' @@ -15,24 +17,12 @@ on: - 'clang/docs/**' - '**.md' - '**.rst' - # Changes in CI won't have any effect with pull_request_target - - '.github/workflows' - # For CI-related files we explicitly skip all the jobs below even if there - # were other (non-ignored) files modified in this PR. - - 'devops/*/**' - -permissions: - contents: read jobs: detect_changes: uses: ./.github/workflows/sycl_detect_changes.yml lint: - needs: [detect_changes] - if: | - github.event.pull_request.head.repo.full_name == 'intel/llvm' || - !contains(needs.detect_changes.outputs.filters, 'ci') runs-on: [Linux, build] container: image: ghcr.io/intel/llvm/sycl_ubuntu2204_nightly:no-drivers @@ -40,7 +30,6 @@ jobs: steps: - uses: actions/checkout@v3 with: - ref: ${{ github.base_ref }} sparse-checkout: | devops/actions/cached_checkout devops/actions/clang-format @@ -61,36 +50,55 @@ jobs: with: path: src - # This job generates matrix of tests for SYCL End-to-End tests - test_matrix: - needs: [detect_changes] - if: | - github.event.pull_request.head.repo.full_name == 'intel/llvm' || - !contains(needs.detect_changes.outputs.filters, 'ci') - name: Generate Test Matrix - uses: ./.github/workflows/sycl_gen_test_matrix.yml - with: - ref: ${{ github.event.pull_request.head.sha }} - lts_config: "hip_amdgpu;lin_intel;esimd_emu;cuda_aws" - - linux_default: - name: Linux - # Only build and test patches, that have passed all linter checks, because - # the next commit is likely to be a follow-up on that job. - needs: [lint, test_matrix, detect_changes] + build: + needs: [lint, detect_changes] if: | always() && (success() || contains(github.event.pull_request.labels.*.name, 'ignore-lint')) - && (github.event.pull_request.head.repo.full_name == 'intel/llvm' - || !contains(needs.detect_changes.outputs.filters, 'ci')) - uses: ./.github/workflows/sycl_linux_build_and_test.yml - secrets: inherit + uses: ./.github/workflows/sycl_linux_build.yml with: - build_ref: ${{ github.event.pull_request.head.sha }} - merge_ref: ${{ github.event.pull_request.base.sha }} + build_ref: ${{ github.sha }} + merge_ref: '' build_cache_root: "/__w/" build_artifact_suffix: "default" build_cache_suffix: "default" - lts_matrix: ${{ needs.test_matrix.outputs.lts_lx_matrix }} - lts_aws_matrix: ${{ needs.test_matrix.outputs.lts_aws_matrix }} changes: ${{ needs.detect_changes.outputs.filters }} + + test: + needs: [build, detect_changes] + strategy: + fail-fast: false + matrix: + include: + - name: ESIMD Emu + runner: '["Linux", "x86-cpu"]' + image: ghcr.io/intel/llvm/ubuntu2204_build:latest + image_options: -u 1001 + target_devices: ext_intel_esimd_emulator:gpu + - name: AMD/HIP + runner: '["Linux", "amdgpu"]' + image: ghcr.io/intel/llvm/ubuntu2204_build:latest + image_options: -u 1001 --device=/dev/dri --device=/dev/kfd + target_devices: ext_oneapi_hip:gpu + - name: Intel + runner: '["Linux", "gen12"]' + image: ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest + image_options: -u 1001 --device=/dev/dri --privileged --cap-add SYS_ADMIN + target_devices: ext_oneapi_level_zero:gpu;opencl:gpu;opencl:cpu + reset_gpu: ${{ contains(needs.detect_changes.outputs.filters, 'drivers') }} + uses: ./.github/workflows/sycl_linux_run_tests.yml + with: + name: ${{ matrix.name }} + runner: ${{ matrix. runner }} + image: ${{ matrix.image }} + image_options: ${{ matrix.image_options }} + target_devices: ${{ matrix.target_devices }} + reset_gpu: ${{ matrix.reset_gpu }} + + ref: ${{ github.sha }} + merge_ref: '' + + sycl_toolchain_artifact: sycl_linux_default + sycl_toolchain_archive: ${{ needs.build.outputs.artifact_archive_name }} + sycl_toolchain_decompress_command: ${{ needs.build.outputs.artifact_decompress_command }} + From 5eef8c76ec9004100a917d6091171da1aa8c6e40 Mon Sep 17 00:00:00 2001 From: Nick Sarnie Date: Tue, 8 Aug 2023 11:00:03 -0400 Subject: [PATCH 05/24] [SYCL][InvokeSimd] Allow callables to return uniform (#10714) The spec states that returning a `uniform` object is allowed: "Return values of type sycl::ext::oneapi::experimental::uniform are not anyhow converted, and broadcast to each work-item; every work-item in the sub-group receives the same value. NOTE: sycl::ext::oneapi::experimental::uniform return type is the way to return a uniform value of simd or simd_mask type." Update the compile-time error checking and ESIMD verifier to allow this. Signed-off-by: Sarnie, Nick --- llvm/lib/SYCLLowerIR/ESIMD/ESIMDVerifier.cpp | 1 + .../ext/oneapi/experimental/invoke_simd.hpp | 3 +- .../InvokeSimd/Spec/uniform_retval.cpp | 30 +++++++++++++++++-- sycl/test/invoke_simd/return-type-uniform.cpp | 29 ++++++++++++++++++ 4 files changed, 59 insertions(+), 4 deletions(-) create mode 100644 sycl/test/invoke_simd/return-type-uniform.cpp diff --git a/llvm/lib/SYCLLowerIR/ESIMD/ESIMDVerifier.cpp b/llvm/lib/SYCLLowerIR/ESIMD/ESIMDVerifier.cpp index e884ee8f2c57b..4d0e84091f107 100644 --- a/llvm/lib/SYCLLowerIR/ESIMD/ESIMDVerifier.cpp +++ b/llvm/lib/SYCLLowerIR/ESIMD/ESIMDVerifier.cpp @@ -63,6 +63,7 @@ static const char *LegalSYCLFunctions[] = { "^sycl::_V1::ext::oneapi::sub_group::.+", "^sycl::_V1::ext::oneapi::experimental::spec_constant<.+>::.+", "^sycl::_V1::ext::oneapi::experimental::this_sub_group", + "^sycl::_V1::ext::oneapi::experimental::uniform<.+>::.+", "^sycl::_V1::ext::oneapi::bfloat16::.+", "^sycl::_V1::ext::oneapi::experimental::if_architecture_is"}; diff --git a/sycl/include/sycl/ext/oneapi/experimental/invoke_simd.hpp b/sycl/include/sycl/ext/oneapi/experimental/invoke_simd.hpp index 40c0efff94021..24d06806b85d5 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/invoke_simd.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/invoke_simd.hpp @@ -371,7 +371,8 @@ constexpr bool has_struct_arg(Ret (*)(Args...)) { template constexpr bool has_struct_ret(Ret (*)(Args...)) { - return std::is_class_v && !is_simd_or_mask_type::value; + return std::is_class_v && !is_simd_or_mask_type::value && + !is_uniform_type::value; } template diff --git a/sycl/test-e2e/InvokeSimd/Spec/uniform_retval.cpp b/sycl/test-e2e/InvokeSimd/Spec/uniform_retval.cpp index 3eea0623c9319..d92cb57f783cc 100644 --- a/sycl/test-e2e/InvokeSimd/Spec/uniform_retval.cpp +++ b/sycl/test-e2e/InvokeSimd/Spec/uniform_retval.cpp @@ -7,6 +7,12 @@ // // VISALTO enable run // RUN: env IGC_VISALTO=63 IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %{run} %t.out +// +// RUN: %{build} -DUNIFORM_RET_TYPE -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr -o %t2.out +// RUN: env IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %{run} %t2.out +// +// VISALTO enable run +// RUN: env IGC_VISALTO=63 IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %{run} %t2.out /* * Test case #1 @@ -98,17 +104,35 @@ template * returning the scalar as a SIMD type seems to work fine. */ template -__attribute__((always_inline)) T +__attribute__((always_inline)) +#ifdef UNIFORM_RET_TYPE +uniform +#else +T +#endif ESIMD_CALLEE_return_uniform_scalar(esimd::simd x, T n) SYCL_ESIMD_FUNCTION { +#ifdef UNIFORM_RET_TYPE + return uniform{n}; +#else return n; +#endif } template [[intel::device_indirectly_callable]] SYCL_EXTERNAL - T __regcall SIMD_CALLEE_return_uniform_scalar(simd x, - T n) SYCL_ESIMD_FUNCTION { +#ifdef UNIFORM_RET_TYPE + uniform +#else + T +#endif + __regcall SIMD_CALLEE_return_uniform_scalar(simd x, + T n) SYCL_ESIMD_FUNCTION { +#ifdef UNIFORM_RET_TYPE + uniform r = ESIMD_CALLEE_return_uniform_scalar(x, n); +#else T r = ESIMD_CALLEE_return_uniform_scalar(x, n); +#endif return r; } diff --git a/sycl/test/invoke_simd/return-type-uniform.cpp b/sycl/test/invoke_simd/return-type-uniform.cpp new file mode 100644 index 0000000000000..13be728f32d25 --- /dev/null +++ b/sycl/test/invoke_simd/return-type-uniform.cpp @@ -0,0 +1,29 @@ +// RUN: %clangxx -fsycl -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr %s -o /dev/null +#include +#include +#include + +using namespace sycl::ext::oneapi::experimental; +using namespace sycl; +namespace esimd = sycl::ext::intel::esimd; + +[[intel::device_indirectly_callable]] uniform +callee(simd) SYCL_ESIMD_FUNCTION { + return uniform(5); +} + +void foo() { + constexpr unsigned Size = 1024; + constexpr unsigned GroupSize = 64; + sycl::range<1> GlobalRange{Size}; + sycl::range<1> LocalRange{GroupSize}; + sycl::nd_range<1> Range(GlobalRange, LocalRange); + queue q; + auto e = q.submit([&](handler &cgh) { + cgh.parallel_for(Range, [=](nd_item<1> ndi) { + uniform x = invoke_simd(ndi.get_sub_group(), callee, 0); + }); + }); +} + +int main() { foo(); } From bd81fc4d38a7cb505fe41f358518d6a4679a669b Mon Sep 17 00:00:00 2001 From: aelovikov-intel Date: Tue, 8 Aug 2023 08:33:34 -0700 Subject: [PATCH 06/24] [SYCL] Use pair of native::sin/cos for sincos under __FAST_MATH__ (#10481) --- sycl/include/sycl/builtins.hpp | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/sycl/include/sycl/builtins.hpp b/sycl/include/sycl/builtins.hpp index 6751ef20c902d..b2fcd558328e7 100644 --- a/sycl/include/sycl/builtins.hpp +++ b/sycl/include/sycl/builtins.hpp @@ -734,8 +734,8 @@ std::enable_if_t<__FAST_MATH_GENFLOAT(T), T> sin(T x) __NOEXC { // svgenfloat sincos (svgenfloat x, genfloatptr cosval) template -std::enable_if_t< - detail::is_svgenfloat::value && detail::is_genfloatptr::value, T> +std::enable_if_t<__FAST_MATH_GENFLOAT(T) && detail::is_genfloatptr::value, + T> sincos(T x, T2 cosval) __NOEXC { detail::check_vector_size(); return __sycl_std::__invoke_sincos(x, cosval); @@ -2500,6 +2500,23 @@ std::enable_if_t::value, T> cos(T x) __NOEXC { return native::cos(x); } +// svgenfloat sincos (svgenfloat x, genfloatptr cosval) +// This is a performance optimization to ensure that sincos isn't slower than a +// pair of sin/cos executed separately. Theoretically, calling non-native sincos +// might be faster than calling native::sin plus native::cos separately and we'd +// need some kind of cost model to make the right decision (and move this +// entirely to the JIT/AOT compilers). However, in practice, this simpler +// solution seems to work just fine and matches how sin/cos above are optimized +// for the fast math path. +template +std::enable_if_t< + detail::is_svgenfloatf::value && detail::is_genfloatptr::value, T> +sincos(T x, T2 cosval) __NOEXC { + detail::check_vector_size(); + *cosval = native::cos(x); + return native::sin(x); +} + // svgenfloatf exp (svgenfloatf x) template std::enable_if_t::value, T> exp(T x) __NOEXC { From d075bd0a0fc693d856668bf6ff53846b98d23c4a Mon Sep 17 00:00:00 2001 From: aelovikov-intel Date: Tue, 8 Aug 2023 15:18:55 -0700 Subject: [PATCH 07/24] [SYCL] Fix check-all after #10635 (#10744) Single triple targets are only meant for manual run and should not be included into check-all, only check-sycl-combined-triples should be. Otherwise we are running the same tests from multiple processes resulting in race conditions (beside unnecessary work). --- sycl/test/CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sycl/test/CMakeLists.txt b/sycl/test/CMakeLists.txt index 01908bcd9c003..843790c9ba289 100644 --- a/sycl/test/CMakeLists.txt +++ b/sycl/test/CMakeLists.txt @@ -83,6 +83,7 @@ add_lit_testsuite(check-sycl-spirv "Running device-agnostic SYCL regression test PARAMS "SYCL_TRIPLE=spir64-unknown-unknown" DEPENDS ${SYCL_TEST_DEPS} ${SYCL_TEST_EXCLUDE} + EXCLUDE_FROM_CHECK_ALL ) add_lit_testsuite(check-sycl-dumps "Running ABI dump tests only" @@ -100,6 +101,7 @@ if(SYCL_BUILD_PI_CUDA) PARAMS "SYCL_TRIPLE=nvptx64-nvidia-cuda" DEPENDS ${SYCL_TEST_DEPS} ${SYCL_TEST_EXCLUDE} + EXCLUDE_FROM_CHECK_ALL ) add_custom_target(check-sycl-cuda) @@ -115,6 +117,7 @@ if(SYCL_BUILD_PI_HIP) PARAMS "SYCL_TRIPLE=nvptx64-nvidia-cuda" DEPENDS ${SYCL_TEST_DEPS} ${SYCL_TEST_EXCLUDE} + EXCLUDE_FROM_CHECK_ALL ) add_dependencies(check-sycl-hip check-sycl-hip-ptx) @@ -125,6 +128,7 @@ if(SYCL_BUILD_PI_HIP) PARAMS "SYCL_TRIPLE=amdgcn-amd-amdhsa" DEPENDS ${SYCL_TEST_DEPS} ${SYCL_TEST_EXCLUDE} + EXCLUDE_FROM_CHECK_ALL ) add_dependencies(check-sycl-hip check-sycl-hip-gcn) From 4b44182081fe48194f7f10be7ada5da0c35f3fe0 Mon Sep 17 00:00:00 2001 From: Andrei Elovikov Date: Tue, 8 Aug 2023 15:39:24 -0700 Subject: [PATCH 08/24] [CI] Add run-name for sycl_precommit_aws.yml --- .github/workflows/sycl_precommit_aws.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/sycl_precommit_aws.yml b/.github/workflows/sycl_precommit_aws.yml index f9b30a104e097..d625f9298d3fa 100644 --- a/.github/workflows/sycl_precommit_aws.yml +++ b/.github/workflows/sycl_precommit_aws.yml @@ -1,4 +1,5 @@ name: E2E on AWS CUDA +run-name: E2E on AWS CUDA - ${{ github.event.workflow_run.display_title }} # We have to keep pre-commit AWS CUDA testing in a separate workflow because we # need access to AWS secret and that isn't available on pull_request jobs for # PRs from forks. And github's "require approval for all outside collaborators" From a2265a6bafb7a78f075e8904f3507b9949742f78 Mon Sep 17 00:00:00 2001 From: Isaac Ault Date: Wed, 9 Aug 2023 10:43:51 +0100 Subject: [PATCH 09/24] [SYCL][Bindless] Fix Mipmap Tests (#10713) # Fix Mipmap Tests Fixing the computation of expected output values so that the tests also work with input sizes that are not powers of 2. --------- Co-authored-by: Dmitry Vodopyanov --- .../bindless_images/mipmap/mipmap_read_1D.cpp | 17 ++++++++++------- .../bindless_images/mipmap/mipmap_read_2D.cpp | 14 +++++++------- .../bindless_images/mipmap/mipmap_read_3D.cpp | 6 +++--- 3 files changed, 20 insertions(+), 17 deletions(-) diff --git a/sycl/test-e2e/bindless_images/mipmap/mipmap_read_1D.cpp b/sycl/test-e2e/bindless_images/mipmap/mipmap_read_1D.cpp index 41ff725849ad7..6a1ebb3790ab1 100644 --- a/sycl/test-e2e/bindless_images/mipmap/mipmap_read_1D.cpp +++ b/sycl/test-e2e/bindless_images/mipmap/mipmap_read_1D.cpp @@ -19,22 +19,25 @@ int main() { auto ctxt = q.get_context(); // declare image data - constexpr size_t N = 16; + constexpr size_t N = 15; std::vector out(N); std::vector expected(N); std::vector dataIn1(N); std::vector dataIn2(N / 2); std::vector copyOut(N / 2); - int j = 0; + for (int i = 0; i < N; i++) { - expected[i] = i + (j + 10); - if (i % 2) - j++; + // Populate input data (to-be mipmap image layers) dataIn1[i] = sycl::float4(i, i, i, i); if (i < (N / 2)) { dataIn2[i] = sycl::float4(i + 10, i + 10, i + 10, i + 10); copyOut[i] = sycl::float4{0, 0, 0, 0}; } + + // Calculate expected output data + float norm_coord = ((i + 0.5f) / (float)N); + int x = norm_coord * (N >> 1); + expected[i] = dataIn1[i][0] + dataIn2[x][0]; } try { @@ -85,8 +88,8 @@ int main() { // Extension: read mipmap level 0 with anisotropic filtering and level 1 // with LOD sycl::float4 px1 = - sycl::ext::oneapi::experimental::read_image( - mipHandle, x, 0.0f, 0.0f); + sycl::ext::oneapi::experimental::read_image(mipHandle, + x, 0.0f); sycl::float4 px2 = sycl::ext::oneapi::experimental::read_image(mipHandle, x, 1.0f); diff --git a/sycl/test-e2e/bindless_images/mipmap/mipmap_read_2D.cpp b/sycl/test-e2e/bindless_images/mipmap/mipmap_read_2D.cpp index 9120dd694e58b..079847f0d1ab9 100644 --- a/sycl/test-e2e/bindless_images/mipmap/mipmap_read_2D.cpp +++ b/sycl/test-e2e/bindless_images/mipmap/mipmap_read_2D.cpp @@ -40,13 +40,13 @@ int main() { } // Expected each x and y will repeat twice // since mipmap level 1 is half in size - int jj = 0; - for (int i = 0; i < width - 1; i += 2) { - for (int j = 0; j < height - 1; j += 2, jj++) { - expected[j + (width * i)] = jj; - expected[j + (width * (i + 1))] = jj; - expected[(j + 1) + (width * i)] = jj; - expected[(j + 1) + (width * (i + 1))] = jj; + for (int i = 0; i < width; i++) { + for (int j = 0; j < height; j++) { + float norm_coord_x = ((i + 0.5f) / (float)width); + int x = norm_coord_x * (width >> 1); + float norm_coord_y = ((j + 0.5f) / (float)height); + int y = norm_coord_y * (height >> 1); + expected[j + (width * i)] = dataIn2[y + (width / 2 * x)][0]; } } diff --git a/sycl/test-e2e/bindless_images/mipmap/mipmap_read_3D.cpp b/sycl/test-e2e/bindless_images/mipmap/mipmap_read_3D.cpp index bacd6e081e9cf..c858ac57f819b 100644 --- a/sycl/test-e2e/bindless_images/mipmap/mipmap_read_3D.cpp +++ b/sycl/test-e2e/bindless_images/mipmap/mipmap_read_3D.cpp @@ -19,9 +19,9 @@ int main() { auto ctxt = q.get_context(); // declare image data - size_t width = 4; - size_t height = 4; - size_t depth = 4; + size_t width = 5; + size_t height = 5; + size_t depth = 5; size_t N = width * height * depth; std::vector out(N); std::vector expected(N); From 180a92ad707bd35df9e98c1474dc52a1e9b3dead Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alberto=20Cabrera=20P=C3=A9rez?= Date: Wed, 9 Aug 2023 10:47:18 +0100 Subject: [PATCH 10/24] [SYCL][COMPAT][Doc] Add SYCLcompat documentation (#9646) This pull request introduces a new stand alone library, `SYCLcompat`: a simplified wrapper on top of SYCL, aiming to make it more accessible to developers familiar with other heterogeneous programming models. SYCLcompat has two primary goals: - Improve the adoption of SYCL. This library is designed to provide a familiar programming interface that resembles other popular heterogeneous programming models. By reducing the learning curve, it enables developers to leverage SYCL's power and features more easily. - Source-to-Source Translation Support. SYCLcompat is also designed to facilitate automatic source-to-source translation from other heterogeneous programming models to SYCL and offer a more standardized and consistent programming interface. This feature can significantly streamline the migration and integration of existing codebases into the SYCL ecosystem. The first commit of this PR includes the proposed library README, providing explanation of its motivation, public interface, usage guidelines, and code examples. A set of PRs will follow, including subsets of the current implementation including their tests. We are open to any suggestions, concerns, or improvements you may have, so please, let us know if you have any. Edit: Updated from extension to stand alone library. https://github.com/intel/llvm/pull/9976 `dims.hpp` and `defs.hpp` headers --------- Co-authored-by: Gordon Brown Co-authored-by: Joe Todd Co-authored-by: Pietro Ghiglio Co-authored-by: Ruyman Reyes Co-authored-by: Steffen Larsen Co-authored-by: aelovikov-intel Co-authored-by: Sami Hatna Co-authored-by: Joe Todd Co-authored-by: Alexey Bader --- sycl/doc/index.rst | 1 + sycl/doc/syclcompat/README.md | 1285 +++++++++++++++++++++++++++++++++ 2 files changed, 1286 insertions(+) create mode 100644 sycl/doc/syclcompat/README.md diff --git a/sycl/doc/index.rst b/sycl/doc/index.rst index 8f6e0854df8f5..579e6e81d44cf 100644 --- a/sycl/doc/index.rst +++ b/sycl/doc/index.rst @@ -13,6 +13,7 @@ Using oneAPI DPC++ for Application Development PreprocessorMacros cuda/contents Extensions + syclcompat/README.md FAQ User API Reference EnvironmentVariables diff --git a/sycl/doc/syclcompat/README.md b/sycl/doc/syclcompat/README.md new file mode 100644 index 0000000000000..5b000cb52dbc3 --- /dev/null +++ b/sycl/doc/syclcompat/README.md @@ -0,0 +1,1285 @@ +# SYCLcompat + +SYCLcompat is a header-only library that intends to help developers familiar +with other heterogeneous programming models (such as OpenMP, CUDA or HIP) to +familiarize themselves with the SYCL programming API while porting their +existing codes. Compatibility tools can also benefit from the reduced API size +when converting legacy codebases. + +SYCLcompat provides: + +* A high-level API that provides closer semantics to other programming models, +simplifying line by line conversions. +* Alternative submission APIs that encapusulate SYCL-specific "queue" and +"event" APIs for easier reference. +* Ability to gradually introduce other SYCL concepts as the user familiarises +themselves with the core SYCL API. +* Clear distinction between core SYCL API and the compatibility interface via +separate namespaces. + +## Notice + +Copyright © 2023-2023 Codeplay Software Limited. All rights reserved. + +Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks of +The Khronos Group Inc. OpenCL(TM) is a trademark of Apple Inc. used by +permission by Khronos. + +## Support + +SYCLcompat depends on specific oneAPI DPC++ compiler extensions that may not be +available to all the SYCL 2020 specification implementations. + +Specifically, this library depends on the following SYCL extensions: + +* [sycl_ext_oneapi_local_memory]( + ../extensions/supported/sycl_ext_oneapi_local_memory.asciidoc) +* [sycl_ext_oneapi_complex]( + ../extensions/experimental/sycl_ext_oneapi_complex.asciidoc) +* [sycl_ext_oneapi_free_function_queries]( + ../extensions/experimental/sycl_ext_oneapi_free_function_queries.asciidoc) +* [sycl_ext_oneapi_assert]( + ../extensions/supported/sycl_ext_oneapi_assert.asciidoc) +* [sycl_ext_oneapi_enqueue_barrier]( + ../extensions/supported/sycl_ext_oneapi_enqueue_barrier.asciidoc) + +## Usage + +All functionality is available under the `syclcompat::` namespace, imported +through the main header, `syclcompat.hpp`. Note that `syclcompat.hpp` does not +import the header. + +``` cpp +#include +``` + +This document presents the public API under the [Features](#features) section, +and provides a working [Sample code](#sample-code) using this library. Refer to +those to learn to use the library. + +## Features + +### dim3 + +SYCLcompat provides a `dim3` class akin to that of CUDA or HIP programming +models. `dim3` encapsulates other languages iteration spaces that are +represented with coordinate letters (x, y, z). + +```cpp +namespace syclcompat { + +class dim3 { +public: + const size_t x, y, z; + constexpr dim3(const sycl::range<3> &r); + constexpr dim3(const sycl::range<2> &r); + constexpr dim3(const sycl::range<1> &r); + constexpr dim3(size_t x, size_t y = 1, size_t z = 1); + + constexpr size_t size(); + + operator sycl::range<3>(); + operator sycl::range<2>(); + operator sycl::range<1>(); +}; + +// Element-wise operators +dim3 operator*(const dim3 &a, const dim3 &b); +dim3 operator+(const dim3 &a, const dim3 &b); +dim3 operator-(const dim3 &a, const dim3 &b); + +} // syclcompat +``` + +In SYCL, the fastest-moving dimension is the one with the highest index, e.g. in +a SYCL 2D range iteration space, there are two dimensions, 0 and 1, and 1 will +be the one that "moves faster". The compatibility headers for SYCL offer a +number of convenience functions that help the mapping between xyz-based +coordinates to SYCL iteration spaces in the different scopes available. In +addition to the global range, the following helper functions are also provided: + +``` c++ +namespace syclcompat { + +namespace local_id { +size_t x(); +size_t y(); +size_t z(); +} // namespace local_id + +namespace local_range { +size_t x(); +size_t y(); +size_t z(); +} // namespace local_range + +namespace work_group_id { +size_t x(); +size_t y(); +size_t z(); +} // namespace work_group_id + +namespace work_group_range { +size_t x(); +size_t y(); +size_t z(); +} // namespace work_group_range + +namespace global_range { +size_t x(); +size_t y(); +size_t z(); +} // namespace global_range + +namespace global_id { +size_t x(); +size_t y(); +size_t z(); +} // namespace global_id + +} // syclcompat +``` + +These translate any kernel dimensions from one convention to the other. An +example of an equivalent SYCL call for a 3D kernel using `compat` is +`syclcompat::global_id::x() == get_global_id(2)`. + +### Local Memory + +When using `compat` functions, there are two distinct interfaces to allocate +device local memory. The first interface uses the _sycl_ext_oneapi_local_memory_ +extension to leverage local memory defined at compile time. +_sycl_ext_oneapi_local_memory_ is accessed through the following wrapper: + +``` c++ +namespace syclcompat { + +template auto *local_mem(); + +} // syclcompat +``` + +`syclcompat::local_mem()` can be used as illustrated in the example +below. + +```c++ +// Sample kernel +using namespace syclcompat; +template +void local_mem_2d(int *d_A) { + // Local memory extension wrapper, size defined at compile-time + auto As = local_mem(); + int id_x = local_id::x(); + int id_y = local_id::y(); + As[id_y][id_x] = id_x * BLOCK_SIZE + id_y; + wg_barrier(); + int val = As[BLOCK_SIZE - id_y - 1][BLOCK_SIZE - id_x - 1]; + d_A[global_id::y() * BLOCK_SIZE + global_id::x()] = val; +} +``` + +The second interface allows users to allocate device local memory at runtime. +SYCLcompat provides this functionality through its kernel launch interface, +`launch`, defined in the following section. + +### launch + +SYCLcompat provides a kernel `launch` interface which accepts a function that +executes on the device (a.k.a "kernel") instead of a lambda/functor. It can be +called either by using a pair of "teams"/"blocks" and "threads", from +OpenMP/CUDA terminology, or using a `sycl::nd_range`. The interface accepts a +device _function_ with the use of an `auto F` template parameter, and a variadic +`Args` for the function's arguments. + +Various overloads for `launch` exist to permit the user to launch on a +specific `queue`, or to define dynamically sized device local memory. + +``` c++ +namespace syclcompat { + +template +sycl::event launch(const dim3 &grid, const dim3 &threads, Args... args); + +template +sycl::event launch(const sycl::nd_range &range, Args... args); + +template +sycl::event launch(const sycl::nd_range &range, + sycl::queue q, Args... args); + +template +sycl::event launch(const dim3 &grid, const dim3 &threads, + sycl::queue q, Args... args); + +template +sycl::event launch(const sycl::nd_range &range, size_t mem_size, + sycl::queue q, Args... args); + +template +sycl::event launch(const sycl::nd_range &range, size_t mem_size, + Args... args); + +template +sycl::event launch(const dim3 &grid, const dim3 &threads, + size_t mem_size, sycl::queue q, Args... args); + +template +sycl::event launch(const dim3 &grid, const dim3 &threads, + size_t mem_size, Args... args); + +} // syclcompat +``` + +For example, if the user had an existing function named `vectorAdd` to execute +on a device such as follows: + +``` c++ +void vectorAdd(const float *A, const float *B, float *C, int n); +``` + +using SYCLcompat, the user can call it as follows: + +``` c++ +syclcompat::launch(blocksPerGrid, threadsPerBlock, d_A, d_B, d_C, n); +``` + +which would be equivalent to the following call using a `sycl::nd_range`: + +``` c++ +auto range = sycl::nd_range<3>{blocksPerGrid * threadsPerBlock, + threadsPerBlock}; +syclcompat::launch(range, d_A, d_B, d_C, n); +``` + +For dynamic local memory allocation, `launch` injects a pointer to a +local `char *` accessor of `mem_size` as the last argument of the kernel +function. For example, the previous function named `vectorAdd` can be modified +with the following signature, which adds a `char *` pointer to access local +memory inside the kernel: + +``` c++ +void vectorAdd(const float *A, const float *B, float *C, int n, + char *local_mem); +``` + +Then, `vectorAdd` can be launched like this: + +``` c++ +syclcompat::launch(blocksPerGrid, threadsPerBlock, mem_size, d_A, + d_B, d_C, n); +``` + +or this: + +``` c++ +auto range = sycl::nd_range<3>{globalSize, localSize}; +syclcompat::launch(range, mem_size, d_A, d_B, d_C, n); +``` + +This `launch` interface allows users to define an internal memory pool, or +scratchpad, that can then be reinterpreted as the datatype required by the user +within the kernel function. + +### Utilities + +SYCLcompat introduces a set of utility functions designed to streamline the +usage of the library and its `launch` mechanism. + +The first utility function is `syclcompat::wg_barrier()`, which provides a +concise work-group barrier. `syclcompat::wg_barrier()` uses the +_SYCL_INTEL_free_function_queries_ extension to provide this functionality. + +The second utility function, `syclcompat::compute_nd_range`, ensures that the +provided global size and work group sizes are appropriate for a given +dimensionality, and that global size is rounded up to a multiple of the work +group size in each dimension. + +```c++ +namespace syclcompat { + +void wg_barrier(); + +template +sycl::nd_range compute_nd_range(sycl::range global_size_in, + sycl::range work_group_size); +sycl::nd_range<1> compute_nd_range(int global_size_in, int work_group_size); + +} // syclcompat +``` + +### Queues + +The design for this library assumes _in-order_ queues +(`sycl::property::queue::in_order()`). + +Many of the APIs accept an optional `queue` parameter, and this can be an +out-of-order queue, either created manually or retrieved via a call to +`syclcompat::create_queue()`, specifying `false` for the `in_order` parameter. + +```c++ +namespace syclcompat { + +sycl::queue create_queue(bool print_on_async_exceptions = false, + bool in_order = true); + +} // syclcompat +``` + +However, SYCLcompat does not implement any mechanisms to deal with this case. +The rationale for this is that a user wanting the full power of SYCL's +dependency management shouldn't be using the this library. As such, support for +out-of-order queues is very limited. The only way to safely use an out-of-order +queue at present is to explicitly `q.wait()` or `e.wait()` where `e` is the +`sycl::event` returned through a `syclcompat::async` API. + +To facilitate machine translation from other heterogeneous programming models to +SYCL, SYCLcompat provides the following pointer aliases for `sycl::event` and +`sycl::queue`, and the function `destroy_event` which destroys an `event_ptr` +allocated on the heap. + +``` c++ +namespace syclcompat { + +using event_ptr = sycl::event *; + +using queue_ptr = sycl::queue *; + +static void destroy_event(event_ptr event); + +} // syclcompat +``` + +### Memory Allocation + +This library provides interfaces to allocate memory to be accessed within kernel +functions and on the host. The `syclcompat::malloc` function allocates device +USM memory, the `syclcompat::malloc_host` function allocates host USM memory, +and the `syclcompat::malloc_shared` function allocates shared USM memory. + +In each case we provide a template and non-templated interface for allocating +memory, taking the number of elements or number of bytes respectively. + +The interface includes both synchronous and asynchronous `malloc`, `memcpy`, +`memset`, `fill`, and `free` operations. + +There is a helper class `pointer_attributes` to query allocation type for memory +pointers using SYCLcompat, through `sycl::usm::alloc` and +`sycl::get_pointer_device`. + +``` c++ +namespace syclcompat { + +// Expects number of elements +template +T *malloc(size_t count, sycl::queue q = get_default_queue()); +template +T *malloc_host(size_t count, sycl::queue q = get_default_queue()); +template +T *malloc_shared(size_t count, sycl::queue q = get_default_queue()); + +// Expects size of the memory in bytes +void *malloc(size_t num_bytes, sycl::queue q = get_default_queue()); +void *malloc_host(size_t num_bytes, sycl::queue q = get_default_queue()); +void *malloc_shared(size_t num_bytes, sycl::queue q = get_default_queue()); + +// 2D, 3D memory allocation wrappers +void *malloc(size_t &pitch, size_t x, size_t y, + sycl::queue q = get_default_queue()) +pitched_data malloc(sycl::range<3> size, sycl::queue q = get_default_queue()); + +// Blocking memcpy +void memcpy(void *to_ptr, const void *from_ptr, size_t size, + sycl::queue q = get_default_queue()); +void memcpy(T *to_ptr, const T *from_ptr, size_t count, + sycl::queue q = get_default_queue()); +void memcpy(void *to_ptr, size_t to_pitch, const void *from_ptr, + size_t from_pitch, size_t x, size_t y, + sycl::queue q = get_default_queue()); // 2D matrix +void memcpy(pitched_data to, sycl::id<3> to_pos, + pitched_data from, sycl::id<3> from_pos, + sycl::range<3> size, + sycl::queue q = get_default_queue()); // 3D matrix + +// Non-blocking memcpy +sycl::event memcpy_async(void *to_ptr, const void *from_ptr, size_t size, + sycl::queue q = get_default_queue()); +template +sycl::event memcpy_async(T *to_ptr, T void *from_ptr, size_t count, + sycl::queue q = get_default_queue()); +sycl::event memcpy_async(void *to_ptr, size_t to_pitch, + const void *from_ptr, size_t from_pitch, + size_t x, size_t y, + sycl::queue q = get_default_queue()); // 2D matrix +sycl::event memcpy_async(pitched_data to, sycl::id<3> to_pos, + pitched_data from, sycl::id<3> from_pos, + sycl::range<3> size, + sycl::queue q = get_default_queue()); // 3D matrix + +// Fill +template +void fill(void *dev_ptr, const T &pattern, size_t count, + sycl::queue q = get_default_queue()); +template +sycl::event fill_async(void *dev_ptr, const T &pattern, + size_t count, sycl::queue q = get_default_queue()); + +// Memset +void memset(void *dev_ptr, int value, size_t size, + sycl::queue q = get_default_queue()); +void memset(void *ptr, size_t pitch, int val, size_t x, size_t y, + sycl::queue q = get_default_queue()); // 2D matrix +void memset(pitched_data pitch, int val, sycl::range<3> size, + sycl::queue q = get_default_queue()); // 3D matrix +sycl::event memset_async(void *dev_ptr, int value, size_t size, + sycl::queue q = get_default_queue()); +sycl::event memset_async(void *ptr, size_t pitch, int val, + size_t x, size_t y, + sycl::queue q = get_default_queue()); // 2D matrix +sycl::event memset_async(pitched_data pitch, int val, + sycl::range<3> size, + sycl::queue q = get_default_queue()); // 3D matrix + +void free(void *ptr, sycl::queue q = get_default_queue()); +sycl::event free_async(const std::vector &pointers, + const std::vector &events, + sycl::queue q = get_default_queue()); + +// Queries pointer allocation type +class pointer_attributes { +public: + void init(const void *ptr, sycl::queue q = get_default_queue()); + sycl::usm::alloc get_memory_type(); + const void *get_device_pointer(); + const void *get_host_pointer(); + bool is_memory_shared(); + unsigned int get_device_id(); +}; + +} // syclcompat +``` + +Finally, the class `pitched_data`, which manages memory allocation for 3D +spaces, padded to avoid uncoalesced memory accesses. + +```c++ +namespace syclcompat { + +class pitched_data { +public: + pitched_data(); + pitched_data(void *data, size_t pitch, size_t x, size_t y); + + void *get_data_ptr(); + size_t get_pitch(); + size_t get_x(); + size_t get_y(); + + void set_data_ptr(void *data); + void set_pitch(size_t pitch); + void set_x(size_t x); + void set_y(size_t y); +}; + +} // syclcompat +``` + +There are various helper classes and aliases defined within SYCLcompat to +encapsulate and define memory operations and objects. These classes and aliases +are primarily designed to assist with machine translation from other +heterogeneous programming models. + +The wrapper class `device_memory` provides a unified representation for device +memory in various regions. The class provides methods to allocate memory for the +object (`init()`) and access the underlying memory in various ways (`get_ptr()`, +`get_access()`, `operator[]`). Aliases for global and USM shared specializations +are provided. + +The `memory_traits` class is provided as a traits helper for `device_memory`. +The `accessor` class template provides a 2D or 3D `sycl::accessor`-like wrapper +around raw pointers. + +```c++ +namespace syclcompat { + +enum class memory_region { + global = 0, // device global memory + constant, // device read-only memory + local, // device local memory + usm_shared, // memory which can be accessed by host and device +}; + +using byte_t = uint8_t; + +enum class target { device, local }; + +template class memory_traits { +public: + static constexpr sycl::access::address_space asp = + (Memory == memory_region::local) + ? sycl::access::address_space::local_space + : sycl::access::address_space::global_space; + static constexpr target target = + (Memory == memory_region::local) + ? target::local + : target::device; + static constexpr sycl::access_mode mode = + (Memory == memory_region::constant) + ? sycl::access_mode::read + : sycl::access_mode::read_write; + static constexpr size_t type_size = sizeof(T); + using element_t = + typename std::conditional_t; + using value_t = typename std::remove_cv_t; + template + using accessor_t = typename std::conditional_t< + target == target::local, + sycl::local_accessor, + sycl::accessor>; + using pointer_t = T *; +}; + +template class device_memory { +public: + using accessor_t = + typename memory_traits::template accessor_t; + using value_t = typename memory_traits::value_t; + using syclcompat_accessor_t = + syclcompat::accessor; + + device_memory(); + + device_memory(const sycl::range &in_range, + std::initializer_list &&init_list); + + template + device_memory( + const typename std::enable_if>::type &in_range, + std::initializer_list> &&init_list); + + device_memory(const sycl::range &range_in); + + // Variadic constructor taking 1, 2 or 3 integers to be interpreted as a + // sycl::range. + template + device_memory(Args... Arguments); + + ~device_memory(); + + // Allocate memory with default queue, and init memory if has initial value. + void init(); + // Allocate memory with specified queue, and init memory if has initial + // value. + void init(sycl::queue q); + + // The variable is assigned to a device pointer. + void assign(value_t *src, size_t size); + + // Get memory pointer of the memory object, which is virtual pointer when + // usm is not used, and device pointer when usm is used. + value_t *get_ptr(); + // Get memory pointer of the memory object, which is virtual pointer when + // usm is not used, and device pointer when usm is used. + value_t *get_ptr(sycl::queue q); + + // Get the device memory object size in bytes. + size_t get_size(); + + template + typename std::enable_if::type &operator[](size_t index); + + // Get accessor with dimension info for the device memory object + // when usm is used and dimension is greater than 1. + template + typename std::enable_if::type + get_access(sycl::handler &cgh); +}; + + +template +class device_memory : public device_memory { +public: + using base = device_memory; + using value_t = typename base::value_t; + using accessor_t = + typename memory_traits::template accessor_t<0>; + device_memory(const value_t &val); + device_memory(); +}; + +template +using global_memory = device_memory; +template +using constant_memory = detail::device_memory; +template +using shared_memory = device_memory; + + +template class accessor; + +template class accessor { +public: + using memory_t = memory_traits; + using element_t = typename memory_t::element_t; + using pointer_t = typename memory_t::pointer_t; + using accessor_t = typename memory_t::template accessor_t<3>; + + accessor(pointer_t data, const sycl::range<3> &in_range); + template + accessor(typename std::enable_if::type &acc); + accessor(const accessor_t &acc, const sycl::range<3> &in_range); + + accessor operator[](size_t index) const; + + pointer_t get_ptr() const; + +}; + +template class accessor { +public: + using memory_t = memory_traits; + using element_t = typename memory_t::element_t; + using pointer_t = typename memory_t::pointer_t; + using accessor_t = typename memory_t::template accessor_t<2>; + + accessor(pointer_t data, const sycl::range<2> &in_range); + template + accessor(typename std::enable_if::type &acc); + accessor(const accessor_t &acc, const sycl::range<2> &in_range); + + pointer_t operator[](size_t index); + + pointer_t get_ptr() const; +}; + +} // syclcompat +``` + +### Device Information + +`sycl::device` properties are encapsulated using the `device_info` helper class. +The class is meant to be constructed and used through the extended device +implemented in SYCLcompat. + +This is the synopsis of `device_info`: + +```c++ +class device_info { +public: + const char *get_name(); + char *get_name(); + template , + std::enable_if_t> || + std::is_same_v, + int> = 0> + auto get_max_work_item_sizes() const; + + template , + std::enable_if_t> || + std::is_same_v, + int> = 0> + auto get_max_work_item_sizes() const; + int get_major_version() const; + int get_minor_version() const; + int get_integrated() const; + int get_max_clock_frequency() const; + int get_max_compute_units() const; + int get_max_work_group_size() const; + int get_max_sub_group_size() const; + int get_max_work_items_per_compute_unit() const; + template || + std::is_same_v, + int> = 0> + auto get_max_nd_range_size() const; + template || + std::is_same_v, + int> = 0> + auto get_max_nd_range_size(); + size_t get_global_mem_size() const; + size_t get_local_mem_size() const; + +void set_name(const char *name); + void set_max_work_item_sizes(const sycl::id<3> max_work_item_sizes); + void set_major_version(int major); + void set_minor_version(int minor); + void set_integrated(int integrated); + void set_max_clock_frequency(int frequency); + void set_max_compute_units(int max_compute_units); + void set_global_mem_size(size_t global_mem_size); + void set_local_mem_size(size_t local_mem_size); + void set_max_work_group_size(int max_work_group_size); + void set_max_sub_group_size(int max_sub_group_size); + void + set_max_work_items_per_compute_unit(int max_work_items_per_compute_unit); + void set_max_nd_range_size(int max_nd_range_size[]); +}; +``` + +### Device Management + +Multiple SYCL functionalities are exposed through utility functions to manage +the current `sycl::device`, `sycl::queue`, and `sycl::context`, exposed as +follows: + +```c++ +namespace syclcompat { + +// Util function to create a new queue for the current device +sycl::queue create_queue(bool print_on_async_exceptions = false, + bool in_order = true); + +// Util function to get the default queue of current device in +// device manager. +sycl::queue get_default_queue(); + +// Util function to wait for the queued kernels. +void wait(sycl::queue q = get_default_queue()); + +// Util function to wait for the queued kernels and throw unhandled errors. +void wait_and_throw(sycl::queue q = get_default_queue()); + +// Util function to get the id of current device in +// device manager. +unsigned int get_current_device_id(); + +// Util function to get the current device. +device_ext &get_current_device(); + +// Util function to get a device by id. +device_ext &get_device(unsigned int id); + +// Util function to get the context of the default queue of current +// device in device manager. +sycl::context get_default_context(); + +// Util function to get a CPU device. +device_ext &cpu_device(); + +// Util function to select a device by its id +unsigned int select_device(unsigned int id); + +} // syclcompat +``` + +The exposed functionalities include creation and destruction of queues, through +`syclcompat::create_queue` and `syclcompat::destroy_queue`, and providing the +ability to wait for submitted kernels using `syclcompat::wait` or +`syclcompat::wait_and_throw`. Any async errors will be output to `stderr` if +`print_on_async_exceptions`. Synchronous exceptions have to be managed by users +independently of what is set in this parameter. + +Devices are managed through a helper class, `device_ext`. The `device_ext` class +associates a vector of `sycl::queues` with its `sycl::device`. The `device_ext` +destructor waits on a set of `sycl::event` which can be added to via +`add_event`. This is used, for example, to implement `syclcompat::free_async` to +schedule release of memory after a kernel or `mempcy`. SYCL device properties +can be queried through `device_ext` as well. + +The class is exposed as follows: + +```c++ +namespace syclcompat { + +class device_ext : public sycl::device { + device_ext(); + device_ext(const sycl::device &base); + ~device_ext(); + + bool is_native_host_atomic_supported(); + int get_major_version(); + int get_minor_version(); + int get_max_compute_units(); + int get_max_clock_frequency(); + int get_integrated(); + void get_device_info(device_info &out); + + device_info get_device_info(); + void reset(); + + sycl::queue *default_queue(); + void queues_wait_and_throw(); + sycl::queue *create_queue(bool print_on_async_exceptions = false, + bool in_order = true); + void destroy_queue(sycl::queue *&queue); + void set_saved_queue(sycl::queue *q); + sycl::queue *get_saved_queue(); + sycl::context get_context(); +}; + +} // syclcompat +``` + +#### Multiple devices + +SYCLcompat allows you to manage multiple devices through +`syclcompat::select_device` and `syclcompat::create_queue`. The library uses the +default SYCL device (i.e. the device returned by `sycl::default_selector_v`) as +the default device, and exposes all other devices available on the system +through the `syclcompat::select_device(unsigned int id)` member function. + +The interface uses the `syclcompat::device_ext::get_current_device_id()` to get +the current CPU thread, and returns the associated device stored internally as a +map with that thread. The map is constructed using calls to +`syclcompat::select_device(unsigned int id)`. Any thread which hasn't used this +member function to select a device will be given the default device. Note that +this implies multiple threads on a single device by default. + +Be aware that targetting multiple devices may lead to unintended behavior caused +by developers, as SYCLcompat does not implement a mechanism to warn when the +wrong queue is used as an argument in any of the member functions of the +`syclcompat` namespace. + +#### Atomic Operations + +SYCLcompat provides an interface for common atomic operations (`add`, `sub`, +`and`, `or`, `xor`, `min`, `max`, `exchange`, `compare_exchange`). While SYCL +exposes atomic operations through member functions of `sycl::atomic_ref`, this +library provides access via functions taking a standard pointer argument. +Template arguments control the `sycl::memory_scope`, `sycl::memory_order` and +`sycl::access::address_space` of these atomic operations. SYCLcompat also +exposes overloads for these atomic functions which take a runtime memoryScope +argument. Every atomic operation is implemented via an API function taking a raw +pointer as the target. Additional overloads for +`syclcompat::compare_exchange_strong` are provided which take a +`sycl::multi_ptr` instead of a raw pointer. Addition and subtraction make use of +`arith_t` to differentiate between numeric and pointer arithmetics. + +The available operations are exposed as follows: + +``` c++ +namespace syclcompat { + +template struct arith { + using type = std::conditional_t, std::ptrdiff_t, T>; +}; +template using arith_t = typename arith::type; + +template +T atomic_fetch_add(T *addr, arith_t operand); +template +T atomic_fetch_add(T *addr, arith_t operand, + sycl::memory_order memoryOrder); + +template +T atomic_fetch_sub(T *addr, arith_t operand); +template +T atomic_fetch_sub(T *addr, arith_t operand, + sycl::memory_order memoryOrder); + +template +T atomic_fetch_and(T *addr, T operand); +template +T atomic_fetch_and(T *addr, T operand, sycl::memory_order memoryOrder); + +template +T atomic_fetch_or(T *addr, T operand); +template +T atomic_fetch_or(T *addr, T operand, sycl::memory_order memoryOrder); + +template +T atomic_fetch_xor(T *addr, T operand); +template +T atomic_fetch_xor(T *addr, T operand, sycl::memory_order memoryOrder); + +template +T atomic_fetch_min(T *addr, T operand); +template +T atomic_fetch_min(T *addr, T operand, sycl::memory_order memoryOrder); + +template +T atomic_fetch_max(T *addr, T operand); +template +T atomic_fetch_max(T *addr, T operand, sycl::memory_order memoryOrder); + +template +unsigned int atomic_fetch_compare_inc(unsigned int *addr, + unsigned int operand); +template +unsigned int atomic_fetch_compare_inc(unsigned int *addr, + unsigned int operand, + sycl::memory_order memoryOrder); + +template +T atomic_exchange(T *addr, T operand); +template +T atomic_exchange(T *addr, T operand, sycl::memory_order memoryOrder); + +template +T atomic_compare_exchange_strong( + sycl::multi_ptr addr, + T expected, T desired, + sycl::memory_order success = sycl::memory_order::relaxed, + sycl::memory_order fail = sycl::memory_order::relaxed); +template +T atomic_compare_exchange_strong( + T *addr, T expected, T desired, + sycl::memory_order success = sycl::memory_order::relaxed, + sycl::memory_order fail = sycl::memory_order::relaxed); + +} // namespace syclcompat +``` + +### Compatibility Utilities + +This library provides a number of small compatibility utilities which exist to +facilitate machine translation of code from other programming models to SYCL. +These functions are part of the public API, but they are not expected to be +useful to developers writing their own code. + +Functionality is provided to represent a pair of integers as a `double`. +`cast_ints_to_double(int, int)` returns a `double` containing the given integers +in the high & low 32-bits respectively. `cast_double_to_int` casts the high or +low 32-bits back into an integer. + +`syclcompat::fast_length` provides a wrapper to SYCL's +`fast_length(sycl::vec)` that accepts arguments for a C++ array and a +length. + +`vectorized_max` and `vectorized_min` are binary operations returning the +max/min of two arguments, where each argument is treated as a `sycl::vec` type. +`vectorized_isgreater` performs elementwise `isgreater`, treating each argument +as a vector of elements, and returning `0` for vector components for which +`isgreater` is false, and `-1` when true. + +`reverse_bits` reverses the bits of a 32-bit unsigned integer, `ffs` returns the +position of the first least significant set bit in an integer. +`byte_level_permute` returns a byte-permutation of two input unsigned integers, +with bytes selected according to a third unsigned integer argument. + +There is also an `experimental::logical_group` class which allows +`sycl::sub_group`s to be further subdivided into 'logical' groups to perform +sub-group level operations. This class provides methods to get the local & group +id and range. The functions `select_from_sub_group`, `shift_sub_group_left`, +`shift_sub_group_right` and `permute_sub_group_by_xor` provide equivalent +functionality to `sycl::select_from_group`, `sycl::shift_group_left`, +`sycl::shift_group_right` and `sycl::permute_group_by_xor`, respectively. +However, they provide an optional argument to represent the `logical_group` size +(default 32). + +The functions `cmul`,`cdiv`,`cabs`, and `conj` define complex math operations +which accept `sycl::vec` arguments representing complex values. + +```c++ +namespace syclcompat { + +inline int cast_double_to_int(double d, bool use_high32 = true); + +inline double cast_ints_to_double(int high32, int low32); + +inline float fast_length(const float *a, int len); + +template inline T vectorized_max(T a, T b); + +template inline T vectorized_min(T a, T b); + +template inline T vectorized_isgreater(T a, T b); + +template <> +inline unsigned vectorized_isgreater(unsigned a, + unsigned b); + +template inline T reverse_bits(T a); + +inline unsigned int byte_level_permute(unsigned int a, unsigned int b, + unsigned int s); + +template inline int ffs(T a); + +template +T select_from_sub_group(sycl::sub_group g, T x, int remote_local_id, + int logical_sub_group_size = 32); + +template +T shift_sub_group_left(sycl::sub_group g, T x, unsigned int delta, + int logical_sub_group_size = 32); + +template +T shift_sub_group_right(sycl::sub_group g, T x, unsigned int delta, + int logical_sub_group_size = 32); + +template +T permute_sub_group_by_xor(sycl::sub_group g, T x, unsigned int mask, + int logical_sub_group_size = 32); + +template +sycl::vec cmul(sycl::vec x, sycl::vec y); + +template +sycl::vec cdiv(sycl::vec x, sycl::vec y); + +template T cabs(sycl::vec x); + +template sycl::vec conj(sycl::vec x); + +} // namespace syclcompat +``` + +The function `experimental::nd_range_barrier` synchronizes work items from all +work groups within a SYCL kernel. This is not officially supported by the SYCL +spec, and so should be used with caution. + +```c++ +namespace syclcompat { +namespace experimental { + +template +inline void nd_range_barrier( + sycl::nd_item item, + sycl::atomic_ref &counter); + +template <> +inline void nd_range_barrier( + sycl::nd_item<1> item, + sycl::atomic_ref &counter); + +class logical_group { +public: + logical_group(sycl::nd_item<3> item, sycl::group<3> parent_group, + uint32_t size); + uint32_t get_local_linear_id() const; + uint32_t get_group_linear_id() const; + uint32_t get_local_linear_range() const; + uint32_t get_group_linear_range() const; +}; + +} // namespace experimental +} // namespace syclcompat +``` + +To assist machine translation, helper aliases are provided for inlining and +alignment attributes. The class template declarations `sycl_compat_kernel_name` +and `sycl_compat_kernel_scalar` are used to assist automatic generation of +kernel names during machine translation. + +`get_sycl_language_version` returns an integer representing the version of the +SYCL spec supported by the current SYCL compiler. + +``` c++ +namespace syclcompat { + +#define __sycl_compat_align__(n) __attribute__((aligned(n))) +#define __sycl_compat_inline__ __inline__ __attribute__((always_inline)) + +#define __sycl_compat_noinline__ __attribute__((noinline)) + +template class sycl_compat_kernel_name; +template class sycl_compat_kernel_scalar; + +int get_sycl_language_version(); + +} // namespace syclcompat +``` + +#### Kernel Helper Functions + +Kernel helper functions provide a structure `kernel_function_info` to keep SYCL +kernel information, and provide a utility function `get_kernel_function_info()` +to get the kernel information. Overloads are provided to allow either returning +a `kernel_function_info` object, or to return by pointer argument. In the +current version, `kernel_function_info` describes only maximum work-group size. + +``` c++ +namespace syclcompat { + +struct kernel_function_info { + int max_work_group_size = 0; +}; + +static void get_kernel_function_info(kernel_function_info *kernel_info, + const void *function); +static kernel_function_info get_kernel_function_info(const void *function); +} // namespace syclcompat +``` + +## Sample Code + +Below is a simple linear algebra sample, which computes `y = mx + b` implemented +using this library: + +``` c++ +#include +#include + +#include +#include + +/** + * Slope intercept form of a straight line equation: Y = m * X + b + */ +template +void slope_intercept(float *Y, float *X, float m, float b, size_t n) { + + // Block index + size_t bx = syclcompat::work_group_id::x(); + // Thread index + size_t tx = syclcompat::local_id::x(); + + size_t i = bx * BLOCK_SIZE + tx; + // or i = syclcompat::global_id::x(); + if (i < n) + Y[i] = m * X[i] + b; +} + +void check_memory(void *ptr, std::string msg) { + if (ptr == nullptr) { + std::cerr << "Failed to allocate memory: " << msg << std::endl; + exit(EXIT_FAILURE); + } +} + +/** + * Program main + */ +int main(int argc, char **argv) { + std::cout << "Simple Kernel example" << std::endl; + + constexpr size_t n_points = 32; + constexpr float m = 1.5f; + constexpr float b = 0.5f; + + int block_size = 32; + if (block_size > syclcompat::get_current_device() + .get_info()) + block_size = 16; + + std::cout << "block_size = " << block_size << ", n_points = " << n_points + << std::endl; + + // Allocate host memory for vectors X and Y + size_t mem_size = n_points * sizeof(float); + float *h_X = (float *)syclcompat::malloc_host(mem_size); + float *h_Y = (float *)syclcompat::malloc_host(mem_size); + check_memory(h_X, "h_X allocation failed."); + check_memory(h_Y, "h_Y allocation failed."); + + // Alternative templated allocation for the expected output + float *h_expected = syclcompat::malloc_host(n_points); + check_memory(h_expected, "Not enough for h_expected."); + + // Initialize host memory & expected output + for (size_t i = 0; i < n_points; i++) { + h_X[i] = i + 1; + h_expected[i] = m * h_X[i] + b; + } + + // Allocate device memory + float *d_X = (float *)syclcompat::malloc(mem_size); + float *d_Y = (float *)syclcompat::malloc(mem_size); + check_memory(d_X, "d_X allocation failed."); + check_memory(d_Y, "d_Y allocation failed."); + + // copy host memory to device + syclcompat::memcpy(d_X, h_X, mem_size); + + size_t threads = block_size; + size_t grid = n_points / block_size; + + std::cout << "Computing result using SYCL Kernel... "; + if (block_size == 16) { + syclcompat::launch>(grid, threads, d_Y, d_X, m, b, + n_points); + } else { + syclcompat::launch>(grid, threads, d_Y, d_X, m, b, + n_points); + } + syclcompat::wait(); + std::cout << "DONE" << std::endl; + + // Async copy result from device to host + syclcompat::memcpy_async(h_Y, d_Y, mem_size).wait(); + + // Check output + for (size_t i = 0; i < n_points; i++) { + assert(h_Y[i] - h_expected[i] < 1e6); + } + + // Clean up memory + syclcompat::free(h_X); + syclcompat::free(h_Y); + syclcompat::free(h_expected); + syclcompat::free(d_X); + syclcompat::free(d_Y); + + return 0; +} +``` + +## Maintainers + +To report problems with this library, please open a new issue with the [COMPAT] +tag at: + + + +## Contributors + +Alberto Cabrera, Codeplay \ +Gordon Brown, Codeplay \ +Joe Todd, Codeplay \ +Pietro Ghiglio, Codeplay \ +Ruyman Reyes, Codeplay/Intel + +## Contributions + +This library is licensed under the Apache 2.0 license. If you have an idea for a +new sample, different build system integration or even a fix for something that +is broken, please get in contact. From f7b00b752506b87414fd026b3010703e85e0d733 Mon Sep 17 00:00:00 2001 From: Chris Perkins Date: Wed, 9 Aug 2023 02:56:19 -0700 Subject: [PATCH 11/24] [SYCL] Fix sycl::vec unary ops (#10722) The recent sycl::vec changes (https://github.com/intel/llvm/pull/9492) broke they unary operations. This PR fixes them and adds some testing to avoid that in the future. --- sycl/include/sycl/types.hpp | 140 +++++++++++++++++++++----------- sycl/test/basic_tests/types.cpp | 63 ++++++++++++++ 2 files changed, 154 insertions(+), 49 deletions(-) diff --git a/sycl/include/sycl/types.hpp b/sycl/include/sycl/types.hpp index d3f63637e3028..37ff4cf3d6438 100644 --- a/sycl/include/sycl/types.hpp +++ b/sycl/include/sycl/types.hpp @@ -583,13 +583,17 @@ template class vec { // vector extension. This is for MSVC compatibility, which has a max alignment // of 64 for direct params. If we drop MSVC, we can have alignment the same as // size and use vector extensions for all sizes. - static constexpr bool IsUsingArray = + static constexpr bool IsUsingArrayOnDevice = (IsHostHalf || IsSizeGreaterThanMaxAlign); #if defined(__SYCL_DEVICE_ONLY__) - static constexpr bool NativeVec = NumElements > 1 && !IsUsingArray; + static constexpr bool NativeVec = NumElements > 1 && !IsUsingArrayOnDevice; + static constexpr bool IsUsingArrayOnHost = + false; // we are not compiling for host. #else static constexpr bool NativeVec = false; + static constexpr bool IsUsingArrayOnHost = + true; // host always uses std::array. #endif static constexpr int getNumElements() { return NumElements; } @@ -770,6 +774,15 @@ template class vec { return *this; } + template + using EnableIfUsingArray = + typename std::enable_if_t; + + template + using EnableIfNotUsingArray = + typename std::enable_if_t; + #ifdef __SYCL_DEVICE_ONLY__ template using EnableIfNotHostHalf = typename std::enable_if_t; @@ -778,13 +791,15 @@ template class vec { using EnableIfHostHalf = typename std::enable_if_t; template - using EnableIfUsingArray = typename std::enable_if_t; + using EnableIfUsingArrayOnDevice = + typename std::enable_if_t; template - using EnableIfNotUsingArray = typename std::enable_if_t; + using EnableIfNotUsingArrayOnDevice = + typename std::enable_if_t; template - explicit constexpr vec(const EnableIfNotUsingArray &arg) + explicit constexpr vec(const EnableIfNotUsingArrayOnDevice &arg) : m_Data{DataType(vec_data::get(arg))} {} template @@ -792,13 +807,13 @@ template class vec { std::is_fundamental_v> || std::is_same_v, half>, vec &> - operator=(const EnableIfNotUsingArray &Rhs) { + operator=(const EnableIfNotUsingArrayOnDevice &Rhs) { m_Data = (DataType)vec_data::get(Rhs); return *this; } template - explicit constexpr vec(const EnableIfUsingArray &arg) + explicit constexpr vec(const EnableIfUsingArrayOnDevice &arg) : vec{detail::RepeatValue( static_cast>(arg)), std::make_index_sequence()} {} @@ -808,7 +823,7 @@ template class vec { std::is_fundamental_v> || std::is_same_v, half>, vec &> - operator=(const EnableIfUsingArray &Rhs) { + operator=(const EnableIfUsingArrayOnDevice &Rhs) { for (int i = 0; i < NumElements; ++i) { setValue(i, Rhs); } @@ -844,22 +859,22 @@ template class vec { std::is_convertible_v && NumElements == IdxNum, DataT>; template constexpr vec(const EnableIfMultipleElems<2, Ty> Arg0, - const EnableIfNotUsingArray Arg1) + const EnableIfNotUsingArrayOnDevice Arg1) : m_Data{vec_data::get(Arg0), vec_data::get(Arg1)} {} template constexpr vec(const EnableIfMultipleElems<3, Ty> Arg0, - const EnableIfNotUsingArray Arg1, const DataT Arg2) + const EnableIfNotUsingArrayOnDevice Arg1, const DataT Arg2) : m_Data{vec_data::get(Arg0), vec_data::get(Arg1), vec_data::get(Arg2)} {} template constexpr vec(const EnableIfMultipleElems<4, Ty> Arg0, - const EnableIfNotUsingArray Arg1, const DataT Arg2, + const EnableIfNotUsingArrayOnDevice Arg1, const DataT Arg2, const Ty Arg3) : m_Data{vec_data::get(Arg0), vec_data::get(Arg1), vec_data::get(Arg2), vec_data::get(Arg3)} {} template constexpr vec(const EnableIfMultipleElems<8, Ty> Arg0, - const EnableIfNotUsingArray Arg1, const DataT Arg2, + const EnableIfNotUsingArrayOnDevice Arg1, const DataT Arg2, const DataT Arg3, const DataT Arg4, const DataT Arg5, const DataT Arg6, const DataT Arg7) : m_Data{vec_data::get(Arg0), vec_data::get(Arg1), @@ -868,7 +883,7 @@ template class vec { vec_data::get(Arg6), vec_data::get(Arg7)} {} template constexpr vec(const EnableIfMultipleElems<16, Ty> Arg0, - const EnableIfNotUsingArray Arg1, const DataT Arg2, + const EnableIfNotUsingArrayOnDevice Arg1, const DataT Arg2, const DataT Arg3, const DataT Arg4, const DataT Arg5, const DataT Arg6, const DataT Arg7, const DataT Arg8, const DataT Arg9, const DataT ArgA, const DataT ArgB, @@ -908,7 +923,7 @@ template class vec { std::is_same::value && !std::is_same::value>> constexpr vec(vector_t openclVector) { - if constexpr (!IsUsingArray) { + if constexpr (!IsUsingArrayOnDevice) { m_Data = openclVector; } else { m_Data = bit_cast(openclVector); @@ -916,7 +931,7 @@ template class vec { } operator vector_t() const { - if constexpr (!IsUsingArray) { + if constexpr (!IsUsingArrayOnDevice) { return m_Data; } else { auto ptr = bit_cast((&m_Data)->data()); @@ -1077,7 +1092,7 @@ template class vec { #ifdef __SYCL_DEVICE_ONLY__ #define __SYCL_BINOP(BINOP, OPASSIGN, CONVERT) \ template \ - vec operator BINOP(const EnableIfNotUsingArray &Rhs) const { \ + vec operator BINOP(const EnableIfNotUsingArrayOnDevice &Rhs) const { \ vec Ret; \ Ret.m_Data = m_Data BINOP Rhs.m_Data; \ if constexpr (std::is_same::value && CONVERT) { \ @@ -1086,7 +1101,7 @@ template class vec { return Ret; \ } \ template \ - vec operator BINOP(const EnableIfUsingArray &Rhs) const { \ + vec operator BINOP(const EnableIfUsingArrayOnDevice &Rhs) const { \ vec Ret; \ for (size_t I = 0; I < NumElements; ++I) { \ Ret.setValue(I, (getValue(I) BINOP Rhs.getValue(I))); \ @@ -1240,67 +1255,94 @@ template class vec { __SYCL_UOP(--, -=) #undef __SYCL_UOP - // Available only when: dataT != cl_float && dataT != cl_double - // && dataT != cl_half + // operator~() available only when: dataT != float && dataT != double + // && dataT != half template - typename std::enable_if_t>, vec> + typename std::enable_if_t> && + (!IsUsingArrayOnDevice && !IsUsingArrayOnHost), + vec> operator~() const { -// Use __SYCL_DEVICE_ONLY__ macro because cast to OpenCL vector type is defined -// by SYCL device compiler only. -#ifdef __SYCL_DEVICE_ONLY__ vec Ret{(typename vec::DataType) ~m_Data}; if constexpr (std::is_same::value) { Ret.ConvertToDataT(); } return Ret; -#else + } + template + typename std::enable_if_t> && + (IsUsingArrayOnDevice || IsUsingArrayOnHost), + vec> + operator~() const { vec Ret{}; for (size_t I = 0; I < NumElements; ++I) { Ret.setValue(I, ~getValue(I)); } return Ret; -#endif } - vec operator!() const { -// Use __SYCL_DEVICE_ONLY__ macro because cast to OpenCL vector type is defined -// by SYCL device compiler only. -#ifdef __SYCL_DEVICE_ONLY__ - return vec{ - (typename vec::DataType) !m_Data}; -#else - vec Ret{}; + // operator! + template + EnableIfNotUsingArray> operator!() const { + return vec{(typename vec::DataType) !m_Data}; + } + + // std::byte neither supports ! unary op or casting, so special handling is + // needed. And, worse, Windows has a conflict with 'byte'. +#if (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0) + template + typename std::enable_if_t::value && + (IsUsingArrayOnDevice || IsUsingArrayOnHost), + vec> + operator!() const { + vec Ret{}; for (size_t I = 0; I < NumElements; ++I) { - Ret.setValue(I, !vec_data::get(getValue(I))); + Ret.setValue(I, std::byte{!vec_data::get(getValue(I))}); } return Ret; -#endif } - vec operator+() const { -// Use __SYCL_DEVICE_ONLY__ macro because cast to OpenCL vector type is defined -// by SYCL device compiler only. -#ifdef __SYCL_DEVICE_ONLY__ - return vec{+m_Data}; + template + typename std::enable_if_t::value && + (IsUsingArrayOnDevice || IsUsingArrayOnHost), + vec> + operator!() const { + vec Ret{}; + for (size_t I = 0; I < NumElements; ++I) + Ret.setValue(I, !vec_data::get(getValue(I))); + return Ret; + } #else + template + EnableIfUsingArray> operator!() const { vec Ret{}; for (size_t I = 0; I < NumElements; ++I) - Ret.setValue(I, vec_data::get(+vec_data::get(getValue(I)))); + Ret.setValue(I, !vec_data::get(getValue(I))); return Ret; + } #endif + + // operator + + template EnableIfNotUsingArray operator+() const { + return vec{+m_Data}; } - vec operator-() const { -// Use __SYCL_DEVICE_ONLY__ macro because cast to OpenCL vector type is defined -// by SYCL device compiler only. -#ifdef __SYCL_DEVICE_ONLY__ + template EnableIfUsingArray operator+() const { + vec Ret{}; + for (size_t I = 0; I < NumElements; ++I) + Ret.setValue(I, vec_data::get(+vec_data::get(getValue(I)))); + return Ret; + } + + // operator - + template EnableIfNotUsingArray operator-() const { return vec{-m_Data}; -#else + } + + template EnableIfUsingArray operator-() const { vec Ret{}; for (size_t I = 0; I < NumElements; ++I) Ret.setValue(I, vec_data::get(-vec_data::get(getValue(I)))); return Ret; -#endif } // OP is: &&, || @@ -1316,7 +1358,7 @@ template class vec { template