diff --git a/.github/workflows/sycl_exp_precommit.yml b/.github/workflows/sycl_exp_precommit.yml deleted file mode 100644 index a0439088cca8a..0000000000000 --- a/.github/workflows/sycl_exp_precommit.yml +++ /dev/null @@ -1,93 +0,0 @@ -name: SYCL Experimental Pre-Commit - -on: - pull_request: - branches: - - sycl - paths: - - '.github/workflows/**' - -jobs: - detect_changes: - uses: ./.github/workflows/sycl_detect_changes.yml - - lint: - needs: [detect_changes] - if: | - github.event.pull_request.head.repo.full_name == 'intel/llvm' - runs-on: [Linux, build] - container: - image: ghcr.io/intel/llvm/sycl_ubuntu2204_nightly:no-drivers - options: -u 1001:1001 - steps: - - uses: actions/checkout@v3 - with: - sparse-checkout: | - devops/actions/cached_checkout - devops/actions/clang-format - devops/actions/cleanup - - name: Register cleanup after job is finished - uses: ./devops/actions/cleanup - - name: 'PR commits + 2' - run: echo "PR_FETCH_DEPTH=$(( ${{ github.event.pull_request.commits }} + 2 ))" >> "${GITHUB_ENV}" - - uses: ./devops/actions/cached_checkout - with: - path: src - fetch-depth: ${{ env.PR_FETCH_DEPTH }} - ref: ${{ github.event.pull_request.head.sha }} - merge_ref: '' - cache_path: "/__w/repo_cache/" - - name: Run clang-format - uses: ./devops/actions/clang-format - with: - path: src - - build: - needs: [lint] - if: | - always() - && (success() || contains(github.event.pull_request.labels.*.name, 'ignore-lint')) - uses: ./.github/workflows/sycl_linux_build.yml - with: - build_ref: ${{ github.sha }} - merge_ref: '' - build_cache_root: "/__w/" - build_artifact_suffix: "default" - build_cache_suffix: "default" - changes: '[]' - - test: - needs: [build, detect_changes] - strategy: - fail-fast: false - matrix: - include: - - name: ESIMD Emu - runner: '["Linux", "x86-cpu"]' - image: ghcr.io/intel/llvm/ubuntu2204_build:latest - image_options: -u 1001 - target_devices: ext_intel_esimd_emulator:gpu - - name: AMD/HIP - runner: '["Linux", "amdgpu"]' - image: ghcr.io/intel/llvm/ubuntu2204_build:latest - image_options: -u 1001 --device=/dev/dri --device=/dev/kfd - target_devices: ext_oneapi_hip:gpu - - name: Intel - runner: '["Linux", "gen12"]' - image: ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest - image_options: -u 1001 --device=/dev/dri --privileged --cap-add SYS_ADMIN - target_devices: ext_oneapi_level_zero:gpu;opencl:gpu;opencl:cpu - reset_gpu: ${{ contains(needs.detect_changes.outputs.filters, 'drivers') }} - uses: ./.github/workflows/sycl_linux_run_tests.yml - with: - name: ${{ matrix.name }} - runner: ${{ matrix. runner }} - image: ${{ matrix.image }} - image_options: ${{ matrix.image_options }} - target_devices: ${{ matrix.target_devices }} - ref: ${{ github.sha }} - merge_ref: '' - - sycl_toolchain_artifact: sycl_linux_default - sycl_toolchain_archive: ${{ needs.build.outputs.artifact_archive_name }} - sycl_toolchain_decompress_command: ${{ needs.build.outputs.artifact_decompress_command }} diff --git a/.github/workflows/sycl_linux_build.yml b/.github/workflows/sycl_linux_build.yml index 44a1358b82ac4..48cee3f83bda3 100644 --- a/.github/workflows/sycl_linux_build.yml +++ b/.github/workflows/sycl_linux_build.yml @@ -38,7 +38,7 @@ on: changes: type: string description: 'Filter matches for the changed files in the PR' - default: '[llvm, clang, sycl, llvm_spirv, xptifw, libclc, libdevice]' + default: '[mlir_sycl, polygeist, cgeist, clang]' required: false merge_ref: description: | @@ -52,6 +52,8 @@ on: default: 3 outputs: + build_conclusion: + value: ${{ jobs.build.outputs.build_conclusion }} artifact_archive_name: value: ${{ jobs.build.outputs.artifact_archive_name }} artifact_decompress_command: @@ -148,6 +150,7 @@ jobs: ref: ${{ inputs.build_ref || github.sha }} merge_ref: ${{ inputs.merge_ref }} cache_path: "/__w/repo_cache/" + default_branch: sycl-mlir - name: Configure env: CC: ${{ inputs.cc }} @@ -168,38 +171,43 @@ jobs: - name: Compile id: build run: cmake --build $GITHUB_WORKSPACE/build - - name: check-llvm - if: always() && !cancelled() && contains(inputs.changes, 'llvm') + - name: mlir-sycl-doc + if: always() && !cancelled() && contains(inputs.changes, 'mlir_sycl') run: | - cmake --build $GITHUB_WORKSPACE/build --target check-llvm - - name: check-clang - if: always() && !cancelled() && contains(inputs.changes, 'clang') + cmake --build $GITHUB_WORKSPACE/build --target mlir-sycl-doc + - name: polygeist-doc + if: always() && !cancelled() && contains(inputs.changes, 'polygeist') run: | - # Can we move this to Dockerfile? Hopefully, noop on Windows. - export XDG_CACHE_HOME=$GITHUB_WORKSPACE/os_cache - cmake --build $GITHUB_WORKSPACE/build --target check-clang - - name: check-sycl - if: always() && !cancelled() && contains(inputs.changes, 'sycl') + cmake --build $GITHUB_WORKSPACE/build --target polygeist-doc + # TODO allow to optionally disable in-tree checks + - name: check-mlir-sycl + shell: bash + if: always() && !cancelled() && contains(inputs.changes, 'mlir_sycl') run: | - # TODO consider moving this to Dockerfile. - export LD_LIBRARY_PATH=/usr/local/cuda/compat/:/usr/local/cuda/lib64:$LD_LIBRARY_PATH - cmake --build $GITHUB_WORKSPACE/build --target check-sycl - - name: check-llvm-spirv - if: always() && !cancelled() && contains(inputs.changes, 'llvm_spirv') + cmake --build $GITHUB_WORKSPACE/build --target check-mlir-sycl + - name: check-polygeist + shell: bash + if: always() && !cancelled() && contains(inputs.changes, 'polygeist') run: | - cmake --build $GITHUB_WORKSPACE/build --target check-llvm-spirv - - name: check-xptifw - if: always() && !cancelled() && contains(inputs.changes, 'xptifw') + cmake --build $GITHUB_WORKSPACE/build --target check-polygeist + - name: check-polygeist-unit + shell: bash + if: always() && !cancelled() && contains(inputs.changes, 'polygeist') run: | - cmake --build $GITHUB_WORKSPACE/build --target check-xptifw - - name: check-libclc - if: always() && !cancelled() && contains(inputs.changes, 'libclc') + cmake --build $GITHUB_WORKSPACE/build --target check-polygeist-unit + - name: check-cgeist + shell: bash + if: always() && !cancelled() && contains(inputs.changes, 'cgeist') run: | - cmake --build $GITHUB_WORKSPACE/build --target check-libclc - - name: check-libdevice - if: always() && !cancelled() && contains(inputs.changes, 'libdevice') + if [ -e /runtimes/oneapi-tbb/env/vars.sh ]; then + source /runtimes/oneapi-tbb/env/vars.sh; + fi + cmake --build $GITHUB_WORKSPACE/build --target check-cgeist + - name: check-clang-driver + shell: bash + if: always() && !cancelled() && contains(inputs.changes, 'clang') run: | - cmake --build $GITHUB_WORKSPACE/build --target check-libdevice + cmake --build $GITHUB_WORKSPACE/build --target check-clang-driver - name: Install if: ${{ always() && !cancelled() && steps.build.conclusion == 'success' }} # TODO replace utility installation with a single CMake target diff --git a/.github/workflows/sycl_nightly.yml b/.github/workflows/sycl_nightly.yml index cf4b1a8176111..cabf159ff05dc 100644 --- a/.github/workflows/sycl_nightly.yml +++ b/.github/workflows/sycl_nightly.yml @@ -6,17 +6,9 @@ on: - cron: '0 3 * * *' jobs: - test_matrix: - if: github.repository == 'intel/llvm' - name: Generate Test Matrix - uses: ./.github/workflows/sycl_gen_test_matrix.yml - with: - lts_config: "hip_amdgpu;ocl_gen12;ocl_x64;l0_gen12;esimd_emu;cuda_aws;win_l0_gen12" - - ubuntu2204_build_test: + ubuntu2204_build: if: github.repository == 'intel/llvm' - uses: ./.github/workflows/sycl_linux_build_and_test.yml - needs: test_matrix + uses: ./.github/workflows/sycl_linux_build.yml secrets: inherit with: build_cache_root: "/__w/" @@ -24,24 +16,76 @@ jobs: build_configure_extra_args: '--hip --cuda --enable-esimd-emulator' merge_ref: '' retention-days: 90 - lts_matrix: ${{ needs.test_matrix.outputs.lts_lx_matrix }} - lts_aws_matrix: ${{ needs.test_matrix.outputs.lts_aws_matrix }} # We upload the build for people to download/use, override its name and # prefer widespread gzip compression. artifact_archive_name: sycl_linux.tar.gz - ubuntu2204_opaque_pointers_build_test: - if: github.repository == 'intel/llvm' - uses: ./.github/workflows/sycl_linux_build_and_test.yml - needs: test_matrix - secrets: inherit + ubuntu2204_test: + needs: [ubuntu2204_build] + if: ${{ always() && !cancelled() && needs.ubuntu2204_build.outputs.build_conclusion == 'success' }} + strategy: + fail-fast: false + matrix: + include: + - name: AMD/HIP + runner: '["Linux", "amdgpu"]' + image: ghcr.io/intel/llvm/ubuntu2204_build:latest + image_options: -u 1001 --device=/dev/dri --device=/dev/kfd + target_devices: ext_oneapi_hip:gpu + + - name: Intel L0 GPU + runner: '["Linux", "gen12"]' + image: ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest + image_options: -u 1001 --device=/dev/dri --privileged --cap-add SYS_ADMIN + target_devices: ext_oneapi_level_zero:gpu + reset_gpu: true + + - name: Intel OCL GPU + runner: '["Linux", "gen12"]' + image: ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest + image_options: -u 1001 --device=/dev/dri --privileged --cap-add SYS_ADMIN + target_devices: opencl:gpu + reset_gpu: true + + - name: OCL CPU + runner: '["Linux", "x86-cpu"]' + image: ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest + image_options: -u 1001 + target_devices: opencl:cpu + + - name: ESIMD Emu + runner: '["Linux", "x86-cpu"]' + image: ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest + image_options: -u 1001 + target_devices: ext_intel_esimd_emulator:gpu + + - name: Self-hosted CUDA + runner: '["Linux", "cuda"]' + image: ghcr.io/intel/llvm/ubuntu2204_build:latest + image_options: -u 1001 --gpus all --cap-add SYS_ADMIN + target_devices: ext_oneapi_cuda:gpu + uses: ./.github/workflows/sycl_linux_run_tests.yml with: - build_cache_root: "/__w/" - build_cache_suffix: opaque_pointers - build_artifact_suffix: opaque_pointers - build_configure_extra_args: "--hip --cuda --enable-esimd-emulator --cmake-opt=-DSPIRV_ENABLE_OPAQUE_POINTERS=TRUE" + name: ${{ matrix.name }} + runner: ${{ matrix.runner }} + image: ${{ matrix.image }} + image_options: ${{ matrix.image_options }} + target_devices: ${{ matrix.target_devices }} + reset_gpu: ${{ matrix.reset_gpu }} + ref: ${{ github.sha }} merge_ref: '' + sycl_toolchain_artifact: sycl_linux_default + sycl_toolchain_archive: ${{ needs.ubuntu2204_build.outputs.artifact_archive_name }} + sycl_toolchain_decompress_command: ${{ needs.ubuntu2204_build.outputs.artifact_decompress_command }} + + + test_matrix: + if: github.repository == 'intel/llvm' + name: Generate Test Matrix + uses: ./.github/workflows/sycl_gen_test_matrix.yml + with: + lts_config: "win_l0_gen12" windows_default: name: Windows @@ -59,7 +103,7 @@ jobs: nightly_build_upload: name: Nightly Build Upload if: ${{ github.ref_name == 'sycl' }} - needs: [ubuntu2204_build_test, windows_default] + needs: [ubuntu2204_build, windows_default] runs-on: ubuntu-latest steps: - uses: actions/download-artifact@v3 @@ -92,7 +136,7 @@ jobs: ubuntu2204_docker_build_push: if: github.repository == 'intel/llvm' runs-on: [Linux, build] - needs: ubuntu2204_build_test + needs: ubuntu2204_build steps: - uses: actions/checkout@v3 - uses: actions/download-artifact@v3 diff --git a/.github/workflows/sycl_post_commit.yml b/.github/workflows/sycl_post_commit.yml index f0021e269a482..4db77b0a48728 100644 --- a/.github/workflows/sycl_post_commit.yml +++ b/.github/workflows/sycl_post_commit.yml @@ -7,19 +7,22 @@ on: - sycl-devops-pr/** - llvmspirv_pulldown + pull_request: + branches: + - sycl + - sycl-devops-pr/** + paths: + - .github/workflow/sycl_post_commit.yml + - .github/workflow/sycl_linux_build.yml + - .github/workflow/sycl_linux_run_tests.yml + - ./devops/actions/cleanup + - ./devops/actions/cached_checkout + jobs: - # This job generates matrix of tests for SYCL End-to-End tests - test_matrix: - name: Generate Test Matrix - if: github.repository == 'intel/llvm' - uses: ./.github/workflows/sycl_gen_test_matrix.yml - with: - lts_config: "l0_gen12;win_l0_gen12" - linux_self_prod: + build: name: Linux (Self build + shared libraries + no-assertions) if: github.repository == 'intel/llvm' - needs: test_matrix - uses: ./.github/workflows/sycl_linux_build_and_test.yml + uses: ./.github/workflows/sycl_linux_build.yml with: build_cache_root: "/__w/llvm" build_cache_suffix: sprod_shared @@ -29,11 +32,32 @@ jobs: build_image: "ghcr.io/intel/llvm/sycl_ubuntu2204_nightly:build" cc: clang cxx: clang++ - lts_matrix: ${{ needs.test_matrix.outputs.lts_lx_matrix }} - cts_matrix: ${{ needs.test_matrix.outputs.cts_matrix }} - lts_aws_matrix: ${{ needs.test_matrix.outputs.lts_aws_matrix }} merge_ref: '' + test: + needs: [build] + if: ${{ always() && !cancelled() && needs.build.outputs.build_conclusion == 'success' }} + uses: ./.github/workflows/sycl_linux_run_tests.yml + with: + name: SYCL E2E on Intel Linux L0 + runner: '["Linux", "gen12"]' + image: ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest + image_options: -u 1001 --device=/dev/dri --privileged --cap-add SYS_ADMIN + target_devices: ext_oneapi_level_zero:gpu + ref: ${{ github.sha }} + merge_ref: '' + sycl_toolchain_artifact: sycl_linux_sprod_shared + sycl_toolchain_archive: ${{ needs.build.outputs.artifact_archive_name }} + sycl_toolchain_decompress_command: ${{ needs.build.outputs.artifact_decompress_command }} + + # This job generates matrix of tests for SYCL End-to-End tests on Windows + test_matrix: + name: Generate Test Matrix + if: github.repository == 'intel/llvm' + uses: ./.github/workflows/sycl_gen_test_matrix.yml + with: + lts_config: "win_l0_gen12" + windows_default: name: Windows needs: test_matrix diff --git a/.github/workflows/sycl_precommit_aws.yml b/.github/workflows/sycl_precommit_aws.yml index 627c59c32dacc..c1d60bf8911df 100644 --- a/.github/workflows/sycl_precommit_aws.yml +++ b/.github/workflows/sycl_precommit_aws.yml @@ -1,10 +1,20 @@ name: E2E on AWS CUDA +run-name: E2E on AWS CUDA - ${{ github.event.workflow_run.display_title }} +# We have to keep pre-commit AWS CUDA testing in a separate workflow because we +# need access to AWS secret and that isn't available on pull_request jobs for +# PRs from forks. And github's "require approval for all outside collaborators" +# is bypassed on pull_request_target. +# +# Also, we use commit status and not check suite/run (which, in theory, is more +# powerful) due to https://github.com/orgs/community/discussions/24616. on: workflow_run: - workflows: [SYCL Experimental Pre-Commit] + workflows: [SYCL Pre Commit on Linux] types: - completed + branches-ignore: + - sycl-mlir jobs: create-check: @@ -53,7 +63,9 @@ jobs: image: ghcr.io/intel/llvm/ubuntu2204_build:latest image_options: -u 1001 --gpus all --cap-add SYS_ADMIN target_devices: ext_oneapi_cuda:gpu - ref: ${{ github.sha }} + # No idea why but that seems to work and be in sync with the main + # pre-commit workflow. + ref: ${{ github.event.workflow_run.referenced_workflows[0].sha }} merge_ref: '' sycl_toolchain_artifact: sycl_linux_default diff --git a/.github/workflows/sycl_precommit_linux.yml b/.github/workflows/sycl_precommit_linux.yml index 10cdd9d124199..0167f947f3124 100644 --- a/.github/workflows/sycl_precommit_linux.yml +++ b/.github/workflows/sycl_precommit_linux.yml @@ -1,6 +1,9 @@ name: SYCL MLIR Pre Commit on Linux on: + # We rely on "Fork pull request workflows from outside collaborators" - + # "Require approval for all outside collaborators" at + # https://github.com/intel/llvm/settings/actions for security. pull_request: branches: - sycl-mlir @@ -14,15 +17,11 @@ on: - '**.md' - '**.rst' -permissions: - contents: read - jobs: detect_changes: uses: ./.github/workflows/sycl_detect_changes.yml lint: - needs: [detect_changes] runs-on: [Linux, build] container: image: ghcr.io/intel/llvm/sycl_ubuntu2204_nightly:no-drivers @@ -30,7 +29,6 @@ jobs: steps: - uses: actions/checkout@v3 with: - ref: ${{ github.base_ref }} sparse-checkout: | devops/actions/cached_checkout devops/actions/clang-format @@ -51,31 +49,46 @@ jobs: with: path: src - # This job generates matrix of tests for SYCL End-to-End tests - test_matrix: - needs: [detect_changes] - name: Generate Test Matrix - uses: ./.github/workflows/sycl_gen_test_matrix.yml - with: - ref: ${{ github.event.pull_request.head.sha }} - lts_config: "lin_intel" - - linux_default: - name: Linux - # Only build and test patches, that have passed all linter checks, because - # the next commit is likely to be a follow-up on that job. - needs: [lint, test_matrix, detect_changes] + build: + needs: [lint, detect_changes] if: | always() && (success() || contains(github.event.pull_request.labels.*.name, 'ignore-lint')) - uses: ./.github/workflows/sycl_linux_build_and_test.yml - secrets: inherit + uses: ./.github/workflows/sycl_linux_build.yml with: - build_ref: ${{ github.event.pull_request.head.sha }} - merge_ref: ${{ github.event.pull_request.base.sha }} + build_ref: ${{ github.sha }} + merge_ref: '' build_cache_root: "/__w/" build_artifact_suffix: "default" build_cache_suffix: "default" - lts_matrix: ${{ needs.test_matrix.outputs.lts_lx_matrix }} - lts_aws_matrix: ${{ needs.test_matrix.outputs.lts_aws_matrix }} changes: ${{ needs.detect_changes.outputs.filters }} + + test: + needs: [build, detect_changes] + strategy: + fail-fast: false + matrix: + include: + - name: Intel + runner: '["Linux", "gen12"]' + image: ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest + image_options: -u 1001 --device=/dev/dri --privileged --cap-add SYS_ADMIN + target_devices: ext_oneapi_level_zero:gpu;opencl:gpu;opencl:cpu + reset_gpu: true + install_drivers: ${{ contains(needs.detect_changes.outputs.filters, 'drivers') }} + uses: ./.github/workflows/sycl_linux_run_tests.yml + with: + name: ${{ matrix.name }} + runner: ${{ matrix. runner }} + image: ${{ matrix.image }} + image_options: ${{ matrix.image_options }} + target_devices: ${{ matrix.target_devices }} + reset_gpu: ${{ matrix.reset_gpu }} + + ref: ${{ github.sha }} + merge_ref: '' + + sycl_toolchain_artifact: sycl_linux_default + sycl_toolchain_archive: ${{ needs.build.outputs.artifact_archive_name }} + sycl_toolchain_decompress_command: ${{ needs.build.outputs.artifact_decompress_command }} + diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 4c877707dbf7b..618530475e6b7 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -4031,6 +4031,7 @@ def VecTypeHint : InheritableAttr { let Spellings = [GNU<"vec_type_hint">, CXX11<"sycl", "vec_type_hint">]; let Args = [TypeArgument<"TypeHint">]; let Subjects = SubjectList<[Function], ErrorDiag>; + let SupportsNonconformingLambdaSyntax = 1; let Documentation = [Undocumented]; } diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 3db8d2409c0b0..bf4d97dd5dba3 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -11979,6 +11979,9 @@ def warn_ivdep_attribute_argument : Warning< def warn_attribute_spelling_deprecated : Warning< "attribute %0 is deprecated">, InGroup; +def warn_attribute_deprecated_ignored : Warning< + "attribute %0 is deprecated; attribute ignored">, + InGroup; def note_spelling_suggestion : Note< "did you mean to use %0 instead?">; def warn_attribute_requires_non_negative_integer_argument : diff --git a/clang/include/clang/Basic/SYCLNativeCPUHelpers.h b/clang/include/clang/Basic/SYCLNativeCPUHelpers.h deleted file mode 100644 index 6fa990fbf6415..0000000000000 --- a/clang/include/clang/Basic/SYCLNativeCPUHelpers.h +++ /dev/null @@ -1,7 +0,0 @@ -#include "clang/Basic/LangOptions.h" -#include -namespace clang { -inline std::string getNativeCPUHeaderName(const LangOptions &LangOpts) { - return LangOpts.SYCLIntHeader + ".hc"; -} -} // namespace clang diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index d58d6c2e0e4a7..f4a20f1914b6e 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -10,7 +10,6 @@ #include "clang/Basic/CodeGenOptions.h" #include "clang/Basic/Diagnostic.h" #include "clang/Basic/LangOptions.h" -#include "clang/Basic/SYCLNativeCPUHelpers.h" #include "clang/Basic/TargetOptions.h" #include "clang/Basic/Targets/SPIR.h" #include "clang/Frontend/FrontendDiagnostic.h" @@ -47,10 +46,10 @@ #include "llvm/Passes/StandardInstrumentations.h" #include "llvm/SYCLLowerIR/CompileTimePropertiesPass.h" #include "llvm/SYCLLowerIR/ESIMD/ESIMDVerifier.h" -#include "llvm/SYCLLowerIR/EmitSYCLNativeCPUHeader.h" #include "llvm/SYCLLowerIR/LowerWGLocalMemory.h" #include "llvm/SYCLLowerIR/MutatePrintfAddrspace.h" #include "llvm/SYCLLowerIR/PrepareSYCLNativeCPU.h" +#include "llvm/SYCLLowerIR/RenameKernelSYCLNativeCPU.h" #include "llvm/SYCLLowerIR/SYCLAddOptLevelAttribute.h" #include "llvm/SYCLLowerIR/SYCLPropagateAspectsUsage.h" #include "llvm/Support/BuryPointer.h" @@ -108,6 +107,10 @@ extern cl::opt DebugInfoCorrelate; static cl::opt ClSanitizeOnOptimizerEarlyEP( "sanitizer-early-opt-ep", cl::Optional, cl::desc("Insert sanitizers on OptimizerEarlyEP."), cl::init(false)); + +static cl::opt SYCLNativeCPURename( + "sycl-native-cpu-rename", cl::init(false), + cl::desc("Rename kernel functions for SYCL Native CPU")); } namespace { @@ -1048,6 +1051,8 @@ void EmitAssemblyHelper::RunOptimizationPipeline( MPM = PB.buildPerModuleDefaultPipeline(Level); } + if (SYCLNativeCPURename) + MPM.addPass(RenameKernelSYCLNativeCPUPass()); if (LangOpts.SYCLIsDevice) { MPM.addPass(SYCLMutatePrintfAddrspacePass()); if (LangOpts.EnableDAEInSpirKernels) @@ -1078,8 +1083,6 @@ void EmitAssemblyHelper::RunOptimizationPipeline( MPM.addPass(CompileTimePropertiesPass()); if (LangOpts.SYCLIsNativeCPU) { - MPM.addPass( - EmitSYCLNativeCPUHeaderPass(getNativeCPUHeaderName(LangOpts))); MPM.addPass(PrepareSYCLNativeCPUPass()); } } diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index dac4aa454d33b..0e573d8ea51fd 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -1211,23 +1211,19 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C, checkSingleArgValidity(DeviceCodeSplit, {"per_kernel", "per_source", "auto", "off"}); - bool IsSYCLNativeCPU = isSYCLNativeCPU(C.getInputArgs()); Arg *SYCLForceTarget = getArgRequiringSYCLRuntime(options::OPT_fsycl_force_target_EQ); if (SYCLForceTarget) { StringRef Val(SYCLForceTarget->getValue()); llvm::Triple TT(MakeSYCLDeviceTriple(Val)); - // Todo: we skip the check for the valid SYCL target, because currently - // setting native_cpu as a target overrides all the other targets, - // re-enable the check once native_cpu can coexist. - if (!IsSYCLNativeCPU && !isValidSYCLTriple(TT)) + if (!isValidSYCLTriple(TT)) Diag(clang::diag::err_drv_invalid_sycl_target) << Val; } bool HasSYCLTargetsOption = SYCLTargets || SYCLLinkTargets || SYCLAddTargets; llvm::StringMap FoundNormalizedTriples; llvm::SmallVector UniqueSYCLTriplesVec; - if (!IsSYCLNativeCPU && HasSYCLTargetsOption) { + if (HasSYCLTargetsOption) { // At this point, we know we have a valid combination // of -fsycl*target options passed Arg *SYCLTargetsValues = SYCLTargets ? SYCLTargets : SYCLLinkTargets; @@ -1262,6 +1258,12 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C, continue; } UserTargetName = "amdgcn-amd-amdhsa"; + } else if (Val == "native_cpu") { + const ToolChain *HostTC = + C.getSingleOffloadToolChain(); + llvm::Triple HostTriple = HostTC->getTriple(); + UniqueSYCLTriplesVec.push_back(HostTriple); + continue; } if (!isValidSYCLTriple(MakeSYCLDeviceTriple(UserTargetName))) { @@ -1329,11 +1331,6 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C, Diag(clang::diag::warn_drv_empty_joined_argument) << SYCLAddTargets->getAsString(C.getInputArgs()); } - } else if (IsSYCLNativeCPU) { - const ToolChain *HostTC = C.getSingleOffloadToolChain(); - llvm::Triple HostTriple = HostTC->getTriple(); - UniqueSYCLTriplesVec.push_back(HostTriple); - addSYCLDefaultTriple(C, UniqueSYCLTriplesVec); } else { // If -fsycl is supplied without -fsycl-*targets we will assume SPIR-V // unless -fintelfpga is supplied, which uses SPIR-V with fpga AOT. @@ -5589,6 +5586,9 @@ class OffloadingActionBuilder final { bool isSpirvAOT = TT.getSubArch() == llvm::Triple::SPIRSubArch_fpga || TT.getSubArch() == llvm::Triple::SPIRSubArch_gen || TT.getSubArch() == llvm::Triple::SPIRSubArch_x86_64; + const bool isSYCLNativeCPU = + TC->getAuxTriple() && + driver::isSYCLNativeCPU(TT, *TC->getAuxTriple()); for (const auto &Input : LI) { if (TT.getSubArch() == llvm::Triple::SPIRSubArch_fpga && types::isFPGA(Input->getType())) { @@ -5810,26 +5810,14 @@ class OffloadingActionBuilder final { } else FullDeviceLinkAction = FullLinkObject; - bool IsSYCLNativeCPU = isSYCLNativeCPU(Args); - if (IsSYCLNativeCPU) { - // for SYCL Native CPU, we just take the linked device - // modules, lower them to an object file , and link it to the host - // object file. - auto *backendAct = C.MakeAction( - FullDeviceLinkAction, types::TY_PP_Asm); - auto *asmAct = - C.MakeAction(backendAct, types::TY_Object); - DA.add(*asmAct, *TC, BoundArch, Action::OFK_SYCL); - return; - } - // reflects whether current target is ahead-of-time and can't // support runtime setting of specialization constants - bool isAOT = isNVPTX || isAMDGCN || isSpirvAOT; + bool isAOT = isNVPTX || isAMDGCN || isSpirvAOT || isSYCLNativeCPU; // post link is not optional - even if not splitting, always need to // process specialization constants - types::ID PostLinkOutType = isSPIR ? types::TY_Tempfiletable + types::ID PostLinkOutType = isSPIR || isSYCLNativeCPU + ? types::TY_Tempfiletable : types::TY_LLVM_BC; auto createPostLinkAction = [&]() { // For SPIR-V targets, force TY_Tempfiletable. @@ -5839,6 +5827,20 @@ class OffloadingActionBuilder final { return TypedPostLinkAction; }; Action *PostLinkAction = createPostLinkAction(); + if (isSYCLNativeCPU) { + // for SYCL Native CPU, we just take the linked device + // modules, lower them to an object file , and link it to the host + // object file. + auto *backendAct = C.MakeAction( + FullDeviceLinkAction, types::TY_PP_Asm); + auto *asmAct = + C.MakeAction(backendAct, types::TY_Object); + DA.add(*asmAct, *TC, BoundArch, Action::OFK_SYCL); + auto *DeviceWrappingAction = C.MakeAction( + PostLinkAction, types::TY_Object); + DA.add(*DeviceWrappingAction, *TC, BoundArch, Action::OFK_SYCL); + continue; + } if (isNVPTX && Args.hasArg(options::OPT_fsycl_embed_ir)) { // When compiling for Nvidia/CUDA devices and the user requested the // IR to be embedded in the application (via option), run the output @@ -6212,17 +6214,8 @@ class OffloadingActionBuilder final { bool GpuInitHasErrors = false; bool HasSYCLTargetsOption = SYCLAddTargets || SYCLTargets || SYCLLinkTargets; - bool IsSYCLNativeCPU = isSYCLNativeCPU(C.getInputArgs()); - // check if multiple targets are passed along with native_cpu: - // currently native_cpu overrides all the other targets, so we emit a - // warning - if (IsSYCLNativeCPU) { - auto *SYCLTargets = Args.getLastArg(options::OPT_fsycl_targets_EQ); - if (SYCLTargets->getNumValues() > 1) - C.getDriver().Diag(clang::diag::warn_drv_sycl_native_cpu_and_targets); - } - if (!IsSYCLNativeCPU && HasSYCLTargetsOption) { + if (HasSYCLTargetsOption) { if (SYCLTargets || SYCLLinkTargets) { Arg *SYCLTargetsValues = SYCLTargets ? SYCLTargets : SYCLLinkTargets; // Fill SYCLTripleList @@ -6256,6 +6249,12 @@ class OffloadingActionBuilder final { C.getDriver().MakeSYCLDeviceTriple("amdgcn-amd-amdhsa"), ValidDevice->data()); UserTargetName = "amdgcn-amd-amdhsa"; + } else if (Val == "native_cpu") { + const ToolChain *HostTC = + C.getSingleOffloadToolChain(); + llvm::Triple TT = HostTC->getTriple(); + SYCLTripleList.push_back(TT); + continue; } llvm::Triple TT(C.getDriver().MakeSYCLDeviceTriple(Val)); @@ -6342,14 +6341,6 @@ class OffloadingActionBuilder final { GpuArchList.emplace_back(TT, nullptr); } } - } else if (IsSYCLNativeCPU) { - const ToolChain *HostTC = - C.getSingleOffloadToolChain(); - llvm::Triple TT = HostTC->getTriple(); - auto TCIt = llvm::find_if( - ToolChains, [&](auto &TC) { return TT == TC->getTriple(); }); - SYCLTripleList.push_back(TT); - SYCLTargetInfoList.emplace_back(*TCIt, nullptr); } else if (HasValidSYCLRuntime) { // -fsycl is provided without -fsycl-*targets. bool SYCLfpga = C.getInputArgs().hasArg(options::OPT_fintelfpga); diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 4e60f396ff0ef..eb3aae227cca8 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -4977,7 +4977,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, bool IsFPGASYCLOffloadDevice = IsSYCLOffloadDevice && Triple.getSubArch() == llvm::Triple::SPIRSubArch_fpga; - bool IsSYCLNativeCPU = isSYCLNativeCPU(Args); + const bool IsSYCLNativeCPU = isSYCLNativeCPU(TC, C.getDefaultToolChain()); // Perform the SYCL host compilation using an external compiler if the user // requested. @@ -5475,6 +5475,10 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, CmdArgs.push_back("-emit-obj"); CollectArgsForIntegratedAssembler(C, Args, CmdArgs, D); } + if (IsSYCLOffloadDevice && IsSYCLNativeCPU) { + CmdArgs.push_back("-mllvm"); + CmdArgs.push_back("-sycl-native-cpu-rename"); + } // Also ignore explicit -force_cpusubtype_ALL option. (void)Args.hasArg(options::OPT_force__cpusubtype__ALL); @@ -9429,6 +9433,10 @@ void OffloadWrapper::ConstructJob(Compilation &C, const JobAction &JA, TargetTripleOpt = ("llvm_" + TargetTripleOpt).str(); } + const bool IsSYCLNativeCPU = isSYCLNativeCPU(TC, C.getDefaultToolChain()); + if (IsSYCLNativeCPU) { + TargetTripleOpt = "native_cpu"; + } WrapperArgs.push_back( C.getArgs().MakeArgString(Twine("-target=") + TargetTripleOpt)); diff --git a/clang/lib/Driver/ToolChains/SYCL.cpp b/clang/lib/Driver/ToolChains/SYCL.cpp index 7d2cf71619d79..faeed8936a4c3 100644 --- a/clang/lib/Driver/ToolChains/SYCL.cpp +++ b/clang/lib/Driver/ToolChains/SYCL.cpp @@ -641,65 +641,66 @@ void SYCL::gen::BackendCompiler::ConstructJob(Compilation &C, StringRef SYCL::gen::resolveGenDevice(StringRef DeviceName) { StringRef Device; - Device = llvm::StringSwitch(DeviceName) - .Cases("intel_gpu_bdw", "intel_gpu_8_0_0", "bdw") - .Cases("intel_gpu_skl", "intel_gpu_9_0_9", "skl") - .Cases("intel_gpu_kbl", "intel_gpu_9_1_9", "kbl") - .Cases("intel_gpu_cfl", "intel_gpu_9_2_9", "cfl") - .Cases("intel_gpu_apl", "intel_gpu_9_3_0", "apl") - .Cases("intel_gpu_glk", "intel_gpu_9_4_0", "glk") - .Cases("intel_gpu_whl", "intel_gpu_9_5_0", "whl") - .Cases("intel_gpu_aml", "intel_gpu_9_6_0", "aml") - .Cases("intel_gpu_cml", "intel_gpu_9_7_0", "cml") - .Cases("intel_gpu_icllp", "intel_gpu_11_0_0", "icllp") - .Cases("intel_gpu_tgllp", "intel_gpu_12_0_0", "tgllp") - .Case("intel_gpu_rkl", "rkl") - .Case("intel_gpu_adl_s", "adl_s") - .Case("intel_gpu_rpl_s", "rpl_s") - .Case("intel_gpu_adl_p", "adl_p") - .Case("intel_gpu_adl_n", "adl_n") - .Cases("intel_gpu_dg1", "intel_gpu_12_10_0", "dg1") - .Case("intel_gpu_acm_g10", "acm_g10") - .Case("intel_gpu_acm_g11", "acm_g11") - .Case("intel_gpu_acm_g12", "acm_g12") - .Case("intel_gpu_pvc", "pvc") - .Case("nvidia_gpu_sm_50", "sm_50") - .Case("nvidia_gpu_sm_52", "sm_52") - .Case("nvidia_gpu_sm_53", "sm_53") - .Case("nvidia_gpu_sm_60", "sm_60") - .Case("nvidia_gpu_sm_61", "sm_61") - .Case("nvidia_gpu_sm_62", "sm_62") - .Case("nvidia_gpu_sm_70", "sm_70") - .Case("nvidia_gpu_sm_72", "sm_72") - .Case("nvidia_gpu_sm_75", "sm_75") - .Case("nvidia_gpu_sm_80", "sm_80") - .Case("nvidia_gpu_sm_86", "sm_86") - .Case("nvidia_gpu_sm_87", "sm_87") - .Case("nvidia_gpu_sm_89", "sm_89") - .Case("nvidia_gpu_sm_90", "sm_90") - .Case("amd_gpu_gfx700", "gfx700") - .Case("amd_gpu_gfx701", "gfx701") - .Case("amd_gpu_gfx702", "gfx702") - .Case("amd_gpu_gfx801", "gfx801") - .Case("amd_gpu_gfx802", "gfx802") - .Case("amd_gpu_gfx803", "gfx803") - .Case("amd_gpu_gfx805", "gfx805") - .Case("amd_gpu_gfx810", "gfx810") - .Case("amd_gpu_gfx900", "gfx900") - .Case("amd_gpu_gfx902", "gfx902") - .Case("amd_gpu_gfx904", "gfx904") - .Case("amd_gpu_gfx906", "gfx906") - .Case("amd_gpu_gfx908", "gfx908") - .Case("amd_gpu_gfx90a", "gfx90a") - .Case("amd_gpu_gfx1010", "gfx1010") - .Case("amd_gpu_gfx1011", "gfx1011") - .Case("amd_gpu_gfx1012", "gfx1012") - .Case("amd_gpu_gfx1013", "gfx1013") - .Case("amd_gpu_gfx1030", "gfx1030") - .Case("amd_gpu_gfx1031", "gfx1031") - .Case("amd_gpu_gfx1032", "gfx1032") - .Case("amd_gpu_gfx1034", "gfx1034") - .Default(""); + Device = + llvm::StringSwitch(DeviceName) + .Cases("intel_gpu_bdw", "intel_gpu_8_0_0", "bdw") + .Cases("intel_gpu_skl", "intel_gpu_9_0_9", "skl") + .Cases("intel_gpu_kbl", "intel_gpu_9_1_9", "kbl") + .Cases("intel_gpu_cfl", "intel_gpu_9_2_9", "cfl") + .Cases("intel_gpu_apl", "intel_gpu_bxt", "intel_gpu_9_3_0", "apl") + .Cases("intel_gpu_glk", "intel_gpu_9_4_0", "glk") + .Cases("intel_gpu_whl", "intel_gpu_9_5_0", "whl") + .Cases("intel_gpu_aml", "intel_gpu_9_6_0", "aml") + .Cases("intel_gpu_cml", "intel_gpu_9_7_0", "cml") + .Cases("intel_gpu_icllp", "intel_gpu_11_0_0", "icllp") + .Cases("intel_gpu_ehl", "intel_gpu_jsl", "ehl") + .Cases("intel_gpu_tgllp", "intel_gpu_12_0_0", "tgllp") + .Case("intel_gpu_rkl", "rkl") + .Cases("intel_gpu_adl_s", "intel_gpu_rpl_s", "adl_s") + .Case("intel_gpu_adl_p", "adl_p") + .Case("intel_gpu_adl_n", "adl_n") + .Cases("intel_gpu_dg1", "intel_gpu_12_10_0", "dg1") + .Cases("intel_gpu_acm_g10", "intel_gpu_dg2_g10", "acm_g10") + .Cases("intel_gpu_acm_g11", "intel_gpu_dg2_g11", "acm_g11") + .Cases("intel_gpu_acm_g12", "intel_gpu_dg2_g12", "acm_g12") + .Case("intel_gpu_pvc", "pvc") + .Case("nvidia_gpu_sm_50", "sm_50") + .Case("nvidia_gpu_sm_52", "sm_52") + .Case("nvidia_gpu_sm_53", "sm_53") + .Case("nvidia_gpu_sm_60", "sm_60") + .Case("nvidia_gpu_sm_61", "sm_61") + .Case("nvidia_gpu_sm_62", "sm_62") + .Case("nvidia_gpu_sm_70", "sm_70") + .Case("nvidia_gpu_sm_72", "sm_72") + .Case("nvidia_gpu_sm_75", "sm_75") + .Case("nvidia_gpu_sm_80", "sm_80") + .Case("nvidia_gpu_sm_86", "sm_86") + .Case("nvidia_gpu_sm_87", "sm_87") + .Case("nvidia_gpu_sm_89", "sm_89") + .Case("nvidia_gpu_sm_90", "sm_90") + .Case("amd_gpu_gfx700", "gfx700") + .Case("amd_gpu_gfx701", "gfx701") + .Case("amd_gpu_gfx702", "gfx702") + .Case("amd_gpu_gfx801", "gfx801") + .Case("amd_gpu_gfx802", "gfx802") + .Case("amd_gpu_gfx803", "gfx803") + .Case("amd_gpu_gfx805", "gfx805") + .Case("amd_gpu_gfx810", "gfx810") + .Case("amd_gpu_gfx900", "gfx900") + .Case("amd_gpu_gfx902", "gfx902") + .Case("amd_gpu_gfx904", "gfx904") + .Case("amd_gpu_gfx906", "gfx906") + .Case("amd_gpu_gfx908", "gfx908") + .Case("amd_gpu_gfx90a", "gfx90a") + .Case("amd_gpu_gfx1010", "gfx1010") + .Case("amd_gpu_gfx1011", "gfx1011") + .Case("amd_gpu_gfx1012", "gfx1012") + .Case("amd_gpu_gfx1013", "gfx1013") + .Case("amd_gpu_gfx1030", "gfx1030") + .Case("amd_gpu_gfx1031", "gfx1031") + .Case("amd_gpu_gfx1032", "gfx1032") + .Case("amd_gpu_gfx1034", "gfx1034") + .Default(""); return Device; } @@ -716,10 +717,10 @@ SmallString<64> SYCL::gen::getGenDeviceMacro(StringRef DeviceName) { .Case("aml", "INTEL_GPU_AML") .Case("cml", "INTEL_GPU_CML") .Case("icllp", "INTEL_GPU_ICLLP") + .Case("ehl", "INTEL_GPU_EHL") .Case("tgllp", "INTEL_GPU_TGLLP") .Case("rkl", "INTEL_GPU_RKL") .Case("adl_s", "INTEL_GPU_ADL_S") - .Case("rpl_s", "INTEL_GPU_RPL_S") .Case("adl_p", "INTEL_GPU_ADL_P") .Case("adl_n", "INTEL_GPU_ADL_N") .Case("dg1", "INTEL_GPU_DG1") @@ -810,7 +811,7 @@ void SYCL::x86_64::BackendCompiler::ConstructJob( SYCLToolChain::SYCLToolChain(const Driver &D, const llvm::Triple &Triple, const ToolChain &HostTC, const ArgList &Args) : ToolChain(D, Triple, Args), HostTC(HostTC), - IsSYCLNativeCPU(isSYCLNativeCPU(Args)) { + IsSYCLNativeCPU(Triple == HostTC.getTriple()) { // Lookup binaries into the driver directory, this is used to // discover the clang-offload-bundler executable. getProgramPaths().push_back(getDriver().Dir); diff --git a/clang/lib/Driver/ToolChains/SYCL.h b/clang/lib/Driver/ToolChains/SYCL.h index 7df2d760bb17c..2c39d5d1c4085 100644 --- a/clang/lib/Driver/ToolChains/SYCL.h +++ b/clang/lib/Driver/ToolChains/SYCL.h @@ -202,13 +202,13 @@ class LLVM_LIBRARY_VISIBILITY SYCLToolChain : public ToolChain { llvm::opt::ArgStringList &CC1Args) const override; const ToolChain &HostTC; + const bool IsSYCLNativeCPU; protected: Tool *buildBackendCompiler() const override; Tool *buildLinker() const override; private: - bool IsSYCLNativeCPU; void TranslateGPUTargetOpt(const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CmdArgs, llvm::opt::OptSpecifier Opt_EQ) const; @@ -223,6 +223,14 @@ template bool isSYCLNativeCPU(const ArgListT &Args) { } return false; } + +inline bool isSYCLNativeCPU(const llvm::Triple HostT, const llvm::Triple DevT) { + return HostT == DevT; +} + +inline bool isSYCLNativeCPU(const ToolChain &TC1, const ToolChain &TC2) { + return isSYCLNativeCPU(TC1.getTriple(), TC2.getTriple()); +} } // end namespace driver } // end namespace clang diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index f512c5a9880cb..a9b7668a06c2c 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -4440,7 +4440,7 @@ SYCLIntelMaxGlobalWorkDimAttr *Sema::MergeSYCLIntelMaxGlobalWorkDimAttr( // to (1, 1, 1) in case the value of SYCLIntelMaxGlobalWorkDimAttr equals to // 0. const auto *MergeExpr = dyn_cast(A.getValue()); - if (MergeExpr->getResultAsAPSInt() == 0) { + if (MergeExpr && MergeExpr->getResultAsAPSInt() == 0) { if (checkWorkGroupSizeAttrExpr(*this, D, A) || checkWorkGroupSizeAttrExpr(*this, D, A)) @@ -4550,8 +4550,11 @@ static void handleSYCLIntelLoopFuseAttr(Sema &S, Decl *D, const ParsedAttr &A) { static void handleVecTypeHint(Sema &S, Decl *D, const ParsedAttr &AL) { // This attribute is deprecated without replacement in SYCL 2020 mode. - if (S.LangOpts.getSYCLVersion() > LangOptions::SYCL_2017) - S.Diag(AL.getLoc(), diag::warn_attribute_spelling_deprecated) << AL; + // Ignore the attribute in SYCL 2020. + if (S.LangOpts.getSYCLVersion() > LangOptions::SYCL_2017) { + S.Diag(AL.getLoc(), diag::warn_attribute_deprecated_ignored) << AL; + return; + } // If the attribute is used with the [[sycl::vec_type_hint]] spelling in SYCL // 2017 mode, we want to warn about using the newer name in the older diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp index ae27b08b2bf68..52be001737786 100644 --- a/clang/lib/Sema/SemaSYCL.cpp +++ b/clang/lib/Sema/SemaSYCL.cpp @@ -20,7 +20,6 @@ #include "clang/Basic/Attributes.h" #include "clang/Basic/Builtins.h" #include "clang/Basic/Diagnostic.h" -#include "clang/Basic/SYCLNativeCPUHelpers.h" #include "clang/Basic/Version.h" #include "clang/Sema/Initialization.h" #include "clang/Sema/Sema.h" @@ -1028,15 +1027,6 @@ static QualType calculateKernelNameType(ASTContext &Ctx, return TAL->get(0).getAsType().getCanonicalType(); } -// Kernel names are currently mangled as type names which -// may collide (in the IR) with the "real" type names generated -// for RTTI etc when compiling host and device code together. -// Therefore the mangling of the kernel function is changed for -// NativeCPU to avoid such potential collision. -static void changeManglingForNativeCPU(std::string &Name) { - Name.append("_NativeCPUKernel"); -} - // Gets a name for the OpenCL kernel function, calculated from the first // template argument of the kernel caller function. static std::pair @@ -1054,12 +1044,18 @@ constructKernelName(Sema &S, const FunctionDecl *KernelCallerFunc, std::string StableName = SYCLUniqueStableNameExpr::ComputeName(S.getASTContext(), KernelNameType); - // When compiling for the SYCLNativeCPU device we need a C++ identifier - // as the kernel name and cannot use the name produced by some manglers - // including the MS mangler. + // For NativeCPU the kernel name is set to the stable GNU-mangled name + // because the default mangling may be different, for example on Windows. + // This is needed for compiling kernels for multiple SYCL targets to ensure + // the same kernel name can be used for kernel lookup in different target + // binaries. This assumes that all SYCL targets use the same mangling + // produced for the stable name. + // Todo: Check if this assumption is valid, and if it would be better + // instead to always compile the NativeCPU device code in GNU mode which + // may cause issues when compiling headers with non-standard extensions + // written for compilers with different C++ ABIs (like MS VS). if (S.getLangOpts().SYCLIsNativeCPU) { MangledName = StableName; - changeManglingForNativeCPU(MangledName); } return {MangledName, StableName}; @@ -5682,16 +5678,6 @@ bool SYCLIntegrationFooter::emit(raw_ostream &OS) { } } - if (S.getLangOpts().SYCLIsNativeCPU) { - // This is a temporary workaround for the integration header file - // being emitted too early. - std::string HCName = getNativeCPUHeaderName(S.getLangOpts()); - - OS << "\n// including the kernel handlers calling the kernels\n"; - OS << "\n#include \""; - OS << HCName; - OS << "\"\n\n"; - } if (EmittedFirstSpecConstant) OS << "#include \n"; diff --git a/clang/test/CodeGenSYCL/native_cpu_basic.cpp b/clang/test/CodeGenSYCL/native_cpu_basic.cpp index 3ea3cab8fa06c..87552837cea4a 100644 --- a/clang/test/CodeGenSYCL/native_cpu_basic.cpp +++ b/clang/test/CodeGenSYCL/native_cpu_basic.cpp @@ -50,8 +50,8 @@ void gen() { } // Check name mangling -// CHECK-DAG: @_ZTS6init_aIiE_NativeCPUKernel_NativeCPUKernel({{.*}}) -// CHECK-DAG: @_ZTS6init_aIfE_NativeCPUKernel_NativeCPUKernel({{.*}}) +// CHECK-DAG: @_ZTS6init_aIiE.NativeCPUKernel({{.*}}) +// CHECK-DAG: @_ZTS6init_aIfE.NativeCPUKernel({{.*}}) // Check Native CPU module flag // CHECK-DAG: !{{[0-9]*}} = !{i32 1, !"is-native-cpu", i32 1} diff --git a/clang/test/Driver/sycl-native-cpu-fsycl.cpp b/clang/test/Driver/sycl-native-cpu-fsycl.cpp index 6646b29dc1d05..27b4598dbba1c 100644 --- a/clang/test/Driver/sycl-native-cpu-fsycl.cpp +++ b/clang/test/Driver/sycl-native-cpu-fsycl.cpp @@ -19,15 +19,20 @@ //CHECK_ACTIONS:| +- 11: linker, {5}, ir, (device-sycl) //CHECK_ACTIONS:| +- 12: backend, {11}, assembler, (device-sycl) //CHECK_ACTIONS:|- 13: assembler, {12}, object, (device-sycl) -//CHECK_ACTIONS:14: offload, "host-sycl ({{.*}})" {10}, "device-sycl ({{.*}})" {13}, image +//call sycl-post-link and clang-offload-wrapper +//CHECK_ACTIONS:| +- 14: sycl-post-link, {11}, tempfiletable, (device-sycl) +//CHECK_ACTIONS:|- 15: clang-offload-wrapper, {14}, object, (device-sycl) +//CHECK_ACTIONS:16: offload, "host-sycl ({{.*}})" {10}, "device-sycl ({{.*}})" {13}, "device-sycl ({{.*}})" {15}, image //CHECK_BINDINGS:# "{{.*}}" - "clang", inputs: ["{{.*}}sycl-native-cpu-fsycl.cpp"], output: "[[KERNELIR:.*]].bc" //CHECK_BINDINGS:# "{{.*}}" - "SYCL::Linker", inputs: ["[[KERNELIR]].bc"], output: "[[KERNELLINK:.*]].bc" //CHECK_BINDINGS:# "{{.*}}" - "clang", inputs: ["[[KERNELLINK]].bc"], output: "[[KERNELOBJ:.*]].o" +//CHECK_BINDINGS:# "{{.*}}" - "SYCL post link", inputs: ["[[KERNELLINK]].bc"], output: "[[TABLEFILE:.*]].table" +//CHECK_BINDINGS:# "{{.*}}" - "offload wrapper", inputs: ["[[TABLEFILE]].table"], output: "[[WRAPPEROBJ:.*]].o" //CHECK_BINDINGS:# "{{.*}}" - "Append Footer to source", inputs: ["{{.*}}sycl-native-cpu-fsycl.cpp"], output: "[[SRCWFOOTER:.*]].cpp" //CHECK_BINDINGS:# "{{.*}}" - "clang", inputs: ["[[SRCWFOOTER]].cpp", "[[KERNELIR]].bc"], output: "[[HOSTOBJ:.*]].o" -//CHECK_BINDINGS:# "{{.*}}" - "{{.*}}::Linker", inputs: ["[[HOSTOBJ]].o", "[[KERNELOBJ]].o"], output: "a.{{.*}}" +//CHECK_BINDINGS:# "{{.*}}" - "{{.*}}::Linker", inputs: ["[[HOSTOBJ]].o", "[[KERNELOBJ]].o", "[[WRAPPEROBJ]].o"], output: "a.{{.*}}" //CHECK_INVO:{{.*}}clang{{.*}}-fsycl-is-device{{.*}}"-fsycl-is-native-cpu" "-D" "__SYCL_NATIVE_CPU__" //CHECK_INVO:{{.*}}clang{{.*}}"-x" "ir" @@ -48,4 +53,6 @@ //CHECK_ACTIONS-AARCH64:| +- 11: linker, {5}, ir, (device-sycl) //CHECK_ACTIONS-AARCH64:| +- 12: backend, {11}, assembler, (device-sycl) //CHECK_ACTIONS-AARCH64:|- 13: assembler, {12}, object, (device-sycl) -//CHECK_ACTIONS-AARCH64:14: offload, "host-sycl (aarch64-unknown-linux-gnu)" {10}, "device-sycl (aarch64-unknown-linux-gnu)" {13}, image +//CHECK_ACTIONS-AARCH64:| +- 14: sycl-post-link, {11}, tempfiletable, (device-sycl) +//CHECK_ACTIONS-AARCH64:|- 15: clang-offload-wrapper, {14}, object, (device-sycl) +//CHECK_ACTIONS-AARCH64:16: offload, "host-sycl (aarch64-unknown-linux-gnu)" {10}, "device-sycl (aarch64-unknown-linux-gnu)" {13}, "device-sycl (aarch64-unknown-linux-gnu)" {15}, image diff --git a/clang/test/Driver/sycl-native-cpu-warn.cpp b/clang/test/Driver/sycl-native-cpu-warn.cpp deleted file mode 100644 index a5e249be86f0e..0000000000000 --- a/clang/test/Driver/sycl-native-cpu-warn.cpp +++ /dev/null @@ -1,5 +0,0 @@ -// RUN: %clangxx -fsycl -fsycl-targets="native_cpu,spir64" -### %s 2>&1 | FileCheck %s - -// checks that we emit the correct warning when native_cpu is listed together with other sycl targets -// TODO: remove this test and the warning once native_cpu is supported alongside other targets -// CHECK: warning: -fsycl-targets=native_cpu overrides SYCL targets option [-Wsycl-native-cpu-targets] diff --git a/clang/test/Driver/sycl-oneapi-gpu.cpp b/clang/test/Driver/sycl-oneapi-gpu.cpp index 7be7c85f3c33b..5445f2bcf9173 100644 --- a/clang/test/Driver/sycl-oneapi-gpu.cpp +++ b/clang/test/Driver/sycl-oneapi-gpu.cpp @@ -20,6 +20,8 @@ // RUN: FileCheck %s --check-prefixes=DEVICE,MACRO -DDEV_STR=apl -DMAC_STR=APL // RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_9_3_0 -### %s 2>&1 | \ // RUN: FileCheck %s --check-prefixes=DEVICE,MACRO -DDEV_STR=apl -DMAC_STR=APL +// RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_bxt -### %s 2>&1 | \ +// RUN: FileCheck %s --check-prefixes=DEVICE,MACRO -DDEV_STR=apl -DMAC_STR=APL // RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_glk -### %s 2>&1 | \ // RUN: FileCheck %s --check-prefixes=DEVICE,MACRO -DDEV_STR=glk -DMAC_STR=GLK // RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_9_4_0 -### %s 2>&1 | \ @@ -42,6 +44,10 @@ // RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_11_0_0 -### %s 2>&1 | \ // RUN: FileCheck %s --check-prefixes=DEVICE,MACRO -DDEV_STR=icllp \ // RUN: -DMAC_STR=ICLLP +// RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_ehl -### %s 2>&1 | \ +// RUN: FileCheck %s --check-prefixes=DEVICE,MACRO -DDEV_STR=ehl -DMAC_STR=EHL +// RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_jsl -### %s 2>&1 | \ +// RUN: FileCheck %s --check-prefixes=DEVICE,MACRO -DDEV_STR=ehl -DMAC_STR=EHL // RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_tgllp -### %s 2>&1 | \ // RUN: FileCheck %s --check-prefixes=DEVICE,MACRO -DDEV_STR=tgllp \ // RUN: -DMAC_STR=TGLLP @@ -54,8 +60,8 @@ // RUN: FileCheck %s --check-prefixes=DEVICE,MACRO -DDEV_STR=adl_s \ // RUN: -DMAC_STR=ADL_S // RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_rpl_s -### %s 2>&1 | \ -// RUN: FileCheck %s --check-prefixes=DEVICE,MACRO -DDEV_STR=rpl_s \ -// RUN: -DMAC_STR=RPL_S +// RUN: FileCheck %s --check-prefixes=DEVICE,MACRO -DDEV_STR=adl_s \ +// RUN: -DMAC_STR=ADL_S // RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_adl_p -### %s 2>&1 | \ // RUN: FileCheck %s --check-prefixes=DEVICE,MACRO -DDEV_STR=adl_p \ // RUN: -DMAC_STR=ADL_P @@ -69,12 +75,21 @@ // RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_acm_g10 -### %s 2>&1 | \ // RUN: FileCheck %s --check-prefixes=DEVICE,MACRO -DDEV_STR=acm_g10 \ // RUN: -DMAC_STR=ACM_G10 +// RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_dg2_g10 -### %s 2>&1 | \ +// RUN: FileCheck %s --check-prefixes=DEVICE,MACRO -DDEV_STR=acm_g10 \ +// RUN: -DMAC_STR=ACM_G10 // RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_acm_g11 -### %s 2>&1 | \ // RUN: FileCheck %s --check-prefixes=DEVICE,MACRO -DDEV_STR=acm_g11 \ // RUN: -DMAC_STR=ACM_G11 +// RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_dg2_g11 -### %s 2>&1 | \ +// RUN: FileCheck %s --check-prefixes=DEVICE,MACRO -DDEV_STR=acm_g11 \ +// RUN: -DMAC_STR=ACM_G11 // RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_acm_g12 -### %s 2>&1 | \ // RUN: FileCheck %s --check-prefixes=DEVICE,MACRO -DDEV_STR=acm_g12 \ // RUN: -DMAC_STR=ACM_G12 +// RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_dg2_g12 -### %s 2>&1 | \ +// RUN: FileCheck %s --check-prefixes=DEVICE,MACRO -DDEV_STR=acm_g12 \ +// RUN: -DMAC_STR=ACM_G12 // RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_pvc -### %s 2>&1 | \ // RUN: FileCheck %s --check-prefixes=DEVICE,MACRO -DDEV_STR=pvc -DMAC_STR=PVC // MACRO: clang{{.*}} "-triple" "spir64_gen-unknown-unknown" diff --git a/clang/test/SemaSYCL/vec-type-hint-2.cpp b/clang/test/SemaSYCL/vec-type-hint-2.cpp new file mode 100644 index 0000000000000..820ab2bc011ee --- /dev/null +++ b/clang/test/SemaSYCL/vec-type-hint-2.cpp @@ -0,0 +1,26 @@ +// RUN: %clang_cc1 -fsycl-is-device -sycl-std=2020 -internal-isystem %S/Inputs -fsyntax-only -verify %s + +// Test which verifies [[sycl::vec_type_hint()]] is accepted +// with non-conforming lambda syntax. + +// NOTE: This attribute is not supported in the SYCL backends. +// To be minimally conformant with SYCL2020, attribute is +// accepted by the Clang FE with a warning. No additional +// semantic handling or IR generation is done for this +// attribute. + +#include "sycl.hpp" + +struct test {}; + +using namespace sycl; +queue q; + +void bar() { + q.submit([&](handler &h) { + h.single_task( + // expected-warning@+1 {{attribute 'vec_type_hint' is deprecated; attribute ignored}} + []() [[sycl::vec_type_hint(test)]] {}); + }); +} + diff --git a/clang/test/SemaSYCL/vec-type-hint.cpp b/clang/test/SemaSYCL/vec-type-hint.cpp index d72d0e5ec91b4..e9fccb4ae2928 100644 --- a/clang/test/SemaSYCL/vec-type-hint.cpp +++ b/clang/test/SemaSYCL/vec-type-hint.cpp @@ -7,4 +7,4 @@ // __attribute__((vec_type_hint)) is deprecated without replacement in SYCL // 2020 mode, but is allowed in SYCL 2017 and OpenCL modes. -KERNEL __attribute__((vec_type_hint(int))) void foo() {} // sycl-2020-warning {{attribute 'vec_type_hint' is deprecated}} +KERNEL __attribute__((vec_type_hint(int))) void foo() {} // sycl-2020-warning {{attribute 'vec_type_hint' is deprecated; attribute ignored}} diff --git a/clang/tools/clang-offload-wrapper/ClangOffloadWrapper.cpp b/clang/tools/clang-offload-wrapper/ClangOffloadWrapper.cpp old mode 100644 new mode 100755 index c73d60936c00d..b0aedbb6d7519 --- a/clang/tools/clang-offload-wrapper/ClangOffloadWrapper.cpp +++ b/clang/tools/clang-offload-wrapper/ClangOffloadWrapper.cpp @@ -17,28 +17,33 @@ #include "clang/Basic/Version.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/TargetParser/Triple.h" #include "llvm/ADT/Twine.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/Bitcode/BitcodeWriter.h" +#include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/GlobalValue.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" +#include "llvm/TargetParser/Triple.h" #ifndef NDEBUG #include "llvm/IR/Verifier.h" #endif // NDEBUG #include "llvm/Object/ELFObjectFile.h" #include "llvm/Object/ObjectFile.h" +#include "llvm/SYCLLowerIR/SYCLUtils.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/EndianStream.h" #include "llvm/Support/Errc.h" #include "llvm/Support/Error.h" #include "llvm/Support/ErrorOr.h" -#include "llvm/Support/LineIterator.h" #include "llvm/Support/FileSystem.h" +#include "llvm/Support/LineIterator.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Path.h" #include "llvm/Support/Program.h" @@ -598,6 +603,69 @@ class BinaryWrapper { return AutoGcBufs.back().get(); } + Function *addDeclarationForNativeCPU(StringRef Name) { + static FunctionType *NativeCPUFuncTy = FunctionType::get( + Type::getVoidTy(C), + {PointerType::getUnqual(C), PointerType::getUnqual(C)}, false); + static FunctionType *NativeCPUBuiltinTy = FunctionType::get( + PointerType::getUnqual(C), {PointerType::getUnqual(C)}, false); + FunctionType *FTy; + if (Name.starts_with("__dpcpp_nativecpu")) + FTy = NativeCPUBuiltinTy; + else + FTy = NativeCPUFuncTy; + auto FCalle = M.getOrInsertFunction( + sycl::utils::addSYCLNativeCPUSuffix(Name).str(), FTy); + Function *F = dyn_cast(FCalle.getCallee()); + if (F == nullptr) + report_fatal_error("Unexpected callee"); + return F; + } + + Expected> + addDeclarationsForNativeCPU(StringRef EntriesFile) { + Expected MBOrErr = loadFile(EntriesFile); + if (!MBOrErr) + return MBOrErr.takeError(); + MemoryBuffer *MB = *MBOrErr; + // the Native CPU PI Plug-in expects the BinaryStart field to point to an + // array of struct nativecpu_entry { + // char *kernelname; + // unsigned char *kernel_ptr; + // }; + StructType *NCPUEntryT = StructType::create( + {PointerType::getUnqual(C), PointerType::getUnqual(C)}, + "__nativecpu_entry"); + SmallVector NativeCPUEntries; + for (line_iterator LI(*MB); !LI.is_at_eof(); ++LI) { + auto *NewDecl = addDeclarationForNativeCPU(*LI); + NativeCPUEntries.push_back(ConstantStruct::get( + NCPUEntryT, + {addStringToModule(*LI, "__ncpu_function_name"), NewDecl})); + } + + // Add an empty entry that we use as end iterator + static auto *NativeCPUEndStr = + addStringToModule("__nativecpu_end", "__ncpu_end_str"); + auto *NullPtr = llvm::ConstantPointerNull::get(PointerType::getUnqual(C)); + NativeCPUEntries.push_back( + ConstantStruct::get(NCPUEntryT, {NativeCPUEndStr, NullPtr})); + + // Create the constant array containing the {kernel name, function pointers} + // pairs + ArrayType *ATy = ArrayType::get(NCPUEntryT, NativeCPUEntries.size()); + Constant *CA = ConstantArray::get(ATy, NativeCPUEntries); + auto *GVar = new GlobalVariable(M, CA->getType(), true, + GlobalVariable::InternalLinkage, CA, + "__sycl_native_cpu_decls"); + auto *Begin = ConstantExpr::getGetElementPtr(GVar->getValueType(), GVar, + getSizetConstPair(0u, 0u)); + auto *End = ConstantExpr::getGetElementPtr( + GVar->getValueType(), GVar, + getSizetConstPair(0u, NativeCPUEntries.size())); + return std::make_pair(Begin, End); + } + // Adds a global readonly variable that is initialized by given data to the // module. GlobalVariable *addGlobalArrayVariable(const Twine &Name, @@ -966,9 +1034,18 @@ class BinaryWrapper { // Adding ELF notes for STDIN is not supported yet. Bin = addELFNotes(Bin, Img.File); } - std::pair Fbin = addDeviceImageToModule( - ArrayRef(Bin->getBufferStart(), Bin->getBufferSize()), - Twine(OffloadKindTag) + Twine(ImgId) + Twine(".data"), Kind, Img.Tgt); + std::pair Fbin; + if (Img.Tgt == "native_cpu") { + auto FBinOrErr = addDeclarationsForNativeCPU(Img.EntriesFile); + if (!FBinOrErr) + return FBinOrErr.takeError(); + Fbin = *FBinOrErr; + } else { + Fbin = addDeviceImageToModule( + ArrayRef(Bin->getBufferStart(), Bin->getBufferSize()), + Twine(OffloadKindTag) + Twine(ImgId) + Twine(".data"), Kind, + Img.Tgt); + } if (Kind == OffloadKind::SYCL) { // For SYCL image offload entries are defined here, by wrapper, so diff --git a/llvm/include/llvm/SYCLLowerIR/EmitSYCLNativeCPUHeader.h b/llvm/include/llvm/SYCLLowerIR/RenameKernelSYCLNativeCPU.h similarity index 50% rename from llvm/include/llvm/SYCLLowerIR/EmitSYCLNativeCPUHeader.h rename to llvm/include/llvm/SYCLLowerIR/RenameKernelSYCLNativeCPU.h index cf8569193da05..086f9256ca7ea 100644 --- a/llvm/include/llvm/SYCLLowerIR/EmitSYCLNativeCPUHeader.h +++ b/llvm/include/llvm/SYCLLowerIR/RenameKernelSYCLNativeCPU.h @@ -1,4 +1,4 @@ -//===---- EmitSYCLHCHeader.h - Emit SYCL Native CPU Helper Header Pass ----===// +//===-- RenameKernelSYCLNativeCPU.h - Kernel renaming for SYCL Native CPU--===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,10 +6,9 @@ // //===----------------------------------------------------------------------===// // -// This pass emits the SYCL Native CPU helper header. -// The header mainly contains the definition for the handler function which -// allows to call the kernel extracted by the device compiler from the host -// runtime. +// A transformation pass that renames the kernel names, making sure that the +// mangled name is a string with no particular semantics. +// //===----------------------------------------------------------------------===// #pragma once @@ -21,16 +20,10 @@ namespace llvm { class ModulePass; -class EmitSYCLNativeCPUHeaderPass - : public PassInfoMixin { +class RenameKernelSYCLNativeCPUPass + : public PassInfoMixin { public: PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM); - EmitSYCLNativeCPUHeaderPass(const std::string &FileName) - : NativeCPUHeaderName(FileName) {} - EmitSYCLNativeCPUHeaderPass() = default; - -private: - std::string NativeCPUHeaderName; }; } // namespace llvm diff --git a/llvm/include/llvm/SYCLLowerIR/SYCLUtils.h b/llvm/include/llvm/SYCLLowerIR/SYCLUtils.h index c9ebcdae53f4b..40450d291509c 100644 --- a/llvm/include/llvm/SYCLLowerIR/SYCLUtils.h +++ b/llvm/include/llvm/SYCLLowerIR/SYCLUtils.h @@ -117,6 +117,12 @@ inline bool isSYCLExternalFunction(const Function *F) { return F->hasFnAttribute(ATTR_SYCL_MODULE_ID); } +constexpr char SYCLNATIVECPUSUFFIX[] = ".SYCLNCPU"; +inline llvm::Twine addSYCLNativeCPUSuffix(StringRef S) { + return llvm::Twine(S, SYCLNATIVECPUSUFFIX); +} +constexpr char SYCLNATIVECPURENAMEMD[] = "sycl-native-cpu-rename"; + } // namespace utils } // namespace sycl } // namespace llvm diff --git a/llvm/lib/SYCLLowerIR/CMakeLists.txt b/llvm/lib/SYCLLowerIR/CMakeLists.txt index 255426bff2b88..0a7fba1b2fa59 100644 --- a/llvm/lib/SYCLLowerIR/CMakeLists.txt +++ b/llvm/lib/SYCLLowerIR/CMakeLists.txt @@ -70,7 +70,7 @@ add_llvm_component_library(LLVMSYCLLowerIR GlobalOffset.cpp TargetHelpers.cpp PrepareSYCLNativeCPU.cpp - EmitSYCLNativeCPUHeader.cpp + RenameKernelSYCLNativeCPU.cpp ADDITIONAL_HEADER_DIRS ${LLVM_MAIN_INCLUDE_DIR}/llvm/SYCLLowerIR diff --git a/llvm/lib/SYCLLowerIR/ESIMD/ESIMDVerifier.cpp b/llvm/lib/SYCLLowerIR/ESIMD/ESIMDVerifier.cpp index e884ee8f2c57b..4d0e84091f107 100644 --- a/llvm/lib/SYCLLowerIR/ESIMD/ESIMDVerifier.cpp +++ b/llvm/lib/SYCLLowerIR/ESIMD/ESIMDVerifier.cpp @@ -63,6 +63,7 @@ static const char *LegalSYCLFunctions[] = { "^sycl::_V1::ext::oneapi::sub_group::.+", "^sycl::_V1::ext::oneapi::experimental::spec_constant<.+>::.+", "^sycl::_V1::ext::oneapi::experimental::this_sub_group", + "^sycl::_V1::ext::oneapi::experimental::uniform<.+>::.+", "^sycl::_V1::ext::oneapi::bfloat16::.+", "^sycl::_V1::ext::oneapi::experimental::if_architecture_is"}; diff --git a/llvm/lib/SYCLLowerIR/EmitSYCLNativeCPUHeader.cpp b/llvm/lib/SYCLLowerIR/EmitSYCLNativeCPUHeader.cpp deleted file mode 100644 index cc41483ff54a7..0000000000000 --- a/llvm/lib/SYCLLowerIR/EmitSYCLNativeCPUHeader.cpp +++ /dev/null @@ -1,141 +0,0 @@ -//===---- EmitSYCLHCHeader.cpp - Emit SYCL Native CPU Helper Header Pass --===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Emits the SYCL Native CPU helper headers, containing the kernel definition -// and handlers. -//===----------------------------------------------------------------------===// - -#include "llvm/SYCLLowerIR/EmitSYCLNativeCPUHeader.h" - -#include "llvm/ADT/StringRef.h" -#include "llvm/IR/Constant.h" -#include "llvm/IR/DerivedTypes.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/Metadata.h" -#include "llvm/IR/Type.h" -#include "llvm/InitializePasses.h" -#include "llvm/Pass.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/FileSystem.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/Utils/Cloning.h" -#include "llvm/Transforms/Utils/ValueMapper.h" -#include -#include - -using namespace llvm; - -namespace { - -void emitSubKernelHandler(const Function *F, raw_ostream &O) { - O << "\nextern \"C\" void " << F->getName() << "subhandler("; - O << "const sycl::detail::NativeCPUArgDesc *MArgs, " - "__nativecpu_state *state);\n"; - return; -} - -// Todo: maybe we could use clang-offload-wrapper for this, -// the main thing that prevents use from using clang-offload-wrapper -// right now is the fact that we need the subhandler -// to figure out which args are used or not, and so the BinaryStart entry -// need to point to the subhandler, and I'm not sure how to do that in -// clang-offload-wrapper. If we figure out a better way to deal with unused -// kernel args, we can probably get rid of the subhandler and make BinaryStart -// point the the actual kernel function pointer, which should be doable in -// clang-offload-wrapper. -void emitSYCLRegisterLib(const Function *F, raw_ostream &O) { - auto KernelName = F->getName(); - std::string SubHandlerName = (KernelName + "subhandler").str(); - static const char *BinariesT = "pi_device_binaries_struct"; - static const char *BinaryT = "pi_device_binary_struct"; - static const char *OffloadEntryT = "_pi_offload_entry_struct"; - std::string Binaries = (BinariesT + KernelName).str(); - std::string Binary = (BinaryT + KernelName).str(); - std::string OffloadEntry = (OffloadEntryT + KernelName).str(); - // Fill in the offload entry struct for this kernel - O << "static " << OffloadEntryT << " " << OffloadEntry << "{" - << "(void*)&" << SubHandlerName << ", " // addr - << "const_cast(\"" << KernelName << "\"), " // name - << "1, " // size - << "0, " // flags - << "0 " // reserved - << "};\n"; - // Fill in the binary struct - O << "static " << BinaryT << " " << Binary << "{" - << "0, " // Version - << "4, " // Kind - << "0, " // Format - << "__SYCL_PI_DEVICE_BINARY_TARGET_UNKNOWN, " // Device target spec - << "nullptr, " // Compile options - << "nullptr, " // Link options - << "nullptr, " // Manifest start - << "nullptr, " // Manifest end - << "(unsigned char*)&" << SubHandlerName << ", " // BinaryStart - << "(unsigned char*)&" << SubHandlerName << " + 1, " // BinaryEnd - << "&" << OffloadEntry << ", " // EntriesBegin - << "&" << OffloadEntry << "+1, " // EntriesEnd - << "nullptr, " // PropertySetsBegin - << "nullptr " // PropertySetsEnd - << "};\n"; - // Fill in the binaries struct - O << "static " << BinariesT << " " << Binaries << "{" - << "0, " // Version - << "1, " // NumDeviceBinaries - << "&" << Binary << ", " // DeviceBinaries - << "nullptr, " // HostEntriesBegin - << "nullptr " // HostEntriesEnd - << "};\n"; - - // Define a struct and use its constructor to call __sycl_register_lib - std::string InitNativeCPU = ("init_native_cpu" + KernelName).str(); - std::string InitNativeCPUT = InitNativeCPU + "_t"; - O << "struct " << InitNativeCPUT << "{\n" - << "\t" << InitNativeCPUT << "(){\n" - << "\t\t" - << "__sycl_register_lib(&" << Binaries << ");\n" - << "\t}\n" - << "};\n" - << "static " << InitNativeCPUT << " " << InitNativeCPU << ";\n"; -} - -} // namespace - -PreservedAnalyses EmitSYCLNativeCPUHeaderPass::run(Module &M, - ModuleAnalysisManager &MAM) { - SmallVector Kernels; - for (auto &F : M) { - if (F.getCallingConv() == llvm::CallingConv::SPIR_KERNEL) - Kernels.push_back(&F); - } - - // Emit native CPU helper header - if (NativeCPUHeaderName == "") { - report_fatal_error("No file name for Native CPU helper header specified", - false); - } - int HCHeaderFD = 0; - std::error_code EC = - llvm::sys::fs::openFileForWrite(NativeCPUHeaderName, HCHeaderFD); - if (EC) { - report_fatal_error(StringRef(EC.message()), false); - } - llvm::raw_fd_ostream O(HCHeaderFD, true); - O << "#pragma once\n"; - O << "#include \n"; - O << "#include \n"; - O << "extern \"C\" void __sycl_register_lib(pi_device_binaries desc);\n"; - - for (auto *F : Kernels) { - emitSubKernelHandler(F, O); - emitSYCLRegisterLib(F, O); - } - - return PreservedAnalyses::all(); -} diff --git a/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp b/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp index c6d199dde5f8d..d80973cd3f55f 100644 --- a/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp +++ b/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "llvm/SYCLLowerIR/PrepareSYCLNativeCPU.h" +#include "llvm/SYCLLowerIR/SYCLUtils.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" @@ -103,7 +104,15 @@ void emitSubkernelForKernel(Function *F, Type *NativeCPUArgDescType, Type *NativeCPUArgDescPtrType = PointerType::getUnqual(NativeCPUArgDescType); // Create function signature - const std::string SubHandlerName = F->getName().str() + "subhandler"; + // Todo: we need to ensure that the kernel name is not mangled as a type + // name, otherwise this may lead to runtime failures due to *weird* + // codegen/linking behaviour, we change the name of the kernel, and the + // subhandler steals its name, we add a suffix to the subhandler later + // on when lowering the device module + std::string OldName = F->getName().str(); + std::string NewName = OldName + ".NativeCPUKernel"; + const auto SubHandlerName = OldName; + F->setName(NewName); FunctionType *FTy = FunctionType::get( Type::getVoidTy(Ctx), {NativeCPUArgDescPtrType, StatePtrType}, false); auto SubhFCallee = F->getParent()->getOrInsertFunction(SubHandlerName, FTy); @@ -142,6 +151,15 @@ void emitSubkernelForKernel(Function *F, Type *NativeCPUArgDescType, KernelArgs.push_back(SubhF->getArg(1)); Builder.CreateCall(KernelTy, F, KernelArgs); Builder.CreateRetVoid(); + + // Add sycl-module-id attribute + // Todo: we may want to copy other attributes to the subhandler, + // but we can't simply use setAttributes(F->getAttributes) since + // the function signatures are different + if (F->hasFnAttribute(sycl::utils::ATTR_SYCL_MODULE_ID)) { + Attribute MId = F->getFnAttribute(sycl::utils::ATTR_SYCL_MODULE_ID); + SubhF->addFnAttr("sycl-module-id", MId.getValueAsString()); + } } // Clones the function and returns a new function with a new argument on type T @@ -252,8 +270,6 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M, StructType::create({PointerType::getUnqual(M.getContext())}); for (auto &NewK : NewKernels) { emitSubkernelForKernel(NewK, NativeCPUArgDescType, StatePtrType); - std::string NewName = NewK->getName().str() + "_NativeCPUKernel"; - NewK->setName(NewName); } // Then we iterate over all the supported builtins, find their uses and diff --git a/llvm/lib/SYCLLowerIR/RenameKernelSYCLNativeCPU.cpp b/llvm/lib/SYCLLowerIR/RenameKernelSYCLNativeCPU.cpp new file mode 100644 index 0000000000000..c96b1c091c547 --- /dev/null +++ b/llvm/lib/SYCLLowerIR/RenameKernelSYCLNativeCPU.cpp @@ -0,0 +1,28 @@ +//===- RenameKernelSYCLNativeCPU.cpp - Kernel renaming for SYCL Native CPU-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// A transformation pass that renames the kernel names, to ensure the name +// doesn't clash with other names. +// +//===----------------------------------------------------------------------===// + +#include "llvm/SYCLLowerIR/RenameKernelSYCLNativeCPU.h" +#include "llvm/SYCLLowerIR/SYCLUtils.h" + +using namespace llvm; + +PreservedAnalyses +RenameKernelSYCLNativeCPUPass::run(Module &M, ModuleAnalysisManager &MAM) { + bool ModuleChanged = false; + for (auto &F : M) { + if (F.hasFnAttribute(sycl::utils::ATTR_SYCL_MODULE_ID)) { + F.setName(sycl::utils::addSYCLNativeCPUSuffix(F.getName())); + } + } + return ModuleChanged ? PreservedAnalyses::none() : PreservedAnalyses::all(); +} diff --git a/llvm/lib/SYCLLowerIR/SYCLPropagateAspectsUsage.cpp b/llvm/lib/SYCLLowerIR/SYCLPropagateAspectsUsage.cpp index 4b37c267f2353..f43cf98f66ca8 100644 --- a/llvm/lib/SYCLLowerIR/SYCLPropagateAspectsUsage.cpp +++ b/llvm/lib/SYCLLowerIR/SYCLPropagateAspectsUsage.cpp @@ -316,11 +316,10 @@ getAspectUsageChain(const Function *F, const FunctionToAspectsMapTy &AspectsMap, } void createUsedAspectsMetadataForFunctions( - FunctionToAspectsMapTy &Map, const AspectsSetTy &ExcludeAspectVals) { - for (auto &[F, Aspects] : Map) { - if (Aspects.empty()) - continue; - + FunctionToAspectsMapTy &FunctionToUsedAspects, + FunctionToAspectsMapTy &FunctionToDeclaredAspects, + const AspectsSetTy &ExcludeAspectVals) { + for (auto &[F, Aspects] : FunctionToUsedAspects) { LLVMContext &C = F->getContext(); // Create a set of unique aspects. First we add the ones from the found @@ -330,6 +329,11 @@ void createUsedAspectsMetadataForFunctions( if (!ExcludeAspectVals.contains(A)) UniqueAspects.insert(A); + // The aspects that were propagated via declared aspects are always + // added to the metadata. + for (const int &A : FunctionToDeclaredAspects[F]) + UniqueAspects.insert(A); + // If there are no new aspects, we can just keep the old metadata. if (UniqueAspects.empty()) continue; @@ -547,7 +551,7 @@ void setSyclFixedTargetsMD(const std::vector &EntryPoints, } /// Returns a map of functions with corresponding used aspects. -FunctionToAspectsMapTy +std::pair buildFunctionsToAspectsMap(Module &M, TypeToAspectsMapTy &TypesWithAspects, const AspectValueToNameMapTy &AspectValues, const std::vector &EntryPoints, @@ -575,10 +579,9 @@ buildFunctionsToAspectsMap(Module &M, TypeToAspectsMapTy &TypesWithAspects, Visited.clear(); for (Function *F : EntryPoints) propagateAspectsThroughCG(F, CG, FunctionToDeclaredAspects, Visited); - for (const auto &It : FunctionToDeclaredAspects) - FunctionToUsedAspects[It.first].insert(It.second.begin(), It.second.end()); - return FunctionToUsedAspects; + return {std::move(FunctionToUsedAspects), + std::move(FunctionToDeclaredAspects)}; } } // anonymous namespace @@ -617,8 +620,9 @@ SYCLPropagateAspectsUsagePass::run(Module &M, ModuleAnalysisManager &MAM) { propagateAspectsToOtherTypesInModule(M, TypesWithAspects, AspectValues); - FunctionToAspectsMapTy FunctionToUsedAspects = buildFunctionsToAspectsMap( - M, TypesWithAspects, AspectValues, EntryPoints, ValidateAspectUsage); + auto [FunctionToUsedAspects, FunctionToDeclaredAspects] = + buildFunctionsToAspectsMap(M, TypesWithAspects, AspectValues, EntryPoints, + ValidateAspectUsage); // Create a set of excluded aspect values. AspectsSetTy ExcludedAspectVals; @@ -629,8 +633,8 @@ SYCLPropagateAspectsUsagePass::run(Module &M, ModuleAnalysisManager &MAM) { ExcludedAspectVals.insert(AspectValIter->second); } - createUsedAspectsMetadataForFunctions(FunctionToUsedAspects, - ExcludedAspectVals); + createUsedAspectsMetadataForFunctions( + FunctionToUsedAspects, FunctionToDeclaredAspects, ExcludedAspectVals); setSyclFixedTargetsMD(EntryPoints, TargetFixedAspects, AspectValues); diff --git a/llvm/test/SYCLLowerIR/PropagateAspectsUsage/exclude-aspect.ll b/llvm/test/SYCLLowerIR/PropagateAspectsUsage/exclude-aspect.ll index 59c7964cd60ad..223ef0e803287 100644 --- a/llvm/test/SYCLLowerIR/PropagateAspectsUsage/exclude-aspect.ll +++ b/llvm/test/SYCLLowerIR/PropagateAspectsUsage/exclude-aspect.ll @@ -47,34 +47,38 @@ define spir_kernel void @kernel1() { ret void } -; funcE should get none of its explicitly declared aspects in its +; funcE should get its explicitly declared aspects even if excluded ; sycl_used_aspects -; CHECK: define spir_func void @funcE() !sycl_declared_aspects ![[#DA1:]] { +; CHECK: define spir_func void @funcE() !sycl_declared_aspects ![[#DA1:]] +; CHECK-SAME: !sycl_used_aspects ![[#DA1]] { define spir_func void @funcE() !sycl_declared_aspects !10 { ret void } ; funcF should have the same aspects as funcE -; CHECK-NOT: define spir_func void @funcF() {{.*}} !sycl_used_aspects +; CHECK: define spir_func void @funcF() !sycl_used_aspects ![[#DA1]] { define spir_func void @funcF() { call spir_func void @funcE() ret void } -; funcG only keeps one aspect, the rest are excluded -; CHECK: define spir_func void @funcG() !sycl_declared_aspects ![[#DA2:]] !sycl_used_aspects ![[#ID3:]] +; aspect1 is used but excluded, aspect2 and aspect4 are declared, so +; attached metadata is aspect2 and aspect4 +; CHECK: define spir_func void @funcG() !sycl_declared_aspects ![[#DA2:]] +; CHECK-SAME: !sycl_used_aspects ![[#DA2]] { define spir_func void @funcG() !sycl_declared_aspects !11 { + %tmp = alloca %B ret void } ; funcH should have the same aspects as funcG -; CHECK: define spir_func void @funcH() !sycl_used_aspects ![[#ID3]] +; CHECK: define spir_func void @funcH() !sycl_used_aspects ![[#DA2]] define spir_func void @funcH() { call spir_func void @funcG() ret void } -; CHECK: define spir_kernel void @kernel2() !sycl_used_aspects ![[#ID3]] +; CHECK: define spir_kernel void @kernel2() !sycl_used_aspects ![[#ID5:]] define spir_kernel void @kernel2() { call spir_func void @funcF() call spir_func void @funcH() @@ -100,7 +104,7 @@ define spir_func void @funcK() !sycl_used_aspects !11 { ret void } -; CHECK: define spir_func void @funcL() !sycl_used_aspects ![[#ID3]] +; CHECK: define spir_func void @funcL() !sycl_used_aspects ![[#ID3:]] define spir_func void @funcL() { call spir_func void @funcK() ret void @@ -128,12 +132,12 @@ define spir_kernel void @kernel3() { !9 = !{!"fp64", i32 5} !10 = !{i32 1} -!11 = !{i32 4, i32 2, i32 1} +!11 = !{i32 4, i32 2} ; CHECK-DAG: ![[#DA1]] = !{i32 1} -; CHECK-DAG: ![[#DA2]] = !{i32 4, i32 2, i32 1} +; CHECK-DAG: ![[#DA2]] = !{i32 4, i32 2} ; CHECK-DAG: ![[#ID0]] = !{i32 0} ; CHECK-DAG: ![[#ID1]] = !{i32 2, i32 0} ; CHECK-DAG: ![[#ID2]] = !{i32 0, i32 2, i32 3} ; CHECK-DAG: ![[#ID3]] = !{i32 2} -; CHECK-DAG: ![[#ID4]] = !{i32 2, i32 4, i32 1} +; CHECK-DAG: ![[#ID4]] = !{i32 2, i32 4} diff --git a/sycl/CMakeLists.txt b/sycl/CMakeLists.txt index 6f5957ee467f6..9e53dd35b5cd4 100644 --- a/sycl/CMakeLists.txt +++ b/sycl/CMakeLists.txt @@ -193,30 +193,39 @@ include(AddBoostMp11Headers) file(GLOB_RECURSE HEADERS_IN_SYCL_DIR CONFIGURE_DEPENDS "${sycl_inc_dir}/sycl/*") file(GLOB_RECURSE HEADERS_IN_CL_DIR CONFIGURE_DEPENDS "${sycl_inc_dir}/CL/*") file(GLOB_RECURSE HEADERS_IN_STD_DIR CONFIGURE_DEPENDS "${sycl_inc_dir}/std/*") +file(GLOB_RECURSE HEADERS_IN_SYCLCOMPAT_DIR CONFIGURE_DEPENDS "${sycl_inc_dir}/syclcompat/*" "${sycl_inc_dir}/syclcompat.hpp") + string(REPLACE "${sycl_inc_dir}" "${SYCL_INCLUDE_BUILD_DIR}" OUT_HEADERS_IN_SYCL_DIR "${HEADERS_IN_SYCL_DIR}") string(REPLACE "${sycl_inc_dir}/CL" "${SYCL_INCLUDE_BUILD_DIR}/sycl/CL" OUT_HEADERS_IN_CL_DIR "${HEADERS_IN_CL_DIR}") string(REPLACE "${sycl_inc_dir}" "${SYCL_INCLUDE_BUILD_DIR}" OUT_HEADERS_IN_STD_DIR "${HEADERS_IN_STD_DIR}") +string(REPLACE "${sycl_inc_dir}" "${SYCL_INCLUDE_BUILD_DIR}" + OUT_HEADERS_IN_SYCLCOMPAT_DIR "${HEADERS_IN_SYCLCOMPAT_DIR}") # Copy SYCL headers from sources to build directory add_custom_target(sycl-headers DEPENDS ${OUT_HEADERS_IN_SYCL_DIR} ${OUT_HEADERS_IN_CL_DIR} ${OUT_HEADERS_IN_STD_DIR} + ${OUT_HEADERS_IN_SYCLCOMPAT_DIR} boost_mp11-headers) add_custom_command( OUTPUT ${OUT_HEADERS_IN_SYCL_DIR} ${OUT_HEADERS_IN_CL_DIR} ${OUT_HEADERS_IN_STD_DIR} + ${OUT_HEADERS_IN_SYCLCOMPAT_DIR} DEPENDS ${HEADERS_IN_SYCL_DIR} ${HEADERS_IN_CL_DIR} ${HEADERS_IN_STD_DIR} + ${HEADERS_IN_SYCLCOMPAT_DIR} COMMAND ${CMAKE_COMMAND} -E copy_directory ${sycl_inc_dir}/sycl ${SYCL_INCLUDE_BUILD_DIR}/sycl COMMAND ${CMAKE_COMMAND} -E copy_directory ${sycl_inc_dir}/CL ${SYCL_INCLUDE_BUILD_DIR}/sycl/CL COMMAND ${CMAKE_COMMAND} -E copy_directory ${sycl_inc_dir}/std ${SYCL_INCLUDE_BUILD_DIR}/std + COMMAND ${CMAKE_COMMAND} -E copy_directory ${sycl_inc_dir}/syclcompat ${SYCL_INCLUDE_BUILD_DIR}/syclcompat + COMMAND ${CMAKE_COMMAND} -E copy ${sycl_inc_dir}/syclcompat.hpp ${SYCL_INCLUDE_BUILD_DIR}/syclcompat.hpp COMMENT "Copying SYCL headers ...") # Copy SYCL headers from source to install directory @@ -224,6 +233,8 @@ install(DIRECTORY "${sycl_inc_dir}/sycl" DESTINATION ${SYCL_INCLUDE_DIR} COMPONE install(DIRECTORY "${sycl_inc_dir}/CL" DESTINATION ${SYCL_INCLUDE_DIR}/sycl COMPONENT sycl-headers) install(DIRECTORY "${sycl_inc_dir}/std" DESTINATION ${SYCL_INCLUDE_DIR} COMPONENT sycl-headers) install(DIRECTORY ${BOOST_MP11_DESTINATION_DIR} DESTINATION ${SYCL_INCLUDE_DIR}/sycl/detail COMPONENT boost_mp11-headers) +install(DIRECTORY "${sycl_inc_dir}/syclcompat" DESTINATION ${SYCL_INCLUDE_DIR}/syclcompat COMPONENT sycl-headers) +install(FILES "${sycl_inc_dir}/syclcompat.hpp" DESTINATION ${SYCL_INCLUDE_DIR} COMPONENT sycl-headers) if (WIN32) set(SYCL_RT_LIBS sycl${SYCL_MAJOR_VERSION}) diff --git a/sycl/cmake/modules/AddSYCLExecutable.cmake b/sycl/cmake/modules/AddSYCLExecutable.cmake index 69a666aaf3430..4aa3ffbbf5119 100644 --- a/sycl/cmake/modules/AddSYCLExecutable.cmake +++ b/sycl/cmake/modules/AddSYCLExecutable.cmake @@ -36,7 +36,10 @@ macro(add_sycl_executable ARG_TARGET_NAME) COMMAND_EXPAND_LISTS) add_dependencies(${ARG_TARGET_NAME}_exec sycl-toolchain) foreach(_lib ${ARG_LIBRARIES}) - add_dependencies(${ARG_TARGET_NAME}_exec _lib) + # Avoid errors when linking external targets such as dl + if(TARGET ${_lib}) + add_dependencies(${ARG_TARGET_NAME}_exec ${_lib}) + endif() endforeach() foreach(_dep ${ARG_DEPENDANTS}) diff --git a/sycl/cmake/modules/AddSYCLLibraryUnitTest.cmake b/sycl/cmake/modules/AddSYCLLibraryUnitTest.cmake new file mode 100644 index 0000000000000..291d55cc107ce --- /dev/null +++ b/sycl/cmake/modules/AddSYCLLibraryUnitTest.cmake @@ -0,0 +1,91 @@ +# add_sycl_library_unittest(test_suite_name sycl_extra_flags +# file1.cpp file2.cpp ...) +# +# sycl_extra_flags: Clang extra compiler flags, e.g. +# "-fsycl-unnamed-lambdas;-fsycl-device-code-split" +# +# Will compile the list of files together using clang. +# Produces a single binary using all the .cpp files +# named 'test_suite_name' at ${CMAKE_CURRENT_BINARY_DIR}. +macro(add_sycl_library_unittest test_suite_name) + cmake_parse_arguments(ARG + "" + "" + "SYCL_EXTRA_FLAGS;SOURCES" + ${ARGN}) + + set(CXX_COMPILER clang++) + if(MSVC) + set(CXX_COMPILER clang-cl.exe) + endif() + + set(DEVICE_COMPILER_EXECUTABLE ${LLVM_RUNTIME_OUTPUT_INTDIR}/${CXX_COMPILER}) + set(_OUTPUT_BIN ${CMAKE_CURRENT_BINARY_DIR}/${test_suite_name}Tests) + set(_TESTS_TARGET ${test_suite_name}Tests) + set(_BIN_TARGET ${_TESTS_TARGET}_bin) + set(_LLVM_TARGET_DEPENDENCIES + "llvm_gtest_main;llvm_gtest;LLVMTestingSupport;LLVMSupport;LLVMDemangle") + + foreach(_lib ${_LLVM_TARGET_DEPENDENCIES}) + list(APPEND _LIBRARIES $) + endforeach() + + # Enable exception handling on Windows + # Appends extra libraries not available in LIBPATH + if(WIN32) + set(_INTERNAL_LINKER_FLAGS /link /SUBSYSTEM:CONSOLE) + list(APPEND _INTERNAL_EXTRA_FLAGS "/EHs") + list(APPEND _LIBRARIES $) + list(APPEND _LIBRARIES ${LLVM_LIBRARY_OUTPUT_INTDIR}/sycl-devicelib-host.lib) + endif() + + if(UNIX) + foreach(_lib "pthread" "dl" "ncurses") + list(APPEND _LIBRARIES "-l${_lib}") + endforeach() + endif() + + get_target_property(GTEST_INCLUDES llvm_gtest INCLUDE_DIRECTORIES) + foreach(_dir ${GTEST_INCLUDES}) + # Avoid -I when _dir contains an empty generator expression. + list(APPEND INCLUDE_COMPILER_STRING "$<$:-I${_dir}>") + endforeach() + + add_custom_target(${_BIN_TARGET} + COMMAND ${DEVICE_COMPILER_EXECUTABLE} -fsycl ${ARG_SOURCES} + -o ${_OUTPUT_BIN} + ${ARG_SYCL_EXTRA_FLAGS} + ${_INTERNAL_EXTRA_FLAGS} + ${INCLUDE_COMPILER_STRING} + ${_LIBRARIES} + ${_INTERNAL_LINKER_FLAGS} + BYPRODUCTS ${CMAKE_CURRENT_BINARY_DIR}/${_TESTS_TARGET} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + COMMAND_EXPAND_LISTS) + + add_dependencies(${_BIN_TARGET} sycl) + foreach(_lib ${ARG_LIBRARIES}) + add_dependencies(${_BIN_TARGET} ${_TARGET_DEPENDENCIES}) + endforeach() + + add_dependencies(SYCLUnitTests ${_BIN_TARGET}) + + add_executable(${_TESTS_TARGET} IMPORTED GLOBAL) + set_target_properties(${_TESTS_TARGET} PROPERTIES + IMPORTED_LOCATION ${CMAKE_CURRENT_BINARY_DIR}) + + # Check target for Linux + if (UNIX) + add_custom_target(check-${test_suite_name} + ${CMAKE_COMMAND} -E + env LD_LIBRARY_PATH="${CMAKE_BINARY_DIR}/lib" + env SYCL_CONFIG_FILE_NAME=null.cfg + env SYCL_DEVICELIB_NO_FALLBACK=1 + env SYCL_CACHE_DIR="${CMAKE_BINARY_DIR}/sycl_cache" + ${CMAKE_CURRENT_BINARY_DIR}/${_TESTS_TARGET} + ) + add_dependencies(check-${test_suite_name} ${_BIN_TARGET}) + add_dependencies(check-sycl-unittests-libs check-${test_suite_name}) + endif() + +endmacro() diff --git a/sycl/doc/UsersManual.md b/sycl/doc/UsersManual.md index 9e4f20ef2fccc..41c16ff59b6c2 100644 --- a/sycl/doc/UsersManual.md +++ b/sycl/doc/UsersManual.md @@ -40,21 +40,24 @@ and not recommended to use in production environment. support are accepted, providing a streamlined interface for AOT. Only one of these values at a time is supported. * intel_gpu_pvc - Ponte Vecchio Intel graphics architecture - * intel_gpu_acm_g12 - Alchemist G12 Intel graphics architecture - * intel_gpu_acm_g11 - Alchemist G11 Intel graphics architecture - * intel_gpu_acm_g10 - Alchemist G10 Intel graphics architecture + * intel_gpu_acm_g12, intel_gpu_dg2_g12 - Alchemist G12 Intel graphics architecture + * intel_gpu_acm_g11, intel_gpu_dg2_g11 - Alchemist G11 Intel graphics architecture + * intel_gpu_acm_g10, intel_gpu_dg2_g10 - Alchemist G10 Intel graphics architecture * intel_gpu_dg1, intel_gpu_12_10_0 - DG1 Intel graphics architecture * intel_gpu_adl_n - Alder Lake N Intel graphics architecture * intel_gpu_adl_p - Alder Lake P Intel graphics architecture - * intel_gpu_rpl_s - Raptor Lake Intel graphics architecture + * intel_gpu_rpl_s - Raptor Lake Intel graphics architecture (equal to intel_gpu_adl_s) * intel_gpu_adl_s - Alder Lake S Intel graphics architecture * intel_gpu_rkl - Rocket Lake Intel graphics architecture * intel_gpu_tgllp, intel_gpu_12_0_0 - Tiger Lake Intel graphics architecture + * intel_gpu_jsl - Jasper Lake Intel graphics architecture (equal to intel_gpu_ehl) + * intel_gpu_ehl - Elkhart Lake Intel graphics architecture * intel_gpu_icllp, intel_gpu_11_0_0 - Ice Lake Intel graphics architecture * intel_gpu_cml, intel_gpu_9_7_0 - Comet Lake Intel graphics architecture * intel_gpu_aml, intel_gpu_9_6_0 - Amber Lake Intel graphics architecture * intel_gpu_whl, intel_gpu_9_5_0 - Whiskey Lake Intel graphics architecture * intel_gpu_glk, intel_gpu_9_4_0 - Gemini Lake Intel graphics architecture + * intel_gpu_bxt - Broxton Intel graphics architecture (equal to intel_gpu_apl) * intel_gpu_apl, intel_gpu_9_3_0 - Apollo Lake Intel graphics architecture * intel_gpu_cfl, intel_gpu_9_2_9 - Coffee Lake Intel graphics architecture * intel_gpu_kbl, intel_gpu_9_1_9 - Kaby Lake Intel graphics architecture diff --git a/sycl/doc/design/SYCLNativeCPU.md b/sycl/doc/design/SYCLNativeCPU.md index 39930d34fd04b..7daa8f1c9a633 100644 --- a/sycl/doc/design/SYCLNativeCPU.md +++ b/sycl/doc/design/SYCLNativeCPU.md @@ -93,20 +93,19 @@ entry: } ``` -For the Native CPU target, the device compiler needs to perform two main operations: -* Materialize the SPIRV builtins (such as `@__spirv_BuiltInGlobalInvocationId`), so that they can be correctly updated by the runtime when executing the kernel. This is performed by the [PrepareSYCLNativeCPU pass](llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp). -* Allow the SYCL runtime to call the kernel, by registering it to the SYCL runtime, operation performed by the [EmitSYCLNativeCPUHeader pass](llvm/lib/SYCLLowerIR/EmitSYCLNativeCPUHeader.cpp). +For the Native CPU target, the device compiler is in charge of materializing the SPIRV builtins (such as `@__spirv_BuiltInGlobalInvocationId`), so that they can be correctly updated by the runtime when executing the kernel. This is performed by the [PrepareSYCLNativeCPU pass](llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp). +The PrepareSYCLNativeCPUPass also emits a `subhandler` function, which receives the kernel arguments from the SYCL runtime (packed in a vector), unpacks them, and forwards only the used ones to the actual kernel. ## PrepareSYCLNativeCPU Pass -This pass will add a pointer to a `nativecpu_state` struct as kernel argument to all the kernel functions, and it will replace all the uses of SPIRV builtins with the return value of appropriately defined functions, which will read the requested information from the `nativecpu_state` struct. The `nativecpu_state` struct and the builtin functions are defined in [native_cpu.hpp](sycl/include/sycl/detail/native_cpu.hpp). +This pass will add a pointer to a `nativecpu_state` struct as kernel argument to all the kernel functions, and it will replace all the uses of SPIRV builtins with the return value of appropriately defined functions, which will read the requested information from the `__nativecpu_state` struct. The `__nativecpu_state` struct and the builtin functions are defined in [native_cpu.hpp](sycl/include/sycl/detail/native_cpu.hpp). The resulting IR is: ```llvm -define weak dso_local void @_Z6Sample(ptr noundef align 4 %0, ptr noundef align 4 %1, ptr noundef align 4 %2, ptr %3) local_unnamed_addr #3 !srcloc !74 !kernel_arg_buffer_location !75 !kernel_arg_type !76 !sycl_fixed_targets !49 !sycl_kernel_omit_args !77 { +define weak dso_local void @_Z6Sample.NativeCPUKernel(ptr noundef align 4 %0, ptr noundef align 4 %1, ptr noundef align 4 %2, ptr %3) local_unnamed_addr #3 !srcloc !74 !kernel_arg_buffer_location !75 !kernel_arg_type !76 !sycl_fixed_targets !49 !sycl_kernel_omit_args !77 { entry: %ncpu_builtin = call ptr @_Z13get_global_idmP15nativecpu_state(ptr %3) %4 = load i64, ptr %ncpu_builtin, align 32, !noalias !78 @@ -122,11 +121,12 @@ entry: ret void } ``` -This pass will also set the correct calling convention for the target, and handle calling convention-related function attributes, allowing to call the kernel from the runtime. \\ -Additionally, this pass emits the definition for a `subhandler` function, which unpacks the vector of kernel arguments coming from the SYCL runtime, and forwards only the used arguments to the kernel. For our example the `subhandler` IR is: +This pass will also set the correct calling convention for the target, and handle calling convention-related function attributes, allowing to call the kernel from the runtime. + +The `subhandler` for the Native CPU kernel looks like: ```llvm -define weak void @_Z6Samplesubhandler(ptr %0, ptr %1) #4 { +define weak void @_Z6Sample(ptr %0, ptr %1) #4 { entry: %2 = getelementptr %0, ptr %0, i64 0 %3 = load ptr, ptr %2, align 8 @@ -136,33 +136,27 @@ entry: %7 = load ptr, ptr %6, align 8 %8 = getelementptr %0, ptr %0, i64 7 %9 = load ptr, ptr %8, align 8 - call void @_ZTS10SimpleVaddIiE_NativeCPUKernel(ptr %3, ptr %5, ptr %7, ptr %9, ptr %1) + call void @_ZTS10SimpleVaddIiE.NativeCPUKernel(ptr %3, ptr %5, ptr %7, ptr %9, ptr %1) ret void } ``` +As you can see, the `subhandler` steals the kernel's function name, and receives two pointer arguments: the first one points to the kernel arguments from the SYCL runtime, and the second one to the `__nativecpu_state` struct. -## EmitSYCLNativeCPUHeader pass - -This pass emits an additional integration header, that will be compiled by the host compiler during the host compilation step. This header is included by the main integration footer and does not need to be managed manually. Its main purpose is to enable the SYCL runtime to register kernels and to call kernels that had unused parameters removed by the optimizer. The header contains, for each kernel: -* The subhandler declaration as a C++ function. -* The definition of `_pi_offload_entry_struct`, `pi_device_binary_struct` and `pi_device_binaries_struct` variables, and a call to `__sycl_register_lib`, which allows to register the kernel to the sycl runtime (the call to `__sycl_register_lib` is performed at program startup via the constructor of a global). The Native CPU integration header is always named `.hc`. - -The Native CPU integration header for our example is: +## Kernel registration -```c++ -extern "C" void _Z6Samplesubhandler(const sycl::detail::NativeCPUArgDesc *MArgs, nativecpu_state *state); - -static _pi_offload_entry_struct _pi_offload_entry_struct_Z6Sample{(void*)&_Z6Samplesubhandler, const_cast("_Z6Sample"), 1, 0, 0 }; -static pi_device_binary_struct pi_device_binary_struct_Z6Sample{0, 4, 0, __SYCL_PI_DEVICE_BINARY_TARGET_UNKNOWN, nullptr, nullptr, nullptr, nullptr, (unsigned char*)&_Z6Samplesubhandler, (unsigned char*)&_Z6Samplesubhandler + 1, &_pi_offload_entry_struct_Z6Sample, &_pi_offload_entry_struct_Z6Sample+1, nullptr, nullptr }; -static pi_device_binaries_struct pi_device_binaries_struct_Z6Sample{0, 1, &pi_device_binary_struct_Z6Sample, nullptr, nullptr }; -struct init_native_cpu_Z6Sample_t{ - init_native_cpu_Z6Sample_t(){ - __sycl_register_lib(&pi_device_binaries_struct_Z6Sample); - } -}; -static init_native_cpu_Z6Sample_t init_native_cpu_Z6Sample; +In order to register the Native CPU kernels to the SYCL runtime, we applied a small change to the `clang-offload-wrapper` tool: normally, the `clang-offload-wrapper` bundles the offload binary in an LLVM-IR module. Instead of bundling the device code, for the Native CPU target we insert an array of function pointers to the `subhandler`s, and the `pi_device_binary_struct::BinaryStart` and `pi_device_binary_struct::BinaryEnd` fields, which normally point to the begin and end addresses of the offload binary, now point to the begin and end of the array. ``` + ------------------------------------------------------- + | "_Z6Sample" | other entries | "__nativecpu_end" | + | &_Z6Sample | | nullptr | + ------------------------------------------------------- + ^ ^ + | | + BinaryStart BinaryEnd +``` + +Each entry in the array contains the kernel name as a string, and a pointer to the `sunhandler` function declaration. Since the subhandler's signature has always the same arguments (two pointers in LLVM-IR), the `clang-offload-wrapper` can emit the function declarations given just the function names contained in the `.table` file emitted by `sycl-post-link`. The symbols are then resolved by the system's linker, which receives both the output from the offload wrapper and the lowered device module. ## Kernel lowering and execution diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc index 10de1c45407bf..dd27c1437af4c 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc @@ -104,11 +104,14 @@ enum class architecture : /* unspecified */ { intel_gpu_kbl, intel_gpu_cfl, intel_gpu_apl, + intel_gpu_bxt = intel_gpu_apl, intel_gpu_glk, intel_gpu_whl, intel_gpu_aml, intel_gpu_cml, intel_gpu_icllp, + intel_gpu_ehl, + intel_gpu_jsl = intel_gpu_ehl, intel_gpu_tgllp, intel_gpu_rkl, intel_gpu_adl_s, @@ -117,8 +120,11 @@ enum class architecture : /* unspecified */ { intel_gpu_adl_n, intel_gpu_dg1, intel_gpu_acm_g10, + intel_gpu_dg2_g10 = intel_gpu_acm_g10, intel_gpu_acm_g11, + intel_gpu_dg2_g11 = intel_gpu_acm_g11, intel_gpu_acm_g12, + intel_gpu_dg2_g12 = intel_gpu_acm_g12, intel_gpu_pvc, nvidia_gpu_sm_50, diff --git a/sycl/doc/index.rst b/sycl/doc/index.rst index 8f6e0854df8f5..579e6e81d44cf 100644 --- a/sycl/doc/index.rst +++ b/sycl/doc/index.rst @@ -13,6 +13,7 @@ Using oneAPI DPC++ for Application Development PreprocessorMacros cuda/contents Extensions + syclcompat/README.md FAQ User API Reference EnvironmentVariables diff --git a/sycl/doc/syclcompat/README.md b/sycl/doc/syclcompat/README.md new file mode 100644 index 0000000000000..5b000cb52dbc3 --- /dev/null +++ b/sycl/doc/syclcompat/README.md @@ -0,0 +1,1285 @@ +# SYCLcompat + +SYCLcompat is a header-only library that intends to help developers familiar +with other heterogeneous programming models (such as OpenMP, CUDA or HIP) to +familiarize themselves with the SYCL programming API while porting their +existing codes. Compatibility tools can also benefit from the reduced API size +when converting legacy codebases. + +SYCLcompat provides: + +* A high-level API that provides closer semantics to other programming models, +simplifying line by line conversions. +* Alternative submission APIs that encapusulate SYCL-specific "queue" and +"event" APIs for easier reference. +* Ability to gradually introduce other SYCL concepts as the user familiarises +themselves with the core SYCL API. +* Clear distinction between core SYCL API and the compatibility interface via +separate namespaces. + +## Notice + +Copyright © 2023-2023 Codeplay Software Limited. All rights reserved. + +Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks of +The Khronos Group Inc. OpenCL(TM) is a trademark of Apple Inc. used by +permission by Khronos. + +## Support + +SYCLcompat depends on specific oneAPI DPC++ compiler extensions that may not be +available to all the SYCL 2020 specification implementations. + +Specifically, this library depends on the following SYCL extensions: + +* [sycl_ext_oneapi_local_memory]( + ../extensions/supported/sycl_ext_oneapi_local_memory.asciidoc) +* [sycl_ext_oneapi_complex]( + ../extensions/experimental/sycl_ext_oneapi_complex.asciidoc) +* [sycl_ext_oneapi_free_function_queries]( + ../extensions/experimental/sycl_ext_oneapi_free_function_queries.asciidoc) +* [sycl_ext_oneapi_assert]( + ../extensions/supported/sycl_ext_oneapi_assert.asciidoc) +* [sycl_ext_oneapi_enqueue_barrier]( + ../extensions/supported/sycl_ext_oneapi_enqueue_barrier.asciidoc) + +## Usage + +All functionality is available under the `syclcompat::` namespace, imported +through the main header, `syclcompat.hpp`. Note that `syclcompat.hpp` does not +import the header. + +``` cpp +#include +``` + +This document presents the public API under the [Features](#features) section, +and provides a working [Sample code](#sample-code) using this library. Refer to +those to learn to use the library. + +## Features + +### dim3 + +SYCLcompat provides a `dim3` class akin to that of CUDA or HIP programming +models. `dim3` encapsulates other languages iteration spaces that are +represented with coordinate letters (x, y, z). + +```cpp +namespace syclcompat { + +class dim3 { +public: + const size_t x, y, z; + constexpr dim3(const sycl::range<3> &r); + constexpr dim3(const sycl::range<2> &r); + constexpr dim3(const sycl::range<1> &r); + constexpr dim3(size_t x, size_t y = 1, size_t z = 1); + + constexpr size_t size(); + + operator sycl::range<3>(); + operator sycl::range<2>(); + operator sycl::range<1>(); +}; + +// Element-wise operators +dim3 operator*(const dim3 &a, const dim3 &b); +dim3 operator+(const dim3 &a, const dim3 &b); +dim3 operator-(const dim3 &a, const dim3 &b); + +} // syclcompat +``` + +In SYCL, the fastest-moving dimension is the one with the highest index, e.g. in +a SYCL 2D range iteration space, there are two dimensions, 0 and 1, and 1 will +be the one that "moves faster". The compatibility headers for SYCL offer a +number of convenience functions that help the mapping between xyz-based +coordinates to SYCL iteration spaces in the different scopes available. In +addition to the global range, the following helper functions are also provided: + +``` c++ +namespace syclcompat { + +namespace local_id { +size_t x(); +size_t y(); +size_t z(); +} // namespace local_id + +namespace local_range { +size_t x(); +size_t y(); +size_t z(); +} // namespace local_range + +namespace work_group_id { +size_t x(); +size_t y(); +size_t z(); +} // namespace work_group_id + +namespace work_group_range { +size_t x(); +size_t y(); +size_t z(); +} // namespace work_group_range + +namespace global_range { +size_t x(); +size_t y(); +size_t z(); +} // namespace global_range + +namespace global_id { +size_t x(); +size_t y(); +size_t z(); +} // namespace global_id + +} // syclcompat +``` + +These translate any kernel dimensions from one convention to the other. An +example of an equivalent SYCL call for a 3D kernel using `compat` is +`syclcompat::global_id::x() == get_global_id(2)`. + +### Local Memory + +When using `compat` functions, there are two distinct interfaces to allocate +device local memory. The first interface uses the _sycl_ext_oneapi_local_memory_ +extension to leverage local memory defined at compile time. +_sycl_ext_oneapi_local_memory_ is accessed through the following wrapper: + +``` c++ +namespace syclcompat { + +template auto *local_mem(); + +} // syclcompat +``` + +`syclcompat::local_mem()` can be used as illustrated in the example +below. + +```c++ +// Sample kernel +using namespace syclcompat; +template +void local_mem_2d(int *d_A) { + // Local memory extension wrapper, size defined at compile-time + auto As = local_mem(); + int id_x = local_id::x(); + int id_y = local_id::y(); + As[id_y][id_x] = id_x * BLOCK_SIZE + id_y; + wg_barrier(); + int val = As[BLOCK_SIZE - id_y - 1][BLOCK_SIZE - id_x - 1]; + d_A[global_id::y() * BLOCK_SIZE + global_id::x()] = val; +} +``` + +The second interface allows users to allocate device local memory at runtime. +SYCLcompat provides this functionality through its kernel launch interface, +`launch`, defined in the following section. + +### launch + +SYCLcompat provides a kernel `launch` interface which accepts a function that +executes on the device (a.k.a "kernel") instead of a lambda/functor. It can be +called either by using a pair of "teams"/"blocks" and "threads", from +OpenMP/CUDA terminology, or using a `sycl::nd_range`. The interface accepts a +device _function_ with the use of an `auto F` template parameter, and a variadic +`Args` for the function's arguments. + +Various overloads for `launch` exist to permit the user to launch on a +specific `queue`, or to define dynamically sized device local memory. + +``` c++ +namespace syclcompat { + +template +sycl::event launch(const dim3 &grid, const dim3 &threads, Args... args); + +template +sycl::event launch(const sycl::nd_range &range, Args... args); + +template +sycl::event launch(const sycl::nd_range &range, + sycl::queue q, Args... args); + +template +sycl::event launch(const dim3 &grid, const dim3 &threads, + sycl::queue q, Args... args); + +template +sycl::event launch(const sycl::nd_range &range, size_t mem_size, + sycl::queue q, Args... args); + +template +sycl::event launch(const sycl::nd_range &range, size_t mem_size, + Args... args); + +template +sycl::event launch(const dim3 &grid, const dim3 &threads, + size_t mem_size, sycl::queue q, Args... args); + +template +sycl::event launch(const dim3 &grid, const dim3 &threads, + size_t mem_size, Args... args); + +} // syclcompat +``` + +For example, if the user had an existing function named `vectorAdd` to execute +on a device such as follows: + +``` c++ +void vectorAdd(const float *A, const float *B, float *C, int n); +``` + +using SYCLcompat, the user can call it as follows: + +``` c++ +syclcompat::launch(blocksPerGrid, threadsPerBlock, d_A, d_B, d_C, n); +``` + +which would be equivalent to the following call using a `sycl::nd_range`: + +``` c++ +auto range = sycl::nd_range<3>{blocksPerGrid * threadsPerBlock, + threadsPerBlock}; +syclcompat::launch(range, d_A, d_B, d_C, n); +``` + +For dynamic local memory allocation, `launch` injects a pointer to a +local `char *` accessor of `mem_size` as the last argument of the kernel +function. For example, the previous function named `vectorAdd` can be modified +with the following signature, which adds a `char *` pointer to access local +memory inside the kernel: + +``` c++ +void vectorAdd(const float *A, const float *B, float *C, int n, + char *local_mem); +``` + +Then, `vectorAdd` can be launched like this: + +``` c++ +syclcompat::launch(blocksPerGrid, threadsPerBlock, mem_size, d_A, + d_B, d_C, n); +``` + +or this: + +``` c++ +auto range = sycl::nd_range<3>{globalSize, localSize}; +syclcompat::launch(range, mem_size, d_A, d_B, d_C, n); +``` + +This `launch` interface allows users to define an internal memory pool, or +scratchpad, that can then be reinterpreted as the datatype required by the user +within the kernel function. + +### Utilities + +SYCLcompat introduces a set of utility functions designed to streamline the +usage of the library and its `launch` mechanism. + +The first utility function is `syclcompat::wg_barrier()`, which provides a +concise work-group barrier. `syclcompat::wg_barrier()` uses the +_SYCL_INTEL_free_function_queries_ extension to provide this functionality. + +The second utility function, `syclcompat::compute_nd_range`, ensures that the +provided global size and work group sizes are appropriate for a given +dimensionality, and that global size is rounded up to a multiple of the work +group size in each dimension. + +```c++ +namespace syclcompat { + +void wg_barrier(); + +template +sycl::nd_range compute_nd_range(sycl::range global_size_in, + sycl::range work_group_size); +sycl::nd_range<1> compute_nd_range(int global_size_in, int work_group_size); + +} // syclcompat +``` + +### Queues + +The design for this library assumes _in-order_ queues +(`sycl::property::queue::in_order()`). + +Many of the APIs accept an optional `queue` parameter, and this can be an +out-of-order queue, either created manually or retrieved via a call to +`syclcompat::create_queue()`, specifying `false` for the `in_order` parameter. + +```c++ +namespace syclcompat { + +sycl::queue create_queue(bool print_on_async_exceptions = false, + bool in_order = true); + +} // syclcompat +``` + +However, SYCLcompat does not implement any mechanisms to deal with this case. +The rationale for this is that a user wanting the full power of SYCL's +dependency management shouldn't be using the this library. As such, support for +out-of-order queues is very limited. The only way to safely use an out-of-order +queue at present is to explicitly `q.wait()` or `e.wait()` where `e` is the +`sycl::event` returned through a `syclcompat::async` API. + +To facilitate machine translation from other heterogeneous programming models to +SYCL, SYCLcompat provides the following pointer aliases for `sycl::event` and +`sycl::queue`, and the function `destroy_event` which destroys an `event_ptr` +allocated on the heap. + +``` c++ +namespace syclcompat { + +using event_ptr = sycl::event *; + +using queue_ptr = sycl::queue *; + +static void destroy_event(event_ptr event); + +} // syclcompat +``` + +### Memory Allocation + +This library provides interfaces to allocate memory to be accessed within kernel +functions and on the host. The `syclcompat::malloc` function allocates device +USM memory, the `syclcompat::malloc_host` function allocates host USM memory, +and the `syclcompat::malloc_shared` function allocates shared USM memory. + +In each case we provide a template and non-templated interface for allocating +memory, taking the number of elements or number of bytes respectively. + +The interface includes both synchronous and asynchronous `malloc`, `memcpy`, +`memset`, `fill`, and `free` operations. + +There is a helper class `pointer_attributes` to query allocation type for memory +pointers using SYCLcompat, through `sycl::usm::alloc` and +`sycl::get_pointer_device`. + +``` c++ +namespace syclcompat { + +// Expects number of elements +template +T *malloc(size_t count, sycl::queue q = get_default_queue()); +template +T *malloc_host(size_t count, sycl::queue q = get_default_queue()); +template +T *malloc_shared(size_t count, sycl::queue q = get_default_queue()); + +// Expects size of the memory in bytes +void *malloc(size_t num_bytes, sycl::queue q = get_default_queue()); +void *malloc_host(size_t num_bytes, sycl::queue q = get_default_queue()); +void *malloc_shared(size_t num_bytes, sycl::queue q = get_default_queue()); + +// 2D, 3D memory allocation wrappers +void *malloc(size_t &pitch, size_t x, size_t y, + sycl::queue q = get_default_queue()) +pitched_data malloc(sycl::range<3> size, sycl::queue q = get_default_queue()); + +// Blocking memcpy +void memcpy(void *to_ptr, const void *from_ptr, size_t size, + sycl::queue q = get_default_queue()); +void memcpy(T *to_ptr, const T *from_ptr, size_t count, + sycl::queue q = get_default_queue()); +void memcpy(void *to_ptr, size_t to_pitch, const void *from_ptr, + size_t from_pitch, size_t x, size_t y, + sycl::queue q = get_default_queue()); // 2D matrix +void memcpy(pitched_data to, sycl::id<3> to_pos, + pitched_data from, sycl::id<3> from_pos, + sycl::range<3> size, + sycl::queue q = get_default_queue()); // 3D matrix + +// Non-blocking memcpy +sycl::event memcpy_async(void *to_ptr, const void *from_ptr, size_t size, + sycl::queue q = get_default_queue()); +template +sycl::event memcpy_async(T *to_ptr, T void *from_ptr, size_t count, + sycl::queue q = get_default_queue()); +sycl::event memcpy_async(void *to_ptr, size_t to_pitch, + const void *from_ptr, size_t from_pitch, + size_t x, size_t y, + sycl::queue q = get_default_queue()); // 2D matrix +sycl::event memcpy_async(pitched_data to, sycl::id<3> to_pos, + pitched_data from, sycl::id<3> from_pos, + sycl::range<3> size, + sycl::queue q = get_default_queue()); // 3D matrix + +// Fill +template +void fill(void *dev_ptr, const T &pattern, size_t count, + sycl::queue q = get_default_queue()); +template +sycl::event fill_async(void *dev_ptr, const T &pattern, + size_t count, sycl::queue q = get_default_queue()); + +// Memset +void memset(void *dev_ptr, int value, size_t size, + sycl::queue q = get_default_queue()); +void memset(void *ptr, size_t pitch, int val, size_t x, size_t y, + sycl::queue q = get_default_queue()); // 2D matrix +void memset(pitched_data pitch, int val, sycl::range<3> size, + sycl::queue q = get_default_queue()); // 3D matrix +sycl::event memset_async(void *dev_ptr, int value, size_t size, + sycl::queue q = get_default_queue()); +sycl::event memset_async(void *ptr, size_t pitch, int val, + size_t x, size_t y, + sycl::queue q = get_default_queue()); // 2D matrix +sycl::event memset_async(pitched_data pitch, int val, + sycl::range<3> size, + sycl::queue q = get_default_queue()); // 3D matrix + +void free(void *ptr, sycl::queue q = get_default_queue()); +sycl::event free_async(const std::vector &pointers, + const std::vector &events, + sycl::queue q = get_default_queue()); + +// Queries pointer allocation type +class pointer_attributes { +public: + void init(const void *ptr, sycl::queue q = get_default_queue()); + sycl::usm::alloc get_memory_type(); + const void *get_device_pointer(); + const void *get_host_pointer(); + bool is_memory_shared(); + unsigned int get_device_id(); +}; + +} // syclcompat +``` + +Finally, the class `pitched_data`, which manages memory allocation for 3D +spaces, padded to avoid uncoalesced memory accesses. + +```c++ +namespace syclcompat { + +class pitched_data { +public: + pitched_data(); + pitched_data(void *data, size_t pitch, size_t x, size_t y); + + void *get_data_ptr(); + size_t get_pitch(); + size_t get_x(); + size_t get_y(); + + void set_data_ptr(void *data); + void set_pitch(size_t pitch); + void set_x(size_t x); + void set_y(size_t y); +}; + +} // syclcompat +``` + +There are various helper classes and aliases defined within SYCLcompat to +encapsulate and define memory operations and objects. These classes and aliases +are primarily designed to assist with machine translation from other +heterogeneous programming models. + +The wrapper class `device_memory` provides a unified representation for device +memory in various regions. The class provides methods to allocate memory for the +object (`init()`) and access the underlying memory in various ways (`get_ptr()`, +`get_access()`, `operator[]`). Aliases for global and USM shared specializations +are provided. + +The `memory_traits` class is provided as a traits helper for `device_memory`. +The `accessor` class template provides a 2D or 3D `sycl::accessor`-like wrapper +around raw pointers. + +```c++ +namespace syclcompat { + +enum class memory_region { + global = 0, // device global memory + constant, // device read-only memory + local, // device local memory + usm_shared, // memory which can be accessed by host and device +}; + +using byte_t = uint8_t; + +enum class target { device, local }; + +template class memory_traits { +public: + static constexpr sycl::access::address_space asp = + (Memory == memory_region::local) + ? sycl::access::address_space::local_space + : sycl::access::address_space::global_space; + static constexpr target target = + (Memory == memory_region::local) + ? target::local + : target::device; + static constexpr sycl::access_mode mode = + (Memory == memory_region::constant) + ? sycl::access_mode::read + : sycl::access_mode::read_write; + static constexpr size_t type_size = sizeof(T); + using element_t = + typename std::conditional_t; + using value_t = typename std::remove_cv_t; + template + using accessor_t = typename std::conditional_t< + target == target::local, + sycl::local_accessor, + sycl::accessor>; + using pointer_t = T *; +}; + +template class device_memory { +public: + using accessor_t = + typename memory_traits::template accessor_t; + using value_t = typename memory_traits::value_t; + using syclcompat_accessor_t = + syclcompat::accessor; + + device_memory(); + + device_memory(const sycl::range &in_range, + std::initializer_list &&init_list); + + template + device_memory( + const typename std::enable_if>::type &in_range, + std::initializer_list> &&init_list); + + device_memory(const sycl::range &range_in); + + // Variadic constructor taking 1, 2 or 3 integers to be interpreted as a + // sycl::range. + template + device_memory(Args... Arguments); + + ~device_memory(); + + // Allocate memory with default queue, and init memory if has initial value. + void init(); + // Allocate memory with specified queue, and init memory if has initial + // value. + void init(sycl::queue q); + + // The variable is assigned to a device pointer. + void assign(value_t *src, size_t size); + + // Get memory pointer of the memory object, which is virtual pointer when + // usm is not used, and device pointer when usm is used. + value_t *get_ptr(); + // Get memory pointer of the memory object, which is virtual pointer when + // usm is not used, and device pointer when usm is used. + value_t *get_ptr(sycl::queue q); + + // Get the device memory object size in bytes. + size_t get_size(); + + template + typename std::enable_if::type &operator[](size_t index); + + // Get accessor with dimension info for the device memory object + // when usm is used and dimension is greater than 1. + template + typename std::enable_if::type + get_access(sycl::handler &cgh); +}; + + +template +class device_memory : public device_memory { +public: + using base = device_memory; + using value_t = typename base::value_t; + using accessor_t = + typename memory_traits::template accessor_t<0>; + device_memory(const value_t &val); + device_memory(); +}; + +template +using global_memory = device_memory; +template +using constant_memory = detail::device_memory; +template +using shared_memory = device_memory; + + +template class accessor; + +template class accessor { +public: + using memory_t = memory_traits; + using element_t = typename memory_t::element_t; + using pointer_t = typename memory_t::pointer_t; + using accessor_t = typename memory_t::template accessor_t<3>; + + accessor(pointer_t data, const sycl::range<3> &in_range); + template + accessor(typename std::enable_if::type &acc); + accessor(const accessor_t &acc, const sycl::range<3> &in_range); + + accessor operator[](size_t index) const; + + pointer_t get_ptr() const; + +}; + +template class accessor { +public: + using memory_t = memory_traits; + using element_t = typename memory_t::element_t; + using pointer_t = typename memory_t::pointer_t; + using accessor_t = typename memory_t::template accessor_t<2>; + + accessor(pointer_t data, const sycl::range<2> &in_range); + template + accessor(typename std::enable_if::type &acc); + accessor(const accessor_t &acc, const sycl::range<2> &in_range); + + pointer_t operator[](size_t index); + + pointer_t get_ptr() const; +}; + +} // syclcompat +``` + +### Device Information + +`sycl::device` properties are encapsulated using the `device_info` helper class. +The class is meant to be constructed and used through the extended device +implemented in SYCLcompat. + +This is the synopsis of `device_info`: + +```c++ +class device_info { +public: + const char *get_name(); + char *get_name(); + template , + std::enable_if_t> || + std::is_same_v, + int> = 0> + auto get_max_work_item_sizes() const; + + template , + std::enable_if_t> || + std::is_same_v, + int> = 0> + auto get_max_work_item_sizes() const; + int get_major_version() const; + int get_minor_version() const; + int get_integrated() const; + int get_max_clock_frequency() const; + int get_max_compute_units() const; + int get_max_work_group_size() const; + int get_max_sub_group_size() const; + int get_max_work_items_per_compute_unit() const; + template || + std::is_same_v, + int> = 0> + auto get_max_nd_range_size() const; + template || + std::is_same_v, + int> = 0> + auto get_max_nd_range_size(); + size_t get_global_mem_size() const; + size_t get_local_mem_size() const; + +void set_name(const char *name); + void set_max_work_item_sizes(const sycl::id<3> max_work_item_sizes); + void set_major_version(int major); + void set_minor_version(int minor); + void set_integrated(int integrated); + void set_max_clock_frequency(int frequency); + void set_max_compute_units(int max_compute_units); + void set_global_mem_size(size_t global_mem_size); + void set_local_mem_size(size_t local_mem_size); + void set_max_work_group_size(int max_work_group_size); + void set_max_sub_group_size(int max_sub_group_size); + void + set_max_work_items_per_compute_unit(int max_work_items_per_compute_unit); + void set_max_nd_range_size(int max_nd_range_size[]); +}; +``` + +### Device Management + +Multiple SYCL functionalities are exposed through utility functions to manage +the current `sycl::device`, `sycl::queue`, and `sycl::context`, exposed as +follows: + +```c++ +namespace syclcompat { + +// Util function to create a new queue for the current device +sycl::queue create_queue(bool print_on_async_exceptions = false, + bool in_order = true); + +// Util function to get the default queue of current device in +// device manager. +sycl::queue get_default_queue(); + +// Util function to wait for the queued kernels. +void wait(sycl::queue q = get_default_queue()); + +// Util function to wait for the queued kernels and throw unhandled errors. +void wait_and_throw(sycl::queue q = get_default_queue()); + +// Util function to get the id of current device in +// device manager. +unsigned int get_current_device_id(); + +// Util function to get the current device. +device_ext &get_current_device(); + +// Util function to get a device by id. +device_ext &get_device(unsigned int id); + +// Util function to get the context of the default queue of current +// device in device manager. +sycl::context get_default_context(); + +// Util function to get a CPU device. +device_ext &cpu_device(); + +// Util function to select a device by its id +unsigned int select_device(unsigned int id); + +} // syclcompat +``` + +The exposed functionalities include creation and destruction of queues, through +`syclcompat::create_queue` and `syclcompat::destroy_queue`, and providing the +ability to wait for submitted kernels using `syclcompat::wait` or +`syclcompat::wait_and_throw`. Any async errors will be output to `stderr` if +`print_on_async_exceptions`. Synchronous exceptions have to be managed by users +independently of what is set in this parameter. + +Devices are managed through a helper class, `device_ext`. The `device_ext` class +associates a vector of `sycl::queues` with its `sycl::device`. The `device_ext` +destructor waits on a set of `sycl::event` which can be added to via +`add_event`. This is used, for example, to implement `syclcompat::free_async` to +schedule release of memory after a kernel or `mempcy`. SYCL device properties +can be queried through `device_ext` as well. + +The class is exposed as follows: + +```c++ +namespace syclcompat { + +class device_ext : public sycl::device { + device_ext(); + device_ext(const sycl::device &base); + ~device_ext(); + + bool is_native_host_atomic_supported(); + int get_major_version(); + int get_minor_version(); + int get_max_compute_units(); + int get_max_clock_frequency(); + int get_integrated(); + void get_device_info(device_info &out); + + device_info get_device_info(); + void reset(); + + sycl::queue *default_queue(); + void queues_wait_and_throw(); + sycl::queue *create_queue(bool print_on_async_exceptions = false, + bool in_order = true); + void destroy_queue(sycl::queue *&queue); + void set_saved_queue(sycl::queue *q); + sycl::queue *get_saved_queue(); + sycl::context get_context(); +}; + +} // syclcompat +``` + +#### Multiple devices + +SYCLcompat allows you to manage multiple devices through +`syclcompat::select_device` and `syclcompat::create_queue`. The library uses the +default SYCL device (i.e. the device returned by `sycl::default_selector_v`) as +the default device, and exposes all other devices available on the system +through the `syclcompat::select_device(unsigned int id)` member function. + +The interface uses the `syclcompat::device_ext::get_current_device_id()` to get +the current CPU thread, and returns the associated device stored internally as a +map with that thread. The map is constructed using calls to +`syclcompat::select_device(unsigned int id)`. Any thread which hasn't used this +member function to select a device will be given the default device. Note that +this implies multiple threads on a single device by default. + +Be aware that targetting multiple devices may lead to unintended behavior caused +by developers, as SYCLcompat does not implement a mechanism to warn when the +wrong queue is used as an argument in any of the member functions of the +`syclcompat` namespace. + +#### Atomic Operations + +SYCLcompat provides an interface for common atomic operations (`add`, `sub`, +`and`, `or`, `xor`, `min`, `max`, `exchange`, `compare_exchange`). While SYCL +exposes atomic operations through member functions of `sycl::atomic_ref`, this +library provides access via functions taking a standard pointer argument. +Template arguments control the `sycl::memory_scope`, `sycl::memory_order` and +`sycl::access::address_space` of these atomic operations. SYCLcompat also +exposes overloads for these atomic functions which take a runtime memoryScope +argument. Every atomic operation is implemented via an API function taking a raw +pointer as the target. Additional overloads for +`syclcompat::compare_exchange_strong` are provided which take a +`sycl::multi_ptr` instead of a raw pointer. Addition and subtraction make use of +`arith_t` to differentiate between numeric and pointer arithmetics. + +The available operations are exposed as follows: + +``` c++ +namespace syclcompat { + +template struct arith { + using type = std::conditional_t, std::ptrdiff_t, T>; +}; +template using arith_t = typename arith::type; + +template +T atomic_fetch_add(T *addr, arith_t operand); +template +T atomic_fetch_add(T *addr, arith_t operand, + sycl::memory_order memoryOrder); + +template +T atomic_fetch_sub(T *addr, arith_t operand); +template +T atomic_fetch_sub(T *addr, arith_t operand, + sycl::memory_order memoryOrder); + +template +T atomic_fetch_and(T *addr, T operand); +template +T atomic_fetch_and(T *addr, T operand, sycl::memory_order memoryOrder); + +template +T atomic_fetch_or(T *addr, T operand); +template +T atomic_fetch_or(T *addr, T operand, sycl::memory_order memoryOrder); + +template +T atomic_fetch_xor(T *addr, T operand); +template +T atomic_fetch_xor(T *addr, T operand, sycl::memory_order memoryOrder); + +template +T atomic_fetch_min(T *addr, T operand); +template +T atomic_fetch_min(T *addr, T operand, sycl::memory_order memoryOrder); + +template +T atomic_fetch_max(T *addr, T operand); +template +T atomic_fetch_max(T *addr, T operand, sycl::memory_order memoryOrder); + +template +unsigned int atomic_fetch_compare_inc(unsigned int *addr, + unsigned int operand); +template +unsigned int atomic_fetch_compare_inc(unsigned int *addr, + unsigned int operand, + sycl::memory_order memoryOrder); + +template +T atomic_exchange(T *addr, T operand); +template +T atomic_exchange(T *addr, T operand, sycl::memory_order memoryOrder); + +template +T atomic_compare_exchange_strong( + sycl::multi_ptr addr, + T expected, T desired, + sycl::memory_order success = sycl::memory_order::relaxed, + sycl::memory_order fail = sycl::memory_order::relaxed); +template +T atomic_compare_exchange_strong( + T *addr, T expected, T desired, + sycl::memory_order success = sycl::memory_order::relaxed, + sycl::memory_order fail = sycl::memory_order::relaxed); + +} // namespace syclcompat +``` + +### Compatibility Utilities + +This library provides a number of small compatibility utilities which exist to +facilitate machine translation of code from other programming models to SYCL. +These functions are part of the public API, but they are not expected to be +useful to developers writing their own code. + +Functionality is provided to represent a pair of integers as a `double`. +`cast_ints_to_double(int, int)` returns a `double` containing the given integers +in the high & low 32-bits respectively. `cast_double_to_int` casts the high or +low 32-bits back into an integer. + +`syclcompat::fast_length` provides a wrapper to SYCL's +`fast_length(sycl::vec)` that accepts arguments for a C++ array and a +length. + +`vectorized_max` and `vectorized_min` are binary operations returning the +max/min of two arguments, where each argument is treated as a `sycl::vec` type. +`vectorized_isgreater` performs elementwise `isgreater`, treating each argument +as a vector of elements, and returning `0` for vector components for which +`isgreater` is false, and `-1` when true. + +`reverse_bits` reverses the bits of a 32-bit unsigned integer, `ffs` returns the +position of the first least significant set bit in an integer. +`byte_level_permute` returns a byte-permutation of two input unsigned integers, +with bytes selected according to a third unsigned integer argument. + +There is also an `experimental::logical_group` class which allows +`sycl::sub_group`s to be further subdivided into 'logical' groups to perform +sub-group level operations. This class provides methods to get the local & group +id and range. The functions `select_from_sub_group`, `shift_sub_group_left`, +`shift_sub_group_right` and `permute_sub_group_by_xor` provide equivalent +functionality to `sycl::select_from_group`, `sycl::shift_group_left`, +`sycl::shift_group_right` and `sycl::permute_group_by_xor`, respectively. +However, they provide an optional argument to represent the `logical_group` size +(default 32). + +The functions `cmul`,`cdiv`,`cabs`, and `conj` define complex math operations +which accept `sycl::vec` arguments representing complex values. + +```c++ +namespace syclcompat { + +inline int cast_double_to_int(double d, bool use_high32 = true); + +inline double cast_ints_to_double(int high32, int low32); + +inline float fast_length(const float *a, int len); + +template inline T vectorized_max(T a, T b); + +template inline T vectorized_min(T a, T b); + +template inline T vectorized_isgreater(T a, T b); + +template <> +inline unsigned vectorized_isgreater(unsigned a, + unsigned b); + +template inline T reverse_bits(T a); + +inline unsigned int byte_level_permute(unsigned int a, unsigned int b, + unsigned int s); + +template inline int ffs(T a); + +template +T select_from_sub_group(sycl::sub_group g, T x, int remote_local_id, + int logical_sub_group_size = 32); + +template +T shift_sub_group_left(sycl::sub_group g, T x, unsigned int delta, + int logical_sub_group_size = 32); + +template +T shift_sub_group_right(sycl::sub_group g, T x, unsigned int delta, + int logical_sub_group_size = 32); + +template +T permute_sub_group_by_xor(sycl::sub_group g, T x, unsigned int mask, + int logical_sub_group_size = 32); + +template +sycl::vec cmul(sycl::vec x, sycl::vec y); + +template +sycl::vec cdiv(sycl::vec x, sycl::vec y); + +template T cabs(sycl::vec x); + +template sycl::vec conj(sycl::vec x); + +} // namespace syclcompat +``` + +The function `experimental::nd_range_barrier` synchronizes work items from all +work groups within a SYCL kernel. This is not officially supported by the SYCL +spec, and so should be used with caution. + +```c++ +namespace syclcompat { +namespace experimental { + +template +inline void nd_range_barrier( + sycl::nd_item item, + sycl::atomic_ref &counter); + +template <> +inline void nd_range_barrier( + sycl::nd_item<1> item, + sycl::atomic_ref &counter); + +class logical_group { +public: + logical_group(sycl::nd_item<3> item, sycl::group<3> parent_group, + uint32_t size); + uint32_t get_local_linear_id() const; + uint32_t get_group_linear_id() const; + uint32_t get_local_linear_range() const; + uint32_t get_group_linear_range() const; +}; + +} // namespace experimental +} // namespace syclcompat +``` + +To assist machine translation, helper aliases are provided for inlining and +alignment attributes. The class template declarations `sycl_compat_kernel_name` +and `sycl_compat_kernel_scalar` are used to assist automatic generation of +kernel names during machine translation. + +`get_sycl_language_version` returns an integer representing the version of the +SYCL spec supported by the current SYCL compiler. + +``` c++ +namespace syclcompat { + +#define __sycl_compat_align__(n) __attribute__((aligned(n))) +#define __sycl_compat_inline__ __inline__ __attribute__((always_inline)) + +#define __sycl_compat_noinline__ __attribute__((noinline)) + +template class sycl_compat_kernel_name; +template class sycl_compat_kernel_scalar; + +int get_sycl_language_version(); + +} // namespace syclcompat +``` + +#### Kernel Helper Functions + +Kernel helper functions provide a structure `kernel_function_info` to keep SYCL +kernel information, and provide a utility function `get_kernel_function_info()` +to get the kernel information. Overloads are provided to allow either returning +a `kernel_function_info` object, or to return by pointer argument. In the +current version, `kernel_function_info` describes only maximum work-group size. + +``` c++ +namespace syclcompat { + +struct kernel_function_info { + int max_work_group_size = 0; +}; + +static void get_kernel_function_info(kernel_function_info *kernel_info, + const void *function); +static kernel_function_info get_kernel_function_info(const void *function); +} // namespace syclcompat +``` + +## Sample Code + +Below is a simple linear algebra sample, which computes `y = mx + b` implemented +using this library: + +``` c++ +#include +#include + +#include +#include + +/** + * Slope intercept form of a straight line equation: Y = m * X + b + */ +template +void slope_intercept(float *Y, float *X, float m, float b, size_t n) { + + // Block index + size_t bx = syclcompat::work_group_id::x(); + // Thread index + size_t tx = syclcompat::local_id::x(); + + size_t i = bx * BLOCK_SIZE + tx; + // or i = syclcompat::global_id::x(); + if (i < n) + Y[i] = m * X[i] + b; +} + +void check_memory(void *ptr, std::string msg) { + if (ptr == nullptr) { + std::cerr << "Failed to allocate memory: " << msg << std::endl; + exit(EXIT_FAILURE); + } +} + +/** + * Program main + */ +int main(int argc, char **argv) { + std::cout << "Simple Kernel example" << std::endl; + + constexpr size_t n_points = 32; + constexpr float m = 1.5f; + constexpr float b = 0.5f; + + int block_size = 32; + if (block_size > syclcompat::get_current_device() + .get_info()) + block_size = 16; + + std::cout << "block_size = " << block_size << ", n_points = " << n_points + << std::endl; + + // Allocate host memory for vectors X and Y + size_t mem_size = n_points * sizeof(float); + float *h_X = (float *)syclcompat::malloc_host(mem_size); + float *h_Y = (float *)syclcompat::malloc_host(mem_size); + check_memory(h_X, "h_X allocation failed."); + check_memory(h_Y, "h_Y allocation failed."); + + // Alternative templated allocation for the expected output + float *h_expected = syclcompat::malloc_host(n_points); + check_memory(h_expected, "Not enough for h_expected."); + + // Initialize host memory & expected output + for (size_t i = 0; i < n_points; i++) { + h_X[i] = i + 1; + h_expected[i] = m * h_X[i] + b; + } + + // Allocate device memory + float *d_X = (float *)syclcompat::malloc(mem_size); + float *d_Y = (float *)syclcompat::malloc(mem_size); + check_memory(d_X, "d_X allocation failed."); + check_memory(d_Y, "d_Y allocation failed."); + + // copy host memory to device + syclcompat::memcpy(d_X, h_X, mem_size); + + size_t threads = block_size; + size_t grid = n_points / block_size; + + std::cout << "Computing result using SYCL Kernel... "; + if (block_size == 16) { + syclcompat::launch>(grid, threads, d_Y, d_X, m, b, + n_points); + } else { + syclcompat::launch>(grid, threads, d_Y, d_X, m, b, + n_points); + } + syclcompat::wait(); + std::cout << "DONE" << std::endl; + + // Async copy result from device to host + syclcompat::memcpy_async(h_Y, d_Y, mem_size).wait(); + + // Check output + for (size_t i = 0; i < n_points; i++) { + assert(h_Y[i] - h_expected[i] < 1e6); + } + + // Clean up memory + syclcompat::free(h_X); + syclcompat::free(h_Y); + syclcompat::free(h_expected); + syclcompat::free(d_X); + syclcompat::free(d_Y); + + return 0; +} +``` + +## Maintainers + +To report problems with this library, please open a new issue with the [COMPAT] +tag at: + + + +## Contributors + +Alberto Cabrera, Codeplay \ +Gordon Brown, Codeplay \ +Joe Todd, Codeplay \ +Pietro Ghiglio, Codeplay \ +Ruyman Reyes, Codeplay/Intel + +## Contributions + +This library is licensed under the Apache 2.0 license. If you have an idea for a +new sample, different build system integration or even a fix for something that +is broken, please get in contact. diff --git a/sycl/include/sycl/builtins.hpp b/sycl/include/sycl/builtins.hpp index 6751ef20c902d..b2fcd558328e7 100644 --- a/sycl/include/sycl/builtins.hpp +++ b/sycl/include/sycl/builtins.hpp @@ -734,8 +734,8 @@ std::enable_if_t<__FAST_MATH_GENFLOAT(T), T> sin(T x) __NOEXC { // svgenfloat sincos (svgenfloat x, genfloatptr cosval) template -std::enable_if_t< - detail::is_svgenfloat::value && detail::is_genfloatptr::value, T> +std::enable_if_t<__FAST_MATH_GENFLOAT(T) && detail::is_genfloatptr::value, + T> sincos(T x, T2 cosval) __NOEXC { detail::check_vector_size(); return __sycl_std::__invoke_sincos(x, cosval); @@ -2500,6 +2500,23 @@ std::enable_if_t::value, T> cos(T x) __NOEXC { return native::cos(x); } +// svgenfloat sincos (svgenfloat x, genfloatptr cosval) +// This is a performance optimization to ensure that sincos isn't slower than a +// pair of sin/cos executed separately. Theoretically, calling non-native sincos +// might be faster than calling native::sin plus native::cos separately and we'd +// need some kind of cost model to make the right decision (and move this +// entirely to the JIT/AOT compilers). However, in practice, this simpler +// solution seems to work just fine and matches how sin/cos above are optimized +// for the fast math path. +template +std::enable_if_t< + detail::is_svgenfloatf::value && detail::is_genfloatptr::value, T> +sincos(T x, T2 cosval) __NOEXC { + detail::check_vector_size(); + *cosval = native::cos(x); + return native::sin(x); +} + // svgenfloatf exp (svgenfloatf x) template std::enable_if_t::value, T> exp(T x) __NOEXC { diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h index 4ad418b5ccbb7..60f0220d388e7 100644 --- a/sycl/include/sycl/detail/pi.h +++ b/sycl/include/sycl/detail/pi.h @@ -905,6 +905,7 @@ static const uint8_t PI_DEVICE_BINARY_OFFLOAD_KIND_SYCL = 4; /// PTX 64-bit image <-> "nvptx64", 64-bit NVIDIA PTX device #define __SYCL_PI_DEVICE_BINARY_TARGET_NVPTX64 "nvptx64" #define __SYCL_PI_DEVICE_BINARY_TARGET_AMDGCN "amdgcn" +#define __SYCL_PI_DEVICE_BINARY_TARGET_NATIVE_CPU "native_cpu" /// Extension to denote native support of assert feature by an arbitrary device /// piDeviceGetInfo call should return this extension when the device supports diff --git a/sycl/include/sycl/ext/oneapi/experimental/device_architecture.hpp b/sycl/include/sycl/ext/oneapi/experimental/device_architecture.hpp index 92fda8191124b..5eb0a301ce4b5 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/device_architecture.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/device_architecture.hpp @@ -19,11 +19,14 @@ enum class architecture { intel_gpu_kbl, intel_gpu_cfl, intel_gpu_apl, + intel_gpu_bxt = intel_gpu_apl, intel_gpu_glk, intel_gpu_whl, intel_gpu_aml, intel_gpu_cml, intel_gpu_icllp, + intel_gpu_ehl, + intel_gpu_jsl = intel_gpu_ehl, intel_gpu_tgllp, intel_gpu_rkl, intel_gpu_adl_s, @@ -32,8 +35,11 @@ enum class architecture { intel_gpu_adl_n, intel_gpu_dg1, intel_gpu_acm_g10, + intel_gpu_dg2_g10 = intel_gpu_acm_g10, intel_gpu_acm_g11, + intel_gpu_dg2_g11 = intel_gpu_acm_g11, intel_gpu_acm_g12, + intel_gpu_dg2_g12 = intel_gpu_acm_g12, intel_gpu_pvc, // NVIDIA architectures nvidia_gpu_sm_50, @@ -128,6 +134,9 @@ static constexpr ext::oneapi::experimental::architecture max_architecture = #ifndef __SYCL_TARGET_INTEL_GPU_ICLLP__ #define __SYCL_TARGET_INTEL_GPU_ICLLP__ 0 #endif +#ifndef __SYCL_TARGET_INTEL_GPU_EHL__ +#define __SYCL_TARGET_INTEL_GPU_EHL__ 0 +#endif #ifndef __SYCL_TARGET_INTEL_GPU_TGLLP__ #define __SYCL_TARGET_INTEL_GPU_TGLLP__ 0 #endif @@ -137,9 +146,6 @@ static constexpr ext::oneapi::experimental::architecture max_architecture = #ifndef __SYCL_TARGET_INTEL_GPU_ADL_S__ #define __SYCL_TARGET_INTEL_GPU_ADL_S__ 0 #endif -#ifndef __SYCL_TARGET_INTEL_GPU_RPL_S__ -#define __SYCL_TARGET_INTEL_GPU_RPL_S__ 0 -#endif #ifndef __SYCL_TARGET_INTEL_GPU_ADL_P__ #define __SYCL_TARGET_INTEL_GPU_ADL_P__ 0 #endif @@ -287,10 +293,10 @@ static constexpr bool is_allowable_aot_mode = (__SYCL_TARGET_INTEL_GPU_AML__ == 1) || (__SYCL_TARGET_INTEL_GPU_CML__ == 1) || (__SYCL_TARGET_INTEL_GPU_ICLLP__ == 1) || + (__SYCL_TARGET_INTEL_GPU_EHL__ == 1) || (__SYCL_TARGET_INTEL_GPU_TGLLP__ == 1) || (__SYCL_TARGET_INTEL_GPU_RKL__ == 1) || (__SYCL_TARGET_INTEL_GPU_ADL_S__ == 1) || - (__SYCL_TARGET_INTEL_GPU_RPL_S__ == 1) || (__SYCL_TARGET_INTEL_GPU_ADL_P__ == 1) || (__SYCL_TARGET_INTEL_GPU_ADL_N__ == 1) || (__SYCL_TARGET_INTEL_GPU_DG1__ == 1) || @@ -364,14 +370,14 @@ struct IsAOTForArchitectureClass { __SYCL_TARGET_INTEL_GPU_CML__ == 1; arr[static_cast(arch::intel_gpu_icllp)] = __SYCL_TARGET_INTEL_GPU_ICLLP__ == 1; + arr[static_cast(arch::intel_gpu_ehl)] = + __SYCL_TARGET_INTEL_GPU_EHL__ == 1; arr[static_cast(arch::intel_gpu_tgllp)] = __SYCL_TARGET_INTEL_GPU_TGLLP__ == 1; arr[static_cast(arch::intel_gpu_rkl)] = __SYCL_TARGET_INTEL_GPU_RKL__ == 1; arr[static_cast(arch::intel_gpu_adl_s)] = __SYCL_TARGET_INTEL_GPU_ADL_S__ == 1; - arr[static_cast(arch::intel_gpu_rpl_s)] = - __SYCL_TARGET_INTEL_GPU_RPL_S__ == 1; arr[static_cast(arch::intel_gpu_adl_p)] = __SYCL_TARGET_INTEL_GPU_ADL_P__ == 1; arr[static_cast(arch::intel_gpu_adl_n)] = diff --git a/sycl/include/sycl/ext/oneapi/experimental/invoke_simd.hpp b/sycl/include/sycl/ext/oneapi/experimental/invoke_simd.hpp index 40c0efff94021..24d06806b85d5 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/invoke_simd.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/invoke_simd.hpp @@ -371,7 +371,8 @@ constexpr bool has_struct_arg(Ret (*)(Args...)) { template constexpr bool has_struct_ret(Ret (*)(Args...)) { - return std::is_class_v && !is_simd_or_mask_type::value; + return std::is_class_v && !is_simd_or_mask_type::value && + !is_uniform_type::value; } template diff --git a/sycl/include/sycl/types.hpp b/sycl/include/sycl/types.hpp index d3f63637e3028..37ff4cf3d6438 100644 --- a/sycl/include/sycl/types.hpp +++ b/sycl/include/sycl/types.hpp @@ -583,13 +583,17 @@ template class vec { // vector extension. This is for MSVC compatibility, which has a max alignment // of 64 for direct params. If we drop MSVC, we can have alignment the same as // size and use vector extensions for all sizes. - static constexpr bool IsUsingArray = + static constexpr bool IsUsingArrayOnDevice = (IsHostHalf || IsSizeGreaterThanMaxAlign); #if defined(__SYCL_DEVICE_ONLY__) - static constexpr bool NativeVec = NumElements > 1 && !IsUsingArray; + static constexpr bool NativeVec = NumElements > 1 && !IsUsingArrayOnDevice; + static constexpr bool IsUsingArrayOnHost = + false; // we are not compiling for host. #else static constexpr bool NativeVec = false; + static constexpr bool IsUsingArrayOnHost = + true; // host always uses std::array. #endif static constexpr int getNumElements() { return NumElements; } @@ -770,6 +774,15 @@ template class vec { return *this; } + template + using EnableIfUsingArray = + typename std::enable_if_t; + + template + using EnableIfNotUsingArray = + typename std::enable_if_t; + #ifdef __SYCL_DEVICE_ONLY__ template using EnableIfNotHostHalf = typename std::enable_if_t; @@ -778,13 +791,15 @@ template class vec { using EnableIfHostHalf = typename std::enable_if_t; template - using EnableIfUsingArray = typename std::enable_if_t; + using EnableIfUsingArrayOnDevice = + typename std::enable_if_t; template - using EnableIfNotUsingArray = typename std::enable_if_t; + using EnableIfNotUsingArrayOnDevice = + typename std::enable_if_t; template - explicit constexpr vec(const EnableIfNotUsingArray &arg) + explicit constexpr vec(const EnableIfNotUsingArrayOnDevice &arg) : m_Data{DataType(vec_data::get(arg))} {} template @@ -792,13 +807,13 @@ template class vec { std::is_fundamental_v> || std::is_same_v, half>, vec &> - operator=(const EnableIfNotUsingArray &Rhs) { + operator=(const EnableIfNotUsingArrayOnDevice &Rhs) { m_Data = (DataType)vec_data::get(Rhs); return *this; } template - explicit constexpr vec(const EnableIfUsingArray &arg) + explicit constexpr vec(const EnableIfUsingArrayOnDevice &arg) : vec{detail::RepeatValue( static_cast>(arg)), std::make_index_sequence()} {} @@ -808,7 +823,7 @@ template class vec { std::is_fundamental_v> || std::is_same_v, half>, vec &> - operator=(const EnableIfUsingArray &Rhs) { + operator=(const EnableIfUsingArrayOnDevice &Rhs) { for (int i = 0; i < NumElements; ++i) { setValue(i, Rhs); } @@ -844,22 +859,22 @@ template class vec { std::is_convertible_v && NumElements == IdxNum, DataT>; template constexpr vec(const EnableIfMultipleElems<2, Ty> Arg0, - const EnableIfNotUsingArray Arg1) + const EnableIfNotUsingArrayOnDevice Arg1) : m_Data{vec_data::get(Arg0), vec_data::get(Arg1)} {} template constexpr vec(const EnableIfMultipleElems<3, Ty> Arg0, - const EnableIfNotUsingArray Arg1, const DataT Arg2) + const EnableIfNotUsingArrayOnDevice Arg1, const DataT Arg2) : m_Data{vec_data::get(Arg0), vec_data::get(Arg1), vec_data::get(Arg2)} {} template constexpr vec(const EnableIfMultipleElems<4, Ty> Arg0, - const EnableIfNotUsingArray Arg1, const DataT Arg2, + const EnableIfNotUsingArrayOnDevice Arg1, const DataT Arg2, const Ty Arg3) : m_Data{vec_data::get(Arg0), vec_data::get(Arg1), vec_data::get(Arg2), vec_data::get(Arg3)} {} template constexpr vec(const EnableIfMultipleElems<8, Ty> Arg0, - const EnableIfNotUsingArray Arg1, const DataT Arg2, + const EnableIfNotUsingArrayOnDevice Arg1, const DataT Arg2, const DataT Arg3, const DataT Arg4, const DataT Arg5, const DataT Arg6, const DataT Arg7) : m_Data{vec_data::get(Arg0), vec_data::get(Arg1), @@ -868,7 +883,7 @@ template class vec { vec_data::get(Arg6), vec_data::get(Arg7)} {} template constexpr vec(const EnableIfMultipleElems<16, Ty> Arg0, - const EnableIfNotUsingArray Arg1, const DataT Arg2, + const EnableIfNotUsingArrayOnDevice Arg1, const DataT Arg2, const DataT Arg3, const DataT Arg4, const DataT Arg5, const DataT Arg6, const DataT Arg7, const DataT Arg8, const DataT Arg9, const DataT ArgA, const DataT ArgB, @@ -908,7 +923,7 @@ template class vec { std::is_same::value && !std::is_same::value>> constexpr vec(vector_t openclVector) { - if constexpr (!IsUsingArray) { + if constexpr (!IsUsingArrayOnDevice) { m_Data = openclVector; } else { m_Data = bit_cast(openclVector); @@ -916,7 +931,7 @@ template class vec { } operator vector_t() const { - if constexpr (!IsUsingArray) { + if constexpr (!IsUsingArrayOnDevice) { return m_Data; } else { auto ptr = bit_cast((&m_Data)->data()); @@ -1077,7 +1092,7 @@ template class vec { #ifdef __SYCL_DEVICE_ONLY__ #define __SYCL_BINOP(BINOP, OPASSIGN, CONVERT) \ template \ - vec operator BINOP(const EnableIfNotUsingArray &Rhs) const { \ + vec operator BINOP(const EnableIfNotUsingArrayOnDevice &Rhs) const { \ vec Ret; \ Ret.m_Data = m_Data BINOP Rhs.m_Data; \ if constexpr (std::is_same::value && CONVERT) { \ @@ -1086,7 +1101,7 @@ template class vec { return Ret; \ } \ template \ - vec operator BINOP(const EnableIfUsingArray &Rhs) const { \ + vec operator BINOP(const EnableIfUsingArrayOnDevice &Rhs) const { \ vec Ret; \ for (size_t I = 0; I < NumElements; ++I) { \ Ret.setValue(I, (getValue(I) BINOP Rhs.getValue(I))); \ @@ -1240,67 +1255,94 @@ template class vec { __SYCL_UOP(--, -=) #undef __SYCL_UOP - // Available only when: dataT != cl_float && dataT != cl_double - // && dataT != cl_half + // operator~() available only when: dataT != float && dataT != double + // && dataT != half template - typename std::enable_if_t>, vec> + typename std::enable_if_t> && + (!IsUsingArrayOnDevice && !IsUsingArrayOnHost), + vec> operator~() const { -// Use __SYCL_DEVICE_ONLY__ macro because cast to OpenCL vector type is defined -// by SYCL device compiler only. -#ifdef __SYCL_DEVICE_ONLY__ vec Ret{(typename vec::DataType) ~m_Data}; if constexpr (std::is_same::value) { Ret.ConvertToDataT(); } return Ret; -#else + } + template + typename std::enable_if_t> && + (IsUsingArrayOnDevice || IsUsingArrayOnHost), + vec> + operator~() const { vec Ret{}; for (size_t I = 0; I < NumElements; ++I) { Ret.setValue(I, ~getValue(I)); } return Ret; -#endif } - vec operator!() const { -// Use __SYCL_DEVICE_ONLY__ macro because cast to OpenCL vector type is defined -// by SYCL device compiler only. -#ifdef __SYCL_DEVICE_ONLY__ - return vec{ - (typename vec::DataType) !m_Data}; -#else - vec Ret{}; + // operator! + template + EnableIfNotUsingArray> operator!() const { + return vec{(typename vec::DataType) !m_Data}; + } + + // std::byte neither supports ! unary op or casting, so special handling is + // needed. And, worse, Windows has a conflict with 'byte'. +#if (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0) + template + typename std::enable_if_t::value && + (IsUsingArrayOnDevice || IsUsingArrayOnHost), + vec> + operator!() const { + vec Ret{}; for (size_t I = 0; I < NumElements; ++I) { - Ret.setValue(I, !vec_data::get(getValue(I))); + Ret.setValue(I, std::byte{!vec_data::get(getValue(I))}); } return Ret; -#endif } - vec operator+() const { -// Use __SYCL_DEVICE_ONLY__ macro because cast to OpenCL vector type is defined -// by SYCL device compiler only. -#ifdef __SYCL_DEVICE_ONLY__ - return vec{+m_Data}; + template + typename std::enable_if_t::value && + (IsUsingArrayOnDevice || IsUsingArrayOnHost), + vec> + operator!() const { + vec Ret{}; + for (size_t I = 0; I < NumElements; ++I) + Ret.setValue(I, !vec_data::get(getValue(I))); + return Ret; + } #else + template + EnableIfUsingArray> operator!() const { vec Ret{}; for (size_t I = 0; I < NumElements; ++I) - Ret.setValue(I, vec_data::get(+vec_data::get(getValue(I)))); + Ret.setValue(I, !vec_data::get(getValue(I))); return Ret; + } #endif + + // operator + + template EnableIfNotUsingArray operator+() const { + return vec{+m_Data}; } - vec operator-() const { -// Use __SYCL_DEVICE_ONLY__ macro because cast to OpenCL vector type is defined -// by SYCL device compiler only. -#ifdef __SYCL_DEVICE_ONLY__ + template EnableIfUsingArray operator+() const { + vec Ret{}; + for (size_t I = 0; I < NumElements; ++I) + Ret.setValue(I, vec_data::get(+vec_data::get(getValue(I)))); + return Ret; + } + + // operator - + template EnableIfNotUsingArray operator-() const { return vec{-m_Data}; -#else + } + + template EnableIfUsingArray operator-() const { vec Ret{}; for (size_t I = 0; I < NumElements; ++I) Ret.setValue(I, vec_data::get(-vec_data::get(getValue(I)))); return Ret; -#endif } // OP is: &&, || @@ -1316,7 +1358,7 @@ template class vec { template