From 0c51f6082760f15d268bb4320c832cebe91704e9 Mon Sep 17 00:00:00 2001
From: Alexey Bader <alexey.bader@intel.com>
Date: Mon, 7 Aug 2023 17:37:39 -0700
Subject: [PATCH 01/24] [SYCL] Switch SPIR-V offload target to opaque pointers
 (#9828)

Although there are a few tests failing due to this change, we need to go
with this change to avoid future regressions and unblock changes
removing typed pointers support. The regressions are supposed to be
fixed by follow-up patches.
---
 llvm/CMakeLists.txt                                      | 2 +-
 sycl/test-e2e/BFloat16/bfloat16_conversions.cpp          | 3 +++
 sycl/test-e2e/BFloat16/bfloat16_type.cpp                 | 4 +++-
 sycl/test-e2e/DeviceLib/string_test.cpp                  | 3 +++
 sycl/test-e2e/ESIMD/ext_math.cpp                         | 3 +++
 sycl/test-e2e/KernelFusion/internalize_array_wrapper.cpp | 3 +++
 sycl/test-e2e/Reduction/reduction_usm.cpp                | 3 +++
 sycl/test-e2e/Regression/local-arg-align.cpp             | 3 +++
 8 files changed, 22 insertions(+), 2 deletions(-)
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 5f9099f793c0b..317997e9c3a8b 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -893,7 +893,7 @@ set(DPCPP_ENABLE_OPAQUE_POINTERS TRUE CACHE BOOL
 if (DPCPP_ENABLE_OPAQUE_POINTERS)
   add_definitions("-DENABLE_OPAQUE_POINTERS=1")
 endif(DPCPP_ENABLE_OPAQUE_POINTERS)
-set(SPIRV_ENABLE_OPAQUE_POINTERS FALSE CACHE BOOL
+set(SPIRV_ENABLE_OPAQUE_POINTERS TRUE CACHE BOOL
     "Enable opaque pointers for SPIR-V offload by default.")
 if(SPIRV_ENABLE_OPAQUE_POINTERS)
   add_definitions("-DSPIRV_ENABLE_OPAQUE_POINTERS=1")
diff --git a/sycl/test-e2e/BFloat16/bfloat16_conversions.cpp b/sycl/test-e2e/BFloat16/bfloat16_conversions.cpp
index 1e552a8aceeaa..85abf3303ec7c 100755
--- a/sycl/test-e2e/BFloat16/bfloat16_conversions.cpp
+++ b/sycl/test-e2e/BFloat16/bfloat16_conversions.cpp
@@ -5,6 +5,9 @@
 // software emulation.
 // UNSUPPORTED: accelerator
 
+// FIXME: enable opaque pointers support on CPU.
+// UNSUPPORTED: cpu
+
 //==---------- bfloat16_conversions.cpp - SYCL bfloat16 type test ---------==//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
diff --git a/sycl/test-e2e/BFloat16/bfloat16_type.cpp b/sycl/test-e2e/BFloat16/bfloat16_type.cpp
index 3d087d04ed898..db3a85fb670f7 100644
--- a/sycl/test-e2e/BFloat16/bfloat16_type.cpp
+++ b/sycl/test-e2e/BFloat16/bfloat16_type.cpp
@@ -5,7 +5,9 @@
 
 // TODO currently the feature isn't supported on FPGA.
 // UNSUPPORTED: accelerator
-//
+
+// FIXME: enable opaque pointers support on CPU.
+// UNSUPPORTED: cpu
 
 //==----------- bfloat16_type.cpp - SYCL bfloat16 type test ----------------==//
 //
diff --git a/sycl/test-e2e/DeviceLib/string_test.cpp b/sycl/test-e2e/DeviceLib/string_test.cpp
index 92377520fd4ce..be4e7ed38ca27 100644
--- a/sycl/test-e2e/DeviceLib/string_test.cpp
+++ b/sycl/test-e2e/DeviceLib/string_test.cpp
@@ -5,6 +5,9 @@
 // RUN: %{build} -fno-builtin -fsycl-device-lib-jit-link -o %t.out
 // RUN: %if !gpu %{ %{run} %t.out %}
 
+// FIXME: enable opaque pointers support on CPU.
+// UNSUPPORTED: cpu
+
 #include <cassert>
 #include <cstdint>
 #include <cstring>
diff --git a/sycl/test-e2e/ESIMD/ext_math.cpp b/sycl/test-e2e/ESIMD/ext_math.cpp
index 47a9e7b251532..d6aa4e5d19791 100644
--- a/sycl/test-e2e/ESIMD/ext_math.cpp
+++ b/sycl/test-e2e/ESIMD/ext_math.cpp
@@ -9,6 +9,9 @@
 // RUN: %{build} -fsycl-device-code-split=per_kernel %{mathflags} -o %t.out
 // RUN: %{run} %t.out
 
+// FIXME: enable opaque pointers support
+// REQUIRES: TEMPORARY_DISABLED
+
 // This test checks extended math operations. Combinations of
 // - argument type - half, float
 // - math function - sin, cos, ..., div_ieee, pow
diff --git a/sycl/test-e2e/KernelFusion/internalize_array_wrapper.cpp b/sycl/test-e2e/KernelFusion/internalize_array_wrapper.cpp
index e90f42d023616..b968b48af9497 100644
--- a/sycl/test-e2e/KernelFusion/internalize_array_wrapper.cpp
+++ b/sycl/test-e2e/KernelFusion/internalize_array_wrapper.cpp
@@ -2,6 +2,9 @@
 // RUN: %{build} -fsycl-embed-ir -O2 -o %t.out
 // RUN: %{run} %t.out
 
+// FIXME: enable opaque pointers support
+// REQUIRES: TEMPORARY_DISABLED
+
 // Test internalization of a nested array type.
 
 #include <array>
diff --git a/sycl/test-e2e/Reduction/reduction_usm.cpp b/sycl/test-e2e/Reduction/reduction_usm.cpp
index 9a27956982117..eac92c670a7b1 100644
--- a/sycl/test-e2e/Reduction/reduction_usm.cpp
+++ b/sycl/test-e2e/Reduction/reduction_usm.cpp
@@ -7,6 +7,9 @@
 // Windows doesn't yet have full shutdown().
 // UNSUPPORTED: ze_debug && windows
 
+// FIXME: enable opaque pointers support
+// REQUIRES: TEMPORARY_DISABLED
+
 // This test performs basic checks of parallel_for(nd_range, reduction, func)
 // with reductions initialized with USM pointer.
 
diff --git a/sycl/test-e2e/Regression/local-arg-align.cpp b/sycl/test-e2e/Regression/local-arg-align.cpp
index d47dc375f6d6f..76c7ed1eef94f 100644
--- a/sycl/test-e2e/Regression/local-arg-align.cpp
+++ b/sycl/test-e2e/Regression/local-arg-align.cpp
@@ -2,6 +2,9 @@
 //
 // RUN: %{run} %t.out
 
+// FIXME: enable opaque pointers support
+// REQUIRES: TEMPORARY_DISABLED
+
 //==-- local-arg-align.cpp - Test for local argument alignmnent ------------==//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

From e020f698e7436525654ff4cb860138f746db6df3 Mon Sep 17 00:00:00 2001
From: elizabethandrews <elizabeth.andrews@intel.com>
Date: Tue, 8 Aug 2023 10:09:20 -0400
Subject: [PATCH 02/24] [SYCL] Ignore vec_type_hint attribute in SYCL 2020
 (#10619)

According to the SYCL 2020 spec, [[sycl::vec_type_hint()]] attribute
should accept arguments of the type sycl::vec type. The attribute should
also be accepted with non conforming lambda syntax.

The current implementation in SYCL corresponds to the openCL version of
this argument (with an additional spelling for SYCL), i.e. the attribute
accepts extended vector type, floating point types and integral type. An
error diagnostic is thrown for sycl:vec type.

Since the attribute is deprecated and is not handled by any SYCL
backend, and will be removed in a future version of the spec, to be
minimally conformant with SYCL 2020 spec, this PR just ignores the
attribute instead of adding support for sycl::vec type. Support was also
added for non conforming lambda syntax
---
 clang/include/clang/Basic/Attr.td             |  1 +
 .../clang/Basic/DiagnosticSemaKinds.td        |  3 +++
 clang/lib/Sema/SemaDeclAttr.cpp               |  7 +++--
 clang/test/SemaSYCL/vec-type-hint-2.cpp       | 26 +++++++++++++++++++
 clang/test/SemaSYCL/vec-type-hint.cpp         |  2 +-
 5 files changed, 36 insertions(+), 3 deletions(-)
 create mode 100644 clang/test/SemaSYCL/vec-type-hint-2.cpp

diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 4c877707dbf7b..618530475e6b7 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -4031,6 +4031,7 @@ def VecTypeHint : InheritableAttr {
   let Spellings = [GNU<"vec_type_hint">, CXX11<"sycl", "vec_type_hint">];
   let Args = [TypeArgument<"TypeHint">];
   let Subjects = SubjectList<[Function], ErrorDiag>;
+  let SupportsNonconformingLambdaSyntax = 1;
   let Documentation = [Undocumented];
 }
 
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 3db8d2409c0b0..bf4d97dd5dba3 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -11979,6 +11979,9 @@ def warn_ivdep_attribute_argument : Warning<
 def warn_attribute_spelling_deprecated : Warning<
   "attribute %0 is deprecated">,
   InGroup<DeprecatedAttributes>;
+def warn_attribute_deprecated_ignored : Warning<
+  "attribute %0 is deprecated; attribute ignored">,
+  InGroup<DeprecatedAttributes>;
 def note_spelling_suggestion : Note<
   "did you mean to use %0 instead?">;
 def warn_attribute_requires_non_negative_integer_argument :
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index f512c5a9880cb..5c6fb8f6ff001 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -4550,8 +4550,11 @@ static void handleSYCLIntelLoopFuseAttr(Sema &S, Decl *D, const ParsedAttr &A) {
 
 static void handleVecTypeHint(Sema &S, Decl *D, const ParsedAttr &AL) {
   // This attribute is deprecated without replacement in SYCL 2020 mode.
-  if (S.LangOpts.getSYCLVersion() > LangOptions::SYCL_2017)
-    S.Diag(AL.getLoc(), diag::warn_attribute_spelling_deprecated) << AL;
+  // Ignore the attribute in SYCL 2020.
+  if (S.LangOpts.getSYCLVersion() > LangOptions::SYCL_2017) {
+    S.Diag(AL.getLoc(), diag::warn_attribute_deprecated_ignored) << AL;
+    return;
+  }
 
   // If the attribute is used with the [[sycl::vec_type_hint]] spelling in SYCL
   // 2017 mode, we want to warn about using the newer name in the older
diff --git a/clang/test/SemaSYCL/vec-type-hint-2.cpp b/clang/test/SemaSYCL/vec-type-hint-2.cpp
new file mode 100644
index 0000000000000..820ab2bc011ee
--- /dev/null
+++ b/clang/test/SemaSYCL/vec-type-hint-2.cpp
@@ -0,0 +1,26 @@
+// RUN: %clang_cc1 -fsycl-is-device -sycl-std=2020 -internal-isystem %S/Inputs -fsyntax-only -verify %s
+
+// Test which verifies [[sycl::vec_type_hint()]] is accepted
+// with non-conforming lambda syntax.
+
+// NOTE: This attribute is not supported in the SYCL backends.
+// To be minimally conformant with SYCL2020, attribute is
+// accepted by the Clang FE with a warning. No additional
+// semantic handling or IR generation is done for this
+// attribute.
+
+#include "sycl.hpp"
+
+struct test {};
+
+using namespace sycl;
+queue q;
+
+void bar() {
+  q.submit([&](handler &h) {
+    h.single_task<class kernelname>(
+        // expected-warning@+1 {{attribute 'vec_type_hint' is deprecated; attribute ignored}}
+        []() [[sycl::vec_type_hint(test)]] {});
+  });
+}
+
diff --git a/clang/test/SemaSYCL/vec-type-hint.cpp b/clang/test/SemaSYCL/vec-type-hint.cpp
index d72d0e5ec91b4..e9fccb4ae2928 100644
--- a/clang/test/SemaSYCL/vec-type-hint.cpp
+++ b/clang/test/SemaSYCL/vec-type-hint.cpp
@@ -7,4 +7,4 @@
 
 // __attribute__((vec_type_hint)) is deprecated without replacement in SYCL
 // 2020 mode, but is allowed in SYCL 2017 and OpenCL modes.
-KERNEL __attribute__((vec_type_hint(int))) void foo() {} // sycl-2020-warning {{attribute 'vec_type_hint' is deprecated}}
+KERNEL __attribute__((vec_type_hint(int))) void foo() {} // sycl-2020-warning {{attribute 'vec_type_hint' is deprecated; attribute ignored}}

From 39506e44b9c9f36968a4551eb7d5ad483d544bbb Mon Sep 17 00:00:00 2001
From: Alexey Bader <alexey.bader@intel.com>
Date: Tue, 8 Aug 2023 07:40:23 -0700
Subject: [PATCH 03/24] [CI] Remove Nightly build configuration for opaque
 pointers (#10723)

This mode is tested by the default configuration now.
---
 .github/workflows/sycl_nightly.yml | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/.github/workflows/sycl_nightly.yml b/.github/workflows/sycl_nightly.yml
index cf4b1a8176111..6e6223679dba9 100644
--- a/.github/workflows/sycl_nightly.yml
+++ b/.github/workflows/sycl_nightly.yml
@@ -31,18 +31,6 @@ jobs:
       # prefer widespread gzip compression.
       artifact_archive_name: sycl_linux.tar.gz
 
-  ubuntu2204_opaque_pointers_build_test:
-    if: github.repository == 'intel/llvm'
-    uses: ./.github/workflows/sycl_linux_build_and_test.yml
-    needs: test_matrix
-    secrets: inherit
-    with:
-      build_cache_root: "/__w/"
-      build_cache_suffix: opaque_pointers
-      build_artifact_suffix: opaque_pointers
-      build_configure_extra_args: "--hip --cuda --enable-esimd-emulator --cmake-opt=-DSPIRV_ENABLE_OPAQUE_POINTERS=TRUE"
-      merge_ref: ''
-
   windows_default:
     name: Windows
     if: github.repository == 'intel/llvm'

From 319f06780dc113a25f9baa15c510633c91ec2da7 Mon Sep 17 00:00:00 2001
From: aelovikov-intel <andrei.elovikov@intel.com>
Date: Tue, 8 Aug 2023 07:51:06 -0700
Subject: [PATCH 04/24] [CI] Switch pre-commit to a new scheme (#10720)

Use pull_request trigger (instead of pull_request_target) for
everything except AWS CUDA E2E testing. The latter has to go to a
separate workflow (workflow_run) in order to have access to the AWS EC
key kept as a github secret.

As part of the changes, I also stopped using matrix generator for the
pre-commit task. Instead, the matrix is written directly inside the
task's .yml file. The only minor difference in the behavior is that
driver installation happens on an image with previous driver installed,
not on a system without any driver.
---
 .github/workflows/sycl_exp_precommit.yml   | 93 ----------------------
 .github/workflows/sycl_precommit_aws.yml   |  9 ++-
 .github/workflows/sycl_precommit_linux.yml | 88 ++++++++++----------
 3 files changed, 56 insertions(+), 134 deletions(-)
 delete mode 100644 .github/workflows/sycl_exp_precommit.yml

diff --git a/.github/workflows/sycl_exp_precommit.yml b/.github/workflows/sycl_exp_precommit.yml
deleted file mode 100644
index a0439088cca8a..0000000000000
--- a/.github/workflows/sycl_exp_precommit.yml
+++ /dev/null
@@ -1,93 +0,0 @@
-name: SYCL Experimental Pre-Commit
-
-on:
-  pull_request:
-    branches:
-    - sycl
-    paths:
-    - '.github/workflows/**'
-
-jobs:
-  detect_changes:
-    uses: ./.github/workflows/sycl_detect_changes.yml
-
-  lint:
-    needs: [detect_changes]
-    if: |
-      github.event.pull_request.head.repo.full_name == 'intel/llvm'
-    runs-on: [Linux, build]
-    container:
-      image: ghcr.io/intel/llvm/sycl_ubuntu2204_nightly:no-drivers
-      options: -u 1001:1001
-    steps:
-    - uses: actions/checkout@v3
-      with:
-        sparse-checkout: |
-          devops/actions/cached_checkout
-          devops/actions/clang-format
-          devops/actions/cleanup
-    - name: Register cleanup after job is finished
-      uses: ./devops/actions/cleanup
-    - name: 'PR commits + 2'
-      run: echo "PR_FETCH_DEPTH=$(( ${{ github.event.pull_request.commits }} + 2 ))" >> "${GITHUB_ENV}"
-    - uses: ./devops/actions/cached_checkout
-      with:
-        path: src
-        fetch-depth: ${{ env.PR_FETCH_DEPTH }}
-        ref: ${{ github.event.pull_request.head.sha }}
-        merge_ref: ''
-        cache_path: "/__w/repo_cache/"
-    - name: Run clang-format
-      uses: ./devops/actions/clang-format
-      with:
-        path: src
-
-  build:
-    needs: [lint]
-    if: |
-      always()
-      && (success() || contains(github.event.pull_request.labels.*.name, 'ignore-lint'))
-    uses: ./.github/workflows/sycl_linux_build.yml
-    with:
-      build_ref: ${{ github.sha }}
-      merge_ref: ''
-      build_cache_root: "/__w/"
-      build_artifact_suffix: "default"
-      build_cache_suffix: "default"
-      changes: '[]'
-
-  test:
-    needs: [build, detect_changes]
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - name: ESIMD Emu
-            runner: '["Linux", "x86-cpu"]'
-            image: ghcr.io/intel/llvm/ubuntu2204_build:latest
-            image_options: -u 1001
-            target_devices: ext_intel_esimd_emulator:gpu
-          - name: AMD/HIP
-            runner: '["Linux", "amdgpu"]'
-            image: ghcr.io/intel/llvm/ubuntu2204_build:latest
-            image_options: -u 1001 --device=/dev/dri --device=/dev/kfd
-            target_devices: ext_oneapi_hip:gpu
-          - name: Intel
-            runner: '["Linux", "gen12"]'
-            image: ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest
-            image_options: -u 1001 --device=/dev/dri --privileged --cap-add SYS_ADMIN
-            target_devices: ext_oneapi_level_zero:gpu;opencl:gpu;opencl:cpu
-            reset_gpu: ${{ contains(needs.detect_changes.outputs.filters, 'drivers') }}
-    uses: ./.github/workflows/sycl_linux_run_tests.yml
-    with:
-      name: ${{ matrix.name }}
-      runner: ${{ matrix. runner }}
-      image: ${{ matrix.image }}
-      image_options: ${{ matrix.image_options }}
-      target_devices: ${{ matrix.target_devices }}
-      ref: ${{ github.sha }}
-      merge_ref: ''
-
-      sycl_toolchain_artifact: sycl_linux_default
-      sycl_toolchain_archive: ${{ needs.build.outputs.artifact_archive_name }}
-      sycl_toolchain_decompress_command: ${{ needs.build.outputs.artifact_decompress_command }}
diff --git a/.github/workflows/sycl_precommit_aws.yml b/.github/workflows/sycl_precommit_aws.yml
index 627c59c32dacc..f9b30a104e097 100644
--- a/.github/workflows/sycl_precommit_aws.yml
+++ b/.github/workflows/sycl_precommit_aws.yml
@@ -1,8 +1,15 @@
 name: E2E on AWS CUDA
+# We have to keep pre-commit AWS CUDA testing in a separate workflow because we
+# need access to AWS secret and that isn't available on pull_request jobs for
+# PRs from forks. And github's "require approval for all outside collaborators"
+# is bypassed on pull_request_target.
+#
+# Also, we use commit status and not check suite/run (which, in theory, is more
+# powerful) due to https://github.com/orgs/community/discussions/24616.
 
 on:
   workflow_run:
-    workflows: [SYCL Experimental Pre-Commit]
+    workflows: [SYCL Pre Commit on Linux]
     types:
       - completed
 
diff --git a/.github/workflows/sycl_precommit_linux.yml b/.github/workflows/sycl_precommit_linux.yml
index e1492b37e0400..7e2306f428d15 100644
--- a/.github/workflows/sycl_precommit_linux.yml
+++ b/.github/workflows/sycl_precommit_linux.yml
@@ -1,11 +1,13 @@
 name: SYCL Pre Commit on Linux
 
 on:
-  pull_request_target:
+  # We rely on "Fork pull request workflows from outside collaborators" -
+  # "Require approval for all outside collaborators" at
+  # https://github.com/intel/llvm/settings/actions for security.
+  pull_request:
     branches:
     - sycl
     - sycl-devops-pr/**
-    - llvmspirv_pulldown
     # Do not run builds if changes are only in the following locations
     paths-ignore:
     - '.github/ISSUE_TEMPLATE/**'
@@ -15,24 +17,12 @@ on:
     - 'clang/docs/**'
     - '**.md'
     - '**.rst'
-    # Changes in CI won't have any effect with pull_request_target
-    - '.github/workflows'
-    # For CI-related files we explicitly skip all the jobs below even if there
-    # were other (non-ignored) files modified in this PR.
-    - 'devops/*/**'
-
-permissions:
-  contents: read
 
 jobs:
   detect_changes:
     uses: ./.github/workflows/sycl_detect_changes.yml
 
   lint:
-    needs: [detect_changes]
-    if: |
-      github.event.pull_request.head.repo.full_name == 'intel/llvm' ||
-      !contains(needs.detect_changes.outputs.filters, 'ci')
     runs-on: [Linux, build]
     container:
       image: ghcr.io/intel/llvm/sycl_ubuntu2204_nightly:no-drivers
@@ -40,7 +30,6 @@ jobs:
     steps:
     - uses: actions/checkout@v3
       with:
-        ref: ${{ github.base_ref }}
         sparse-checkout: |
           devops/actions/cached_checkout
           devops/actions/clang-format
@@ -61,36 +50,55 @@ jobs:
       with:
         path: src
 
-  # This job generates matrix of tests for SYCL End-to-End tests
-  test_matrix:
-    needs: [detect_changes]
-    if: |
-      github.event.pull_request.head.repo.full_name == 'intel/llvm' ||
-      !contains(needs.detect_changes.outputs.filters, 'ci')
-    name: Generate Test Matrix
-    uses: ./.github/workflows/sycl_gen_test_matrix.yml
-    with:
-      ref: ${{ github.event.pull_request.head.sha }}
-      lts_config: "hip_amdgpu;lin_intel;esimd_emu;cuda_aws"
-
-  linux_default:
-    name: Linux
-    # Only build and test patches, that have passed all linter checks, because
-    # the next commit is likely to be a follow-up on that job.
-    needs: [lint, test_matrix, detect_changes]
+  build:
+    needs: [lint, detect_changes]
     if: |
       always()
       && (success() || contains(github.event.pull_request.labels.*.name, 'ignore-lint'))
-      && (github.event.pull_request.head.repo.full_name == 'intel/llvm'
-          || !contains(needs.detect_changes.outputs.filters, 'ci'))
-    uses: ./.github/workflows/sycl_linux_build_and_test.yml
-    secrets: inherit
+    uses: ./.github/workflows/sycl_linux_build.yml
     with:
-      build_ref: ${{ github.event.pull_request.head.sha }}
-      merge_ref: ${{ github.event.pull_request.base.sha }}
+      build_ref: ${{ github.sha }}
+      merge_ref: ''
       build_cache_root: "/__w/"
       build_artifact_suffix: "default"
       build_cache_suffix: "default"
-      lts_matrix: ${{ needs.test_matrix.outputs.lts_lx_matrix }}
-      lts_aws_matrix: ${{ needs.test_matrix.outputs.lts_aws_matrix }}
       changes: ${{ needs.detect_changes.outputs.filters }}
+
+  test:
+    needs: [build, detect_changes]
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - name: ESIMD Emu
+            runner: '["Linux", "x86-cpu"]'
+            image: ghcr.io/intel/llvm/ubuntu2204_build:latest
+            image_options: -u 1001
+            target_devices: ext_intel_esimd_emulator:gpu
+          - name: AMD/HIP
+            runner: '["Linux", "amdgpu"]'
+            image: ghcr.io/intel/llvm/ubuntu2204_build:latest
+            image_options: -u 1001 --device=/dev/dri --device=/dev/kfd
+            target_devices: ext_oneapi_hip:gpu
+          - name: Intel
+            runner: '["Linux", "gen12"]'
+            image: ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest
+            image_options: -u 1001 --device=/dev/dri --privileged --cap-add SYS_ADMIN
+            target_devices: ext_oneapi_level_zero:gpu;opencl:gpu;opencl:cpu
+            reset_gpu: ${{ contains(needs.detect_changes.outputs.filters, 'drivers') }}
+    uses: ./.github/workflows/sycl_linux_run_tests.yml
+    with:
+      name: ${{ matrix.name }}
+      runner: ${{ matrix. runner }}
+      image: ${{ matrix.image }}
+      image_options: ${{ matrix.image_options }}
+      target_devices: ${{ matrix.target_devices }}
+      reset_gpu: ${{ matrix.reset_gpu }}
+
+      ref: ${{ github.sha }}
+      merge_ref: ''
+
+      sycl_toolchain_artifact: sycl_linux_default
+      sycl_toolchain_archive: ${{ needs.build.outputs.artifact_archive_name }}
+      sycl_toolchain_decompress_command: ${{ needs.build.outputs.artifact_decompress_command }}
+

From 5eef8c76ec9004100a917d6091171da1aa8c6e40 Mon Sep 17 00:00:00 2001
From: Nick Sarnie <sarnex@users.noreply.github.com>
Date: Tue, 8 Aug 2023 11:00:03 -0400
Subject: [PATCH 05/24] [SYCL][InvokeSimd] Allow callables to return uniform
 (#10714)

The spec states that returning a `uniform` object is allowed:

"Return values of type sycl::ext::oneapi::experimental::uniform<T> are
not anyhow converted, and broadcast to each work-item; every work-item
in the sub-group receives the same value. NOTE:
sycl::ext::oneapi::experimental::uniform<T> return type is the way to
return a uniform value of simd or simd_mask type."

Update the compile-time error checking and ESIMD verifier to allow this.

Signed-off-by: Sarnie, Nick <nick.sarnie@intel.com>
---
 llvm/lib/SYCLLowerIR/ESIMD/ESIMDVerifier.cpp  |  1 +
 .../ext/oneapi/experimental/invoke_simd.hpp   |  3 +-
 .../InvokeSimd/Spec/uniform_retval.cpp        | 30 +++++++++++++++++--
 sycl/test/invoke_simd/return-type-uniform.cpp | 29 ++++++++++++++++++
 4 files changed, 59 insertions(+), 4 deletions(-)
 create mode 100644 sycl/test/invoke_simd/return-type-uniform.cpp

diff --git a/llvm/lib/SYCLLowerIR/ESIMD/ESIMDVerifier.cpp b/llvm/lib/SYCLLowerIR/ESIMD/ESIMDVerifier.cpp
index e884ee8f2c57b..4d0e84091f107 100644
--- a/llvm/lib/SYCLLowerIR/ESIMD/ESIMDVerifier.cpp
+++ b/llvm/lib/SYCLLowerIR/ESIMD/ESIMDVerifier.cpp
@@ -63,6 +63,7 @@ static const char *LegalSYCLFunctions[] = {
     "^sycl::_V1::ext::oneapi::sub_group::.+",
     "^sycl::_V1::ext::oneapi::experimental::spec_constant<.+>::.+",
     "^sycl::_V1::ext::oneapi::experimental::this_sub_group",
+    "^sycl::_V1::ext::oneapi::experimental::uniform<.+>::.+",
     "^sycl::_V1::ext::oneapi::bfloat16::.+",
     "^sycl::_V1::ext::oneapi::experimental::if_architecture_is"};
 
diff --git a/sycl/include/sycl/ext/oneapi/experimental/invoke_simd.hpp b/sycl/include/sycl/ext/oneapi/experimental/invoke_simd.hpp
index 40c0efff94021..24d06806b85d5 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/invoke_simd.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/invoke_simd.hpp
@@ -371,7 +371,8 @@ constexpr bool has_struct_arg(Ret (*)(Args...)) {
 
 template <typename Ret, typename... Args>
 constexpr bool has_struct_ret(Ret (*)(Args...)) {
-  return std::is_class_v<Ret> && !is_simd_or_mask_type<Ret>::value;
+  return std::is_class_v<Ret> && !is_simd_or_mask_type<Ret>::value &&
+         !is_uniform_type<Ret>::value;
 }
 
 template <typename Ret, typename... Args>
diff --git a/sycl/test-e2e/InvokeSimd/Spec/uniform_retval.cpp b/sycl/test-e2e/InvokeSimd/Spec/uniform_retval.cpp
index 3eea0623c9319..d92cb57f783cc 100644
--- a/sycl/test-e2e/InvokeSimd/Spec/uniform_retval.cpp
+++ b/sycl/test-e2e/InvokeSimd/Spec/uniform_retval.cpp
@@ -7,6 +7,12 @@
 //
 // VISALTO enable run
 // RUN: env IGC_VISALTO=63 IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %{run} %t.out
+//
+// RUN: %{build} -DUNIFORM_RET_TYPE -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr -o %t2.out
+// RUN: env IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %{run} %t2.out
+//
+// VISALTO enable run
+// RUN: env IGC_VISALTO=63 IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %{run} %t2.out
 
 /*
  * Test case #1
@@ -98,17 +104,35 @@ template <class T>
  * returning the scalar as a SIMD type seems to work fine.
  */
 template <class T>
-__attribute__((always_inline)) T
+__attribute__((always_inline))
+#ifdef UNIFORM_RET_TYPE
+uniform<T>
+#else
+T
+#endif
 ESIMD_CALLEE_return_uniform_scalar(esimd::simd<T, VL> x,
                                    T n) SYCL_ESIMD_FUNCTION {
+#ifdef UNIFORM_RET_TYPE
+  return uniform<T>{n};
+#else
   return n;
+#endif
 }
 
 template <class T>
 [[intel::device_indirectly_callable]] SYCL_EXTERNAL
-    T __regcall SIMD_CALLEE_return_uniform_scalar(simd<T, VL> x,
-                                                  T n) SYCL_ESIMD_FUNCTION {
+#ifdef UNIFORM_RET_TYPE
+    uniform<T>
+#else
+    T
+#endif
+    __regcall SIMD_CALLEE_return_uniform_scalar(simd<T, VL> x,
+                                                T n) SYCL_ESIMD_FUNCTION {
+#ifdef UNIFORM_RET_TYPE
+  uniform<T> r = ESIMD_CALLEE_return_uniform_scalar<T>(x, n);
+#else
   T r = ESIMD_CALLEE_return_uniform_scalar<T>(x, n);
+#endif
   return r;
 }
 
diff --git a/sycl/test/invoke_simd/return-type-uniform.cpp b/sycl/test/invoke_simd/return-type-uniform.cpp
new file mode 100644
index 0000000000000..13be728f32d25
--- /dev/null
+++ b/sycl/test/invoke_simd/return-type-uniform.cpp
@@ -0,0 +1,29 @@
+// RUN: %clangxx -fsycl -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr %s -o /dev/null
+#include <sycl/ext/intel/esimd.hpp>
+#include <sycl/ext/oneapi/experimental/invoke_simd.hpp>
+#include <sycl/sycl.hpp>
+
+using namespace sycl::ext::oneapi::experimental;
+using namespace sycl;
+namespace esimd = sycl::ext::intel::esimd;
+
+[[intel::device_indirectly_callable]] uniform<int>
+callee(simd<int, 8>) SYCL_ESIMD_FUNCTION {
+  return uniform<int>(5);
+}
+
+void foo() {
+  constexpr unsigned Size = 1024;
+  constexpr unsigned GroupSize = 64;
+  sycl::range<1> GlobalRange{Size};
+  sycl::range<1> LocalRange{GroupSize};
+  sycl::nd_range<1> Range(GlobalRange, LocalRange);
+  queue q;
+  auto e = q.submit([&](handler &cgh) {
+    cgh.parallel_for(Range, [=](nd_item<1> ndi) {
+      uniform<int> x = invoke_simd(ndi.get_sub_group(), callee, 0);
+    });
+  });
+}
+
+int main() { foo(); }

From bd81fc4d38a7cb505fe41f358518d6a4679a669b Mon Sep 17 00:00:00 2001
From: aelovikov-intel <andrei.elovikov@intel.com>
Date: Tue, 8 Aug 2023 08:33:34 -0700
Subject: [PATCH 06/24] [SYCL] Use pair of native::sin/cos for sincos under
 __FAST_MATH__ (#10481)

---
 sycl/include/sycl/builtins.hpp | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/sycl/include/sycl/builtins.hpp b/sycl/include/sycl/builtins.hpp
index 6751ef20c902d..b2fcd558328e7 100644
--- a/sycl/include/sycl/builtins.hpp
+++ b/sycl/include/sycl/builtins.hpp
@@ -734,8 +734,8 @@ std::enable_if_t<__FAST_MATH_GENFLOAT(T), T> sin(T x) __NOEXC {
 
 // svgenfloat sincos (svgenfloat x, genfloatptr cosval)
 template <typename T, typename T2>
-std::enable_if_t<
-    detail::is_svgenfloat<T>::value && detail::is_genfloatptr<T2>::value, T>
+std::enable_if_t<__FAST_MATH_GENFLOAT(T) && detail::is_genfloatptr<T2>::value,
+                 T>
 sincos(T x, T2 cosval) __NOEXC {
   detail::check_vector_size<T, T2>();
   return __sycl_std::__invoke_sincos<T>(x, cosval);
@@ -2500,6 +2500,23 @@ std::enable_if_t<detail::is_svgenfloatf<T>::value, T> cos(T x) __NOEXC {
   return native::cos(x);
 }
 
+// svgenfloat sincos (svgenfloat x, genfloatptr cosval)
+// This is a performance optimization to ensure that sincos isn't slower than a
+// pair of sin/cos executed separately. Theoretically, calling non-native sincos
+// might be faster than calling native::sin plus native::cos separately and we'd
+// need some kind of cost model to make the right decision (and move this
+// entirely to the JIT/AOT compilers). However, in practice, this simpler
+// solution seems to work just fine and matches how sin/cos above are optimized
+// for the fast math path.
+template <typename T, typename T2>
+std::enable_if_t<
+    detail::is_svgenfloatf<T>::value && detail::is_genfloatptr<T2>::value, T>
+sincos(T x, T2 cosval) __NOEXC {
+  detail::check_vector_size<T, T2>();
+  *cosval = native::cos(x);
+  return native::sin(x);
+}
+
 // svgenfloatf exp (svgenfloatf x)
 template <typename T>
 std::enable_if_t<detail::is_svgenfloatf<T>::value, T> exp(T x) __NOEXC {

From d075bd0a0fc693d856668bf6ff53846b98d23c4a Mon Sep 17 00:00:00 2001
From: aelovikov-intel <andrei.elovikov@intel.com>
Date: Tue, 8 Aug 2023 15:18:55 -0700
Subject: [PATCH 07/24] [SYCL] Fix check-all after #10635 (#10744)

Single triple targets are only meant for manual run and should not be
included into check-all, only check-sycl-combined-triples should be.
Otherwise we are running the same tests from multiple processes
resulting in race conditions (beside unnecessary work).
---
 sycl/test/CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/sycl/test/CMakeLists.txt b/sycl/test/CMakeLists.txt
index 01908bcd9c003..843790c9ba289 100644
--- a/sycl/test/CMakeLists.txt
+++ b/sycl/test/CMakeLists.txt
@@ -83,6 +83,7 @@ add_lit_testsuite(check-sycl-spirv "Running device-agnostic SYCL regression test
   PARAMS "SYCL_TRIPLE=spir64-unknown-unknown"
   DEPENDS ${SYCL_TEST_DEPS}
   ${SYCL_TEST_EXCLUDE}
+  EXCLUDE_FROM_CHECK_ALL
   )
 
 add_lit_testsuite(check-sycl-dumps "Running ABI dump tests only"
@@ -100,6 +101,7 @@ if(SYCL_BUILD_PI_CUDA)
     PARAMS "SYCL_TRIPLE=nvptx64-nvidia-cuda"
     DEPENDS ${SYCL_TEST_DEPS}
     ${SYCL_TEST_EXCLUDE}
+    EXCLUDE_FROM_CHECK_ALL
   )
 
   add_custom_target(check-sycl-cuda)
@@ -115,6 +117,7 @@ if(SYCL_BUILD_PI_HIP)
       PARAMS "SYCL_TRIPLE=nvptx64-nvidia-cuda"
       DEPENDS ${SYCL_TEST_DEPS}
       ${SYCL_TEST_EXCLUDE}
+      EXCLUDE_FROM_CHECK_ALL
     )
 
     add_dependencies(check-sycl-hip check-sycl-hip-ptx)
@@ -125,6 +128,7 @@ if(SYCL_BUILD_PI_HIP)
       PARAMS "SYCL_TRIPLE=amdgcn-amd-amdhsa"
       DEPENDS ${SYCL_TEST_DEPS}
       ${SYCL_TEST_EXCLUDE}
+      EXCLUDE_FROM_CHECK_ALL
     )
 
     add_dependencies(check-sycl-hip check-sycl-hip-gcn)

From 4b44182081fe48194f7f10be7ada5da0c35f3fe0 Mon Sep 17 00:00:00 2001
From: Andrei Elovikov <andrei.elovikov@intel.com>
Date: Tue, 8 Aug 2023 15:39:24 -0700
Subject: [PATCH 08/24] [CI] Add run-name for sycl_precommit_aws.yml

---
 .github/workflows/sycl_precommit_aws.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/sycl_precommit_aws.yml b/.github/workflows/sycl_precommit_aws.yml
index f9b30a104e097..d625f9298d3fa 100644
--- a/.github/workflows/sycl_precommit_aws.yml
+++ b/.github/workflows/sycl_precommit_aws.yml
@@ -1,4 +1,5 @@
 name: E2E on AWS CUDA
+run-name: E2E on AWS CUDA - ${{ github.event.workflow_run.display_title }}
 # We have to keep pre-commit AWS CUDA testing in a separate workflow because we
 # need access to AWS secret and that isn't available on pull_request jobs for
 # PRs from forks. And github's "require approval for all outside collaborators"

From a2265a6bafb7a78f075e8904f3507b9949742f78 Mon Sep 17 00:00:00 2001
From: Isaac Ault <isaac.ault@codeplay.com>
Date: Wed, 9 Aug 2023 10:43:51 +0100
Subject: [PATCH 09/24] [SYCL][Bindless] Fix Mipmap Tests (#10713)

# Fix Mipmap Tests

Fixing the computation of expected output values so that the tests also
work with input sizes that are not powers of 2.

---------

Co-authored-by: Dmitry Vodopyanov <dmitry.vodopyanov@intel.com>
---
 .../bindless_images/mipmap/mipmap_read_1D.cpp   | 17 ++++++++++-------
 .../bindless_images/mipmap/mipmap_read_2D.cpp   | 14 +++++++-------
 .../bindless_images/mipmap/mipmap_read_3D.cpp   |  6 +++---
 3 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/sycl/test-e2e/bindless_images/mipmap/mipmap_read_1D.cpp b/sycl/test-e2e/bindless_images/mipmap/mipmap_read_1D.cpp
index 41ff725849ad7..6a1ebb3790ab1 100644
--- a/sycl/test-e2e/bindless_images/mipmap/mipmap_read_1D.cpp
+++ b/sycl/test-e2e/bindless_images/mipmap/mipmap_read_1D.cpp
@@ -19,22 +19,25 @@ int main() {
   auto ctxt = q.get_context();
 
   // declare image data
-  constexpr size_t N = 16;
+  constexpr size_t N = 15;
   std::vector<float> out(N);
   std::vector<float> expected(N);
   std::vector<sycl::float4> dataIn1(N);
   std::vector<sycl::float4> dataIn2(N / 2);
   std::vector<sycl::float4> copyOut(N / 2);
-  int j = 0;
+
   for (int i = 0; i < N; i++) {
-    expected[i] = i + (j + 10);
-    if (i % 2)
-      j++;
+    // Populate input data (to-be mipmap image layers)
     dataIn1[i] = sycl::float4(i, i, i, i);
     if (i < (N / 2)) {
       dataIn2[i] = sycl::float4(i + 10, i + 10, i + 10, i + 10);
       copyOut[i] = sycl::float4{0, 0, 0, 0};
     }
+
+    // Calculate expected output data
+    float norm_coord = ((i + 0.5f) / (float)N);
+    int x = norm_coord * (N >> 1);
+    expected[i] = dataIn1[i][0] + dataIn2[x][0];
   }
 
   try {
@@ -85,8 +88,8 @@ int main() {
         // Extension: read mipmap level 0 with anisotropic filtering and level 1
         // with LOD
         sycl::float4 px1 =
-            sycl::ext::oneapi::experimental::read_image<sycl::float4>(
-                mipHandle, x, 0.0f, 0.0f);
+            sycl::ext::oneapi::experimental::read_image<sycl::float4>(mipHandle,
+                                                                      x, 0.0f);
         sycl::float4 px2 =
             sycl::ext::oneapi::experimental::read_image<sycl::float4>(mipHandle,
                                                                       x, 1.0f);
diff --git a/sycl/test-e2e/bindless_images/mipmap/mipmap_read_2D.cpp b/sycl/test-e2e/bindless_images/mipmap/mipmap_read_2D.cpp
index 9120dd694e58b..079847f0d1ab9 100644
--- a/sycl/test-e2e/bindless_images/mipmap/mipmap_read_2D.cpp
+++ b/sycl/test-e2e/bindless_images/mipmap/mipmap_read_2D.cpp
@@ -40,13 +40,13 @@ int main() {
   }
   // Expected each x and y will repeat twice
   // since mipmap level 1 is half in size
-  int jj = 0;
-  for (int i = 0; i < width - 1; i += 2) {
-    for (int j = 0; j < height - 1; j += 2, jj++) {
-      expected[j + (width * i)] = jj;
-      expected[j + (width * (i + 1))] = jj;
-      expected[(j + 1) + (width * i)] = jj;
-      expected[(j + 1) + (width * (i + 1))] = jj;
+  for (int i = 0; i < width; i++) {
+    for (int j = 0; j < height; j++) {
+      float norm_coord_x = ((i + 0.5f) / (float)width);
+      int x = norm_coord_x * (width >> 1);
+      float norm_coord_y = ((j + 0.5f) / (float)height);
+      int y = norm_coord_y * (height >> 1);
+      expected[j + (width * i)] = dataIn2[y + (width / 2 * x)][0];
     }
   }
 
diff --git a/sycl/test-e2e/bindless_images/mipmap/mipmap_read_3D.cpp b/sycl/test-e2e/bindless_images/mipmap/mipmap_read_3D.cpp
index bacd6e081e9cf..c858ac57f819b 100644
--- a/sycl/test-e2e/bindless_images/mipmap/mipmap_read_3D.cpp
+++ b/sycl/test-e2e/bindless_images/mipmap/mipmap_read_3D.cpp
@@ -19,9 +19,9 @@ int main() {
   auto ctxt = q.get_context();
 
   // declare image data
-  size_t width = 4;
-  size_t height = 4;
-  size_t depth = 4;
+  size_t width = 5;
+  size_t height = 5;
+  size_t depth = 5;
   size_t N = width * height * depth;
   std::vector<float> out(N);
   std::vector<float> expected(N);

From 180a92ad707bd35df9e98c1474dc52a1e9b3dead Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alberto=20Cabrera=20P=C3=A9rez?=
 <alberto.cabrera@codeplay.com>
Date: Wed, 9 Aug 2023 10:47:18 +0100
Subject: [PATCH 10/24] [SYCL][COMPAT][Doc] Add SYCLcompat documentation
 (#9646)

This pull request introduces a new stand alone library, `SYCLcompat`: a
simplified wrapper on top of SYCL, aiming to make it more accessible to
developers familiar with other heterogeneous programming models.

SYCLcompat has two primary goals:

- Improve the adoption of SYCL. This library is designed to provide a
familiar programming interface that resembles other popular
heterogeneous programming models. By reducing the learning curve, it
enables developers to leverage SYCL's power and features more easily.

- Source-to-Source Translation Support. SYCLcompat is also designed to
facilitate automatic source-to-source translation from other
heterogeneous programming models to SYCL and offer a more standardized
and consistent programming interface. This feature can significantly
streamline the migration and integration of existing codebases into the
SYCL ecosystem.

The first commit of this PR includes the proposed library README,
providing explanation of its motivation, public interface, usage
guidelines, and code examples.

A set of PRs will follow, including subsets of the current
implementation including their tests.

We are open to any suggestions, concerns, or improvements you may have,
so please, let us know if you have any.

Edit: Updated from extension to stand alone library.
https://github.com/intel/llvm/pull/9976 `dims.hpp` and `defs.hpp`
headers

---------

Co-authored-by: Gordon Brown <gordon@codeplay.com>
Co-authored-by: Joe Todd <joe.todd@codeplay.com>
Co-authored-by: Pietro Ghiglio <pietro.ghiglio@codeplay.com>
Co-authored-by: Ruyman Reyes <ruyman@codeplay.com>
Co-authored-by: Steffen Larsen <steffen.larsen@intel.com>
Co-authored-by: aelovikov-intel <andrei.elovikov@intel.com>
Co-authored-by: Sami Hatna <sami.hatna@codeplay.com>
Co-authored-by: Joe Todd <joeatodd@users.noreply.github.com>
Co-authored-by: Alexey Bader <alexey.bader@intel.com>
---
 sycl/doc/index.rst            |    1 +
 sycl/doc/syclcompat/README.md | 1285 +++++++++++++++++++++++++++++++++
 2 files changed, 1286 insertions(+)
 create mode 100644 sycl/doc/syclcompat/README.md

diff --git a/sycl/doc/index.rst b/sycl/doc/index.rst
index 8f6e0854df8f5..579e6e81d44cf 100644
--- a/sycl/doc/index.rst
+++ b/sycl/doc/index.rst
@@ -13,6 +13,7 @@ Using oneAPI DPC++ for Application Development
    PreprocessorMacros
    cuda/contents
    Extensions <https://github.com/intel/llvm/tree/sycl/sycl/doc/extensions>
+   syclcompat/README.md
    FAQ
    User API Reference <https://intel.github.io/llvm-docs/doxygen/group__sycl__api.html>
    EnvironmentVariables
diff --git a/sycl/doc/syclcompat/README.md b/sycl/doc/syclcompat/README.md
new file mode 100644
index 0000000000000..5b000cb52dbc3
--- /dev/null
+++ b/sycl/doc/syclcompat/README.md
@@ -0,0 +1,1285 @@
+# SYCLcompat
+
+SYCLcompat is a header-only library that intends to help developers familiar
+with other heterogeneous programming models (such as OpenMP, CUDA or HIP) to
+familiarize themselves with the SYCL programming API while porting their
+existing codes. Compatibility tools can also benefit from the reduced API size
+when converting legacy codebases.
+
+SYCLcompat provides:
+
+* A high-level API that provides closer semantics to other programming models,
+simplifying line by line conversions.
+* Alternative submission APIs that encapusulate SYCL-specific "queue" and
+"event" APIs for easier reference.
+* Ability to gradually introduce other SYCL concepts as the user familiarises
+themselves with the core SYCL API.
+* Clear distinction between core SYCL API and the compatibility interface via
+separate namespaces.
+
+## Notice
+
+Copyright © 2023-2023 Codeplay Software Limited. All rights reserved.
+
+Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks of
+The Khronos Group Inc. OpenCL(TM) is a trademark of Apple Inc. used by
+permission by Khronos.
+
+## Support
+
+SYCLcompat depends on specific oneAPI DPC++ compiler extensions that may not be
+available to all the SYCL 2020 specification implementations.
+
+Specifically, this library depends on the following SYCL extensions:
+
+* [sycl_ext_oneapi_local_memory](
+    ../extensions/supported/sycl_ext_oneapi_local_memory.asciidoc)
+* [sycl_ext_oneapi_complex](
+    ../extensions/experimental/sycl_ext_oneapi_complex.asciidoc)
+* [sycl_ext_oneapi_free_function_queries](
+    ../extensions/experimental/sycl_ext_oneapi_free_function_queries.asciidoc)
+* [sycl_ext_oneapi_assert](
+    ../extensions/supported/sycl_ext_oneapi_assert.asciidoc)
+* [sycl_ext_oneapi_enqueue_barrier](
+    ../extensions/supported/sycl_ext_oneapi_enqueue_barrier.asciidoc)
+
+## Usage
+
+All functionality is available under the `syclcompat::` namespace, imported
+through the main header, `syclcompat.hpp`. Note that `syclcompat.hpp` does not
+import the <sycl/sycl.hpp> header.
+
+``` cpp
+#include <sycl/syclcompat.hpp>
+```
+
+This document presents the public API under the [Features](#features) section,
+and provides a working [Sample code](#sample-code) using this library. Refer to
+those to learn to use the library.
+
+## Features
+
+### dim3
+
+SYCLcompat provides a `dim3` class akin to that of CUDA or HIP programming
+models. `dim3` encapsulates other languages iteration spaces that are
+represented with coordinate letters (x, y, z).
+
+```cpp
+namespace syclcompat {
+
+class dim3 {
+public:
+  const size_t x, y, z;
+  constexpr dim3(const sycl::range<3> &r);
+  constexpr dim3(const sycl::range<2> &r);
+  constexpr dim3(const sycl::range<1> &r);
+  constexpr dim3(size_t x, size_t y = 1, size_t z = 1);
+
+  constexpr size_t size();
+
+  operator sycl::range<3>();
+  operator sycl::range<2>();
+  operator sycl::range<1>();
+};
+
+// Element-wise operators
+dim3 operator*(const dim3 &a, const dim3 &b);
+dim3 operator+(const dim3 &a, const dim3 &b);
+dim3 operator-(const dim3 &a, const dim3 &b);
+
+} // syclcompat
+```
+
+In SYCL, the fastest-moving dimension is the one with the highest index, e.g. in
+a SYCL 2D range iteration space, there are two dimensions, 0 and 1, and 1 will
+be the one that "moves faster". The compatibility headers for SYCL offer a
+number of convenience functions that help the mapping between xyz-based
+coordinates to SYCL iteration spaces in the different scopes available. In
+addition to the global range, the following helper functions are also provided:
+
+``` c++
+namespace syclcompat {
+
+namespace local_id {
+size_t x();
+size_t y();
+size_t z();
+} // namespace local_id
+
+namespace local_range {
+size_t x();
+size_t y();
+size_t z();
+} // namespace local_range
+
+namespace work_group_id {
+size_t x();
+size_t y();
+size_t z();
+} // namespace work_group_id
+
+namespace work_group_range {
+size_t x();
+size_t y();
+size_t z();
+} // namespace work_group_range
+
+namespace global_range {
+size_t x();
+size_t y();
+size_t z();
+} // namespace global_range
+
+namespace global_id {
+size_t x();
+size_t y();
+size_t z();
+} // namespace global_id
+
+} // syclcompat
+```
+
+These translate any kernel dimensions from one convention to the other. An
+example of an equivalent SYCL call for a 3D kernel using `compat` is
+`syclcompat::global_id::x() == get_global_id(2)`.
+
+### Local Memory
+
+When using `compat` functions, there are two distinct interfaces to allocate
+device local memory. The first interface uses the _sycl_ext_oneapi_local_memory_
+extension to leverage local memory defined at compile time.
+_sycl_ext_oneapi_local_memory_ is accessed through the following wrapper:
+
+``` c++
+namespace syclcompat {
+
+template <typename AllocT> auto *local_mem();
+
+} // syclcompat
+```
+
+`syclcompat::local_mem<AllocT>()` can be used as illustrated in the example
+below.
+
+```c++
+// Sample kernel
+using namespace syclcompat;
+template <int BLOCK_SIZE>
+void local_mem_2d(int *d_A) {
+  // Local memory extension wrapper, size defined at compile-time
+  auto As = local_mem<int[BLOCK_SIZE][BLOCK_SIZE]>();
+  int id_x = local_id::x();
+  int id_y = local_id::y();
+  As[id_y][id_x] = id_x * BLOCK_SIZE + id_y;
+  wg_barrier();
+  int val = As[BLOCK_SIZE - id_y - 1][BLOCK_SIZE - id_x - 1];
+  d_A[global_id::y() * BLOCK_SIZE + global_id::x()] = val;
+}
+```
+
+The second interface allows users to allocate device local memory at runtime.
+SYCLcompat provides this functionality through its kernel launch interface,
+`launch<function>`, defined in the following section.
+
+### launch<function>
+
+SYCLcompat provides a kernel `launch` interface which accepts a function that
+executes on the device (a.k.a "kernel") instead of a lambda/functor. It can be
+called either by using a pair of "teams"/"blocks" and "threads", from
+OpenMP/CUDA terminology, or using a `sycl::nd_range`. The interface accepts a
+device _function_ with the use of an `auto F` template parameter, and a variadic
+`Args` for the function's arguments.
+
+Various overloads for `launch<function>` exist to permit the user to launch on a
+specific `queue`, or to define dynamically sized device local memory.
+
+``` c++
+namespace syclcompat {
+
+template <auto F, typename... Args>
+sycl::event launch(const dim3 &grid, const dim3 &threads, Args... args);
+
+template <auto F, int Dim, typename... Args>
+sycl::event launch(const sycl::nd_range<Dim> &range, Args... args);
+
+template <auto F, int Dim, typename... Args>
+sycl::event launch(const sycl::nd_range<Dim> &range,
+                   sycl::queue q, Args... args);
+
+template <auto F, typename... Args>
+sycl::event launch(const dim3 &grid, const dim3 &threads,
+                   sycl::queue q, Args... args);
+
+template <auto F, int Dim, typename... Args>
+sycl::event launch(const sycl::nd_range<Dim> &range, size_t mem_size,
+                   sycl::queue q, Args... args);
+
+template <auto F, int Dim, typename... Args>
+sycl::event launch(const sycl::nd_range<Dim> &range, size_t mem_size,
+                   Args... args);
+
+template <auto F, typename... Args>
+sycl::event launch(const dim3 &grid, const dim3 &threads,
+                   size_t mem_size, sycl::queue q, Args... args);
+
+template <auto F, typename... Args>
+sycl::event launch(const dim3 &grid, const dim3 &threads,
+                   size_t mem_size, Args... args);
+
+} // syclcompat
+```
+
+For example, if the user had an existing function named `vectorAdd` to execute
+on a device such as follows:
+
+``` c++
+void vectorAdd(const float *A, const float *B, float *C, int n);
+```
+
+using SYCLcompat, the user can call it as follows:
+
+``` c++
+syclcompat::launch<vectorAdd>(blocksPerGrid, threadsPerBlock, d_A, d_B, d_C, n);
+```
+
+which would be equivalent to the following call using a `sycl::nd_range`:
+
+``` c++
+auto range = sycl::nd_range<3>{blocksPerGrid * threadsPerBlock,
+                               threadsPerBlock};
+syclcompat::launch<vectorAdd>(range, d_A, d_B, d_C, n);
+```
+
+For dynamic local memory allocation, `launch<function>` injects a pointer to a
+local `char *` accessor of `mem_size` as the last argument of the kernel
+function. For example, the previous function named `vectorAdd` can be modified
+with the following signature, which adds a `char *` pointer to access local
+memory inside the kernel:
+
+``` c++
+void vectorAdd(const float *A, const float *B, float *C, int n,
+               char *local_mem);
+```
+
+Then, `vectorAdd` can be launched like this:
+
+``` c++
+syclcompat::launch<vectorAdd>(blocksPerGrid, threadsPerBlock, mem_size, d_A, 
+                              d_B, d_C, n);
+```
+
+or this:
+
+``` c++
+auto range = sycl::nd_range<3>{globalSize, localSize};
+syclcompat::launch<vectorAdd>(range, mem_size, d_A, d_B, d_C, n);
+```
+
+This `launch` interface allows users to define an internal memory pool, or
+scratchpad, that can then be reinterpreted as the datatype required by the user
+within the kernel function.
+
+### Utilities
+
+SYCLcompat introduces a set of utility functions designed to streamline the
+usage of the library and its `launch<function>` mechanism.
+
+The first utility function is `syclcompat::wg_barrier()`, which provides a
+concise work-group barrier. `syclcompat::wg_barrier()` uses the
+_SYCL_INTEL_free_function_queries_ extension to provide this functionality.
+
+The second utility function, `syclcompat::compute_nd_range`, ensures that the
+provided global size and work group sizes are appropriate for a given
+dimensionality, and that global size is rounded up to a multiple of the work
+group size in each dimension.
+
+```c++
+namespace syclcompat {
+
+void wg_barrier();
+
+template <int Dim>
+sycl::nd_range<Dim> compute_nd_range(sycl::range<Dim> global_size_in,
+                                     sycl::range<Dim> work_group_size);
+sycl::nd_range<1> compute_nd_range(int global_size_in, int work_group_size);
+
+} // syclcompat
+```
+
+### Queues
+
+The design for this library assumes _in-order_ queues
+(`sycl::property::queue::in_order()`).
+
+Many of the APIs accept an optional `queue` parameter, and this can be an
+out-of-order queue, either created manually or retrieved via a call to
+`syclcompat::create_queue()`, specifying `false` for the `in_order` parameter.
+
+```c++
+namespace syclcompat {
+
+sycl::queue create_queue(bool print_on_async_exceptions = false,
+                         bool in_order = true);
+
+} // syclcompat
+```
+
+However, SYCLcompat does not implement any mechanisms to deal with this case.
+The rationale for this is that a user wanting the full power of SYCL's
+dependency management shouldn't be using the this library. As such, support for
+out-of-order queues is very limited. The only way to safely use an out-of-order
+queue at present is to explicitly `q.wait()` or `e.wait()` where `e` is the
+`sycl::event` returned through a `syclcompat::async` API.
+
+To facilitate machine translation from other heterogeneous programming models to
+SYCL, SYCLcompat provides the following pointer aliases for `sycl::event` and
+`sycl::queue`, and the function `destroy_event` which destroys an `event_ptr`
+allocated on the heap.
+
+``` c++
+namespace syclcompat {
+
+using event_ptr = sycl::event *;
+
+using queue_ptr = sycl::queue *;
+
+static void destroy_event(event_ptr event);
+
+} // syclcompat
+```
+
+### Memory Allocation
+
+This library provides interfaces to allocate memory to be accessed within kernel
+functions and on the host. The `syclcompat::malloc` function allocates device
+USM memory, the `syclcompat::malloc_host` function allocates host USM memory,
+and the `syclcompat::malloc_shared` function allocates shared USM memory.
+
+In each case we provide a template and non-templated interface for allocating
+memory, taking the number of elements or number of bytes respectively.
+
+The interface includes both synchronous and asynchronous `malloc`, `memcpy`,
+`memset`, `fill`, and `free` operations.
+
+There is a helper class `pointer_attributes` to query allocation type for memory
+pointers using SYCLcompat, through `sycl::usm::alloc` and
+`sycl::get_pointer_device`.
+
+``` c++
+namespace syclcompat {
+
+// Expects number of elements
+template <typename T>
+T *malloc(size_t count, sycl::queue q = get_default_queue());
+template <typename T>
+T *malloc_host(size_t count, sycl::queue q = get_default_queue());
+template <typename T>
+T *malloc_shared(size_t count, sycl::queue q = get_default_queue());
+
+// Expects size of the memory in bytes
+void *malloc(size_t num_bytes, sycl::queue q = get_default_queue());
+void *malloc_host(size_t num_bytes, sycl::queue q = get_default_queue());
+void *malloc_shared(size_t num_bytes, sycl::queue q = get_default_queue());
+
+// 2D, 3D memory allocation wrappers
+void *malloc(size_t &pitch, size_t x, size_t y,
+             sycl::queue q = get_default_queue())
+pitched_data malloc(sycl::range<3> size, sycl::queue q = get_default_queue());
+
+// Blocking memcpy
+void memcpy(void *to_ptr, const void *from_ptr, size_t size,
+            sycl::queue q = get_default_queue());
+void memcpy(T *to_ptr, const T *from_ptr, size_t count,
+            sycl::queue q = get_default_queue());
+void memcpy(void *to_ptr, size_t to_pitch, const void *from_ptr,
+            size_t from_pitch, size_t x, size_t y,
+            sycl::queue q = get_default_queue()); // 2D matrix
+void memcpy(pitched_data to, sycl::id<3> to_pos,
+            pitched_data from, sycl::id<3> from_pos,
+            sycl::range<3> size,
+            sycl::queue q = get_default_queue()); // 3D matrix
+
+// Non-blocking memcpy
+sycl::event memcpy_async(void *to_ptr, const void *from_ptr, size_t size,
+                         sycl::queue q = get_default_queue());
+template <typename T>
+sycl::event memcpy_async(T *to_ptr, T void *from_ptr, size_t count,
+                         sycl::queue q = get_default_queue());
+sycl::event memcpy_async(void *to_ptr, size_t to_pitch,
+                         const void *from_ptr, size_t from_pitch,
+                         size_t x, size_t y,
+                         sycl::queue q = get_default_queue()); // 2D matrix
+sycl::event memcpy_async(pitched_data to, sycl::id<3> to_pos,
+                         pitched_data from, sycl::id<3> from_pos,
+                         sycl::range<3> size,
+                         sycl::queue q = get_default_queue()); // 3D matrix
+
+// Fill
+template <class T>
+void fill(void *dev_ptr, const T &pattern, size_t count,
+          sycl::queue q = get_default_queue());
+template <typename T>
+sycl::event fill_async(void *dev_ptr, const T &pattern,
+                       size_t count, sycl::queue q = get_default_queue());
+
+// Memset
+void memset(void *dev_ptr, int value, size_t size,
+                   sycl::queue q = get_default_queue());
+void memset(void *ptr, size_t pitch, int val, size_t x, size_t y,
+            sycl::queue q = get_default_queue()); // 2D matrix
+void memset(pitched_data pitch, int val, sycl::range<3> size,
+                          sycl::queue q = get_default_queue()); // 3D matrix
+sycl::event memset_async(void *dev_ptr, int value, size_t size,
+                         sycl::queue q = get_default_queue());
+sycl::event memset_async(void *ptr, size_t pitch, int val,
+                         size_t x, size_t y,
+                         sycl::queue q = get_default_queue()); // 2D matrix
+sycl::event memset_async(pitched_data pitch, int val,
+                         sycl::range<3> size,
+                         sycl::queue q = get_default_queue()); // 3D matrix
+
+void free(void *ptr, sycl::queue q = get_default_queue());
+sycl::event free_async(const std::vector<void *> &pointers,
+                       const std::vector<sycl::event> &events,
+                       sycl::queue q = get_default_queue());
+
+// Queries pointer allocation type
+class pointer_attributes {
+public:
+  void init(const void *ptr, sycl::queue q = get_default_queue());
+  sycl::usm::alloc get_memory_type();
+  const void *get_device_pointer();
+  const void *get_host_pointer();
+  bool is_memory_shared();
+  unsigned int get_device_id();
+};
+
+} // syclcompat
+```
+
+Finally, the class `pitched_data`, which manages memory allocation for 3D
+spaces, padded to avoid uncoalesced memory accesses.
+
+```c++
+namespace syclcompat {
+
+class pitched_data {
+public:
+  pitched_data();
+  pitched_data(void *data, size_t pitch, size_t x, size_t y);
+
+  void *get_data_ptr();
+  size_t get_pitch();
+  size_t get_x();
+  size_t get_y();
+
+  void set_data_ptr(void *data);
+  void set_pitch(size_t pitch);
+  void set_x(size_t x);
+  void set_y(size_t y);
+};
+
+} // syclcompat
+```
+
+There are various helper classes and aliases defined within SYCLcompat to
+encapsulate and define memory operations and objects. These classes and aliases
+are primarily designed to assist with machine translation from other
+heterogeneous programming models.
+
+The wrapper class `device_memory` provides a unified representation for device
+memory in various regions. The class provides methods to allocate memory for the
+object (`init()`) and access the underlying memory in various ways (`get_ptr()`,
+`get_access()`, `operator[]`). Aliases for global and USM shared specializations
+are provided.
+
+The `memory_traits` class is provided as a traits helper for `device_memory`.
+The `accessor` class template provides a 2D or 3D `sycl::accessor`-like wrapper
+around raw pointers.
+
+```c++
+namespace syclcompat {
+
+enum class memory_region {
+  global = 0, // device global memory
+  constant,   // device read-only memory
+  local,      // device local memory
+  usm_shared, // memory which can be accessed by host and device
+};
+
+using byte_t = uint8_t;
+
+enum class target { device, local };
+
+template <memory_region Memory, class T = byte_t> class memory_traits {
+public:
+  static constexpr sycl::access::address_space asp =
+      (Memory == memory_region::local)
+          ? sycl::access::address_space::local_space
+          : sycl::access::address_space::global_space;
+  static constexpr target target =
+      (Memory == memory_region::local)
+          ? target::local
+          : target::device;
+  static constexpr sycl::access_mode mode =
+      (Memory == memory_region::constant)
+          ? sycl::access_mode::read
+          : sycl::access_mode::read_write;
+  static constexpr size_t type_size = sizeof(T);
+  using element_t =
+      typename std::conditional_t<Memory == constant, const T, T>;
+  using value_t = typename std::remove_cv_t<T>;
+  template <size_t Dimension = 1>
+  using accessor_t = typename std::conditional_t<
+      target == target::local,
+      sycl::local_accessor<T, Dimension>,
+      sycl::accessor<T, Dimension, mode>>;
+  using pointer_t = T *;
+};
+
+template <class T, memory_region Memory, size_t Dimension> class device_memory {
+public:
+  using accessor_t =
+      typename memory_traits<Memory, T>::template accessor_t<Dimension>;
+  using value_t = typename memory_traits<Memory, T>::value_t;
+  using syclcompat_accessor_t =
+      syclcompat::accessor<T, Memory, Dimension>;
+
+  device_memory();
+
+  device_memory(const sycl::range<Dimension> &in_range,
+                std::initializer_list<value_t> &&init_list);
+
+  template <size_t D = Dimension>
+  device_memory(
+      const typename std::enable_if<D == 2, sycl::range<2>>::type &in_range,
+      std::initializer_list<std::initializer_list<value_t>> &&init_list);
+
+  device_memory(const sycl::range<Dimension> &range_in);
+
+  // Variadic constructor taking 1, 2 or 3 integers to be interpreted as a
+  // sycl::range<Dim>.
+  template <class... Args>
+  device_memory(Args... Arguments);
+
+  ~device_memory();
+
+  // Allocate memory with default queue, and init memory if has initial value.
+  void init();
+  // Allocate memory with specified queue, and init memory if has initial
+  // value.
+  void init(sycl::queue q);
+
+  // The variable is assigned to a device pointer.
+  void assign(value_t *src, size_t size);
+
+  // Get memory pointer of the memory object, which is virtual pointer when
+  // usm is not used, and device pointer when usm is used.
+  value_t *get_ptr();
+  // Get memory pointer of the memory object, which is virtual pointer when
+  // usm is not used, and device pointer when usm is used.
+  value_t *get_ptr(sycl::queue q);
+
+  // Get the device memory object size in bytes.
+  size_t get_size();
+
+  template <size_t D = Dimension>
+  typename std::enable_if<D == 1, T>::type &operator[](size_t index);
+
+  // Get accessor with dimension info for the device memory object
+  // when usm is used and dimension is greater than 1.
+  template <size_t D = Dimension>
+  typename std::enable_if<D != 1, syclcompat_accessor_t>::type
+  get_access(sycl::handler &cgh);
+};
+
+
+template <class T, memory_region Memory>
+class device_memory<T, Memory, 0> : public device_memory<T, Memory, 1> {
+public:
+  using base = device_memory<T, Memory, 1>;
+  using value_t = typename base::value_t;
+  using accessor_t =
+      typename memory_traits<Memory, T>::template accessor_t<0>;
+  device_memory(const value_t &val);
+  device_memory();
+};
+
+template <class T, size_t Dimension>
+using global_memory = device_memory<T, memory_region::global, Dimension>;
+template <class T, size_t Dimension>
+using constant_memory = detail::device_memory<T, constant, Dimension>;
+template <class T, size_t Dimension>
+using shared_memory = device_memory<T, memory_region::usm_shared, Dimension>;
+
+
+template <class T, memory_region Memory, size_t Dimension> class accessor;
+
+template <class T, memory_region Memory> class accessor<T, Memory, 3> {
+public:
+  using memory_t = memory_traits<Memory, T>;
+  using element_t = typename memory_t::element_t;
+  using pointer_t = typename memory_t::pointer_t;
+  using accessor_t = typename memory_t::template accessor_t<3>;
+
+  accessor(pointer_t data, const sycl::range<3> &in_range);
+  template <memory_region M = Memory>
+  accessor(typename std::enable_if<M != memory_region::local,
+                                   const accessor_t>::type &acc);
+  accessor(const accessor_t &acc, const sycl::range<3> &in_range);
+
+  accessor<T, Memory, 2> operator[](size_t index) const;
+
+  pointer_t get_ptr() const;
+
+};
+
+template <class T, memory_region Memory> class accessor<T, Memory, 2> {
+public:
+  using memory_t = memory_traits<Memory, T>;
+  using element_t = typename memory_t::element_t;
+  using pointer_t = typename memory_t::pointer_t;
+  using accessor_t = typename memory_t::template accessor_t<2>;
+
+  accessor(pointer_t data, const sycl::range<2> &in_range);
+  template <memory_region M = Memory>
+  accessor(typename std::enable_if<M != memory_region::local,
+                                   const accessor_t>::type &acc);
+  accessor(const accessor_t &acc, const sycl::range<2> &in_range);
+
+  pointer_t operator[](size_t index);
+
+  pointer_t get_ptr() const;
+};
+
+} // syclcompat
+```
+
+### Device Information
+
+`sycl::device` properties are encapsulated using the `device_info` helper class.
+The class is meant to be constructed and used through the extended device
+implemented in SYCLcompat.
+
+This is the synopsis of `device_info`:
+
+```c++
+class device_info {
+public:
+  const char *get_name();
+  char *get_name();
+  template <typename WorkItemSizesTy = sycl::id<3>,
+            std::enable_if_t<std::is_same_v<WorkItemSizesTy, sycl::id<3>> ||
+                                 std::is_same_v<WorkItemSizesTy, int *>,
+                             int> = 0>
+  auto get_max_work_item_sizes() const;
+
+  template <typename WorkItemSizesTy = sycl::id<3>,
+          std::enable_if_t<std::is_same_v<WorkItemSizesTy, sycl::id<3>> ||
+                                std::is_same_v<WorkItemSizesTy, int *>,
+                            int> = 0>
+  auto get_max_work_item_sizes() const;
+  int get_major_version() const;
+  int get_minor_version() const;
+  int get_integrated() const;
+  int get_max_clock_frequency() const;
+  int get_max_compute_units() const;
+  int get_max_work_group_size() const;
+  int get_max_sub_group_size() const;
+  int get_max_work_items_per_compute_unit() const;
+  template <typename NDRangeSizeTy = size_t *,
+            std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t *> ||
+                                 std::is_same_v<NDRangeSizeTy, int *>,
+                             int> = 0>
+  auto get_max_nd_range_size() const;
+  template <typename NDRangeSizeTy = size_t *,
+            std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t *> ||
+                                 std::is_same_v<NDRangeSizeTy, int *>,
+                             int> = 0>
+  auto get_max_nd_range_size();
+  size_t get_global_mem_size() const;
+  size_t get_local_mem_size() const;
+
+void set_name(const char *name);
+  void set_max_work_item_sizes(const sycl::id<3> max_work_item_sizes);
+  void set_major_version(int major);
+  void set_minor_version(int minor);
+  void set_integrated(int integrated);
+  void set_max_clock_frequency(int frequency);
+  void set_max_compute_units(int max_compute_units);
+  void set_global_mem_size(size_t global_mem_size);
+  void set_local_mem_size(size_t local_mem_size);
+  void set_max_work_group_size(int max_work_group_size);
+  void set_max_sub_group_size(int max_sub_group_size);
+  void
+  set_max_work_items_per_compute_unit(int max_work_items_per_compute_unit);
+  void set_max_nd_range_size(int max_nd_range_size[]);
+};
+```
+
+### Device Management
+
+Multiple SYCL functionalities are exposed through utility functions to manage
+the current `sycl::device`, `sycl::queue`, and `sycl::context`, exposed as
+follows:
+
+```c++
+namespace syclcompat {
+
+// Util function to create a new queue for the current device
+sycl::queue create_queue(bool print_on_async_exceptions = false,
+                         bool in_order = true);
+
+// Util function to get the default queue of current device in
+// device manager.
+sycl::queue get_default_queue();
+
+// Util function to wait for the queued kernels.
+void wait(sycl::queue q = get_default_queue());
+
+// Util function to wait for the queued kernels and throw unhandled errors.
+void wait_and_throw(sycl::queue q = get_default_queue());
+
+// Util function to get the id of current device in
+// device manager.
+unsigned int get_current_device_id();
+
+// Util function to get the current device.
+device_ext &get_current_device();
+
+// Util function to get a device by id.
+device_ext &get_device(unsigned int id);
+
+// Util function to get the context of the default queue of current
+// device in device manager.
+sycl::context get_default_context();
+
+// Util function to get a CPU device.
+device_ext &cpu_device();
+
+// Util function to select a device by its id
+unsigned int select_device(unsigned int id);
+
+} // syclcompat
+```
+
+The exposed functionalities include creation and destruction of queues, through
+`syclcompat::create_queue` and `syclcompat::destroy_queue`, and providing the
+ability to wait for submitted kernels using `syclcompat::wait` or
+`syclcompat::wait_and_throw`. Any async errors will be output to `stderr` if
+`print_on_async_exceptions`. Synchronous exceptions have to be managed by users
+independently of what is set in this parameter.
+
+Devices are managed through a helper class, `device_ext`. The `device_ext` class
+associates a vector of `sycl::queues` with its `sycl::device`. The `device_ext`
+destructor waits on a set of `sycl::event` which can be added to via
+`add_event`. This is used, for example, to implement `syclcompat::free_async` to
+schedule release of memory after a kernel or `mempcy`. SYCL device properties
+can be queried through `device_ext` as well.
+
+The class is exposed as follows:
+
+```c++
+namespace syclcompat {
+
+class device_ext : public sycl::device {
+  device_ext();
+  device_ext(const sycl::device &base);
+  ~device_ext();
+
+  bool is_native_host_atomic_supported();
+  int get_major_version();
+  int get_minor_version();
+  int get_max_compute_units();
+  int get_max_clock_frequency();
+  int get_integrated();
+  void get_device_info(device_info &out);
+
+  device_info get_device_info();
+  void reset();
+
+  sycl::queue *default_queue();
+  void queues_wait_and_throw();
+  sycl::queue *create_queue(bool print_on_async_exceptions = false,
+                            bool in_order = true);
+  void destroy_queue(sycl::queue *&queue);
+  void set_saved_queue(sycl::queue *q);
+  sycl::queue *get_saved_queue();
+  sycl::context get_context();
+};
+
+} // syclcompat
+```
+
+#### Multiple devices
+
+SYCLcompat allows you to manage multiple devices through
+`syclcompat::select_device` and `syclcompat::create_queue`. The library uses the
+default SYCL device (i.e. the device returned by `sycl::default_selector_v`) as
+the default device, and exposes all other devices available on the system
+through the `syclcompat::select_device(unsigned int id)` member function.
+
+The interface uses the `syclcompat::device_ext::get_current_device_id()` to get
+the current CPU thread, and returns the associated device stored internally as a
+map with that thread. The map is constructed using calls to
+`syclcompat::select_device(unsigned int id)`. Any thread which hasn't used this
+member function to select a device will be given the default device. Note that
+this implies multiple threads on a single device by default.
+
+Be aware that targetting multiple devices may lead to unintended behavior caused
+by developers, as SYCLcompat does not implement a mechanism to warn when the
+wrong queue is used as an argument in any of the member functions of the
+`syclcompat` namespace.
+
+#### Atomic Operations
+
+SYCLcompat provides an interface for common atomic operations (`add`, `sub`,
+`and`, `or`, `xor`, `min`, `max`, `exchange`, `compare_exchange`). While SYCL
+exposes atomic operations through member functions of `sycl::atomic_ref`, this
+library provides access via functions taking a standard pointer argument.
+Template arguments control the `sycl::memory_scope`, `sycl::memory_order` and
+`sycl::access::address_space` of these atomic operations. SYCLcompat also
+exposes overloads for these atomic functions which take a runtime memoryScope
+argument. Every atomic operation is implemented via an API function taking a raw
+pointer as the target. Additional overloads for
+`syclcompat::compare_exchange_strong` are provided which take a
+`sycl::multi_ptr` instead of a raw pointer. Addition and subtraction make use of
+`arith_t` to differentiate between numeric and pointer arithmetics.
+
+The available operations are exposed as follows:
+
+``` c++
+namespace syclcompat {
+
+template <typename T> struct arith {
+  using type = std::conditional_t<std::is_pointer_v<T>, std::ptrdiff_t, T>;
+};
+template <typename T> using arith_t = typename arith<T>::type;
+
+template <typename T,
+          sycl::access::address_space addressSpace =
+              sycl::access::address_space::global_space,
+          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+          sycl::memory_scope memoryScope = sycl::memory_scope::device>
+T atomic_fetch_add(T *addr, arith_t<T> operand);
+template <typename T, sycl::access::address_space addressSpace =
+                          sycl::access::address_space::global_space>
+T atomic_fetch_add(T *addr, arith_t<T> operand,
+                   sycl::memory_order memoryOrder);
+
+template <typename T,
+          sycl::access::address_space addressSpace =
+              sycl::access::address_space::global_space,
+          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+          sycl::memory_scope memoryScope = sycl::memory_scope::device>
+T atomic_fetch_sub(T *addr, arith_t<T> operand);
+template <typename T, sycl::access::address_space addressSpace =
+                          sycl::access::address_space::global_space>
+T atomic_fetch_sub(T *addr, arith_t<T> operand,
+                          sycl::memory_order memoryOrder);
+
+template <typename T,
+          sycl::access::address_space addressSpace =
+              sycl::access::address_space::global_space,
+          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+          sycl::memory_scope memoryScope = sycl::memory_scope::device>
+T atomic_fetch_and(T *addr, T operand);
+template <typename T, sycl::access::address_space addressSpace =
+                          sycl::access::address_space::global_space>
+T atomic_fetch_and(T *addr, T operand, sycl::memory_order memoryOrder);
+
+template <typename T,
+          sycl::access::address_space addressSpace =
+              sycl::access::address_space::global_space,
+          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+          sycl::memory_scope memoryScope = sycl::memory_scope::device>
+T atomic_fetch_or(T *addr, T operand);
+template <typename T, sycl::access::address_space addressSpace =
+                          sycl::access::address_space::global_space>
+T atomic_fetch_or(T *addr, T operand, sycl::memory_order memoryOrder);
+
+template <typename T,
+          sycl::access::address_space addressSpace =
+              sycl::access::address_space::global_space,
+          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+          sycl::memory_scope memoryScope = sycl::memory_scope::device>
+T atomic_fetch_xor(T *addr, T operand);
+template <typename T, sycl::access::address_space addressSpace =
+                          sycl::access::address_space::global_space>
+T atomic_fetch_xor(T *addr, T operand, sycl::memory_order memoryOrder);
+
+template <typename T,
+          sycl::access::address_space addressSpace =
+              sycl::access::address_space::global_space,
+          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+          sycl::memory_scope memoryScope = sycl::memory_scope::device>
+T atomic_fetch_min(T *addr, T operand);
+template <typename T, sycl::access::address_space addressSpace =
+                          sycl::access::address_space::global_space>
+T atomic_fetch_min(T *addr, T operand, sycl::memory_order memoryOrder);
+
+template <typename T,
+          sycl::access::address_space addressSpace =
+              sycl::access::address_space::global_space,
+          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+          sycl::memory_scope memoryScope = sycl::memory_scope::device>
+T atomic_fetch_max(T *addr, T operand);
+template <typename T, sycl::access::address_space addressSpace =
+                          sycl::access::address_space::global_space>
+T atomic_fetch_max(T *addr, T operand, sycl::memory_order memoryOrder);
+
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::global_space,
+          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+          sycl::memory_scope memoryScope = sycl::memory_scope::device>
+unsigned int atomic_fetch_compare_inc(unsigned int *addr,
+                                      unsigned int operand);
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::global_space>
+unsigned int atomic_fetch_compare_inc(unsigned int *addr,
+                                      unsigned int operand,
+                                      sycl::memory_order memoryOrder);
+
+template <typename T,
+          sycl::access::address_space addressSpace =
+              sycl::access::address_space::global_space,
+          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+          sycl::memory_scope memoryScope = sycl::memory_scope::device>
+T atomic_exchange(T *addr, T operand);
+template <typename T, sycl::access::address_space addressSpace =
+                          sycl::access::address_space::global_space>
+T atomic_exchange(T *addr, T operand, sycl::memory_order memoryOrder);
+
+template <typename T,
+          sycl::access::address_space addressSpace =
+              sycl::access::address_space::global_space,
+          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+          sycl::memory_scope memoryScope = sycl::memory_scope::device>
+T atomic_compare_exchange_strong(
+    sycl::multi_ptr<T, sycl::access::address_space::global_space> addr,
+    T expected, T desired,
+    sycl::memory_order success = sycl::memory_order::relaxed,
+    sycl::memory_order fail = sycl::memory_order::relaxed);
+template <typename T,
+          sycl::access::address_space addressSpace =
+              sycl::access::address_space::global_space,
+          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+          sycl::memory_scope memoryScope = sycl::memory_scope::device>
+T atomic_compare_exchange_strong(
+    T *addr, T expected, T desired,
+    sycl::memory_order success = sycl::memory_order::relaxed,
+    sycl::memory_order fail = sycl::memory_order::relaxed);
+
+} // namespace syclcompat
+```
+
+### Compatibility Utilities
+
+This library provides a number of small compatibility utilities which exist to
+facilitate machine translation of code from other programming models to SYCL.
+These functions are part of the public API, but they are not expected to be
+useful to developers writing their own code.
+
+Functionality is provided to represent a pair of integers as a `double`.
+`cast_ints_to_double(int, int)` returns a `double` containing the given integers
+in the high & low 32-bits respectively. `cast_double_to_int` casts the high or
+low 32-bits back into an integer.
+
+`syclcompat::fast_length` provides a wrapper to SYCL's
+`fast_length(sycl::vec<float,N>)` that accepts arguments for a C++ array and a
+length.
+
+`vectorized_max` and `vectorized_min` are binary operations returning the
+max/min of two arguments, where each argument is treated as a `sycl::vec` type.
+`vectorized_isgreater` performs elementwise `isgreater`, treating each argument
+as a vector of elements, and returning `0` for vector components for which
+`isgreater` is false, and `-1` when true.
+
+`reverse_bits` reverses the bits of a 32-bit unsigned integer, `ffs` returns the
+position of the first least significant set bit in an integer.
+`byte_level_permute` returns a byte-permutation of two input unsigned integers,
+with bytes selected according to a third unsigned integer argument.
+
+There is also an `experimental::logical_group` class which allows
+`sycl::sub_group`s to be further subdivided into 'logical' groups to perform
+sub-group level operations. This class provides methods to get the local & group
+id and range. The functions `select_from_sub_group`, `shift_sub_group_left`,
+`shift_sub_group_right` and `permute_sub_group_by_xor` provide equivalent
+functionality to `sycl::select_from_group`, `sycl::shift_group_left`,
+`sycl::shift_group_right` and `sycl::permute_group_by_xor`, respectively.
+However, they provide an optional argument to represent the `logical_group` size
+(default 32).
+
+The functions `cmul`,`cdiv`,`cabs`, and `conj` define complex math operations
+which accept `sycl::vec<T,2>` arguments representing complex values.
+
+```c++
+namespace syclcompat {
+
+inline int cast_double_to_int(double d, bool use_high32 = true);
+
+inline double cast_ints_to_double(int high32, int low32);
+
+inline float fast_length(const float *a, int len);
+
+template <typename S, typename T> inline T vectorized_max(T a, T b);
+
+template <typename S, typename T> inline T vectorized_min(T a, T b);
+
+template <typename S, typename T> inline T vectorized_isgreater(T a, T b);
+
+template <>
+inline unsigned vectorized_isgreater<sycl::ushort2, unsigned>(unsigned a,
+                                                              unsigned b);
+
+template <typename T> inline T reverse_bits(T a);
+
+inline unsigned int byte_level_permute(unsigned int a, unsigned int b,
+                                       unsigned int s);
+
+template <typename T> inline int ffs(T a);
+
+template <typename T>
+T select_from_sub_group(sycl::sub_group g, T x, int remote_local_id,
+                        int logical_sub_group_size = 32);
+
+template <typename T>
+T shift_sub_group_left(sycl::sub_group g, T x, unsigned int delta,
+                       int logical_sub_group_size = 32);
+
+template <typename T>
+T shift_sub_group_right(sycl::sub_group g, T x, unsigned int delta,
+                        int logical_sub_group_size = 32);
+
+template <typename T>
+T permute_sub_group_by_xor(sycl::sub_group g, T x, unsigned int mask,
+                           int logical_sub_group_size = 32);
+
+template <typename T>
+sycl::vec<T, 2> cmul(sycl::vec<T, 2> x, sycl::vec<T, 2> y);
+
+template <typename T>
+sycl::vec<T, 2> cdiv(sycl::vec<T, 2> x, sycl::vec<T, 2> y);
+
+template <typename T> T cabs(sycl::vec<T, 2> x);
+
+template <typename T> sycl::vec<T, 2> conj(sycl::vec<T, 2> x);
+
+} // namespace syclcompat
+```
+
+The function `experimental::nd_range_barrier` synchronizes work items from all
+work groups within a SYCL kernel. This is not officially supported by the SYCL
+spec, and so should be used with caution.
+
+```c++
+namespace syclcompat {
+namespace experimental {
+
+template <int dimensions = 3>
+inline void nd_range_barrier(
+    sycl::nd_item<dimensions> item,
+    sycl::atomic_ref<unsigned int, sycl::memory_order::acq_rel,
+                     sycl::memory_scope::device,
+                     sycl::access::address_space::global_space> &counter);
+
+template <>
+inline void nd_range_barrier(
+    sycl::nd_item<1> item,
+    sycl::atomic_ref<unsigned int, sycl::memory_order::acq_rel,
+                     sycl::memory_scope::device,
+                     sycl::access::address_space::global_space> &counter);
+
+class logical_group {
+public:
+  logical_group(sycl::nd_item<3> item, sycl::group<3> parent_group,
+                uint32_t size);
+  uint32_t get_local_linear_id() const;
+  uint32_t get_group_linear_id() const;
+  uint32_t get_local_linear_range() const;
+  uint32_t get_group_linear_range() const;
+};
+
+} // namespace experimental
+} // namespace syclcompat
+```
+
+To assist machine translation, helper aliases are provided for inlining and
+alignment attributes. The class template declarations `sycl_compat_kernel_name`
+and `sycl_compat_kernel_scalar` are used to assist automatic generation of
+kernel names during machine translation.
+
+`get_sycl_language_version` returns an integer representing the version of the
+SYCL spec supported by the current SYCL compiler.
+
+``` c++
+namespace syclcompat {
+
+#define __sycl_compat_align__(n) __attribute__((aligned(n)))
+#define __sycl_compat_inline__ __inline__ __attribute__((always_inline))
+
+#define __sycl_compat_noinline__ __attribute__((noinline))
+
+template <class... Args> class sycl_compat_kernel_name;
+template <int Arg> class sycl_compat_kernel_scalar;
+
+int get_sycl_language_version();
+
+} // namespace syclcompat
+```
+
+#### Kernel Helper Functions
+
+Kernel helper functions provide a structure `kernel_function_info` to keep SYCL
+kernel information, and provide a utility function `get_kernel_function_info()`
+to get the kernel information. Overloads are provided to allow either returning
+a `kernel_function_info` object, or to return by pointer argument. In the
+current version, `kernel_function_info` describes only maximum work-group size.
+
+``` c++
+namespace syclcompat {
+
+struct kernel_function_info {
+  int max_work_group_size = 0;
+};
+
+static void get_kernel_function_info(kernel_function_info *kernel_info,
+                                     const void *function);
+static kernel_function_info get_kernel_function_info(const void *function);
+} // namespace syclcompat
+```
+
+## Sample Code
+
+Below is a simple linear algebra sample, which computes `y = mx + b` implemented
+using this library:
+
+``` c++
+#include <cassert>
+#include <iostream>
+
+#include <sycl/syclcompat.hpp>
+#include <sycl/sycl.hpp>
+
+/**
+ * Slope intercept form of a straight line equation: Y = m * X + b
+ */
+template <int BLOCK_SIZE>
+void slope_intercept(float *Y, float *X, float m, float b, size_t n) {
+
+  // Block index
+  size_t bx = syclcompat::work_group_id::x();
+  // Thread index
+  size_t tx = syclcompat::local_id::x();
+
+  size_t i = bx * BLOCK_SIZE + tx;
+  // or  i = syclcompat::global_id::x();
+  if (i < n)
+    Y[i] = m * X[i] + b;
+}
+
+void check_memory(void *ptr, std::string msg) {
+  if (ptr == nullptr) {
+    std::cerr << "Failed to allocate memory: " << msg << std::endl;
+    exit(EXIT_FAILURE);
+  }
+}
+
+/**
+ * Program main
+ */
+int main(int argc, char **argv) {
+  std::cout << "Simple Kernel example" << std::endl;
+
+  constexpr size_t n_points = 32;
+  constexpr float m = 1.5f;
+  constexpr float b = 0.5f;
+
+  int block_size = 32;
+  if (block_size > syclcompat::get_current_device()
+                       .get_info<sycl::info::device::max_work_group_size>())
+    block_size = 16;
+
+  std::cout << "block_size = " << block_size << ", n_points = " << n_points
+            << std::endl;
+
+  // Allocate host memory for vectors X and Y
+  size_t mem_size = n_points * sizeof(float);
+  float *h_X = (float *)syclcompat::malloc_host(mem_size);
+  float *h_Y = (float *)syclcompat::malloc_host(mem_size);
+  check_memory(h_X, "h_X allocation failed.");
+  check_memory(h_Y, "h_Y allocation failed.");
+
+  // Alternative templated allocation for the expected output
+  float *h_expected = syclcompat::malloc_host<float>(n_points);
+  check_memory(h_expected, "Not enough for h_expected.");
+
+  // Initialize host memory & expected output
+  for (size_t i = 0; i < n_points; i++) {
+    h_X[i] = i + 1;
+    h_expected[i] = m * h_X[i] + b;
+  }
+
+  // Allocate device memory
+  float *d_X = (float *)syclcompat::malloc(mem_size);
+  float *d_Y = (float *)syclcompat::malloc(mem_size);
+  check_memory(d_X, "d_X allocation failed.");
+  check_memory(d_Y, "d_Y allocation failed.");
+
+  // copy host memory to device
+  syclcompat::memcpy(d_X, h_X, mem_size);
+
+  size_t threads = block_size;
+  size_t grid = n_points / block_size;
+
+  std::cout << "Computing result using SYCL Kernel... ";
+  if (block_size == 16) {
+    syclcompat::launch<slope_intercept<16>>(grid, threads, d_Y, d_X, m, b,
+                                        n_points);
+  } else {
+    syclcompat::launch<slope_intercept<32>>(grid, threads, d_Y, d_X, m, b,
+                                        n_points);
+  }
+  syclcompat::wait();
+  std::cout << "DONE" << std::endl;
+
+  // Async copy result from device to host
+  syclcompat::memcpy_async(h_Y, d_Y, mem_size).wait();
+
+  // Check output
+  for (size_t i = 0; i < n_points; i++) {
+    assert(h_Y[i] - h_expected[i] < 1e6);
+  }
+
+  // Clean up memory
+  syclcompat::free(h_X);
+  syclcompat::free(h_Y);
+  syclcompat::free(h_expected);
+  syclcompat::free(d_X);
+  syclcompat::free(d_Y);
+
+  return 0;
+}
+```
+
+## Maintainers
+
+To report problems with this library, please open a new issue with the [COMPAT]
+tag at:
+
+<https://github.com/intel/llvm/issues>
+
+## Contributors
+
+Alberto Cabrera, Codeplay \
+Gordon Brown, Codeplay \
+Joe Todd, Codeplay \
+Pietro Ghiglio, Codeplay \
+Ruyman Reyes, Codeplay/Intel
+
+## Contributions
+
+This library is licensed under the Apache 2.0 license. If you have an idea for a
+new sample, different build system integration or even a fix for something that
+is broken, please get in contact.

From f7b00b752506b87414fd026b3010703e85e0d733 Mon Sep 17 00:00:00 2001
From: Chris Perkins <chris.perkins@intel.com>
Date: Wed, 9 Aug 2023 02:56:19 -0700
Subject: [PATCH 11/24] [SYCL] Fix sycl::vec unary ops (#10722)

The recent sycl::vec changes (https://github.com/intel/llvm/pull/9492)
broke they unary operations. This PR fixes them and adds some testing to
avoid that in the future.
---
 sycl/include/sycl/types.hpp     | 140 +++++++++++++++++++++-----------
 sycl/test/basic_tests/types.cpp |  63 ++++++++++++++
 2 files changed, 154 insertions(+), 49 deletions(-)

diff --git a/sycl/include/sycl/types.hpp b/sycl/include/sycl/types.hpp
index d3f63637e3028..37ff4cf3d6438 100644
--- a/sycl/include/sycl/types.hpp
+++ b/sycl/include/sycl/types.hpp
@@ -583,13 +583,17 @@ template <typename Type, int NumElements> class vec {
   // vector extension. This is for MSVC compatibility, which has a max alignment
   // of 64 for direct params. If we drop MSVC, we can have alignment the same as
   // size and use vector extensions for all sizes.
-  static constexpr bool IsUsingArray =
+  static constexpr bool IsUsingArrayOnDevice =
       (IsHostHalf || IsSizeGreaterThanMaxAlign);
 
 #if defined(__SYCL_DEVICE_ONLY__)
-  static constexpr bool NativeVec = NumElements > 1 && !IsUsingArray;
+  static constexpr bool NativeVec = NumElements > 1 && !IsUsingArrayOnDevice;
+  static constexpr bool IsUsingArrayOnHost =
+      false; // we are not compiling for host.
 #else
   static constexpr bool NativeVec = false;
+  static constexpr bool IsUsingArrayOnHost =
+      true; // host always uses std::array.
 #endif
 
   static constexpr int getNumElements() { return NumElements; }
@@ -770,6 +774,15 @@ template <typename Type, int NumElements> class vec {
     return *this;
   }
 
+  template <typename T = void>
+  using EnableIfUsingArray =
+      typename std::enable_if_t<IsUsingArrayOnDevice || IsUsingArrayOnHost, T>;
+
+  template <typename T = void>
+  using EnableIfNotUsingArray =
+      typename std::enable_if_t<!IsUsingArrayOnDevice && !IsUsingArrayOnHost,
+                                T>;
+
 #ifdef __SYCL_DEVICE_ONLY__
   template <typename T = void>
   using EnableIfNotHostHalf = typename std::enable_if_t<!IsHostHalf, T>;
@@ -778,13 +791,15 @@ template <typename Type, int NumElements> class vec {
   using EnableIfHostHalf = typename std::enable_if_t<IsHostHalf, T>;
 
   template <typename T = void>
-  using EnableIfUsingArray = typename std::enable_if_t<IsUsingArray, T>;
+  using EnableIfUsingArrayOnDevice =
+      typename std::enable_if_t<IsUsingArrayOnDevice, T>;
 
   template <typename T = void>
-  using EnableIfNotUsingArray = typename std::enable_if_t<!IsUsingArray, T>;
+  using EnableIfNotUsingArrayOnDevice =
+      typename std::enable_if_t<!IsUsingArrayOnDevice, T>;
 
   template <typename Ty = DataT>
-  explicit constexpr vec(const EnableIfNotUsingArray<Ty> &arg)
+  explicit constexpr vec(const EnableIfNotUsingArrayOnDevice<Ty> &arg)
       : m_Data{DataType(vec_data<Ty>::get(arg))} {}
 
   template <typename Ty = DataT>
@@ -792,13 +807,13 @@ template <typename Type, int NumElements> class vec {
       std::is_fundamental_v<vec_data_t<Ty>> ||
           std::is_same_v<typename std::remove_const_t<Ty>, half>,
       vec &>
-  operator=(const EnableIfNotUsingArray<Ty> &Rhs) {
+  operator=(const EnableIfNotUsingArrayOnDevice<Ty> &Rhs) {
     m_Data = (DataType)vec_data<Ty>::get(Rhs);
     return *this;
   }
 
   template <typename Ty = DataT>
-  explicit constexpr vec(const EnableIfUsingArray<Ty> &arg)
+  explicit constexpr vec(const EnableIfUsingArrayOnDevice<Ty> &arg)
       : vec{detail::RepeatValue<NumElements>(
                 static_cast<vec_data_t<DataT>>(arg)),
             std::make_index_sequence<NumElements>()} {}
@@ -808,7 +823,7 @@ template <typename Type, int NumElements> class vec {
       std::is_fundamental_v<vec_data_t<Ty>> ||
           std::is_same_v<typename std::remove_const_t<Ty>, half>,
       vec &>
-  operator=(const EnableIfUsingArray<Ty> &Rhs) {
+  operator=(const EnableIfUsingArrayOnDevice<Ty> &Rhs) {
     for (int i = 0; i < NumElements; ++i) {
       setValue(i, Rhs);
     }
@@ -844,22 +859,22 @@ template <typename Type, int NumElements> class vec {
       std::is_convertible_v<T, DataT> && NumElements == IdxNum, DataT>;
   template <typename Ty = DataT>
   constexpr vec(const EnableIfMultipleElems<2, Ty> Arg0,
-                const EnableIfNotUsingArray<Ty> Arg1)
+                const EnableIfNotUsingArrayOnDevice<Ty> Arg1)
       : m_Data{vec_data<Ty>::get(Arg0), vec_data<Ty>::get(Arg1)} {}
   template <typename Ty = DataT>
   constexpr vec(const EnableIfMultipleElems<3, Ty> Arg0,
-                const EnableIfNotUsingArray<Ty> Arg1, const DataT Arg2)
+                const EnableIfNotUsingArrayOnDevice<Ty> Arg1, const DataT Arg2)
       : m_Data{vec_data<Ty>::get(Arg0), vec_data<Ty>::get(Arg1),
                vec_data<Ty>::get(Arg2)} {}
   template <typename Ty = DataT>
   constexpr vec(const EnableIfMultipleElems<4, Ty> Arg0,
-                const EnableIfNotUsingArray<Ty> Arg1, const DataT Arg2,
+                const EnableIfNotUsingArrayOnDevice<Ty> Arg1, const DataT Arg2,
                 const Ty Arg3)
       : m_Data{vec_data<Ty>::get(Arg0), vec_data<Ty>::get(Arg1),
                vec_data<Ty>::get(Arg2), vec_data<Ty>::get(Arg3)} {}
   template <typename Ty = DataT>
   constexpr vec(const EnableIfMultipleElems<8, Ty> Arg0,
-                const EnableIfNotUsingArray<Ty> Arg1, const DataT Arg2,
+                const EnableIfNotUsingArrayOnDevice<Ty> Arg1, const DataT Arg2,
                 const DataT Arg3, const DataT Arg4, const DataT Arg5,
                 const DataT Arg6, const DataT Arg7)
       : m_Data{vec_data<Ty>::get(Arg0), vec_data<Ty>::get(Arg1),
@@ -868,7 +883,7 @@ template <typename Type, int NumElements> class vec {
                vec_data<Ty>::get(Arg6), vec_data<Ty>::get(Arg7)} {}
   template <typename Ty = DataT>
   constexpr vec(const EnableIfMultipleElems<16, Ty> Arg0,
-                const EnableIfNotUsingArray<Ty> Arg1, const DataT Arg2,
+                const EnableIfNotUsingArrayOnDevice<Ty> Arg1, const DataT Arg2,
                 const DataT Arg3, const DataT Arg4, const DataT Arg5,
                 const DataT Arg6, const DataT Arg7, const DataT Arg8,
                 const DataT Arg9, const DataT ArgA, const DataT ArgB,
@@ -908,7 +923,7 @@ template <typename Type, int NumElements> class vec {
                 std::is_same<vector_t_, vector_t>::value &&
                 !std::is_same<vector_t_, DataT>::value>>
   constexpr vec(vector_t openclVector) {
-    if constexpr (!IsUsingArray) {
+    if constexpr (!IsUsingArrayOnDevice) {
       m_Data = openclVector;
     } else {
       m_Data = bit_cast<DataType>(openclVector);
@@ -916,7 +931,7 @@ template <typename Type, int NumElements> class vec {
   }
 
   operator vector_t() const {
-    if constexpr (!IsUsingArray) {
+    if constexpr (!IsUsingArrayOnDevice) {
       return m_Data;
     } else {
       auto ptr = bit_cast<const VectorDataType *>((&m_Data)->data());
@@ -1077,7 +1092,7 @@ template <typename Type, int NumElements> class vec {
 #ifdef __SYCL_DEVICE_ONLY__
 #define __SYCL_BINOP(BINOP, OPASSIGN, CONVERT)                                 \
   template <typename Ty = vec>                                                 \
-  vec operator BINOP(const EnableIfNotUsingArray<Ty> &Rhs) const {             \
+  vec operator BINOP(const EnableIfNotUsingArrayOnDevice<Ty> &Rhs) const {     \
     vec Ret;                                                                   \
     Ret.m_Data = m_Data BINOP Rhs.m_Data;                                      \
     if constexpr (std::is_same<Type, bool>::value && CONVERT) {                \
@@ -1086,7 +1101,7 @@ template <typename Type, int NumElements> class vec {
     return Ret;                                                                \
   }                                                                            \
   template <typename Ty = vec>                                                 \
-  vec operator BINOP(const EnableIfUsingArray<Ty> &Rhs) const {                \
+  vec operator BINOP(const EnableIfUsingArrayOnDevice<Ty> &Rhs) const {        \
     vec Ret;                                                                   \
     for (size_t I = 0; I < NumElements; ++I) {                                 \
       Ret.setValue(I, (getValue(I) BINOP Rhs.getValue(I)));                    \
@@ -1240,67 +1255,94 @@ template <typename Type, int NumElements> class vec {
   __SYCL_UOP(--, -=)
 #undef __SYCL_UOP
 
-  // Available only when: dataT != cl_float && dataT != cl_double
-  // && dataT != cl_half
+  // operator~() available only when: dataT != float && dataT != double
+  // && dataT != half
   template <typename T = DataT>
-  typename std::enable_if_t<std::is_integral_v<vec_data_t<T>>, vec>
+  typename std::enable_if_t<!std::is_floating_point_v<vec_data_t<T>> &&
+                                (!IsUsingArrayOnDevice && !IsUsingArrayOnHost),
+                            vec>
   operator~() const {
-// Use __SYCL_DEVICE_ONLY__ macro because cast to OpenCL vector type is defined
-// by SYCL device compiler only.
-#ifdef __SYCL_DEVICE_ONLY__
     vec Ret{(typename vec::DataType) ~m_Data};
     if constexpr (std::is_same<Type, bool>::value) {
       Ret.ConvertToDataT();
     }
     return Ret;
-#else
+  }
+  template <typename T = DataT>
+  typename std::enable_if_t<!std::is_floating_point_v<vec_data_t<T>> &&
+                                (IsUsingArrayOnDevice || IsUsingArrayOnHost),
+                            vec>
+  operator~() const {
     vec Ret{};
     for (size_t I = 0; I < NumElements; ++I) {
       Ret.setValue(I, ~getValue(I));
     }
     return Ret;
-#endif
   }
 
-  vec<rel_t, NumElements> operator!() const {
-// Use __SYCL_DEVICE_ONLY__ macro because cast to OpenCL vector type is defined
-// by SYCL device compiler only.
-#ifdef __SYCL_DEVICE_ONLY__
-    return vec<rel_t, NumElements>{
-        (typename vec<rel_t, NumElements>::DataType) !m_Data};
-#else
-    vec<rel_t, NumElements> Ret{};
+  // operator!
+  template <typename T = DataT, int N = NumElements>
+  EnableIfNotUsingArray<vec<T, N>> operator!() const {
+    return vec<T, N>{(typename vec<DataT, NumElements>::DataType) !m_Data};
+  }
+
+  // std::byte neither supports ! unary op or casting, so special handling is
+  // needed. And, worse, Windows has a conflict with 'byte'.
+#if (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0)
+  template <typename T = DataT, int N = NumElements>
+  typename std::enable_if_t<std::is_same<std::byte, T>::value &&
+                                (IsUsingArrayOnDevice || IsUsingArrayOnHost),
+                            vec<T, N>>
+  operator!() const {
+    vec Ret{};
     for (size_t I = 0; I < NumElements; ++I) {
-      Ret.setValue(I, !vec_data<DataT>::get(getValue(I)));
+      Ret.setValue(I, std::byte{!vec_data<DataT>::get(getValue(I))});
     }
     return Ret;
-#endif
   }
 
-  vec operator+() const {
-// Use __SYCL_DEVICE_ONLY__ macro because cast to OpenCL vector type is defined
-// by SYCL device compiler only.
-#ifdef __SYCL_DEVICE_ONLY__
-    return vec{+m_Data};
+  template <typename T = DataT, int N = NumElements>
+  typename std::enable_if_t<!std::is_same<std::byte, T>::value &&
+                                (IsUsingArrayOnDevice || IsUsingArrayOnHost),
+                            vec<T, N>>
+  operator!() const {
+    vec Ret{};
+    for (size_t I = 0; I < NumElements; ++I)
+      Ret.setValue(I, !vec_data<DataT>::get(getValue(I)));
+    return Ret;
+  }
 #else
+  template <typename T = DataT, int N = NumElements>
+  EnableIfUsingArray<vec<T, N>> operator!() const {
     vec Ret{};
     for (size_t I = 0; I < NumElements; ++I)
-      Ret.setValue(I, vec_data<DataT>::get(+vec_data<DataT>::get(getValue(I))));
+      Ret.setValue(I, !vec_data<DataT>::get(getValue(I)));
     return Ret;
+  }
 #endif
+
+  // operator +
+  template <typename T = vec> EnableIfNotUsingArray<T> operator+() const {
+    return vec{+m_Data};
   }
 
-  vec operator-() const {
-// Use __SYCL_DEVICE_ONLY__ macro because cast to OpenCL vector type is defined
-// by SYCL device compiler only.
-#ifdef __SYCL_DEVICE_ONLY__
+  template <typename T = vec> EnableIfUsingArray<T> operator+() const {
+    vec Ret{};
+    for (size_t I = 0; I < NumElements; ++I)
+      Ret.setValue(I, vec_data<DataT>::get(+vec_data<DataT>::get(getValue(I))));
+    return Ret;
+  }
+
+  // operator -
+  template <typename T = vec> EnableIfNotUsingArray<T> operator-() const {
     return vec{-m_Data};
-#else
+  }
+
+  template <typename T = vec> EnableIfUsingArray<T> operator-() const {
     vec Ret{};
     for (size_t I = 0; I < NumElements; ++I)
       Ret.setValue(I, vec_data<DataT>::get(-vec_data<DataT>::get(getValue(I))));
     return Ret;
-#endif
   }
 
   // OP is: &&, ||
@@ -1316,7 +1358,7 @@ template <typename Type, int NumElements> class vec {
   template <template <typename> class Operation,
             typename Ty = vec<DataT, NumElements>>
   vec<DataT, NumElements>
-  operatorHelper(const EnableIfNotUsingArray<Ty> &Rhs) const {
+  operatorHelper(const EnableIfNotUsingArrayOnDevice<Ty> &Rhs) const {
     vec<DataT, NumElements> Result;
     Operation<DataType> Op;
     Result.m_Data = Op(m_Data, Rhs.m_Data);
@@ -1326,7 +1368,7 @@ template <typename Type, int NumElements> class vec {
   template <template <typename> class Operation,
             typename Ty = vec<DataT, NumElements>>
   vec<DataT, NumElements>
-  operatorHelper(const EnableIfUsingArray<Ty> &Rhs) const {
+  operatorHelper(const EnableIfUsingArrayOnDevice<Ty> &Rhs) const {
     vec<DataT, NumElements> Result;
     Operation<DataT> Op;
     for (size_t I = 0; I < NumElements; ++I) {
diff --git a/sycl/test/basic_tests/types.cpp b/sycl/test/basic_tests/types.cpp
index 96415f45496a4..4b4738d99f36b 100644
--- a/sycl/test/basic_tests/types.cpp
+++ b/sycl/test/basic_tests/types.cpp
@@ -101,6 +101,67 @@ template <> inline void checkSizeForFloatingPoint<s::half, sizeof(int16_t)>() {
   static_assert(sizeof(s::half) == sizeof(int16_t), "");
 }
 
+template <typename vecType, int numOfElems>
+std::string vec2string(const sycl::vec<vecType, numOfElems> &vec) {
+  std::string str = "";
+  for (size_t i = 0; i < numOfElems - 1; ++i) {
+    str += std::to_string(vec[i]) + ",";
+  }
+  str = "{" + str + std::to_string(vec[numOfElems - 1]) + "}";
+  return str;
+}
+
+// the math built-in testing ensures that the vec binary ops get tested,
+// but the unary ops are only tested by the CTS tests. Here we do some
+// basic testing of the unary ops, ensuring they compile correctly.
+template <typename T> void checkVecUnaryOps(T &v) {
+
+  std::cout << vec2string(v) << std::endl;
+
+  T d = +v;
+  std::cout << vec2string(d) << std::endl;
+
+  T e = -v;
+  std::cout << vec2string(e) << std::endl;
+
+  // ~ only supported by integral types.
+  if constexpr (std::is_integral_v<T>) {
+    T g = ~v;
+    std::cout << vec2string(g) << std::endl;
+  }
+
+  T f = !v;
+  std::cout << vec2string(f) << std::endl;
+}
+
+void checkVariousVecUnaryOps() {
+  sycl::vec<int, 1> vi1{1};
+  checkVecUnaryOps(vi1);
+  sycl::vec<int, 16> vi{1, 2, 0, -4, 1, 2, 0, -4, 1, 2, 0, -4, 1, 2, 0, -4};
+  checkVecUnaryOps(vi);
+
+  sycl::vec<long, 1> vl1{1};
+  checkVecUnaryOps(vl1);
+  sycl::vec<long, 16> vl{2, 3, 0, -5, 2, 3, 0, -5, 2, 3, 0, -5, 2, 3, 0, -5};
+  checkVecUnaryOps(vl);
+
+  sycl::vec<long long, 1> vll1{1};
+  checkVecUnaryOps(vll1);
+  sycl::vec<long long, 16> vll{0, 3, 4, -6, 0, 3, 4, -6,
+                               0, 3, 4, -6, 0, 3, 4, -6};
+  checkVecUnaryOps(vll);
+
+  sycl::vec<float, 1> vf1{1};
+  checkVecUnaryOps(vf1);
+  sycl::vec<float, 16> vf{0, 4, 5, -9, 0, 4, 5, -9, 0, 4, 5, -9, 0, 4, 5, -9};
+  checkVecUnaryOps(vf);
+
+  sycl::vec<double, 1> vd1{1};
+  checkVecUnaryOps(vd1);
+  sycl::vec<double, 16> vd{0, 4, 5, -9, 0, 4, 5, -9, 0, 4, 5, -9, 0, 4, 5, -9};
+  checkVecUnaryOps(vd);
+}
+
 int main() {
   // Test for creating constexpr expressions
   constexpr sycl::specialization_id<sycl::vec<sycl::half, 2>> id(1.0);
@@ -126,5 +187,7 @@ int main() {
   checkSizeForFloatingPoint<s::opencl::cl_float, sizeof(int32_t)>();
   checkSizeForFloatingPoint<s::opencl::cl_double, sizeof(int64_t)>();
 
+  checkVariousVecUnaryOps();
+
   return 0;
 }

From aab5d747c46ec216e9f85868ede744d0bb35a307 Mon Sep 17 00:00:00 2001
From: PietroGhg <38155419+PietroGhg@users.noreply.github.com>
Date: Wed, 9 Aug 2023 11:58:05 +0200
Subject: [PATCH 12/24] [SYCL][NATIVECPU] Support multiple SYCL targets in the
 same compiler invocation (#10495)

This PR adds support to multiple SYCL targets alongside `native_cpu` in
the same compiler invocation (e.g. `clang++ -fsycl
-fsycl-targets=native_cpu,spir64 input.cpp`). In order to implement this
we had to make changes to multiple components, here is a quick overview:
* Driver: changes in the Driver allow to correctly parse all the targets
passed to `-fsycl-targets` (before we were just looking for
`native_cpu`, ignoring the others). The Driver now also calls
`sycl-post-link` and `clang-offload-wrapper`, performing a compilation
flow more similar to the one used for other targets.
* Sema: since the kernel name needs to be the same for all the SYCL
targets, the change to the kernel name in Sema has been removed, and
replaced with an LLVM Pass that gets run when lowering the device module
(`llvm/lib/SYCLLowerIR/RenameKernelSYCLNativeCPU.cpp`).
* Runtime: The definition for `_pi_program` in the Native CPU Plug-In
now supports multiple kernels in one program, and the
`__SYCL_PI_DEVICE_BINARY_TARGET_NATIVE_CPU` binary type has been added
in order to identify kernels compiled for Native CPU.
* clang-offload-wrapper: for Native CPU, the offload-wrapper doesn't
bundle the device code in the host module, but instead produces an array
containing function declarations that are resolved by the linker, see
`sycl/doc/design/SYCLNativeCPU.md` for more information.

---------

Co-authored-by: Uwe Dolinsky <uwe@codeplay.com>
Co-authored-by: Alexey Sachkov <alexey.sachkov@intel.com>
Co-authored-by: Steffen Larsen <steffen.larsen@intel.com>
---
 .../clang/Basic/SYCLNativeCPUHelpers.h        |   7 -
 clang/lib/CodeGen/BackendUtil.cpp             |  11 +-
 clang/lib/Driver/Driver.cpp                   |  79 +++++-----
 clang/lib/Driver/ToolChains/Clang.cpp         |  10 +-
 clang/lib/Driver/ToolChains/SYCL.cpp          |   2 +-
 clang/lib/Driver/ToolChains/SYCL.h            |  10 +-
 clang/lib/Sema/SemaSYCL.cpp                   |  34 ++---
 clang/test/CodeGenSYCL/native_cpu_basic.cpp   |   4 +-
 clang/test/Driver/sycl-native-cpu-fsycl.cpp   |  13 +-
 clang/test/Driver/sycl-native-cpu-warn.cpp    |   5 -
 .../ClangOffloadWrapper.cpp                   |  87 ++++++++++-
 ...PUHeader.h => RenameKernelSYCLNativeCPU.h} |  19 +--
 llvm/include/llvm/SYCLLowerIR/SYCLUtils.h     |   6 +
 llvm/lib/SYCLLowerIR/CMakeLists.txt           |   2 +-
 .../SYCLLowerIR/EmitSYCLNativeCPUHeader.cpp   | 141 ------------------
 llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp |  22 ++-
 .../SYCLLowerIR/RenameKernelSYCLNativeCPU.cpp |  28 ++++
 sycl/doc/design/SYCLNativeCPU.md              |  50 +++----
 sycl/include/sycl/detail/pi.h                 |   1 +
 sycl/plugins/unified_runtime/pi2ur.hpp        |   4 +
 .../ur/adapters/native_cpu/device.cpp         |  17 ++-
 .../ur/adapters/native_cpu/kernel.cpp         |   6 +-
 .../ur/adapters/native_cpu/program.cpp        |   8 +
 .../ur/adapters/native_cpu/program.hpp        |  18 +++
 .../native_cpu/kernelhandler-scalar.cpp       |   8 +-
 .../native_cpu/kernelhandler.cpp              |  41 -----
 .../native_cpu/kernelhandler_noargs.cpp       |  22 ---
 .../native_cpu/kernelhandler_noargs2.cpp      |  18 ---
 .../native_cpu/native_cpu_builtins.cpp        |   6 +-
 .../native_cpu/native_cpu_subhandler.cpp      |  18 +--
 .../native_cpu/offload-wrapper.cpp            |  59 ++++++++
 sycl/test/native_cpu/driver-fsycl.cpp         |   4 +-
 .../native_cpu/example-sycl-application.cpp   |   5 +-
 sycl/test/native_cpu/multi-devices-swap.cpp   |  50 +++++++
 sycl/test/native_cpu/multi-devices.cpp        |  50 +++++++
 sycl/test/native_cpu/vector-add.cpp           |   5 +-
 36 files changed, 477 insertions(+), 393 deletions(-)
 delete mode 100644 clang/include/clang/Basic/SYCLNativeCPUHelpers.h
 delete mode 100644 clang/test/Driver/sycl-native-cpu-warn.cpp
 mode change 100644 => 100755 clang/tools/clang-offload-wrapper/ClangOffloadWrapper.cpp
 rename llvm/include/llvm/SYCLLowerIR/{EmitSYCLNativeCPUHeader.h => RenameKernelSYCLNativeCPU.h} (50%)
 delete mode 100644 llvm/lib/SYCLLowerIR/EmitSYCLNativeCPUHeader.cpp
 create mode 100644 llvm/lib/SYCLLowerIR/RenameKernelSYCLNativeCPU.cpp
 mode change 100644 => 100755 sycl/plugins/unified_runtime/ur/adapters/native_cpu/program.cpp
 delete mode 100644 sycl/test/check_device_code/native_cpu/kernelhandler.cpp
 delete mode 100644 sycl/test/check_device_code/native_cpu/kernelhandler_noargs.cpp
 delete mode 100644 sycl/test/check_device_code/native_cpu/kernelhandler_noargs2.cpp
 create mode 100644 sycl/test/check_device_code/native_cpu/offload-wrapper.cpp
 create mode 100644 sycl/test/native_cpu/multi-devices-swap.cpp
 create mode 100644 sycl/test/native_cpu/multi-devices.cpp

diff --git a/clang/include/clang/Basic/SYCLNativeCPUHelpers.h b/clang/include/clang/Basic/SYCLNativeCPUHelpers.h
deleted file mode 100644
index 6fa990fbf6415..0000000000000
--- a/clang/include/clang/Basic/SYCLNativeCPUHelpers.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#include "clang/Basic/LangOptions.h"
-#include <string>
-namespace clang {
-inline std::string getNativeCPUHeaderName(const LangOptions &LangOpts) {
-  return LangOpts.SYCLIntHeader + ".hc";
-}
-} // namespace clang
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index d58d6c2e0e4a7..f4a20f1914b6e 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -10,7 +10,6 @@
 #include "clang/Basic/CodeGenOptions.h"
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/LangOptions.h"
-#include "clang/Basic/SYCLNativeCPUHelpers.h"
 #include "clang/Basic/TargetOptions.h"
 #include "clang/Basic/Targets/SPIR.h"
 #include "clang/Frontend/FrontendDiagnostic.h"
@@ -47,10 +46,10 @@
 #include "llvm/Passes/StandardInstrumentations.h"
 #include "llvm/SYCLLowerIR/CompileTimePropertiesPass.h"
 #include "llvm/SYCLLowerIR/ESIMD/ESIMDVerifier.h"
-#include "llvm/SYCLLowerIR/EmitSYCLNativeCPUHeader.h"
 #include "llvm/SYCLLowerIR/LowerWGLocalMemory.h"
 #include "llvm/SYCLLowerIR/MutatePrintfAddrspace.h"
 #include "llvm/SYCLLowerIR/PrepareSYCLNativeCPU.h"
+#include "llvm/SYCLLowerIR/RenameKernelSYCLNativeCPU.h"
 #include "llvm/SYCLLowerIR/SYCLAddOptLevelAttribute.h"
 #include "llvm/SYCLLowerIR/SYCLPropagateAspectsUsage.h"
 #include "llvm/Support/BuryPointer.h"
@@ -108,6 +107,10 @@ extern cl::opt<bool> DebugInfoCorrelate;
 static cl::opt<bool> ClSanitizeOnOptimizerEarlyEP(
     "sanitizer-early-opt-ep", cl::Optional,
     cl::desc("Insert sanitizers on OptimizerEarlyEP."), cl::init(false));
+
+static cl::opt<bool> SYCLNativeCPURename(
+    "sycl-native-cpu-rename", cl::init(false),
+    cl::desc("Rename kernel functions for SYCL Native CPU"));
 }
 
 namespace {
@@ -1048,6 +1051,8 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
       MPM = PB.buildPerModuleDefaultPipeline(Level);
     }
 
+    if (SYCLNativeCPURename)
+      MPM.addPass(RenameKernelSYCLNativeCPUPass());
     if (LangOpts.SYCLIsDevice) {
       MPM.addPass(SYCLMutatePrintfAddrspacePass());
       if (LangOpts.EnableDAEInSpirKernels)
@@ -1078,8 +1083,6 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
       MPM.addPass(CompileTimePropertiesPass());
 
       if (LangOpts.SYCLIsNativeCPU) {
-        MPM.addPass(
-            EmitSYCLNativeCPUHeaderPass(getNativeCPUHeaderName(LangOpts)));
         MPM.addPass(PrepareSYCLNativeCPUPass());
       }
     }
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 3c6dcae8dcfd7..0aa2801a5d22a 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -1169,23 +1169,19 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
   checkSingleArgValidity(DeviceCodeSplit,
                          {"per_kernel", "per_source", "auto", "off"});
 
-  bool IsSYCLNativeCPU = isSYCLNativeCPU(C.getInputArgs());
   Arg *SYCLForceTarget =
       getArgRequiringSYCLRuntime(options::OPT_fsycl_force_target_EQ);
   if (SYCLForceTarget) {
     StringRef Val(SYCLForceTarget->getValue());
     llvm::Triple TT(MakeSYCLDeviceTriple(Val));
-    // Todo: we skip the check for the valid SYCL target, because currently
-    // setting native_cpu as a target overrides all the other targets,
-    // re-enable the check once native_cpu can coexist.
-    if (!IsSYCLNativeCPU && !isValidSYCLTriple(TT))
+    if (!isValidSYCLTriple(TT))
       Diag(clang::diag::err_drv_invalid_sycl_target) << Val;
   }
   bool HasSYCLTargetsOption = SYCLTargets || SYCLLinkTargets || SYCLAddTargets;
 
   llvm::StringMap<StringRef> FoundNormalizedTriples;
   llvm::SmallVector<llvm::Triple, 4> UniqueSYCLTriplesVec;
-  if (!IsSYCLNativeCPU && HasSYCLTargetsOption) {
+  if (HasSYCLTargetsOption) {
     // At this point, we know we have a valid combination
     // of -fsycl*target options passed
     Arg *SYCLTargetsValues = SYCLTargets ? SYCLTargets : SYCLLinkTargets;
@@ -1220,6 +1216,12 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
               continue;
             }
             UserTargetName = "amdgcn-amd-amdhsa";
+          } else if (Val == "native_cpu") {
+            const ToolChain *HostTC =
+                C.getSingleOffloadToolChain<Action::OFK_Host>();
+            llvm::Triple HostTriple = HostTC->getTriple();
+            UniqueSYCLTriplesVec.push_back(HostTriple);
+            continue;
           }
 
           if (!isValidSYCLTriple(MakeSYCLDeviceTriple(UserTargetName))) {
@@ -1287,11 +1289,6 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
         Diag(clang::diag::warn_drv_empty_joined_argument)
             << SYCLAddTargets->getAsString(C.getInputArgs());
     }
-  } else if (IsSYCLNativeCPU) {
-    const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>();
-    llvm::Triple HostTriple = HostTC->getTriple();
-    UniqueSYCLTriplesVec.push_back(HostTriple);
-    addSYCLDefaultTriple(C, UniqueSYCLTriplesVec);
   } else {
     // If -fsycl is supplied without -fsycl-*targets we will assume SPIR-V
     // unless -fintelfpga is supplied, which uses SPIR-V with fpga AOT.
@@ -5496,6 +5493,9 @@ class OffloadingActionBuilder final {
         bool isSpirvAOT = TT.getSubArch() == llvm::Triple::SPIRSubArch_fpga ||
                           TT.getSubArch() == llvm::Triple::SPIRSubArch_gen ||
                           TT.getSubArch() == llvm::Triple::SPIRSubArch_x86_64;
+        const bool isSYCLNativeCPU =
+            TC->getAuxTriple() &&
+            driver::isSYCLNativeCPU(TT, *TC->getAuxTriple());
         for (const auto &Input : LI) {
           if (TT.getSubArch() == llvm::Triple::SPIRSubArch_fpga &&
               types::isFPGA(Input->getType())) {
@@ -5717,26 +5717,14 @@ class OffloadingActionBuilder final {
           } else
             FullDeviceLinkAction = FullLinkObject;
 
-          bool IsSYCLNativeCPU = isSYCLNativeCPU(Args);
-          if (IsSYCLNativeCPU) {
-            // for SYCL Native CPU, we just take the linked device
-            // modules, lower them to an object file , and link it to the host
-            // object file.
-            auto *backendAct = C.MakeAction<BackendJobAction>(
-                FullDeviceLinkAction, types::TY_PP_Asm);
-            auto *asmAct =
-                C.MakeAction<AssembleJobAction>(backendAct, types::TY_Object);
-            DA.add(*asmAct, *TC, BoundArch, Action::OFK_SYCL);
-            return;
-          }
-
           // reflects whether current target is ahead-of-time and can't
           // support runtime setting of specialization constants
-          bool isAOT = isNVPTX || isAMDGCN || isSpirvAOT;
+          bool isAOT = isNVPTX || isAMDGCN || isSpirvAOT || isSYCLNativeCPU;
 
           // post link is not optional - even if not splitting, always need to
           // process specialization constants
-          types::ID PostLinkOutType = isSPIR ? types::TY_Tempfiletable
+          types::ID PostLinkOutType = isSPIR || isSYCLNativeCPU
+                                          ? types::TY_Tempfiletable
                                           : types::TY_LLVM_BC;
           auto createPostLinkAction = [&]() {
             // For SPIR-V targets, force TY_Tempfiletable.
@@ -5746,6 +5734,20 @@ class OffloadingActionBuilder final {
             return TypedPostLinkAction;
           };
           Action *PostLinkAction = createPostLinkAction();
+          if (isSYCLNativeCPU) {
+            // for SYCL Native CPU, we just take the linked device
+            // modules, lower them to an object file , and link it to the host
+            // object file.
+            auto *backendAct = C.MakeAction<BackendJobAction>(
+                FullDeviceLinkAction, types::TY_PP_Asm);
+            auto *asmAct =
+                C.MakeAction<AssembleJobAction>(backendAct, types::TY_Object);
+            DA.add(*asmAct, *TC, BoundArch, Action::OFK_SYCL);
+            auto *DeviceWrappingAction = C.MakeAction<OffloadWrapperJobAction>(
+                PostLinkAction, types::TY_Object);
+            DA.add(*DeviceWrappingAction, *TC, BoundArch, Action::OFK_SYCL);
+            continue;
+          }
           if (isNVPTX && Args.hasArg(options::OPT_fsycl_embed_ir)) {
             // When compiling for Nvidia/CUDA devices and the user requested the
             // IR to be embedded in the application (via option), run the output
@@ -6119,17 +6121,8 @@ class OffloadingActionBuilder final {
       bool GpuInitHasErrors = false;
       bool HasSYCLTargetsOption =
           SYCLAddTargets || SYCLTargets || SYCLLinkTargets;
-      bool IsSYCLNativeCPU = isSYCLNativeCPU(C.getInputArgs());
 
-      // check if multiple targets are passed along with native_cpu:
-      // currently native_cpu overrides all the other targets, so we emit a
-      // warning
-      if (IsSYCLNativeCPU) {
-        auto *SYCLTargets = Args.getLastArg(options::OPT_fsycl_targets_EQ);
-        if (SYCLTargets->getNumValues() > 1)
-          C.getDriver().Diag(clang::diag::warn_drv_sycl_native_cpu_and_targets);
-      }
-      if (!IsSYCLNativeCPU && HasSYCLTargetsOption) {
+      if (HasSYCLTargetsOption) {
         if (SYCLTargets || SYCLLinkTargets) {
           Arg *SYCLTargetsValues = SYCLTargets ? SYCLTargets : SYCLLinkTargets;
           // Fill SYCLTripleList
@@ -6163,6 +6156,12 @@ class OffloadingActionBuilder final {
                   C.getDriver().MakeSYCLDeviceTriple("amdgcn-amd-amdhsa"),
                   ValidDevice->data());
               UserTargetName = "amdgcn-amd-amdhsa";
+            } else if (Val == "native_cpu") {
+              const ToolChain *HostTC =
+                  C.getSingleOffloadToolChain<Action::OFK_Host>();
+              llvm::Triple TT = HostTC->getTriple();
+              SYCLTripleList.push_back(TT);
+              continue;
             }
 
             llvm::Triple TT(C.getDriver().MakeSYCLDeviceTriple(Val));
@@ -6249,14 +6248,6 @@ class OffloadingActionBuilder final {
               GpuArchList.emplace_back(TT, nullptr);
           }
         }
-      } else if (IsSYCLNativeCPU) {
-        const ToolChain *HostTC =
-            C.getSingleOffloadToolChain<Action::OFK_Host>();
-        llvm::Triple TT = HostTC->getTriple();
-        auto TCIt = llvm::find_if(
-            ToolChains, [&](auto &TC) { return TT == TC->getTriple(); });
-        SYCLTripleList.push_back(TT);
-        SYCLTargetInfoList.emplace_back(*TCIt, nullptr);
       } else if (HasValidSYCLRuntime) {
         // -fsycl is provided without -fsycl-*targets.
         bool SYCLfpga = C.getInputArgs().hasArg(options::OPT_fintelfpga);
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 234a468819714..8f49fbf7c97bb 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -4977,7 +4977,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   bool IsFPGASYCLOffloadDevice =
       IsSYCLOffloadDevice &&
       Triple.getSubArch() == llvm::Triple::SPIRSubArch_fpga;
-  bool IsSYCLNativeCPU = isSYCLNativeCPU(Args);
+  const bool IsSYCLNativeCPU = isSYCLNativeCPU(TC, C.getDefaultToolChain());
 
   // Perform the SYCL host compilation using an external compiler if the user
   // requested.
@@ -5468,6 +5468,10 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
       CmdArgs.push_back("-emit-obj");
       CollectArgsForIntegratedAssembler(C, Args, CmdArgs, D);
     }
+    if (IsSYCLOffloadDevice && IsSYCLNativeCPU) {
+      CmdArgs.push_back("-mllvm");
+      CmdArgs.push_back("-sycl-native-cpu-rename");
+    }
 
     // Also ignore explicit -force_cpusubtype_ALL option.
     (void)Args.hasArg(options::OPT_force__cpusubtype__ALL);
@@ -9409,6 +9413,10 @@ void OffloadWrapper::ConstructJob(Compilation &C, const JobAction &JA,
       TargetTripleOpt = ("llvm_" + TargetTripleOpt).str();
     }
 
+    const bool IsSYCLNativeCPU = isSYCLNativeCPU(TC, C.getDefaultToolChain());
+    if (IsSYCLNativeCPU) {
+      TargetTripleOpt = "native_cpu";
+    }
     WrapperArgs.push_back(
         C.getArgs().MakeArgString(Twine("-target=") + TargetTripleOpt));
 
diff --git a/clang/lib/Driver/ToolChains/SYCL.cpp b/clang/lib/Driver/ToolChains/SYCL.cpp
index 7d2cf71619d79..ea704c20300c6 100644
--- a/clang/lib/Driver/ToolChains/SYCL.cpp
+++ b/clang/lib/Driver/ToolChains/SYCL.cpp
@@ -810,7 +810,7 @@ void SYCL::x86_64::BackendCompiler::ConstructJob(
 SYCLToolChain::SYCLToolChain(const Driver &D, const llvm::Triple &Triple,
                              const ToolChain &HostTC, const ArgList &Args)
     : ToolChain(D, Triple, Args), HostTC(HostTC),
-      IsSYCLNativeCPU(isSYCLNativeCPU(Args)) {
+      IsSYCLNativeCPU(Triple == HostTC.getTriple()) {
   // Lookup binaries into the driver directory, this is used to
   // discover the clang-offload-bundler executable.
   getProgramPaths().push_back(getDriver().Dir);
diff --git a/clang/lib/Driver/ToolChains/SYCL.h b/clang/lib/Driver/ToolChains/SYCL.h
index 7df2d760bb17c..2c39d5d1c4085 100644
--- a/clang/lib/Driver/ToolChains/SYCL.h
+++ b/clang/lib/Driver/ToolChains/SYCL.h
@@ -202,13 +202,13 @@ class LLVM_LIBRARY_VISIBILITY SYCLToolChain : public ToolChain {
       llvm::opt::ArgStringList &CC1Args) const override;
 
   const ToolChain &HostTC;
+  const bool IsSYCLNativeCPU;
 
 protected:
   Tool *buildBackendCompiler() const override;
   Tool *buildLinker() const override;
 
 private:
-  bool IsSYCLNativeCPU;
   void TranslateGPUTargetOpt(const llvm::opt::ArgList &Args,
                              llvm::opt::ArgStringList &CmdArgs,
                              llvm::opt::OptSpecifier Opt_EQ) const;
@@ -223,6 +223,14 @@ template <typename ArgListT> bool isSYCLNativeCPU(const ArgListT &Args) {
   }
   return false;
 }
+
+inline bool isSYCLNativeCPU(const llvm::Triple HostT, const llvm::Triple DevT) {
+  return HostT == DevT;
+}
+
+inline bool isSYCLNativeCPU(const ToolChain &TC1, const ToolChain &TC2) {
+  return isSYCLNativeCPU(TC1.getTriple(), TC2.getTriple());
+}
 } // end namespace driver
 } // end namespace clang
 
diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp
index ae27b08b2bf68..52be001737786 100644
--- a/clang/lib/Sema/SemaSYCL.cpp
+++ b/clang/lib/Sema/SemaSYCL.cpp
@@ -20,7 +20,6 @@
 #include "clang/Basic/Attributes.h"
 #include "clang/Basic/Builtins.h"
 #include "clang/Basic/Diagnostic.h"
-#include "clang/Basic/SYCLNativeCPUHelpers.h"
 #include "clang/Basic/Version.h"
 #include "clang/Sema/Initialization.h"
 #include "clang/Sema/Sema.h"
@@ -1028,15 +1027,6 @@ static QualType calculateKernelNameType(ASTContext &Ctx,
   return TAL->get(0).getAsType().getCanonicalType();
 }
 
-// Kernel names are currently mangled as type names which
-// may collide (in the IR) with the "real" type names generated
-// for RTTI etc when compiling host and device code together.
-// Therefore the mangling of the kernel function is changed for
-// NativeCPU to avoid such potential collision.
-static void changeManglingForNativeCPU(std::string &Name) {
-  Name.append("_NativeCPUKernel");
-}
-
 // Gets a name for the OpenCL kernel function, calculated from the first
 // template argument of the kernel caller function.
 static std::pair<std::string, std::string>
@@ -1054,12 +1044,18 @@ constructKernelName(Sema &S, const FunctionDecl *KernelCallerFunc,
   std::string StableName =
       SYCLUniqueStableNameExpr::ComputeName(S.getASTContext(), KernelNameType);
 
-  // When compiling for the SYCLNativeCPU device we need a C++ identifier
-  // as the kernel name and cannot use the name produced by some manglers
-  // including the MS mangler.
+  // For NativeCPU the kernel name is set to the stable GNU-mangled name
+  // because the default mangling may be different, for example on Windows.
+  // This is needed for compiling kernels for multiple SYCL targets to ensure
+  // the same kernel name can be used for kernel lookup in different target
+  // binaries. This assumes that all SYCL targets use the same mangling
+  // produced for the stable name.
+  // Todo: Check if this assumption is valid, and if it would be better
+  // instead to always compile the NativeCPU device code in GNU mode which
+  // may cause issues when compiling headers with non-standard extensions
+  // written for compilers with different C++ ABIs (like MS VS).
   if (S.getLangOpts().SYCLIsNativeCPU) {
     MangledName = StableName;
-    changeManglingForNativeCPU(MangledName);
   }
 
   return {MangledName, StableName};
@@ -5682,16 +5678,6 @@ bool SYCLIntegrationFooter::emit(raw_ostream &OS) {
     }
   }
 
-  if (S.getLangOpts().SYCLIsNativeCPU) {
-    // This is a temporary workaround for the integration header file
-    // being emitted too early.
-    std::string HCName = getNativeCPUHeaderName(S.getLangOpts());
-
-    OS << "\n// including the kernel handlers calling the kernels\n";
-    OS << "\n#include \"";
-    OS << HCName;
-    OS << "\"\n\n";
-  }
   if (EmittedFirstSpecConstant)
     OS << "#include <sycl/detail/spec_const_integration.hpp>\n";
 
diff --git a/clang/test/CodeGenSYCL/native_cpu_basic.cpp b/clang/test/CodeGenSYCL/native_cpu_basic.cpp
index 3ea3cab8fa06c..87552837cea4a 100644
--- a/clang/test/CodeGenSYCL/native_cpu_basic.cpp
+++ b/clang/test/CodeGenSYCL/native_cpu_basic.cpp
@@ -50,8 +50,8 @@ void gen() {
 }
 
 // Check name mangling 
-// CHECK-DAG: @_ZTS6init_aIiE_NativeCPUKernel_NativeCPUKernel({{.*}})
-// CHECK-DAG: @_ZTS6init_aIfE_NativeCPUKernel_NativeCPUKernel({{.*}})
+// CHECK-DAG: @_ZTS6init_aIiE.NativeCPUKernel({{.*}})
+// CHECK-DAG: @_ZTS6init_aIfE.NativeCPUKernel({{.*}})
 
 // Check Native CPU module flag
 // CHECK-DAG: !{{[0-9]*}} = !{i32 1, !"is-native-cpu", i32 1}
diff --git a/clang/test/Driver/sycl-native-cpu-fsycl.cpp b/clang/test/Driver/sycl-native-cpu-fsycl.cpp
index 6646b29dc1d05..27b4598dbba1c 100644
--- a/clang/test/Driver/sycl-native-cpu-fsycl.cpp
+++ b/clang/test/Driver/sycl-native-cpu-fsycl.cpp
@@ -19,15 +19,20 @@
 //CHECK_ACTIONS:|     +- 11: linker, {5}, ir, (device-sycl)
 //CHECK_ACTIONS:|  +- 12: backend, {11}, assembler, (device-sycl)
 //CHECK_ACTIONS:|- 13: assembler, {12}, object, (device-sycl)
-//CHECK_ACTIONS:14: offload, "host-sycl ({{.*}})" {10}, "device-sycl ({{.*}})" {13}, image
+//call sycl-post-link and clang-offload-wrapper
+//CHECK_ACTIONS:|  +- 14: sycl-post-link, {11}, tempfiletable, (device-sycl)
+//CHECK_ACTIONS:|- 15: clang-offload-wrapper, {14}, object, (device-sycl)
+//CHECK_ACTIONS:16: offload, "host-sycl ({{.*}})" {10}, "device-sycl ({{.*}})" {13}, "device-sycl ({{.*}})" {15}, image
 
 
 //CHECK_BINDINGS:# "{{.*}}" - "clang", inputs: ["{{.*}}sycl-native-cpu-fsycl.cpp"], output: "[[KERNELIR:.*]].bc"
 //CHECK_BINDINGS:# "{{.*}}" - "SYCL::Linker", inputs: ["[[KERNELIR]].bc"], output: "[[KERNELLINK:.*]].bc"
 //CHECK_BINDINGS:# "{{.*}}" - "clang", inputs: ["[[KERNELLINK]].bc"], output: "[[KERNELOBJ:.*]].o"
+//CHECK_BINDINGS:# "{{.*}}" - "SYCL post link", inputs: ["[[KERNELLINK]].bc"], output: "[[TABLEFILE:.*]].table"
+//CHECK_BINDINGS:# "{{.*}}" - "offload wrapper", inputs: ["[[TABLEFILE]].table"], output: "[[WRAPPEROBJ:.*]].o"
 //CHECK_BINDINGS:# "{{.*}}" - "Append Footer to source", inputs: ["{{.*}}sycl-native-cpu-fsycl.cpp"], output: "[[SRCWFOOTER:.*]].cpp"
 //CHECK_BINDINGS:# "{{.*}}" - "clang", inputs: ["[[SRCWFOOTER]].cpp", "[[KERNELIR]].bc"], output: "[[HOSTOBJ:.*]].o"
-//CHECK_BINDINGS:# "{{.*}}" - "{{.*}}::Linker", inputs: ["[[HOSTOBJ]].o", "[[KERNELOBJ]].o"], output: "a.{{.*}}"
+//CHECK_BINDINGS:# "{{.*}}" - "{{.*}}::Linker", inputs: ["[[HOSTOBJ]].o", "[[KERNELOBJ]].o", "[[WRAPPEROBJ]].o"], output: "a.{{.*}}"
 
 //CHECK_INVO:{{.*}}clang{{.*}}-fsycl-is-device{{.*}}"-fsycl-is-native-cpu" "-D" "__SYCL_NATIVE_CPU__" 
 //CHECK_INVO:{{.*}}clang{{.*}}"-x" "ir"
@@ -48,4 +53,6 @@
 //CHECK_ACTIONS-AARCH64:|     +- 11: linker, {5}, ir, (device-sycl)
 //CHECK_ACTIONS-AARCH64:|  +- 12: backend, {11}, assembler, (device-sycl)
 //CHECK_ACTIONS-AARCH64:|- 13: assembler, {12}, object, (device-sycl)
-//CHECK_ACTIONS-AARCH64:14: offload, "host-sycl (aarch64-unknown-linux-gnu)" {10}, "device-sycl (aarch64-unknown-linux-gnu)" {13}, image
+//CHECK_ACTIONS-AARCH64:|  +- 14: sycl-post-link, {11}, tempfiletable, (device-sycl)
+//CHECK_ACTIONS-AARCH64:|- 15: clang-offload-wrapper, {14}, object, (device-sycl)
+//CHECK_ACTIONS-AARCH64:16: offload, "host-sycl (aarch64-unknown-linux-gnu)" {10}, "device-sycl (aarch64-unknown-linux-gnu)" {13}, "device-sycl (aarch64-unknown-linux-gnu)" {15}, image
diff --git a/clang/test/Driver/sycl-native-cpu-warn.cpp b/clang/test/Driver/sycl-native-cpu-warn.cpp
deleted file mode 100644
index a5e249be86f0e..0000000000000
--- a/clang/test/Driver/sycl-native-cpu-warn.cpp
+++ /dev/null
@@ -1,5 +0,0 @@
-// RUN: %clangxx -fsycl -fsycl-targets="native_cpu,spir64" -### %s 2>&1 | FileCheck %s
-
-// checks that we emit the correct warning when native_cpu is listed together with other sycl targets
-// TODO: remove this test and the warning once native_cpu is supported alongside other targets
-// CHECK:  warning: -fsycl-targets=native_cpu overrides SYCL targets option [-Wsycl-native-cpu-targets]
diff --git a/clang/tools/clang-offload-wrapper/ClangOffloadWrapper.cpp b/clang/tools/clang-offload-wrapper/ClangOffloadWrapper.cpp
old mode 100644
new mode 100755
index c73d60936c00d..b0aedbb6d7519
--- a/clang/tools/clang-offload-wrapper/ClangOffloadWrapper.cpp
+++ b/clang/tools/clang-offload-wrapper/ClangOffloadWrapper.cpp
@@ -17,28 +17,33 @@
 #include "clang/Basic/Version.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/TargetParser/Triple.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/TargetParser/Triple.h"
 #ifndef NDEBUG
 #include "llvm/IR/Verifier.h"
 #endif // NDEBUG
 #include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Object/ObjectFile.h"
+#include "llvm/SYCLLowerIR/SYCLUtils.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/EndianStream.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorOr.h"
-#include "llvm/Support/LineIterator.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/LineIterator.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Program.h"
@@ -598,6 +603,69 @@ class BinaryWrapper {
     return AutoGcBufs.back().get();
   }
 
+  Function *addDeclarationForNativeCPU(StringRef Name) {
+    static FunctionType *NativeCPUFuncTy = FunctionType::get(
+        Type::getVoidTy(C),
+        {PointerType::getUnqual(C), PointerType::getUnqual(C)}, false);
+    static FunctionType *NativeCPUBuiltinTy = FunctionType::get(
+        PointerType::getUnqual(C), {PointerType::getUnqual(C)}, false);
+    FunctionType *FTy;
+    if (Name.starts_with("__dpcpp_nativecpu"))
+      FTy = NativeCPUBuiltinTy;
+    else
+      FTy = NativeCPUFuncTy;
+    auto FCalle = M.getOrInsertFunction(
+        sycl::utils::addSYCLNativeCPUSuffix(Name).str(), FTy);
+    Function *F = dyn_cast<Function>(FCalle.getCallee());
+    if (F == nullptr)
+      report_fatal_error("Unexpected callee");
+    return F;
+  }
+
+  Expected<std::pair<Constant *, Constant *>>
+  addDeclarationsForNativeCPU(StringRef EntriesFile) {
+    Expected<MemoryBuffer *> MBOrErr = loadFile(EntriesFile);
+    if (!MBOrErr)
+      return MBOrErr.takeError();
+    MemoryBuffer *MB = *MBOrErr;
+    // the Native CPU PI Plug-in expects the BinaryStart field to point to an
+    // array of struct nativecpu_entry {
+    //   char *kernelname;
+    //   unsigned char *kernel_ptr;
+    // };
+    StructType *NCPUEntryT = StructType::create(
+        {PointerType::getUnqual(C), PointerType::getUnqual(C)},
+        "__nativecpu_entry");
+    SmallVector<Constant *, 5> NativeCPUEntries;
+    for (line_iterator LI(*MB); !LI.is_at_eof(); ++LI) {
+      auto *NewDecl = addDeclarationForNativeCPU(*LI);
+      NativeCPUEntries.push_back(ConstantStruct::get(
+          NCPUEntryT,
+          {addStringToModule(*LI, "__ncpu_function_name"), NewDecl}));
+    }
+
+    // Add an empty entry that we use as end iterator
+    static auto *NativeCPUEndStr =
+        addStringToModule("__nativecpu_end", "__ncpu_end_str");
+    auto *NullPtr = llvm::ConstantPointerNull::get(PointerType::getUnqual(C));
+    NativeCPUEntries.push_back(
+        ConstantStruct::get(NCPUEntryT, {NativeCPUEndStr, NullPtr}));
+
+    // Create the constant array containing the {kernel name, function pointers}
+    // pairs
+    ArrayType *ATy = ArrayType::get(NCPUEntryT, NativeCPUEntries.size());
+    Constant *CA = ConstantArray::get(ATy, NativeCPUEntries);
+    auto *GVar = new GlobalVariable(M, CA->getType(), true,
+                                    GlobalVariable::InternalLinkage, CA,
+                                    "__sycl_native_cpu_decls");
+    auto *Begin = ConstantExpr::getGetElementPtr(GVar->getValueType(), GVar,
+                                                 getSizetConstPair(0u, 0u));
+    auto *End = ConstantExpr::getGetElementPtr(
+        GVar->getValueType(), GVar,
+        getSizetConstPair(0u, NativeCPUEntries.size()));
+    return std::make_pair(Begin, End);
+  }
+
   // Adds a global readonly variable that is initialized by given data to the
   // module.
   GlobalVariable *addGlobalArrayVariable(const Twine &Name,
@@ -966,9 +1034,18 @@ class BinaryWrapper {
         // Adding ELF notes for STDIN is not supported yet.
         Bin = addELFNotes(Bin, Img.File);
       }
-      std::pair<Constant *, Constant *> Fbin = addDeviceImageToModule(
-          ArrayRef<char>(Bin->getBufferStart(), Bin->getBufferSize()),
-          Twine(OffloadKindTag) + Twine(ImgId) + Twine(".data"), Kind, Img.Tgt);
+      std::pair<Constant *, Constant *> Fbin;
+      if (Img.Tgt == "native_cpu") {
+        auto FBinOrErr = addDeclarationsForNativeCPU(Img.EntriesFile);
+        if (!FBinOrErr)
+          return FBinOrErr.takeError();
+        Fbin = *FBinOrErr;
+      } else {
+        Fbin = addDeviceImageToModule(
+            ArrayRef<char>(Bin->getBufferStart(), Bin->getBufferSize()),
+            Twine(OffloadKindTag) + Twine(ImgId) + Twine(".data"), Kind,
+            Img.Tgt);
+      }
 
       if (Kind == OffloadKind::SYCL) {
         // For SYCL image offload entries are defined here, by wrapper, so
diff --git a/llvm/include/llvm/SYCLLowerIR/EmitSYCLNativeCPUHeader.h b/llvm/include/llvm/SYCLLowerIR/RenameKernelSYCLNativeCPU.h
similarity index 50%
rename from llvm/include/llvm/SYCLLowerIR/EmitSYCLNativeCPUHeader.h
rename to llvm/include/llvm/SYCLLowerIR/RenameKernelSYCLNativeCPU.h
index cf8569193da05..086f9256ca7ea 100644
--- a/llvm/include/llvm/SYCLLowerIR/EmitSYCLNativeCPUHeader.h
+++ b/llvm/include/llvm/SYCLLowerIR/RenameKernelSYCLNativeCPU.h
@@ -1,4 +1,4 @@
-//===---- EmitSYCLHCHeader.h - Emit SYCL Native CPU Helper Header Pass ----===//
+//===-- RenameKernelSYCLNativeCPU.h - Kernel renaming for SYCL Native CPU--===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,10 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This pass emits the SYCL Native CPU helper header.
-// The header mainly contains the definition for the handler function which
-// allows to call the kernel extracted by the device compiler from the host
-// runtime.
+// A transformation pass that renames the kernel names, making sure that the
+// mangled name is a string with no particular semantics.
+//
 //===----------------------------------------------------------------------===//
 
 #pragma once
@@ -21,16 +20,10 @@ namespace llvm {
 
 class ModulePass;
 
-class EmitSYCLNativeCPUHeaderPass
-    : public PassInfoMixin<EmitSYCLNativeCPUHeaderPass> {
+class RenameKernelSYCLNativeCPUPass
+    : public PassInfoMixin<RenameKernelSYCLNativeCPUPass> {
 public:
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM);
-  EmitSYCLNativeCPUHeaderPass(const std::string &FileName)
-      : NativeCPUHeaderName(FileName) {}
-  EmitSYCLNativeCPUHeaderPass() = default;
-
-private:
-  std::string NativeCPUHeaderName;
 };
 
 } // namespace llvm
diff --git a/llvm/include/llvm/SYCLLowerIR/SYCLUtils.h b/llvm/include/llvm/SYCLLowerIR/SYCLUtils.h
index c9ebcdae53f4b..40450d291509c 100644
--- a/llvm/include/llvm/SYCLLowerIR/SYCLUtils.h
+++ b/llvm/include/llvm/SYCLLowerIR/SYCLUtils.h
@@ -117,6 +117,12 @@ inline bool isSYCLExternalFunction(const Function *F) {
   return F->hasFnAttribute(ATTR_SYCL_MODULE_ID);
 }
 
+constexpr char SYCLNATIVECPUSUFFIX[] = ".SYCLNCPU";
+inline llvm::Twine addSYCLNativeCPUSuffix(StringRef S) {
+  return llvm::Twine(S, SYCLNATIVECPUSUFFIX);
+}
+constexpr char SYCLNATIVECPURENAMEMD[] = "sycl-native-cpu-rename";
+
 } // namespace utils
 } // namespace sycl
 } // namespace llvm
diff --git a/llvm/lib/SYCLLowerIR/CMakeLists.txt b/llvm/lib/SYCLLowerIR/CMakeLists.txt
index 255426bff2b88..0a7fba1b2fa59 100644
--- a/llvm/lib/SYCLLowerIR/CMakeLists.txt
+++ b/llvm/lib/SYCLLowerIR/CMakeLists.txt
@@ -70,7 +70,7 @@ add_llvm_component_library(LLVMSYCLLowerIR
   GlobalOffset.cpp
   TargetHelpers.cpp
   PrepareSYCLNativeCPU.cpp
-  EmitSYCLNativeCPUHeader.cpp
+  RenameKernelSYCLNativeCPU.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/SYCLLowerIR
diff --git a/llvm/lib/SYCLLowerIR/EmitSYCLNativeCPUHeader.cpp b/llvm/lib/SYCLLowerIR/EmitSYCLNativeCPUHeader.cpp
deleted file mode 100644
index cc41483ff54a7..0000000000000
--- a/llvm/lib/SYCLLowerIR/EmitSYCLNativeCPUHeader.cpp
+++ /dev/null
@@ -1,141 +0,0 @@
-//===---- EmitSYCLHCHeader.cpp - Emit SYCL Native CPU Helper Header Pass --===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Emits the SYCL Native CPU helper headers, containing the kernel definition
-// and handlers.
-//===----------------------------------------------------------------------===//
-
-#include "llvm/SYCLLowerIR/EmitSYCLNativeCPUHeader.h"
-
-#include "llvm/ADT/StringRef.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Type.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-#include <functional>
-#include <numeric>
-
-using namespace llvm;
-
-namespace {
-
-void emitSubKernelHandler(const Function *F, raw_ostream &O) {
-  O << "\nextern \"C\" void " << F->getName() << "subhandler(";
-  O << "const sycl::detail::NativeCPUArgDesc *MArgs, "
-       "__nativecpu_state *state);\n";
-  return;
-}
-
-// Todo: maybe we could use clang-offload-wrapper for this,
-// the main thing that prevents use from using clang-offload-wrapper
-// right now is the fact that we need the subhandler
-// to figure out which args are used or not, and so the BinaryStart entry
-// need to point to the subhandler, and I'm not sure how to do that in
-// clang-offload-wrapper. If we figure out a better way to deal with unused
-// kernel args, we can probably get rid of the subhandler and make BinaryStart
-// point the the actual kernel function pointer, which should be doable in
-// clang-offload-wrapper.
-void emitSYCLRegisterLib(const Function *F, raw_ostream &O) {
-  auto KernelName = F->getName();
-  std::string SubHandlerName = (KernelName + "subhandler").str();
-  static const char *BinariesT = "pi_device_binaries_struct";
-  static const char *BinaryT = "pi_device_binary_struct";
-  static const char *OffloadEntryT = "_pi_offload_entry_struct";
-  std::string Binaries = (BinariesT + KernelName).str();
-  std::string Binary = (BinaryT + KernelName).str();
-  std::string OffloadEntry = (OffloadEntryT + KernelName).str();
-  // Fill in the offload entry struct for this kernel
-  O << "static " << OffloadEntryT << " " << OffloadEntry << "{"
-    << "(void*)&" << SubHandlerName << ", "            // addr
-    << "const_cast<char*>(\"" << KernelName << "\"), " // name
-    << "1, "                                           // size
-    << "0, "                                           // flags
-    << "0 "                                            // reserved
-    << "};\n";
-  // Fill in the binary struct
-  O << "static " << BinaryT << " " << Binary << "{"
-    << "0, "                                             // Version
-    << "4, "                                             // Kind
-    << "0, "                                             // Format
-    << "__SYCL_PI_DEVICE_BINARY_TARGET_UNKNOWN, "        // Device target spec
-    << "nullptr, "                                       // Compile options
-    << "nullptr, "                                       // Link options
-    << "nullptr, "                                       // Manifest start
-    << "nullptr, "                                       // Manifest end
-    << "(unsigned char*)&" << SubHandlerName << ", "     // BinaryStart
-    << "(unsigned char*)&" << SubHandlerName << " + 1, " // BinaryEnd
-    << "&" << OffloadEntry << ", "                       // EntriesBegin
-    << "&" << OffloadEntry << "+1, "                     // EntriesEnd
-    << "nullptr, "                                       // PropertySetsBegin
-    << "nullptr "                                        // PropertySetsEnd
-    << "};\n";
-  // Fill in the binaries struct
-  O << "static " << BinariesT << " " << Binaries << "{"
-    << "0, "                 // Version
-    << "1, "                 // NumDeviceBinaries
-    << "&" << Binary << ", " // DeviceBinaries
-    << "nullptr, "           // HostEntriesBegin
-    << "nullptr "            // HostEntriesEnd
-    << "};\n";
-
-  // Define a struct and use its constructor to call __sycl_register_lib
-  std::string InitNativeCPU = ("init_native_cpu" + KernelName).str();
-  std::string InitNativeCPUT = InitNativeCPU + "_t";
-  O << "struct " << InitNativeCPUT << "{\n"
-    << "\t" << InitNativeCPUT << "(){\n"
-    << "\t\t"
-    << "__sycl_register_lib(&" << Binaries << ");\n"
-    << "\t}\n"
-    << "};\n"
-    << "static " << InitNativeCPUT << " " << InitNativeCPU << ";\n";
-}
-
-} // namespace
-
-PreservedAnalyses EmitSYCLNativeCPUHeaderPass::run(Module &M,
-                                                   ModuleAnalysisManager &MAM) {
-  SmallVector<Function *> Kernels;
-  for (auto &F : M) {
-    if (F.getCallingConv() == llvm::CallingConv::SPIR_KERNEL)
-      Kernels.push_back(&F);
-  }
-
-  // Emit native CPU helper header
-  if (NativeCPUHeaderName == "") {
-    report_fatal_error("No file name for Native CPU helper header specified",
-                       false);
-  }
-  int HCHeaderFD = 0;
-  std::error_code EC =
-      llvm::sys::fs::openFileForWrite(NativeCPUHeaderName, HCHeaderFD);
-  if (EC) {
-    report_fatal_error(StringRef(EC.message()), false);
-  }
-  llvm::raw_fd_ostream O(HCHeaderFD, true);
-  O << "#pragma once\n";
-  O << "#include <sycl/detail/native_cpu.hpp>\n";
-  O << "#include <sycl/detail/pi.h>\n";
-  O << "extern \"C\" void __sycl_register_lib(pi_device_binaries desc);\n";
-
-  for (auto *F : Kernels) {
-    emitSubKernelHandler(F, O);
-    emitSYCLRegisterLib(F, O);
-  }
-
-  return PreservedAnalyses::all();
-}
diff --git a/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp b/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp
index c6d199dde5f8d..d80973cd3f55f 100644
--- a/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp
+++ b/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/SYCLLowerIR/PrepareSYCLNativeCPU.h"
+#include "llvm/SYCLLowerIR/SYCLUtils.h"
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
@@ -103,7 +104,15 @@ void emitSubkernelForKernel(Function *F, Type *NativeCPUArgDescType,
   Type *NativeCPUArgDescPtrType = PointerType::getUnqual(NativeCPUArgDescType);
 
   // Create function signature
-  const std::string SubHandlerName = F->getName().str() + "subhandler";
+  // Todo: we need to ensure that the kernel name is not mangled as a type
+  // name, otherwise this may lead to runtime failures due to *weird*
+  // codegen/linking behaviour, we change the name of the kernel, and the
+  // subhandler steals its name, we add a suffix to the subhandler later
+  // on when lowering the device module
+  std::string OldName = F->getName().str();
+  std::string NewName = OldName + ".NativeCPUKernel";
+  const auto SubHandlerName = OldName;
+  F->setName(NewName);
   FunctionType *FTy = FunctionType::get(
       Type::getVoidTy(Ctx), {NativeCPUArgDescPtrType, StatePtrType}, false);
   auto SubhFCallee = F->getParent()->getOrInsertFunction(SubHandlerName, FTy);
@@ -142,6 +151,15 @@ void emitSubkernelForKernel(Function *F, Type *NativeCPUArgDescType,
   KernelArgs.push_back(SubhF->getArg(1));
   Builder.CreateCall(KernelTy, F, KernelArgs);
   Builder.CreateRetVoid();
+
+  // Add sycl-module-id attribute
+  // Todo: we may want to copy other attributes to the subhandler,
+  // but we can't simply use setAttributes(F->getAttributes) since
+  // the function signatures are different
+  if (F->hasFnAttribute(sycl::utils::ATTR_SYCL_MODULE_ID)) {
+    Attribute MId = F->getFnAttribute(sycl::utils::ATTR_SYCL_MODULE_ID);
+    SubhF->addFnAttr("sycl-module-id", MId.getValueAsString());
+  }
 }
 
 // Clones the function and returns a new function with a new argument on type T
@@ -252,8 +270,6 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M,
       StructType::create({PointerType::getUnqual(M.getContext())});
   for (auto &NewK : NewKernels) {
     emitSubkernelForKernel(NewK, NativeCPUArgDescType, StatePtrType);
-    std::string NewName = NewK->getName().str() + "_NativeCPUKernel";
-    NewK->setName(NewName);
   }
 
   // Then we iterate over all the supported builtins, find their uses and
diff --git a/llvm/lib/SYCLLowerIR/RenameKernelSYCLNativeCPU.cpp b/llvm/lib/SYCLLowerIR/RenameKernelSYCLNativeCPU.cpp
new file mode 100644
index 0000000000000..c96b1c091c547
--- /dev/null
+++ b/llvm/lib/SYCLLowerIR/RenameKernelSYCLNativeCPU.cpp
@@ -0,0 +1,28 @@
+//===- RenameKernelSYCLNativeCPU.cpp - Kernel renaming for SYCL Native CPU-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// A transformation pass that renames the kernel names, to ensure the name
+// doesn't clash with other names.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/SYCLLowerIR/RenameKernelSYCLNativeCPU.h"
+#include "llvm/SYCLLowerIR/SYCLUtils.h"
+
+using namespace llvm;
+
+PreservedAnalyses
+RenameKernelSYCLNativeCPUPass::run(Module &M, ModuleAnalysisManager &MAM) {
+  bool ModuleChanged = false;
+  for (auto &F : M) {
+    if (F.hasFnAttribute(sycl::utils::ATTR_SYCL_MODULE_ID)) {
+      F.setName(sycl::utils::addSYCLNativeCPUSuffix(F.getName()));
+    }
+  }
+  return ModuleChanged ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
diff --git a/sycl/doc/design/SYCLNativeCPU.md b/sycl/doc/design/SYCLNativeCPU.md
index 39930d34fd04b..7daa8f1c9a633 100644
--- a/sycl/doc/design/SYCLNativeCPU.md
+++ b/sycl/doc/design/SYCLNativeCPU.md
@@ -93,20 +93,19 @@ entry:
 }
 ```
 
-For the Native CPU target, the device compiler needs to perform two main operations:
-* Materialize the SPIRV builtins (such as `@__spirv_BuiltInGlobalInvocationId`), so that they can be correctly updated by the runtime when executing the kernel. This is performed by the [PrepareSYCLNativeCPU pass](llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp).
-* Allow the SYCL runtime to call the kernel, by registering it to the SYCL runtime, operation performed by the [EmitSYCLNativeCPUHeader pass](llvm/lib/SYCLLowerIR/EmitSYCLNativeCPUHeader.cpp).
+For the Native CPU target, the device compiler is in charge of materializing the SPIRV builtins (such as `@__spirv_BuiltInGlobalInvocationId`), so that they can be correctly updated by the runtime when executing the kernel. This is performed by the [PrepareSYCLNativeCPU pass](llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp).
+The PrepareSYCLNativeCPUPass also emits a `subhandler` function, which receives the kernel arguments from the SYCL runtime (packed in a vector), unpacks them, and forwards only the used ones to the actual kernel. 
 
 
 ## PrepareSYCLNativeCPU Pass
 
-This pass will add a pointer to a `nativecpu_state` struct as kernel argument to all the kernel functions, and it will replace all the uses of SPIRV builtins with the return value of appropriately defined functions, which will read the requested information from the `nativecpu_state` struct. The `nativecpu_state` struct and the builtin functions are defined in [native_cpu.hpp](sycl/include/sycl/detail/native_cpu.hpp).
+This pass will add a pointer to a `nativecpu_state` struct as kernel argument to all the kernel functions, and it will replace all the uses of SPIRV builtins with the return value of appropriately defined functions, which will read the requested information from the `__nativecpu_state` struct. The `__nativecpu_state` struct and the builtin functions are defined in [native_cpu.hpp](sycl/include/sycl/detail/native_cpu.hpp).
 
 
 The resulting IR is:
 
 ```llvm
-define weak dso_local void @_Z6Sample(ptr noundef align 4 %0, ptr noundef align 4 %1, ptr noundef align 4 %2, ptr %3) local_unnamed_addr #3 !srcloc !74 !kernel_arg_buffer_location !75 !kernel_arg_type !76 !sycl_fixed_targets !49 !sycl_kernel_omit_args !77 {
+define weak dso_local void @_Z6Sample.NativeCPUKernel(ptr noundef align 4 %0, ptr noundef align 4 %1, ptr noundef align 4 %2, ptr %3) local_unnamed_addr #3 !srcloc !74 !kernel_arg_buffer_location !75 !kernel_arg_type !76 !sycl_fixed_targets !49 !sycl_kernel_omit_args !77 {
 entry:
   %ncpu_builtin = call ptr @_Z13get_global_idmP15nativecpu_state(ptr %3)
   %4 = load i64, ptr %ncpu_builtin, align 32, !noalias !78
@@ -122,11 +121,12 @@ entry:
   ret void
 }
 ```
-This pass will also set the correct calling convention for the target, and handle calling convention-related function attributes, allowing to call the kernel from the runtime. \\
-Additionally, this pass emits the definition for a `subhandler` function, which unpacks the vector of kernel arguments coming from the SYCL runtime, and forwards only the used arguments to the kernel. For our example the `subhandler` IR is:
+This pass will also set the correct calling convention for the target, and handle calling convention-related function attributes, allowing to call the kernel from the runtime.
+
+The `subhandler` for the Native CPU kernel looks like: 
 
 ```llvm
-define weak void @_Z6Samplesubhandler(ptr %0, ptr %1) #4 {
+define weak void @_Z6Sample(ptr %0, ptr %1) #4 {
 entry:
   %2 = getelementptr %0, ptr %0, i64 0
   %3 = load ptr, ptr %2, align 8
@@ -136,33 +136,27 @@ entry:
   %7 = load ptr, ptr %6, align 8
   %8 = getelementptr %0, ptr %0, i64 7
   %9 = load ptr, ptr %8, align 8
-  call void @_ZTS10SimpleVaddIiE_NativeCPUKernel(ptr %3, ptr %5, ptr %7, ptr %9, ptr %1)
+  call void @_ZTS10SimpleVaddIiE.NativeCPUKernel(ptr %3, ptr %5, ptr %7, ptr %9, ptr %1)
   ret void
 }
 ```
+As you can see, the `subhandler` steals the kernel's function name, and receives two pointer arguments: the first one points to the kernel arguments from the SYCL runtime, and the second one to the `__nativecpu_state` struct.
 
-## EmitSYCLNativeCPUHeader pass
-
-This pass emits an additional integration header, that will be compiled by the host compiler during the host compilation step. This header is included by the main integration footer and does not need to be managed manually. Its main purpose is to enable the SYCL runtime to register kernels and to call kernels that had unused parameters removed by the optimizer. The header contains, for each kernel:
-* The subhandler declaration as a C++ function.
-* The definition of `_pi_offload_entry_struct`, `pi_device_binary_struct` and `pi_device_binaries_struct` variables, and a call to `__sycl_register_lib`, which allows to register the kernel to the sycl runtime (the call to `__sycl_register_lib` is performed at program startup via the constructor of a global). The Native CPU integration header is always named `<main-sycl-int-header>.hc`.
-
-The Native CPU integration header for our example is:
+## Kernel registration
 
-```c++
-extern "C" void _Z6Samplesubhandler(const sycl::detail::NativeCPUArgDesc *MArgs, nativecpu_state *state);
-
-static _pi_offload_entry_struct _pi_offload_entry_struct_Z6Sample{(void*)&_Z6Samplesubhandler, const_cast<char*>("_Z6Sample"), 1, 0, 0 };
-static pi_device_binary_struct pi_device_binary_struct_Z6Sample{0, 4, 0, __SYCL_PI_DEVICE_BINARY_TARGET_UNKNOWN, nullptr, nullptr, nullptr, nullptr, (unsigned char*)&_Z6Samplesubhandler, (unsigned char*)&_Z6Samplesubhandler + 1, &_pi_offload_entry_struct_Z6Sample, &_pi_offload_entry_struct_Z6Sample+1, nullptr, nullptr };
-static pi_device_binaries_struct pi_device_binaries_struct_Z6Sample{0, 1, &pi_device_binary_struct_Z6Sample, nullptr, nullptr };
-struct init_native_cpu_Z6Sample_t{
-	init_native_cpu_Z6Sample_t(){
-		__sycl_register_lib(&pi_device_binaries_struct_Z6Sample);
-	}
-};
-static init_native_cpu_Z6Sample_t init_native_cpu_Z6Sample;
+In order to register the Native CPU kernels to the SYCL runtime, we applied a small change to the `clang-offload-wrapper` tool: normally, the `clang-offload-wrapper` bundles the offload binary in an LLVM-IR module. Instead of bundling the device code, for the Native CPU target we insert an array of function pointers to the `subhandler`s, and the `pi_device_binary_struct::BinaryStart` and `pi_device_binary_struct::BinaryEnd` fields, which normally point to the begin and end addresses of the offload binary, now point to the begin and end of the array.
 
 ```
+ -------------------------------------------------------
+ | "_Z6Sample"   | other entries  |  "__nativecpu_end" |
+ | &_Z6Sample    |                |  nullptr           |
+ -------------------------------------------------------
+        ^                                   ^    
+        |                                   |
+    BinaryStart                         BinaryEnd  
+```
+
+Each entry in the array contains the kernel name as a string, and a pointer to the `sunhandler` function declaration. Since the subhandler's signature has always the same arguments (two pointers in LLVM-IR), the `clang-offload-wrapper` can emit the function declarations given just the function names contained in the `.table` file emitted by `sycl-post-link`. The symbols are then resolved by the system's linker, which receives both the output from the offload wrapper and the lowered device module.
 
 ## Kernel lowering and execution
 
diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h
index 4ad418b5ccbb7..60f0220d388e7 100644
--- a/sycl/include/sycl/detail/pi.h
+++ b/sycl/include/sycl/detail/pi.h
@@ -905,6 +905,7 @@ static const uint8_t PI_DEVICE_BINARY_OFFLOAD_KIND_SYCL = 4;
 /// PTX 64-bit image <-> "nvptx64", 64-bit NVIDIA PTX device
 #define __SYCL_PI_DEVICE_BINARY_TARGET_NVPTX64 "nvptx64"
 #define __SYCL_PI_DEVICE_BINARY_TARGET_AMDGCN "amdgcn"
+#define __SYCL_PI_DEVICE_BINARY_TARGET_NATIVE_CPU "native_cpu"
 
 /// Extension to denote native support of assert feature by an arbitrary device
 /// piDeviceGetInfo call should return this extension when the device supports
diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp
index 83f8525054523..d57fbd0d5817d 100644
--- a/sycl/plugins/unified_runtime/pi2ur.hpp
+++ b/sycl/plugins/unified_runtime/pi2ur.hpp
@@ -1384,6 +1384,10 @@ piextDeviceSelectBinary(pi_device Device, // TODO: does this need to be context?
                     __SYCL_PI_DEVICE_BINARY_TARGET_AMDGCN) == 0)
       UrBinaries[BinaryCount].pDeviceTargetSpec =
           UR_DEVICE_BINARY_TARGET_AMDGCN;
+    else if (strcmp(Binaries[BinaryCount]->DeviceTargetSpec,
+                    __SYCL_PI_DEVICE_BINARY_TARGET_NATIVE_CPU) == 0)
+      UrBinaries[BinaryCount].pDeviceTargetSpec =
+          "native_cpu"; // todo: define UR_DEVICE_BINARY_TARGET_NATIVE_CPU;
     else
       UrBinaries[BinaryCount].pDeviceTargetSpec =
           UR_DEVICE_BINARY_TARGET_UNKNOWN;
diff --git a/sycl/plugins/unified_runtime/ur/adapters/native_cpu/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/native_cpu/device.cpp
index 970d4baf5e58b..c47eab8fefd9f 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/native_cpu/device.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/native_cpu/device.cpp
@@ -83,7 +83,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
     // TODO : Populate return string accordingly - e.g. cl_khr_fp16,
     // cl_khr_fp64, cl_khr_int64_base_atomics,
     // cl_khr_int64_extended_atomics
-    return ReturnValue("");
+    return ReturnValue("cl_khr_fp64 ");
   case UR_DEVICE_INFO_VERSION:
     return ReturnValue("0.1");
   case UR_DEVICE_INFO_COMPILER_AVAILABLE:
@@ -328,5 +328,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceSelectBinary(
   std::ignore = NumBinaries;
   std::ignore = pSelectedBinary;
 
-  CONTINUE_NO_IMPLEMENTATION;
+#define UR_DEVICE_BINARY_TARGET_NATIVE_CPU "native_cpu"
+  // look for a binary with type "native_cpu"
+  // Todo: error checking
+  // Todo: define UR_DEVICE_BINARY_TARGET_NATIVE_CPU in upstream
+  const char *image_target = UR_DEVICE_BINARY_TARGET_NATIVE_CPU;
+  for (uint32_t i = 0; i < NumBinaries; ++i) {
+    if (strcmp(pBinaries[i].pDeviceTargetSpec, image_target) == 0) {
+      *pSelectedBinary = i;
+      return UR_RESULT_SUCCESS;
+    }
+  }
+
+  // No image can be loaded for the given device
+  return UR_RESULT_ERROR_INVALID_BINARY;
 }
diff --git a/sycl/plugins/unified_runtime/ur/adapters/native_cpu/kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/native_cpu/kernel.cpp
index b41a697fae58f..7a93636b95642 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/native_cpu/kernel.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/native_cpu/kernel.cpp
@@ -19,7 +19,11 @@ urKernelCreate(ur_program_handle_t hProgram, const char *pKernelName,
   UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   UR_ASSERT(pKernelName, UR_RESULT_ERROR_INVALID_NULL_POINTER);
 
-  auto f = reinterpret_cast<nativecpu_ptr_t>(hProgram->_ptr);
+  auto kernelEntry = hProgram->_kernels.find(pKernelName);
+  if (kernelEntry == hProgram->_kernels.end())
+    return UR_RESULT_ERROR_INVALID_KERNEL;
+
+  auto f = reinterpret_cast<nativecpu_ptr_t>(kernelEntry->second);
   auto kernel = new ur_kernel_handle_t_(pKernelName, *f);
 
   *phKernel = kernel;
diff --git a/sycl/plugins/unified_runtime/ur/adapters/native_cpu/program.cpp b/sycl/plugins/unified_runtime/ur/adapters/native_cpu/program.cpp
old mode 100644
new mode 100755
index 6a0dcdae55bf2..0d62cc4da939d
--- a/sycl/plugins/unified_runtime/ur/adapters/native_cpu/program.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/native_cpu/program.cpp
@@ -39,6 +39,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary(
   auto hProgram = new ur_program_handle_t_(
       hContext, reinterpret_cast<const unsigned char *>(pBinary));
 
+  const nativecpu_entry *nativecpu_it =
+      reinterpret_cast<const nativecpu_entry *>(pBinary);
+  while (nativecpu_it->kernel_ptr != nullptr) {
+    hProgram->_kernels.insert(
+        std::make_pair(nativecpu_it->kernelname, nativecpu_it->kernel_ptr));
+    nativecpu_it++;
+  }
+
   *phProgram = hProgram;
 
   return UR_RESULT_SUCCESS;
diff --git a/sycl/plugins/unified_runtime/ur/adapters/native_cpu/program.hpp b/sycl/plugins/unified_runtime/ur/adapters/native_cpu/program.hpp
index b60df5a193d24..850bc48172362 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/native_cpu/program.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/native_cpu/program.hpp
@@ -11,6 +11,7 @@
 #include <ur_api.h>
 
 #include "context.hpp"
+#include <map>
 
 struct ur_program_handle_t_ {
   ur_program_handle_t_(ur_context_handle_t ctx, const unsigned char *pBinary)
@@ -21,4 +22,21 @@ struct ur_program_handle_t_ {
   ur_context_handle_t _ctx;
   const unsigned char *_ptr;
   std::atomic_uint32_t _refCount;
+
+  struct _compare {
+    bool operator()(char const *a, char const *b) const {
+      return std::strcmp(a, b) < 0;
+    }
+  };
+
+  std::map<const char *, const unsigned char *, _compare> _kernels;
+};
+
+// The nativecpu_entry struct is also defined as LLVM-IR in the
+// clang-offload-wrapper tool. The two definitions need to match,
+// therefore any change to this struct needs to be reflected in the
+// offload-wrapper.
+struct nativecpu_entry {
+  const char *kernelname;
+  const unsigned char *kernel_ptr;
 };
diff --git a/sycl/test/check_device_code/native_cpu/kernelhandler-scalar.cpp b/sycl/test/check_device_code/native_cpu/kernelhandler-scalar.cpp
index fda54f082f7d1..a112c9a2e35ec 100644
--- a/sycl/test/check_device_code/native_cpu/kernelhandler-scalar.cpp
+++ b/sycl/test/check_device_code/native_cpu/kernelhandler-scalar.cpp
@@ -50,7 +50,7 @@ int main() {
   return 0;
 }
 
-// CHECK-LL-DAG: @_ZTS6init_aIiE_NativeCPUKernel_NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, i32 {{.*}}%2, ptr {{.*}}%3){{.*}}
-// CHECK-LL-DAG: @_ZTS6init_aIjE_NativeCPUKernel_NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, i32 {{.*}}%2, ptr {{.*}}%3){{.*}}
-// CHECK-LL-DAG: @_ZTS6init_aIfE_NativeCPUKernel_NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, float {{.*}}%2, ptr {{.*}}%3){{.*}}
-// CHECK-LL-DAG: @_ZTS6init_aIdE_NativeCPUKernel_NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, double {{.*}}%2, ptr {{.*}}%3){{.*}}
+// CHECK-LL-DAG: @_ZTS6init_aIiE.NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, i32 {{.*}}%2, ptr {{.*}}%3){{.*}}
+// CHECK-LL-DAG: @_ZTS6init_aIjE.NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, i32 {{.*}}%2, ptr {{.*}}%3){{.*}}
+// CHECK-LL-DAG: @_ZTS6init_aIfE.NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, float {{.*}}%2, ptr {{.*}}%3){{.*}}
+// CHECK-LL-DAG: @_ZTS6init_aIdE.NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, double {{.*}}%2, ptr {{.*}}%3){{.*}}
diff --git a/sycl/test/check_device_code/native_cpu/kernelhandler.cpp b/sycl/test/check_device_code/native_cpu/kernelhandler.cpp
deleted file mode 100644
index f9d8e59599529..0000000000000
--- a/sycl/test/check_device_code/native_cpu/kernelhandler.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-// Checks that kernelhandler and subhandler are emitted in the integration
-// headers. The sycl-native-cpu helper header is always named
-// <sycl-int-header>.hc
-// RUN: %clangxx -fsycl-device-only -fsycl-targets=native_cpu -Xclang -fsycl-int-header=%t.h -Xclang -fsycl-int-footer=%t-footer.h -o %t.bc %s
-// RUN: FileCheck -input-file=%t-footer.h %s --check-prefix=CHECK-H
-// RUN: FileCheck -input-file=%t.h.hc %s --check-prefix=CHECK-HC
-// Compiling generated main integration header to check correctness, -fsycl
-// option used to find required includes
-// RUN: %clangxx -fsycl -D __SYCL_NATIVE_CPU__ -c -x c++ %t.h
-
-#include "sycl.hpp"
-class Test1;
-int main() {
-  sycl::queue deviceQueue;
-  sycl::accessor<int, 1, sycl::access::mode::write> acc;
-  sycl::range<1> r(1);
-  deviceQueue.submit([&](sycl::handler &h) {
-    h.parallel_for<Test1>(r, [=](sycl::id<1> id) { acc[id[0]] = 42; });
-  });
-}
-
-// check that we are including the Native CPU header in the main SYCL ih
-// CHECK-H: #include "{{.*}}.h.hc"
-
-// check that we are emitting the subhandler in Native CPU ih
-//CHECK-HC: #pragma once
-//CHECK-HC-NEXT: #include <sycl/detail/native_cpu.hpp>
-//CHECK-HC-NEXT: #include <sycl/detail/pi.h>
-//CHECK-HC-NEXT: extern "C" void __sycl_register_lib(pi_device_binaries desc);
-//CHECK-HC:extern "C" void _ZTS5Test1_NativeCPUKernelsubhandler(const sycl::detail::NativeCPUArgDesc *MArgs, __nativecpu_state *state);
-
-// check that we are emitting the call to __sycl_register_lib
-//CHECK-HC: static _pi_offload_entry_struct _pi_offload_entry_struct_ZTS5Test1_NativeCPUKernel{(void*)&_ZTS5Test1_NativeCPUKernelsubhandler, const_cast<char*>("_ZTS5Test1_NativeCPUKernel"), 1, 0, 0 };
-//CHECK-HC-NEXT: static pi_device_binary_struct pi_device_binary_struct_ZTS5Test1_NativeCPUKernel{0, 4, 0, __SYCL_PI_DEVICE_BINARY_TARGET_UNKNOWN, nullptr, nullptr, nullptr, nullptr, (unsigned char*)&_ZTS5Test1_NativeCPUKernelsubhandler, (unsigned char*)&_ZTS5Test1_NativeCPUKernelsubhandler + 1, &_pi_offload_entry_struct_ZTS5Test1_NativeCPUKernel, &_pi_offload_entry_struct_ZTS5Test1_NativeCPUKernel+1, nullptr, nullptr };
-//CHECK-HC-NEXT: static pi_device_binaries_struct pi_device_binaries_struct_ZTS5Test1_NativeCPUKernel{0, 1, &pi_device_binary_struct_ZTS5Test1_NativeCPUKernel, nullptr, nullptr };
-//CHECK-HC-NEXT: struct init_native_cpu_ZTS5Test1_NativeCPUKernel_t{
-//CHECK-HC-NEXT: 	init_native_cpu_ZTS5Test1_NativeCPUKernel_t(){
-//CHECK-HC-NEXT: 		__sycl_register_lib(&pi_device_binaries_struct_ZTS5Test1_NativeCPUKernel);
-//CHECK-HC-NEXT: 	}
-//CHECK-HC-NEXT: };
-//CHECK-HC-NEXT: static init_native_cpu_ZTS5Test1_NativeCPUKernel_t init_native_cpu_ZTS5Test1_NativeCPUKernel;
diff --git a/sycl/test/check_device_code/native_cpu/kernelhandler_noargs.cpp b/sycl/test/check_device_code/native_cpu/kernelhandler_noargs.cpp
deleted file mode 100644
index 5ff054b9961b9..0000000000000
--- a/sycl/test/check_device_code/native_cpu/kernelhandler_noargs.cpp
+++ /dev/null
@@ -1,22 +0,0 @@
-// RUN: %clangxx -fsycl-device-only -fsycl-targets=native_cpu -Xclang -fsycl-int-header=%t.h  -o %t.bc %s
-// RUN: FileCheck -input-file=%t.h.hc %s --check-prefix=CHECK-HC
-// Compiling generated main integration header to check correctness, -fsycl
-// option used to find required includes
-// RUN: %clangxx -fsycl -D __SYCL_NATIVE_CPU__ -c -x c++ %t.h
-
-#include "sycl.hpp"
-class Test1;
-int main() {
-  sycl::queue deviceQueue;
-  sycl::accessor<int, 1, sycl::access::mode::write> acc;
-  sycl::range<1> r(1);
-  deviceQueue.submit([&](sycl::handler &h) {
-    h.parallel_for<Test1>(r, [=](sycl::id<1> id) {
-      acc[id[0]]; // all kernel arguments are removed
-    });
-  });
-}
-
-//CHECK-HC: #pragma once
-//CHECK-HC-NEXT: #include <sycl/detail/native_cpu.hpp>
-//CHECK-HC:extern "C" void _ZTS5Test1_NativeCPUKernelsubhandler(const sycl::detail::NativeCPUArgDesc *MArgs, __nativecpu_state *state);
diff --git a/sycl/test/check_device_code/native_cpu/kernelhandler_noargs2.cpp b/sycl/test/check_device_code/native_cpu/kernelhandler_noargs2.cpp
deleted file mode 100644
index 8a4188010eb8b..0000000000000
--- a/sycl/test/check_device_code/native_cpu/kernelhandler_noargs2.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-// cmdline that used to fail in kernel handler emission
-// RUN: %clangxx -fsycl-device-only  -fsycl-targets=native_cpu -sycl-std=2020 -Xclang -fsycl-int-header=%t.h -mllvm -sycl-opt -S -emit-llvm  -o - %s
-// RUN: FileCheck -input-file=%t.h.hc %s --check-prefix=CHECK-HC
-// RUN: %clangxx -fsycl -D __SYCL_NATIVE_CPU__ -c -x c++ %t.h
-#include "sycl.hpp"
-
-template <typename name, typename Func>
-__attribute__((sycl_kernel)) void launch(const Func &kernelFunc) {
-  kernelFunc();
-}
-int main() {
-  launch<class TestKernel>([]() {});
-  return 0;
-}
-
-//CHECK-HC: #pragma once
-//CHECK-HC-NEXT: #include <sycl/detail/native_cpu.hpp>
-//CHECK-HC:extern "C" void _ZTSZ4mainE10TestKernel_NativeCPUKernelsubhandler(const sycl::detail::NativeCPUArgDesc *MArgs, __nativecpu_state *state);
diff --git a/sycl/test/check_device_code/native_cpu/native_cpu_builtins.cpp b/sycl/test/check_device_code/native_cpu/native_cpu_builtins.cpp
index 74cc81d487970..a8f5694efe348 100644
--- a/sycl/test/check_device_code/native_cpu/native_cpu_builtins.cpp
+++ b/sycl/test/check_device_code/native_cpu/native_cpu_builtins.cpp
@@ -15,7 +15,7 @@ int main() {
   sycl::range<1> r(1);
   deviceQueue.submit([&](sycl::handler &h) {
     h.parallel_for<Test1>(r, [=](sycl::id<1> id) { acc[id[0]] = 42; });
-    // CHECK: @_ZTS5Test1_NativeCPUKernel_NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, ptr %2)
+    // CHECK: @_ZTS5Test1.NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, ptr %2)
     // CHECK: call{{.*}}__dpcpp_nativecpu_global_id(ptr %2)
   });
   sycl::nd_range<2> r2({1, 1}, {
@@ -24,14 +24,14 @@ int main() {
                                });
   deviceQueue.submit([&](sycl::handler &h) {
     h.parallel_for<Test2>(r2, [=](sycl::id<2> id) { acc[id[1]] = 42; });
-    // CHECK: @_ZTS5Test2_NativeCPUKernel_NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, ptr %2)
+    // CHECK: @_ZTS5Test2.NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, ptr %2)
     // CHECK: call{{.*}}__dpcpp_nativecpu_global_id(ptr %2)
   });
   sycl::nd_range<3> r3({1, 1, 1}, {1, 1, 1});
   deviceQueue.submit([&](sycl::handler &h) {
     h.parallel_for<Test3>(
         r3, [=](sycl::item<3> item) { acc[item[2]] = item.get_range(0); });
-    // CHECK: @_ZTS5Test3_NativeCPUKernel_NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, ptr %2)
+    // CHECK: @_ZTS5Test3.NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, ptr %2)
     // CHECK-DAG: call{{.*}}__dpcpp_nativecpu_global_range(ptr %2)
     // CHECK-DAG: call{{.*}}__dpcpp_nativecpu_global_id(ptr %2)
   });
diff --git a/sycl/test/check_device_code/native_cpu/native_cpu_subhandler.cpp b/sycl/test/check_device_code/native_cpu/native_cpu_subhandler.cpp
index afbba94b7d54c..fab20cff1dd64 100644
--- a/sycl/test/check_device_code/native_cpu/native_cpu_subhandler.cpp
+++ b/sycl/test/check_device_code/native_cpu/native_cpu_subhandler.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl-device-only  -fsycl-targets=native_cpu -Xclang -sycl-std=2020 -mllvm -sycl-opt -S -emit-llvm  -o - %s | FileCheck %s
+// RUN: %clangxx -fsycl-device-only  -O2 -g -fexceptions -fsycl-targets=native_cpu -Xclang -sycl-std=2020 -mllvm -sycl-opt -S -emit-llvm  -o - %s | FileCheck %s
 
 // Checks that the subhandler is correctly emitted in the module
 #include <sycl/sycl.hpp>
@@ -32,7 +32,7 @@ __attribute__((sycl_kernel)) void launch(const Func &kernelFunc) {
 void test() {
   queue q;
   gen_test<int>(q);
-  //CHECK:  define weak void @_ZTS6init_aIiE_NativeCPUKernelsubhandler(ptr %{{.*}}, ptr %[[STATE:.*]]) #{{.*}} {
+  //CHECK:  define weak void @_ZTS6init_aIiE(ptr %{{.*}}, ptr %[[STATE:.*]]) #{{.*}} {
   //CHECK:       %{{.*}} = getelementptr %{{.*}}, ptr %{{.*}}, i64 {{.*}}
   //CHECK-NEXT:  %[[ARG1:.*]] = load ptr, ptr %{{.*}}
   //CHECK-NEXT:  %{{.*}} = getelementptr %{{.*}}, ptr %{{.*}}, i64 {{.*}}
@@ -40,11 +40,11 @@ void test() {
   //CHECK-NEXT:  %{{.*}} = getelementptr %{{.*}}, ptr %{{.*}}, i64 {{.*}}
   //CHECK-NEXT:  %{{.*}} = load ptr, ptr %{{.*}}
   //CHECK-NEXT:  %[[ARG3:.*]] = load i32, ptr %{{.*}}
-  //CHECK-NEXT:  call void @_ZTS6init_aIiE_NativeCPUKernel_NativeCPUKernel(ptr %[[ARG1]], ptr %[[ARG2]], i32 %[[ARG3]], ptr %[[STATE]])
+  //CHECK-NEXT:  call void @_ZTS6init_aIiE.NativeCPUKernel(ptr %[[ARG1]], ptr %[[ARG2]], i32 %[[ARG3]], ptr %[[STATE]])
   //CHECK-NEXT:  ret void
   //CHECK-NEXT:}
   gen_test<float>(q);
-  //CHECK:  define weak void @_ZTS6init_aIfE_NativeCPUKernelsubhandler(ptr %{{.*}}, ptr %[[STATE1:.*]]) #{{.*}} {
+  //CHECK:  define weak void @_ZTS6init_aIfE(ptr %{{.*}}, ptr %[[STATE1:.*]]) #{{.*}} {
   //CHECK:       %{{.*}} = getelementptr %{{.*}}, ptr %{{.*}}, i64 {{.*}}
   //CHECK-NEXT:  %[[ARGF1:.*]] = load ptr, ptr %{{.*}}
   //CHECK-NEXT:  %{{.*}} = getelementptr %{{.*}}, ptr %{{.*}}, i64 {{.*}}
@@ -52,7 +52,7 @@ void test() {
   //CHECK-NEXT:  %{{.*}} = getelementptr %{{.*}}, ptr %{{.*}}, i64 {{.*}}
   //CHECK-NEXT:  %{{.*}} = load ptr, ptr %{{.*}}
   //CHECK-NEXT:  %[[ARGF3:.*]] = load float, ptr %{{.*}}
-  //CHECK-NEXT:  call void @_ZTS6init_aIfE_NativeCPUKernel_NativeCPUKernel(ptr %[[ARGF1]], ptr %[[ARGF2]], float %[[ARGF3]], ptr %[[STATE1]])
+  //CHECK-NEXT:  call void @_ZTS6init_aIfE.NativeCPUKernel(ptr %[[ARGF1]], ptr %[[ARGF2]], float %[[ARGF3]], ptr %[[STATE1]])
   //CHECK-NEXT:  ret void
   //CHECK-NEXT:}
 
@@ -64,14 +64,14 @@ void test() {
       acc[id[0]]; // all kernel arguments are removed
     });
   });
-  //CHECK:define weak void @_ZTS5Test1_NativeCPUKernelsubhandler(ptr %{{.*}}, ptr %[[STATE2:.*]]) #{{.*}} {
-  //CHECK:       call void @_ZTS5Test1_NativeCPUKernel_NativeCPUKernel(ptr %[[STATE2]])
+  //CHECK:define weak void @_ZTS5Test1(ptr %{{.*}}, ptr %[[STATE2:.*]]) #{{.*}} {
+  //CHECK:       call void @_ZTS5Test1.NativeCPUKernel(ptr %[[STATE2]])
   //CHECK-NEXT:  ret void
   //CHECK-NEXT:}
 
   launch<class TestKernel>([]() {});
-  //CHECK:define weak void @_ZTSZ4testvE10TestKernel_NativeCPUKernelsubhandler(ptr %{{.*}}, ptr %[[STATE3:.*]]) #2 {
-  //CHECK:       call void @_ZTSZ4testvE10TestKernel_NativeCPUKernel_NativeCPUKernel(ptr %[[STATE3]])
+  //CHECK:define weak void @_ZTSZ4testvE10TestKernel(ptr %{{.*}}, ptr %[[STATE3:.*]]) #{{.*}} {
+  //CHECK:       call void @_ZTSZ4testvE10TestKernel.NativeCPUKernel(ptr %[[STATE3]])
   //CHECK-NEXT:  ret void
   //CHECK-NEXT:}
 }
diff --git a/sycl/test/check_device_code/native_cpu/offload-wrapper.cpp b/sycl/test/check_device_code/native_cpu/offload-wrapper.cpp
new file mode 100644
index 0000000000000..42ed79ae47bdb
--- /dev/null
+++ b/sycl/test/check_device_code/native_cpu/offload-wrapper.cpp
@@ -0,0 +1,59 @@
+// This test checks the output for the clang-offload-wrapper for the Native CPU
+// target:
+// RUN: %clangxx -fsycl-device-only -fsycl-targets=native_cpu %s -o %t.bc
+// RUN: sycl-post-link -emit-param-info -symbols -emit-exported-symbols -O2 -spec-const=rt -device-globals -o %t.table %t.bc
+// RUN: clang-offload-wrapper -o=%t_wrap.bc -host=x86_64-unknown-linux-gnu -target=native_cpu -kind=sycl -batch %t.table
+// RUN: llvm-dis %t_wrap.bc -o - | FileCheck %s
+
+#include <sycl/sycl.hpp>
+
+#include <array>
+#include <iostream>
+
+constexpr sycl::access::mode sycl_read = sycl::access::mode::read;
+constexpr sycl::access::mode sycl_write = sycl::access::mode::write;
+
+class SimpleVadd;
+
+int main() {
+  const size_t N = 4;
+  std::array<int, N> A = {{1, 2, 3, 4}}, B = {{2, 3, 4, 5}}, C{{0, 0, 0, 0}};
+  sycl::queue deviceQueue;
+  sycl::range<1> numOfItems{N};
+  sycl::buffer<int, 1> bufferA(A.data(), numOfItems);
+  sycl::buffer<int, 1> bufferB(B.data(), numOfItems);
+  sycl::buffer<int, 1> bufferC(C.data(), numOfItems);
+
+  deviceQueue
+      .submit([&](sycl::handler &cgh) {
+        auto accessorA = bufferA.get_access<sycl_read>(cgh);
+        auto accessorB = bufferB.get_access<sycl_read>(cgh);
+        auto accessorC = bufferC.get_access<sycl_write>(cgh);
+
+        auto kern = [=](sycl::id<1> wiID) {
+          accessorC[wiID] = accessorA[wiID] + accessorB[wiID];
+        };
+        cgh.parallel_for<class SimpleVadd>(numOfItems, kern);
+      })
+      .wait();
+
+  for (unsigned int i = 0; i < N; i++) {
+    std::cout << "C[" << i << "] = " << C[i] << "\n";
+    if (C[i] != A[i] + B[i]) {
+      std::cout << "The results are incorrect (element " << i << " is " << C[i]
+                << "!\n";
+      return 1;
+    }
+  }
+  std::cout << "The results are correct!\n";
+  return 0;
+}
+// Check that the kernel name is added as a string in wrapper module
+// CHECK: [[KERNELNAME:@__ncpu_function_name.[0-9]*]] = internal unnamed_addr constant [17 x i8] c"_ZTS10SimpleVadd\00"
+// Check that the string for the end entry is added
+// CHECK: @__ncpu_end_str = internal unnamed_addr constant [16 x i8] c"__nativecpu_end\00"
+// Check that the array of declarations for Native CPU is added to the module,
+// and it contains the entry for the kernel and it's terminated by the end entry
+// CHECK: @__sycl_native_cpu_decls = internal constant [{{[0-9]*}} x %__nativecpu_entry] [{{.*}} %__nativecpu_entry { ptr [[KERNELNAME]], ptr @_ZTS10SimpleVadd.SYCLNCPU }, %__nativecpu_entry { ptr @__ncpu_end_str, ptr null }]
+// Check that the declaration for the kernel is added for the wrapper module
+// CHECK-DAG: declare void @_ZTS10SimpleVadd.SYCLNCPU(ptr, ptr)
diff --git a/sycl/test/native_cpu/driver-fsycl.cpp b/sycl/test/native_cpu/driver-fsycl.cpp
index 5f4e295646fc8..48c8b4d6237c8 100644
--- a/sycl/test/native_cpu/driver-fsycl.cpp
+++ b/sycl/test/native_cpu/driver-fsycl.cpp
@@ -16,7 +16,7 @@ int main() {
 
   // Submitting command group(work) to queue
   Queue.submit([&](sycl::handler &cgh) {
-    // Getting write only access to the buffer on a device.
+    // Getting write-only access to the buffer on a device.
     sycl::accessor Accessor{Buffer, cgh, sycl::write_only};
     // Executing kernel
     cgh.parallel_for<class FillBuffer>(NumOfWorkItems, [=](sycl::id<1> WIid) {
@@ -25,7 +25,7 @@ int main() {
     });
   });
 
-  // Getting read only access to the buffer on the host.
+  // Getting read-only access to the buffer on the host.
   // Implicit barrier waiting for queue to complete the work.
   sycl::host_accessor HostAccessor{Buffer, sycl::read_only};
 
diff --git a/sycl/test/native_cpu/example-sycl-application.cpp b/sycl/test/native_cpu/example-sycl-application.cpp
index 59ca56ef3863d..f25b9f48aa844 100644
--- a/sycl/test/native_cpu/example-sycl-application.cpp
+++ b/sycl/test/native_cpu/example-sycl-application.cpp
@@ -1,8 +1,5 @@
 // REQUIRES: native_cpu_be
-// RUN: %clangxx -fsycl-device-only -fsycl-targets=native_cpu -Xclang -fsycl-int-header=%t.h -Xclang -fsycl-int-footer=%t-footer.h %s -o %t.bc
-// RUN: %clangxx -include %t.h -include %t-footer.h  -std=c++17 -D __SYCL_NATIVE_CPU__ -I %sycl_include -I %sycl_include/sycl  %s -O2 -c -o %t-host.o
-// RUN: %clangxx %t.bc -O3 -c -o %t-kernel.o
-// RUN: %clangxx -L %sycl_libs_dir %sycl_lib %t-kernel.o %t-host.o -o %t
+// RUN: %clangxx -fsycl -fsycl-targets=native_cpu %s -o %t
 // RUN: env ONEAPI_DEVICE_SELECTOR="native_cpu:cpu" %t
 
 /***************************************************************************
diff --git a/sycl/test/native_cpu/multi-devices-swap.cpp b/sycl/test/native_cpu/multi-devices-swap.cpp
new file mode 100644
index 0000000000000..282b89ba23e41
--- /dev/null
+++ b/sycl/test/native_cpu/multi-devices-swap.cpp
@@ -0,0 +1,50 @@
+// REQUIRES: native_cpu_be
+// REQUIRES: opencl_be
+// RUN: %clangxx -fsycl -fsycl-targets=native_cpu,spir64 %s -o %t
+// RUN: env ONEAPI_DEVICE_SELECTOR="native_cpu:cpu" %t
+// RUN: env ONEAPI_DEVICE_SELECTOR="opencl:cpu" %t
+
+#include <sycl/sycl.hpp>
+
+int main() {
+  // Creating buffer of 4 elements to be used inside the kernel code
+  sycl::buffer<size_t, 1> Buffer(4);
+
+  // Creating SYCL queue
+  sycl::queue Queue;
+
+  // Size of index space for kernel
+  sycl::range<1> NumOfWorkItems{Buffer.size()};
+
+  // Submitting command group(work) to queue
+  Queue.submit([&](sycl::handler &cgh) {
+    // Getting write-only access to the buffer on a device.
+    sycl::accessor Accessor{Buffer, cgh, sycl::write_only};
+    // Executing kernel
+    cgh.parallel_for<class FillBuffer>(NumOfWorkItems, [=](sycl::id<1> WIid) {
+      // Fill buffer with indexes.
+      Accessor[WIid] = WIid.get(0);
+    });
+  });
+
+  // Getting read-only access to the buffer on the host.
+  // Implicit barrier waiting for queue to complete the work.
+  sycl::host_accessor HostAccessor{Buffer, sycl::read_only};
+
+  // Check the results
+  bool MismatchFound = false;
+  for (size_t I = 0; I < Buffer.size(); ++I) {
+    if (HostAccessor[I] != I) {
+      std::cout << "The result is incorrect for element: " << I
+                << " , expected: " << I << " , got: " << HostAccessor[I]
+                << std::endl;
+      MismatchFound = true;
+    }
+  }
+
+  if (!MismatchFound) {
+    std::cout << "The results are correct!" << std::endl;
+  }
+
+  return MismatchFound;
+}
diff --git a/sycl/test/native_cpu/multi-devices.cpp b/sycl/test/native_cpu/multi-devices.cpp
new file mode 100644
index 0000000000000..6a3cba9ec5307
--- /dev/null
+++ b/sycl/test/native_cpu/multi-devices.cpp
@@ -0,0 +1,50 @@
+// REQUIRES: native_cpu_be
+// REQUIRES: opencl_be
+// RUN: %clangxx -fsycl -fsycl-targets=spir64,native_cpu %s -o %t
+// RUN: env ONEAPI_DEVICE_SELECTOR="native_cpu:cpu" %t
+// RUN: env ONEAPI_DEVICE_SELECTOR="opencl:cpu" %t
+
+#include <sycl/sycl.hpp>
+
+int main() {
+  // Creating buffer of 4 elements to be used inside the kernel code
+  sycl::buffer<size_t, 1> Buffer(4);
+
+  // Creating SYCL queue
+  sycl::queue Queue;
+
+  // Size of index space for kernel
+  sycl::range<1> NumOfWorkItems{Buffer.size()};
+
+  // Submitting command group(work) to queue
+  Queue.submit([&](sycl::handler &cgh) {
+    // Getting write-only access to the buffer on a device.
+    sycl::accessor Accessor{Buffer, cgh, sycl::write_only};
+    // Executing kernel
+    cgh.parallel_for<class FillBuffer>(NumOfWorkItems, [=](sycl::id<1> WIid) {
+      // Fill buffer with indexes.
+      Accessor[WIid] = WIid.get(0);
+    });
+  });
+
+  // Getting read-only access to the buffer on the host.
+  // Implicit barrier waiting for queue to complete the work.
+  sycl::host_accessor HostAccessor{Buffer, sycl::read_only};
+
+  // Check the results
+  bool MismatchFound = false;
+  for (size_t I = 0; I < Buffer.size(); ++I) {
+    if (HostAccessor[I] != I) {
+      std::cout << "The result is incorrect for element: " << I
+                << " , expected: " << I << " , got: " << HostAccessor[I]
+                << std::endl;
+      MismatchFound = true;
+    }
+  }
+
+  if (!MismatchFound) {
+    std::cout << "The results are correct!" << std::endl;
+  }
+
+  return MismatchFound;
+}
diff --git a/sycl/test/native_cpu/vector-add.cpp b/sycl/test/native_cpu/vector-add.cpp
index 18a83eb16c1cc..38065390dc5bb 100644
--- a/sycl/test/native_cpu/vector-add.cpp
+++ b/sycl/test/native_cpu/vector-add.cpp
@@ -1,8 +1,5 @@
 // REQUIRES: native_cpu_be
-// RUN: %clangxx -fsycl-device-only -fsycl-targets=native_cpu -Xclang -fsycl-int-header=%t.h -Xclang -fsycl-int-footer=%t-footer.h %s -o %t.bc
-// RUN: %clangxx -D __SYCL_NATIVE_CPU__ -std=c++17 -include %t.h -include %t-footer.h -I %sycl_include -I %sycl_include/sycl  %s -O2 -c -o %t-host.o
-// RUN: %clangxx %t.bc -O3 -c -o %t-kernel.o
-// RUN: %clangxx -L %sycl_libs_dir %sycl_lib %t-kernel.o %t-host.o -o %t
+// RUN: %clangxx -fsycl -fsycl-targets=native_cpu %s -o %t
 // RUN: env ONEAPI_DEVICE_SELECTOR="native_cpu:cpu" %t
 
 #include <sycl/sycl.hpp>

From bedd818eb281f92b0d2d40cb6423235743161b65 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alberto=20Cabrera=20P=C3=A9rez?=
 <alberto.cabrera@codeplay.com>
Date: Wed, 9 Aug 2023 13:37:17 +0100
Subject: [PATCH 13/24]  [SYCL][COMPAT] SYCLcompat dim and defs headers (#9976)

This is an implementation of SYCLcompat, the stand alone library
proposed in https://github.com/intel/llvm/pull/9646.
Please, find the documentation there.

SYCLcompat has two primary goals:

- Improve the adoption of SYCL. This library is designed to provide a
familiar programming interface that resembles other popular
heterogeneous programming models. By reducing the learning curve, it
enables developers to leverage SYCL's power and features more easily.
- Source-to-Source Translation Support. SYCLcompat is also designed to
facilitate automatic source-to-source translation from other
heterogeneous programming models to SYCL and offer a more standardized
and consistent programming interface. This feature can significantly
streamline the migration and integration of existing codebases into the
SYCL ecosystem

The PR also includes tests. We were advised to include the tests as part
of `sycl/unittests`, so we added a different mechanism to build them
using `clang`.

As we stated in the docs PR, we are open to any suggestions, concerns,
or improvements you may have, so please, let us know if you have any.

---------

Co-authored-by: Gordon Brown <gordon@codeplay.com>
Co-authored-by: Joe Todd <joe.todd@codeplay.com>
Co-authored-by: Pietro Ghiglio <pietro.ghiglio@codeplay.com>
Co-authored-by: Ruyman Reyes <ruyman@codeplay.com>
Co-authored-by: tomflinda <tomflinda@gmail.com>
---
 sycl/CMakeLists.txt                           |  11 ++
 sycl/cmake/modules/AddSYCLExecutable.cmake    |   5 +-
 .../modules/AddSYCLLibraryUnitTest.cmake      |  91 ++++++++++++++
 sycl/include/syclcompat.hpp                   |  25 ++++
 sycl/include/syclcompat/defs.hpp              |  43 +++++++
 sycl/include/syclcompat/dims.hpp              |  72 ++++++++++++
 sycl/include/syclcompat/syclcompat.hpp        |  26 ++++
 sycl/unittests/CMakeLists.txt                 |   6 +
 sycl/unittests/syclcompat/CMakeLists.txt      |  33 ++++++
 sycl/unittests/syclcompat/Defs.cpp            |  34 ++++++
 sycl/unittests/syclcompat/Dim.cpp             | 111 ++++++++++++++++++
 11 files changed, 456 insertions(+), 1 deletion(-)
 create mode 100644 sycl/cmake/modules/AddSYCLLibraryUnitTest.cmake
 create mode 100644 sycl/include/syclcompat.hpp
 create mode 100644 sycl/include/syclcompat/defs.hpp
 create mode 100644 sycl/include/syclcompat/dims.hpp
 create mode 100644 sycl/include/syclcompat/syclcompat.hpp
 create mode 100644 sycl/unittests/syclcompat/CMakeLists.txt
 create mode 100644 sycl/unittests/syclcompat/Defs.cpp
 create mode 100644 sycl/unittests/syclcompat/Dim.cpp

diff --git a/sycl/CMakeLists.txt b/sycl/CMakeLists.txt
index e6a8d1939db34..89c401ba8050b 100644
--- a/sycl/CMakeLists.txt
+++ b/sycl/CMakeLists.txt
@@ -193,30 +193,39 @@ include(AddBoostMp11Headers)
 file(GLOB_RECURSE HEADERS_IN_SYCL_DIR CONFIGURE_DEPENDS "${sycl_inc_dir}/sycl/*")
 file(GLOB_RECURSE HEADERS_IN_CL_DIR CONFIGURE_DEPENDS "${sycl_inc_dir}/CL/*")
 file(GLOB_RECURSE HEADERS_IN_STD_DIR CONFIGURE_DEPENDS "${sycl_inc_dir}/std/*")
+file(GLOB_RECURSE HEADERS_IN_SYCLCOMPAT_DIR CONFIGURE_DEPENDS "${sycl_inc_dir}/syclcompat/*" "${sycl_inc_dir}/syclcompat.hpp")
+
 string(REPLACE "${sycl_inc_dir}" "${SYCL_INCLUDE_BUILD_DIR}"
   OUT_HEADERS_IN_SYCL_DIR "${HEADERS_IN_SYCL_DIR}")
 string(REPLACE "${sycl_inc_dir}/CL" "${SYCL_INCLUDE_BUILD_DIR}/sycl/CL"
   OUT_HEADERS_IN_CL_DIR "${HEADERS_IN_CL_DIR}")
 string(REPLACE "${sycl_inc_dir}" "${SYCL_INCLUDE_BUILD_DIR}"
   OUT_HEADERS_IN_STD_DIR "${HEADERS_IN_STD_DIR}")
+string(REPLACE "${sycl_inc_dir}" "${SYCL_INCLUDE_BUILD_DIR}"
+  OUT_HEADERS_IN_SYCLCOMPAT_DIR "${HEADERS_IN_SYCLCOMPAT_DIR}")
 
 # Copy SYCL headers from sources to build directory
 add_custom_target(sycl-headers
   DEPENDS ${OUT_HEADERS_IN_SYCL_DIR}
           ${OUT_HEADERS_IN_CL_DIR}
           ${OUT_HEADERS_IN_STD_DIR}
+          ${OUT_HEADERS_IN_SYCLCOMPAT_DIR}
           boost_mp11-headers)
 
 add_custom_command(
   OUTPUT  ${OUT_HEADERS_IN_SYCL_DIR}
           ${OUT_HEADERS_IN_CL_DIR}
           ${OUT_HEADERS_IN_STD_DIR}
+          ${OUT_HEADERS_IN_SYCLCOMPAT_DIR}
   DEPENDS ${HEADERS_IN_SYCL_DIR}
           ${HEADERS_IN_CL_DIR}
           ${HEADERS_IN_STD_DIR}
+          ${HEADERS_IN_SYCLCOMPAT_DIR}
   COMMAND ${CMAKE_COMMAND} -E copy_directory ${sycl_inc_dir}/sycl ${SYCL_INCLUDE_BUILD_DIR}/sycl
   COMMAND ${CMAKE_COMMAND} -E copy_directory ${sycl_inc_dir}/CL ${SYCL_INCLUDE_BUILD_DIR}/sycl/CL
   COMMAND ${CMAKE_COMMAND} -E copy_directory ${sycl_inc_dir}/std ${SYCL_INCLUDE_BUILD_DIR}/std
+  COMMAND ${CMAKE_COMMAND} -E copy_directory ${sycl_inc_dir}/syclcompat ${SYCL_INCLUDE_BUILD_DIR}/syclcompat
+  COMMAND ${CMAKE_COMMAND} -E copy ${sycl_inc_dir}/syclcompat.hpp ${SYCL_INCLUDE_BUILD_DIR}/syclcompat.hpp
   COMMENT "Copying SYCL headers ...")
 
 # Copy SYCL headers from source to install directory
@@ -224,6 +233,8 @@ install(DIRECTORY "${sycl_inc_dir}/sycl" DESTINATION ${SYCL_INCLUDE_DIR} COMPONE
 install(DIRECTORY "${sycl_inc_dir}/CL" DESTINATION ${SYCL_INCLUDE_DIR}/sycl COMPONENT sycl-headers)
 install(DIRECTORY "${sycl_inc_dir}/std" DESTINATION ${SYCL_INCLUDE_DIR} COMPONENT sycl-headers)
 install(DIRECTORY ${BOOST_MP11_DESTINATION_DIR} DESTINATION ${SYCL_INCLUDE_DIR}/sycl/detail COMPONENT boost_mp11-headers)
+install(DIRECTORY "${sycl_inc_dir}/syclcompat" DESTINATION ${SYCL_INCLUDE_DIR}/syclcompat COMPONENT sycl-headers)
+install(FILES "${sycl_inc_dir}/syclcompat.hpp" DESTINATION ${SYCL_INCLUDE_DIR} COMPONENT sycl-headers)
 
 if (WIN32)
   set(SYCL_RT_LIBS sycl${SYCL_MAJOR_VERSION})
diff --git a/sycl/cmake/modules/AddSYCLExecutable.cmake b/sycl/cmake/modules/AddSYCLExecutable.cmake
index 69a666aaf3430..4aa3ffbbf5119 100644
--- a/sycl/cmake/modules/AddSYCLExecutable.cmake
+++ b/sycl/cmake/modules/AddSYCLExecutable.cmake
@@ -36,7 +36,10 @@ macro(add_sycl_executable ARG_TARGET_NAME)
     COMMAND_EXPAND_LISTS)
   add_dependencies(${ARG_TARGET_NAME}_exec sycl-toolchain)
   foreach(_lib ${ARG_LIBRARIES})
-    add_dependencies(${ARG_TARGET_NAME}_exec _lib)
+    # Avoid errors when linking external targets such as dl
+    if(TARGET ${_lib})
+      add_dependencies(${ARG_TARGET_NAME}_exec ${_lib})
+    endif()
   endforeach()
 
   foreach(_dep ${ARG_DEPENDANTS})
diff --git a/sycl/cmake/modules/AddSYCLLibraryUnitTest.cmake b/sycl/cmake/modules/AddSYCLLibraryUnitTest.cmake
new file mode 100644
index 0000000000000..291d55cc107ce
--- /dev/null
+++ b/sycl/cmake/modules/AddSYCLLibraryUnitTest.cmake
@@ -0,0 +1,91 @@
+# add_sycl_library_unittest(test_suite_name sycl_extra_flags
+#                           file1.cpp file2.cpp ...)
+#
+# sycl_extra_flags: Clang extra compiler flags, e.g.
+#                   "-fsycl-unnamed-lambdas;-fsycl-device-code-split"
+#
+# Will compile the list of files together using clang.
+# Produces a single binary using all the .cpp files
+# named 'test_suite_name' at ${CMAKE_CURRENT_BINARY_DIR}.
+macro(add_sycl_library_unittest test_suite_name)
+  cmake_parse_arguments(ARG
+    ""
+    ""
+    "SYCL_EXTRA_FLAGS;SOURCES"
+    ${ARGN})
+
+  set(CXX_COMPILER clang++)
+  if(MSVC)
+    set(CXX_COMPILER clang-cl.exe)
+  endif()
+
+  set(DEVICE_COMPILER_EXECUTABLE ${LLVM_RUNTIME_OUTPUT_INTDIR}/${CXX_COMPILER})
+  set(_OUTPUT_BIN ${CMAKE_CURRENT_BINARY_DIR}/${test_suite_name}Tests)
+  set(_TESTS_TARGET ${test_suite_name}Tests)
+  set(_BIN_TARGET ${_TESTS_TARGET}_bin)
+  set(_LLVM_TARGET_DEPENDENCIES
+    "llvm_gtest_main;llvm_gtest;LLVMTestingSupport;LLVMSupport;LLVMDemangle")
+
+  foreach(_lib ${_LLVM_TARGET_DEPENDENCIES})
+    list(APPEND _LIBRARIES $<TARGET_LINKER_FILE:${_lib}>)
+  endforeach()
+
+  # Enable exception handling on Windows
+  # Appends extra libraries not available in LIBPATH
+  if(WIN32)
+    set(_INTERNAL_LINKER_FLAGS /link /SUBSYSTEM:CONSOLE)
+    list(APPEND _INTERNAL_EXTRA_FLAGS "/EHs")
+    list(APPEND _LIBRARIES $<TARGET_LINKER_FILE:sycl>)
+    list(APPEND _LIBRARIES ${LLVM_LIBRARY_OUTPUT_INTDIR}/sycl-devicelib-host.lib)
+  endif()
+
+  if(UNIX)
+    foreach(_lib "pthread" "dl" "ncurses")
+      list(APPEND _LIBRARIES "-l${_lib}")
+    endforeach()
+  endif()
+
+  get_target_property(GTEST_INCLUDES llvm_gtest INCLUDE_DIRECTORIES)
+  foreach(_dir ${GTEST_INCLUDES})
+    # Avoid -I when _dir contains an empty generator expression.
+    list(APPEND INCLUDE_COMPILER_STRING "$<$<BOOL:${_dir}>:-I${_dir}>")
+  endforeach()
+
+  add_custom_target(${_BIN_TARGET}
+    COMMAND ${DEVICE_COMPILER_EXECUTABLE} -fsycl ${ARG_SOURCES}
+      -o ${_OUTPUT_BIN}
+      ${ARG_SYCL_EXTRA_FLAGS}
+      ${_INTERNAL_EXTRA_FLAGS}
+      ${INCLUDE_COMPILER_STRING}
+      ${_LIBRARIES}
+      ${_INTERNAL_LINKER_FLAGS}
+    BYPRODUCTS ${CMAKE_CURRENT_BINARY_DIR}/${_TESTS_TARGET}
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+    COMMAND_EXPAND_LISTS)
+
+  add_dependencies(${_BIN_TARGET} sycl)
+  foreach(_lib ${ARG_LIBRARIES})
+    add_dependencies(${_BIN_TARGET} ${_TARGET_DEPENDENCIES})
+  endforeach()
+
+  add_dependencies(SYCLUnitTests ${_BIN_TARGET})
+
+  add_executable(${_TESTS_TARGET} IMPORTED GLOBAL)
+  set_target_properties(${_TESTS_TARGET} PROPERTIES
+    IMPORTED_LOCATION ${CMAKE_CURRENT_BINARY_DIR})
+
+  # Check target for Linux
+  if (UNIX)
+    add_custom_target(check-${test_suite_name}
+      ${CMAKE_COMMAND} -E
+      env LD_LIBRARY_PATH="${CMAKE_BINARY_DIR}/lib"
+      env SYCL_CONFIG_FILE_NAME=null.cfg
+      env SYCL_DEVICELIB_NO_FALLBACK=1
+      env SYCL_CACHE_DIR="${CMAKE_BINARY_DIR}/sycl_cache"
+      ${CMAKE_CURRENT_BINARY_DIR}/${_TESTS_TARGET}
+    )
+    add_dependencies(check-${test_suite_name} ${_BIN_TARGET})
+    add_dependencies(check-sycl-unittests-libs check-${test_suite_name})
+  endif()
+
+endmacro()
diff --git a/sycl/include/syclcompat.hpp b/sycl/include/syclcompat.hpp
new file mode 100644
index 0000000000000..c12ad8ef0cf89
--- /dev/null
+++ b/sycl/include/syclcompat.hpp
@@ -0,0 +1,25 @@
+/***************************************************************************
+ *
+ *  Copyright (C) Codeplay Software Ltd.
+ *
+ *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
+ *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
+ *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  SYCLcompat
+ *
+ *  syclcompat.hpp
+ *
+ *  Description:
+ *    Main include header for SYCLcompat
+ **************************************************************************/
+
+#pragma once
+
+#include <syclcompat/syclcompat.hpp>
diff --git a/sycl/include/syclcompat/defs.hpp b/sycl/include/syclcompat/defs.hpp
new file mode 100644
index 0000000000000..6e4d76cf99403
--- /dev/null
+++ b/sycl/include/syclcompat/defs.hpp
@@ -0,0 +1,43 @@
+/***************************************************************************
+ *
+ *  Copyright (C) Codeplay Software Ltd.
+ *
+ *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
+ *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
+ *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  SYCLcompat
+ *
+ *  defs.hpp
+ *
+ *  Description:
+ *    helper aliases and definitions for SYCLcompat
+ *
+ **************************************************************************/
+
+// The original source was under the license below:
+//==---- dpct.hpp ---------------------------------*- C++ -*----------------==//
+//
+// Copyright (C) Intel Corporation
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+template <class... Args> class sycl_compat_kernel_name;
+template <int Arg> class sycl_compat_kernel_scalar;
+
+#define __sycl_compat_align__(n) alignas(n)
+#define __sycl_compat_inline__ __inline__ __attribute__((always_inline))
+
+#define __sycl_compat_noinline__ __attribute__((noinline))
+
+#define SYCL_COMPAT_COMPATIBILITY_TEMP (600)
diff --git a/sycl/include/syclcompat/dims.hpp b/sycl/include/syclcompat/dims.hpp
new file mode 100644
index 0000000000000..60f9927897304
--- /dev/null
+++ b/sycl/include/syclcompat/dims.hpp
@@ -0,0 +1,72 @@
+/***************************************************************************
+ *
+ *  Copyright (C) Codeplay Software Ltd.
+ *
+ *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
+ *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
+ *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  SYCLcompat
+ *
+ *  dims.hpp
+ *
+ *  Description:
+ *    dim3 functionality for SYCLcompat
+ **************************************************************************/
+
+#pragma once
+
+#include <tuple>
+
+#include <sycl/range.hpp>
+
+namespace syclcompat {
+
+class dim3 {
+public:
+  const size_t x, y, z;
+
+  dim3(const sycl::range<3> &r) : x(r[2]), y(r[1]), z(r[0]) {}
+
+  dim3(const sycl::range<2> &r) : x(r[1]), y(r[0]), z(1) {}
+
+  dim3(const sycl::range<1> &r) : x(r[0]), y(1), z(1) {}
+
+  constexpr dim3(size_t x, size_t y = 1, size_t z = 1) : x(x), y(y), z(z) {}
+
+  constexpr size_t size() const { return x * y * z; }
+
+  operator sycl::range<3>() const { return sycl::range<3>(z, y, x); }
+  operator sycl::range<2>() const {
+    if (z != 1)
+      throw std::invalid_argument(
+          "Attempting to convert a 3D dim3 into sycl::range<2>");
+    return sycl::range<2>(y, x);
+  }
+  operator sycl::range<1>() const {
+    if (z != 1 || y != 1)
+      throw std::invalid_argument(
+          "Attempting to convert a 2D or 3D dim3 into sycl::range<1>");
+    return sycl::range<1>(x);
+  }
+}; // namespace dim3
+
+inline dim3 operator*(const dim3 &a, const dim3 &b) {
+  return dim3{a.x * b.x, a.y * b.y, a.z * b.z};
+}
+
+inline dim3 operator+(const dim3 &a, const dim3 &b) {
+  return dim3{a.x + b.x, a.y + b.y, a.z + b.z};
+}
+
+inline dim3 operator-(const dim3 &a, const dim3 &b) {
+  return dim3{a.x - b.x, a.y - b.y, a.z - b.z};
+}
+
+} // namespace syclcompat
diff --git a/sycl/include/syclcompat/syclcompat.hpp b/sycl/include/syclcompat/syclcompat.hpp
new file mode 100644
index 0000000000000..81171b1150eee
--- /dev/null
+++ b/sycl/include/syclcompat/syclcompat.hpp
@@ -0,0 +1,26 @@
+/***************************************************************************
+ *
+ *  Copyright (C) Codeplay Software Ltd.
+ *
+ *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
+ *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
+ *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  SYCLcompat
+ *
+ *  syclcompat.hpp
+ *
+ *  Description:
+ *    Main include internal header for SYCLcompat
+ **************************************************************************/
+
+#pragma once
+
+#include <syclcompat/defs.hpp>
+#include <syclcompat/dims.hpp>
diff --git a/sycl/unittests/CMakeLists.txt b/sycl/unittests/CMakeLists.txt
index 55c8470e7d133..c02e4c9bc484b 100644
--- a/sycl/unittests/CMakeLists.txt
+++ b/sycl/unittests/CMakeLists.txt
@@ -56,3 +56,9 @@ if (NOT WIN32)
   add_subdirectory(xpti_trace)
 endif()
 
+# Library unit testing
+include(AddSYCLLibraryUnitTest)
+add_custom_target(check-sycl-unittests-libs)
+add_dependencies(check-sycl-unittests check-sycl-unittests-libs)
+
+add_subdirectory(syclcompat)
diff --git a/sycl/unittests/syclcompat/CMakeLists.txt b/sycl/unittests/syclcompat/CMakeLists.txt
new file mode 100644
index 0000000000000..f766465f35b2b
--- /dev/null
+++ b/sycl/unittests/syclcompat/CMakeLists.txt
@@ -0,0 +1,33 @@
+#
+# Copyright (C) Codeplay Software Ltd.
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations under
+# the License.
+#
+# SYCLcompat
+#
+
+set(SYCL_EXTRA_FLAGS "-fsycl-unnamed-lambda;-fsycl-device-code-split=per_kernel")
+
+if (UNIX)
+  add_custom_target(check-syclcompat)
+  add_dependencies(check-sycl-unittests check-syclcompat)
+endif()
+
+add_sycl_library_unittest(syclcompat-base
+  SYCL_EXTRA_FLAGS
+    ${SYCL_EXTRA_FLAGS}
+  SOURCES
+    Dim.cpp
+    Defs.cpp)
+
+if (UNIX)
+  add_dependencies(check-syclcompat check-syclcompat-base)
+endif()
diff --git a/sycl/unittests/syclcompat/Defs.cpp b/sycl/unittests/syclcompat/Defs.cpp
new file mode 100644
index 0000000000000..2450a842dacbf
--- /dev/null
+++ b/sycl/unittests/syclcompat/Defs.cpp
@@ -0,0 +1,34 @@
+/***************************************************************************
+ *
+ *  Copyright (C) Codeplay Software Ltd.
+ *
+ *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
+ *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
+ *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  SYCLcompat
+ *
+ *  Defs.cpp
+ *
+ *  Description:
+ *     __sycl_compat_align__ tests
+ **************************************************************************/
+
+#include <gtest/gtest.h>
+#include <sycl/sycl.hpp>
+#include <syclcompat/defs.hpp>
+
+TEST(DEFS, Align) {
+  struct __sycl_compat_align__(16) {
+    int a;
+    char c;
+  }
+  s;
+  EXPECT_EQ(sizeof(s), 16);
+}
diff --git a/sycl/unittests/syclcompat/Dim.cpp b/sycl/unittests/syclcompat/Dim.cpp
new file mode 100644
index 0000000000000..26210d3aa93d8
--- /dev/null
+++ b/sycl/unittests/syclcompat/Dim.cpp
@@ -0,0 +1,111 @@
+/***************************************************************************
+ *
+ *  Copyright (C) Codeplay Software Ltd.
+ *
+ *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
+ *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
+ *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  SYCLcompat
+ *
+ *  Dim.cpp
+ *
+ *  Description:
+ *     dim3 tests
+ **************************************************************************/
+
+#include <gtest/gtest.h>
+#include <sycl/sycl.hpp>
+#include <syclcompat/dims.hpp>
+
+TEST(DimTest, Construct) {
+  syclcompat::dim3 d3(512);
+  EXPECT_EQ(d3.x, 512);
+  EXPECT_EQ(d3.y, 1);
+  EXPECT_EQ(d3.z, 1);
+}
+
+TEST(DimTest, Convert) {
+  syclcompat::dim3 d3(512);
+  sycl::range<3> r3 = d3;
+  EXPECT_EQ(d3.x, r3[2]);
+  EXPECT_EQ(d3.y, r3[1]);
+  EXPECT_EQ(d3.z, r3[0]);
+
+  sycl::range<2> r2{1, 2};
+  syclcompat::dim3 d3_from_range2(r2);
+  EXPECT_EQ(d3_from_range2.x, 2);
+  EXPECT_EQ(d3_from_range2.y, 1);
+  EXPECT_EQ(d3_from_range2.z, 1);
+
+  sycl::range<1> r1{2};
+  syclcompat::dim3 d3_from_range1(r1);
+  EXPECT_EQ(d3_from_range2.x, 2);
+  EXPECT_EQ(d3_from_range2.y, 1);
+  EXPECT_EQ(d3_from_range2.z, 1);
+}
+
+TEST(DimTest, ConvertBack) {
+  // Dimension-dependent conversions and
+  // check that exceptions are thrown when trying to convert
+  // higher dimensional dim3 to sycl::range
+  {
+    syclcompat::dim3 dim_3D(512, 4, 2);
+
+    sycl::range<3> range_3D{dim_3D};
+    sycl::range<3> exp_3D{2, 4, 512};
+    EXPECT_EQ(range_3D, exp_3D);
+
+    EXPECT_THROW(sycl::range<2> range_2D{dim_3D}, std::invalid_argument);
+    EXPECT_THROW(sycl::range<1> range_1D{dim_3D}, std::invalid_argument);
+  }
+  {
+    syclcompat::dim3 dim_2D(512, 2);
+
+    sycl::range<3> range_3D{dim_2D};
+    sycl::range<3> exp_3D{1, 2, 512};
+    EXPECT_EQ(range_3D, exp_3D);
+
+    sycl::range<2> range_2D{dim_2D};
+    sycl::range<2> exp_2D{2, 512};
+    EXPECT_EQ(range_2D, exp_2D);
+
+    EXPECT_THROW(sycl::range<1> range_1D{dim_2D}, std::invalid_argument);
+  }
+  {
+    syclcompat::dim3 dim_1D{512};
+    sycl::range<3> range_3D{dim_1D};
+    sycl::range<3> exp_3D{1, 1, 512};
+    EXPECT_EQ(range_3D, exp_3D);
+
+    sycl::range<2> range_2D{dim_1D};
+    sycl::range<2> exp_2D{1, 512};
+    EXPECT_EQ(range_2D, exp_2D);
+
+    sycl::range<1> range_1D{dim_1D};
+    sycl::range<1> exp_1D{512};
+    EXPECT_EQ(range_1D, exp_1D);
+  }
+}
+
+// Check that an nd_range is correctly constructed
+// from pair of dim3
+TEST(DimTest, ConvertMulti) {
+  syclcompat::dim3 threads(32, 4, 2);
+  syclcompat::dim3 grid(4, 1, 1);
+
+  sycl::nd_range<3> range{grid * threads, threads};
+
+  EXPECT_EQ(range.get_global_range()[0], 2);
+  EXPECT_EQ(range.get_global_range()[1], 4);
+  EXPECT_EQ(range.get_global_range()[2], 128);
+  EXPECT_EQ(range.get_local_range()[0], 2);
+  EXPECT_EQ(range.get_local_range()[1], 4);
+  EXPECT_EQ(range.get_local_range()[2], 32);
+}

From 01197902ea8106275f56ac316ebddfdc3a604e41 Mon Sep 17 00:00:00 2001
From: elizabethandrews <elizabeth.andrews@intel.com>
Date: Wed, 9 Aug 2023 10:42:21 -0400
Subject: [PATCH 14/24] [SYCL][Clang][NFC] Fix static analyzer concern (#10747)

Fix static analyzer concern about dereferencing null value.
---
 clang/lib/Sema/SemaDeclAttr.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 5c6fb8f6ff001..a9b7668a06c2c 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -4440,7 +4440,7 @@ SYCLIntelMaxGlobalWorkDimAttr *Sema::MergeSYCLIntelMaxGlobalWorkDimAttr(
   // to (1, 1, 1) in case the value of SYCLIntelMaxGlobalWorkDimAttr equals to
   // 0.
   const auto *MergeExpr = dyn_cast<ConstantExpr>(A.getValue());
-  if (MergeExpr->getResultAsAPSInt() == 0) {
+  if (MergeExpr && MergeExpr->getResultAsAPSInt() == 0) {
     if (checkWorkGroupSizeAttrExpr<SYCLIntelMaxWorkGroupSizeAttr>(*this, D,
                                                                   A) ||
         checkWorkGroupSizeAttrExpr<SYCLReqdWorkGroupSizeAttr>(*this, D, A))

From bf9725232274cb5f1e3e2dc2a13fd34d65379381 Mon Sep 17 00:00:00 2001
From: Justin Cai <justin.cai@intel.com>
Date: Wed, 9 Aug 2023 08:08:39 -0700
Subject: [PATCH 15/24] [SYCL] Propagate explicitly declared aspects even if
 excluded (#10650)

This PR changes `SYCLPropagateAspectsPass` to propagate aspects that
come from `sycl_declared_aspects` even if they are excluded. The reason
for this change is because a test like
`no-fp64-optimization-declared-aspects.cpp` added in this PR would
failed before with higher optimization level because

- on the first aspect propagation pass, `fp64` is not propagated (to
allow for trivial uses of `float x = 1.5` to optimized out)
- the call to the function marked with `device_has(fp64)` is inlined on
higher optimizations
- that function does not actually use `double` in its body

which means no usage of double ends up in the optimized function,
leading the second aspect propagation pass to not attach `fp64` to its
used aspects metadata.

---------

Co-authored-by: Alexey Sachkov <alexey.sachkov@intel.com>
Co-authored-by: Marcos Maronas <maarquitos14@users.noreply.github.com>
---
 .../SYCLLowerIR/SYCLPropagateAspectsUsage.cpp | 30 +++++++++-------
 .../PropagateAspectsUsage/exclude-aspect.ll   | 26 ++++++++------
 .../no-fp64-optimization-declared-aspects.cpp | 35 +++++++++++++++++++
 3 files changed, 67 insertions(+), 24 deletions(-)
 create mode 100644 sycl/test-e2e/OptionalKernelFeatures/no-fp64-optimization-declared-aspects.cpp

diff --git a/llvm/lib/SYCLLowerIR/SYCLPropagateAspectsUsage.cpp b/llvm/lib/SYCLLowerIR/SYCLPropagateAspectsUsage.cpp
index 4b37c267f2353..f43cf98f66ca8 100644
--- a/llvm/lib/SYCLLowerIR/SYCLPropagateAspectsUsage.cpp
+++ b/llvm/lib/SYCLLowerIR/SYCLPropagateAspectsUsage.cpp
@@ -316,11 +316,10 @@ getAspectUsageChain(const Function *F, const FunctionToAspectsMapTy &AspectsMap,
 }
 
 void createUsedAspectsMetadataForFunctions(
-    FunctionToAspectsMapTy &Map, const AspectsSetTy &ExcludeAspectVals) {
-  for (auto &[F, Aspects] : Map) {
-    if (Aspects.empty())
-      continue;
-
+    FunctionToAspectsMapTy &FunctionToUsedAspects,
+    FunctionToAspectsMapTy &FunctionToDeclaredAspects,
+    const AspectsSetTy &ExcludeAspectVals) {
+  for (auto &[F, Aspects] : FunctionToUsedAspects) {
     LLVMContext &C = F->getContext();
 
     // Create a set of unique aspects. First we add the ones from the found
@@ -330,6 +329,11 @@ void createUsedAspectsMetadataForFunctions(
       if (!ExcludeAspectVals.contains(A))
         UniqueAspects.insert(A);
 
+    // The aspects that were propagated via declared aspects are always
+    // added to the metadata.
+    for (const int &A : FunctionToDeclaredAspects[F])
+      UniqueAspects.insert(A);
+
     // If there are no new aspects, we can just keep the old metadata.
     if (UniqueAspects.empty())
       continue;
@@ -547,7 +551,7 @@ void setSyclFixedTargetsMD(const std::vector<Function *> &EntryPoints,
 }
 
 /// Returns a map of functions with corresponding used aspects.
-FunctionToAspectsMapTy
+std::pair<FunctionToAspectsMapTy, FunctionToAspectsMapTy>
 buildFunctionsToAspectsMap(Module &M, TypeToAspectsMapTy &TypesWithAspects,
                            const AspectValueToNameMapTy &AspectValues,
                            const std::vector<Function *> &EntryPoints,
@@ -575,10 +579,9 @@ buildFunctionsToAspectsMap(Module &M, TypeToAspectsMapTy &TypesWithAspects,
   Visited.clear();
   for (Function *F : EntryPoints)
     propagateAspectsThroughCG(F, CG, FunctionToDeclaredAspects, Visited);
-  for (const auto &It : FunctionToDeclaredAspects)
-    FunctionToUsedAspects[It.first].insert(It.second.begin(), It.second.end());
 
-  return FunctionToUsedAspects;
+  return {std::move(FunctionToUsedAspects),
+          std::move(FunctionToDeclaredAspects)};
 }
 
 } // anonymous namespace
@@ -617,8 +620,9 @@ SYCLPropagateAspectsUsagePass::run(Module &M, ModuleAnalysisManager &MAM) {
 
   propagateAspectsToOtherTypesInModule(M, TypesWithAspects, AspectValues);
 
-  FunctionToAspectsMapTy FunctionToUsedAspects = buildFunctionsToAspectsMap(
-      M, TypesWithAspects, AspectValues, EntryPoints, ValidateAspectUsage);
+  auto [FunctionToUsedAspects, FunctionToDeclaredAspects] =
+      buildFunctionsToAspectsMap(M, TypesWithAspects, AspectValues, EntryPoints,
+                                 ValidateAspectUsage);
 
   // Create a set of excluded aspect values.
   AspectsSetTy ExcludedAspectVals;
@@ -629,8 +633,8 @@ SYCLPropagateAspectsUsagePass::run(Module &M, ModuleAnalysisManager &MAM) {
     ExcludedAspectVals.insert(AspectValIter->second);
   }
 
-  createUsedAspectsMetadataForFunctions(FunctionToUsedAspects,
-                                        ExcludedAspectVals);
+  createUsedAspectsMetadataForFunctions(
+      FunctionToUsedAspects, FunctionToDeclaredAspects, ExcludedAspectVals);
 
   setSyclFixedTargetsMD(EntryPoints, TargetFixedAspects, AspectValues);
 
diff --git a/llvm/test/SYCLLowerIR/PropagateAspectsUsage/exclude-aspect.ll b/llvm/test/SYCLLowerIR/PropagateAspectsUsage/exclude-aspect.ll
index 59c7964cd60ad..223ef0e803287 100644
--- a/llvm/test/SYCLLowerIR/PropagateAspectsUsage/exclude-aspect.ll
+++ b/llvm/test/SYCLLowerIR/PropagateAspectsUsage/exclude-aspect.ll
@@ -47,34 +47,38 @@ define spir_kernel void @kernel1() {
   ret void
 }
 
-; funcE should get none of its explicitly declared aspects in its
+; funcE should get its explicitly declared aspects even if excluded
 ; sycl_used_aspects
-; CHECK: define spir_func void @funcE() !sycl_declared_aspects ![[#DA1:]] {
+; CHECK:      define spir_func void @funcE() !sycl_declared_aspects ![[#DA1:]]
+; CHECK-SAME: !sycl_used_aspects ![[#DA1]] {
 define spir_func void @funcE() !sycl_declared_aspects !10 {
   ret void
 }
 
 ; funcF should have the same aspects as funcE
-; CHECK-NOT: define spir_func void @funcF() {{.*}} !sycl_used_aspects
+; CHECK: define spir_func void @funcF() !sycl_used_aspects ![[#DA1]] {
 define spir_func void @funcF() {
   call spir_func void @funcE()
   ret void
 }
 
-; funcG only keeps one aspect, the rest are excluded
-; CHECK: define spir_func void @funcG() !sycl_declared_aspects ![[#DA2:]] !sycl_used_aspects ![[#ID3:]]
+; aspect1 is used but excluded, aspect2 and aspect4 are declared, so
+; attached metadata is aspect2 and aspect4
+; CHECK:      define spir_func void @funcG() !sycl_declared_aspects ![[#DA2:]]
+; CHECK-SAME: !sycl_used_aspects ![[#DA2]] {
 define spir_func void @funcG() !sycl_declared_aspects !11 {
+  %tmp = alloca %B
   ret void
 }
 
 ; funcH should have the same aspects as funcG
-; CHECK: define spir_func void @funcH() !sycl_used_aspects ![[#ID3]]
+; CHECK: define spir_func void @funcH() !sycl_used_aspects ![[#DA2]]
 define spir_func void @funcH() {
   call spir_func void @funcG()
   ret void
 }
 
-; CHECK: define spir_kernel void @kernel2() !sycl_used_aspects ![[#ID3]]
+; CHECK: define spir_kernel void @kernel2() !sycl_used_aspects ![[#ID5:]]
 define spir_kernel void @kernel2() {
   call spir_func void @funcF()
   call spir_func void @funcH()
@@ -100,7 +104,7 @@ define spir_func void @funcK() !sycl_used_aspects !11 {
   ret void
 }
 
-; CHECK: define spir_func void @funcL() !sycl_used_aspects ![[#ID3]]
+; CHECK: define spir_func void @funcL() !sycl_used_aspects ![[#ID3:]]
 define spir_func void @funcL() {
   call spir_func void @funcK()
   ret void
@@ -128,12 +132,12 @@ define spir_kernel void @kernel3() {
 !9 = !{!"fp64", i32 5}
 
 !10 = !{i32 1}
-!11 = !{i32 4, i32 2, i32 1}
+!11 = !{i32 4, i32 2}
 ; CHECK-DAG: ![[#DA1]] = !{i32 1}
-; CHECK-DAG: ![[#DA2]] = !{i32 4, i32 2, i32 1}
+; CHECK-DAG: ![[#DA2]] = !{i32 4, i32 2}
 
 ; CHECK-DAG: ![[#ID0]] = !{i32 0}
 ; CHECK-DAG: ![[#ID1]] = !{i32 2, i32 0}
 ; CHECK-DAG: ![[#ID2]] = !{i32 0, i32 2, i32 3}
 ; CHECK-DAG: ![[#ID3]] = !{i32 2}
-; CHECK-DAG: ![[#ID4]] = !{i32 2, i32 4, i32 1}
+; CHECK-DAG: ![[#ID4]] = !{i32 2, i32 4}
diff --git a/sycl/test-e2e/OptionalKernelFeatures/no-fp64-optimization-declared-aspects.cpp b/sycl/test-e2e/OptionalKernelFeatures/no-fp64-optimization-declared-aspects.cpp
new file mode 100644
index 0000000000000..fed9398ec3eb9
--- /dev/null
+++ b/sycl/test-e2e/OptionalKernelFeatures/no-fp64-optimization-declared-aspects.cpp
@@ -0,0 +1,35 @@
+// UNSUPPORTED: aspect-fp64
+// RUN: %{build} -o %t.out -O3
+// RUN: %{run} %t.out
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+template <aspect asp, typename T>
+[[sycl::device_has(asp)]] void dummy_function_decorated(const T &acc) {
+  acc[0] = true;
+}
+
+int main() {
+  queue q;
+  bool b = false;
+  assert(!q.get_device().has(aspect::fp64));
+
+  buffer<bool, 1> buf(&b, 1);
+  try {
+    q.submit([&](handler &cgh) {
+      accessor acc(buf, cgh);
+      cgh.single_task([=]() { dummy_function_decorated<aspect::fp64>(acc); });
+    });
+    std::cout << "Exception should have been thrown!\n";
+    return 1;
+  } catch (const sycl::exception &e) {
+    if (e.code() != errc::kernel_not_supported) {
+      std::cout << "Exception caught, but wrong error code!\n";
+      throw;
+    }
+    std::cout << "pass\n";
+    return 0;
+  }
+}

From a5a5e0320f774eaca612880633f73ee1c35fd9ad Mon Sep 17 00:00:00 2001
From: Artur Gainullin <artur.gainullin@intel.com>
Date: Wed, 9 Aug 2023 09:03:27 -0700
Subject: [PATCH 16/24] [SYCL] Skip alloca commands when checking for leaves
 completion (#10740)

Scheduler::checkLeavesCompletion checks status of all leaves of the
buffer to see whether we can destroy that sycl::buffer. There are many
scenarios when alloca commands are leaves, these commands don't have
associated event and currently they are always incorrectly considered
"in progress" because of that preventing buffers to be destroyed timely
and deferring their destruction till the point of program termination.
Skip alloca commands to fix that.
---
 sycl/source/detail/scheduler/scheduler.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp
index da2579a80a645..b674c3a820a00 100644
--- a/sycl/source/detail/scheduler/scheduler.cpp
+++ b/sycl/source/detail/scheduler/scheduler.cpp
@@ -27,11 +27,15 @@ namespace detail {
 
 bool Scheduler::checkLeavesCompletion(MemObjRecord *Record) {
   for (Command *Cmd : Record->MReadLeaves) {
-    if (!Cmd->getEvent()->isCompleted())
+    if (!(Cmd->getType() == detail::Command::ALLOCA ||
+          Cmd->getType() == detail::Command::ALLOCA_SUB_BUF) &&
+        !Cmd->getEvent()->isCompleted())
       return false;
   }
   for (Command *Cmd : Record->MWriteLeaves) {
-    if (!Cmd->getEvent()->isCompleted())
+    if (!(Cmd->getType() == detail::Command::ALLOCA ||
+          Cmd->getType() == detail::Command::ALLOCA_SUB_BUF) &&
+        !Cmd->getEvent()->isCompleted())
       return false;
   }
   return true;

From aa5722c9b25b79c70756c77cbe8393ad524f6e5e Mon Sep 17 00:00:00 2001
From: aelovikov-intel <andrei.elovikov@intel.com>
Date: Wed, 9 Aug 2023 09:07:17 -0700
Subject: [PATCH 17/24] [CI] Fix install_drivers/reset_gpu in pre-commit task
 (#10759)

---
 .github/workflows/sycl_precommit_linux.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/sycl_precommit_linux.yml b/.github/workflows/sycl_precommit_linux.yml
index 7e2306f428d15..f106628905e49 100644
--- a/.github/workflows/sycl_precommit_linux.yml
+++ b/.github/workflows/sycl_precommit_linux.yml
@@ -75,6 +75,7 @@ jobs:
             image: ghcr.io/intel/llvm/ubuntu2204_build:latest
             image_options: -u 1001
             target_devices: ext_intel_esimd_emulator:gpu
+            install_drivers: ${{ contains(needs.detect_changes.outputs.filters, 'drivers') }}
           - name: AMD/HIP
             runner: '["Linux", "amdgpu"]'
             image: ghcr.io/intel/llvm/ubuntu2204_build:latest
@@ -85,7 +86,8 @@ jobs:
             image: ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest
             image_options: -u 1001 --device=/dev/dri --privileged --cap-add SYS_ADMIN
             target_devices: ext_oneapi_level_zero:gpu;opencl:gpu;opencl:cpu
-            reset_gpu: ${{ contains(needs.detect_changes.outputs.filters, 'drivers') }}
+            reset_gpu: true
+            install_drivers: ${{ contains(needs.detect_changes.outputs.filters, 'drivers') }}
     uses: ./.github/workflows/sycl_linux_run_tests.yml
     with:
       name: ${{ matrix.name }}

From 17a053e9690fff5e0a5fe6fb5abf80670c4af6ed Mon Sep 17 00:00:00 2001
From: Andrei Elovikov <andrei.elovikov@intel.com>
Date: Wed, 9 Aug 2023 10:19:01 -0700
Subject: [PATCH 18/24] [CI] Try to sync aws cuda e2e tests in pre-commit with
 the build

---
 .github/workflows/sycl_precommit_aws.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/sycl_precommit_aws.yml b/.github/workflows/sycl_precommit_aws.yml
index d625f9298d3fa..6daa6dc9e5c02 100644
--- a/.github/workflows/sycl_precommit_aws.yml
+++ b/.github/workflows/sycl_precommit_aws.yml
@@ -61,7 +61,9 @@ jobs:
       image: ghcr.io/intel/llvm/ubuntu2204_build:latest
       image_options: -u 1001 --gpus all --cap-add SYS_ADMIN
       target_devices: ext_oneapi_cuda:gpu
-      ref: ${{ github.sha }}
+      # No idea why but that seems to work and be in sync with the main
+      # pre-commit workflow.
+      ref: ${{ github.event.workflow_run.referenced_workflows[0].sha }}
       merge_ref: ''
 
       sycl_toolchain_artifact: sycl_linux_default

From 56946aa525827e6893e778271299ef376d05261f Mon Sep 17 00:00:00 2001
From: aelovikov-intel <andrei.elovikov@intel.com>
Date: Wed, 9 Aug 2023 10:43:36 -0700
Subject: [PATCH 19/24] [CI] Use separate Linux build/test jobs and no matrix
 generator in nightly (#10745)

Also changes daily build upload (via github release) to only require
successful build/LIT and not E2E tests.

In addition to that, I switched CUDA E2E from using AWS runner to using
our self-hosted one. The load of this workflow is low so our single
runner can handle that.

---------

Co-authored-by: Steffen Larsen <steffen.larsen@intel.com>
---
 .github/workflows/sycl_linux_build.yml |  2 +
 .github/workflows/sycl_nightly.yml     | 84 +++++++++++++++++++++-----
 2 files changed, 72 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/sycl_linux_build.yml b/.github/workflows/sycl_linux_build.yml
index 44a1358b82ac4..c355f18f35471 100644
--- a/.github/workflows/sycl_linux_build.yml
+++ b/.github/workflows/sycl_linux_build.yml
@@ -52,6 +52,8 @@ on:
         default: 3
 
     outputs:
+      build_conclusion:
+        value: ${{ jobs.build.outputs.build_conclusion }}
       artifact_archive_name:
         value: ${{ jobs.build.outputs.artifact_archive_name }}
       artifact_decompress_command:
diff --git a/.github/workflows/sycl_nightly.yml b/.github/workflows/sycl_nightly.yml
index 6e6223679dba9..cabf159ff05dc 100644
--- a/.github/workflows/sycl_nightly.yml
+++ b/.github/workflows/sycl_nightly.yml
@@ -6,17 +6,9 @@ on:
     - cron: '0 3 * * *'
 
 jobs:
-  test_matrix:
-    if: github.repository == 'intel/llvm'
-    name: Generate Test Matrix
-    uses: ./.github/workflows/sycl_gen_test_matrix.yml
-    with:
-      lts_config: "hip_amdgpu;ocl_gen12;ocl_x64;l0_gen12;esimd_emu;cuda_aws;win_l0_gen12"
-
-  ubuntu2204_build_test:
+  ubuntu2204_build:
     if: github.repository == 'intel/llvm'
-    uses: ./.github/workflows/sycl_linux_build_and_test.yml
-    needs: test_matrix
+    uses: ./.github/workflows/sycl_linux_build.yml
     secrets: inherit
     with:
       build_cache_root: "/__w/"
@@ -24,13 +16,77 @@ jobs:
       build_configure_extra_args: '--hip --cuda --enable-esimd-emulator'
       merge_ref: ''
       retention-days: 90
-      lts_matrix: ${{ needs.test_matrix.outputs.lts_lx_matrix }}
-      lts_aws_matrix: ${{ needs.test_matrix.outputs.lts_aws_matrix }}
 
       # We upload the build for people to download/use, override its name and
       # prefer widespread gzip compression.
       artifact_archive_name: sycl_linux.tar.gz
 
+  ubuntu2204_test:
+    needs: [ubuntu2204_build]
+    if: ${{ always() && !cancelled() && needs.ubuntu2204_build.outputs.build_conclusion == 'success' }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - name: AMD/HIP
+            runner: '["Linux", "amdgpu"]'
+            image: ghcr.io/intel/llvm/ubuntu2204_build:latest
+            image_options: -u 1001 --device=/dev/dri --device=/dev/kfd
+            target_devices: ext_oneapi_hip:gpu
+
+          - name: Intel L0 GPU
+            runner: '["Linux", "gen12"]'
+            image: ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest
+            image_options: -u 1001 --device=/dev/dri --privileged --cap-add SYS_ADMIN
+            target_devices: ext_oneapi_level_zero:gpu
+            reset_gpu: true
+
+          - name: Intel OCL GPU
+            runner: '["Linux", "gen12"]'
+            image: ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest
+            image_options: -u 1001 --device=/dev/dri --privileged --cap-add SYS_ADMIN
+            target_devices: opencl:gpu
+            reset_gpu: true
+
+          - name: OCL CPU
+            runner: '["Linux", "x86-cpu"]'
+            image: ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest
+            image_options: -u 1001
+            target_devices: opencl:cpu
+
+          - name: ESIMD Emu
+            runner: '["Linux", "x86-cpu"]'
+            image: ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest
+            image_options: -u 1001
+            target_devices: ext_intel_esimd_emulator:gpu
+
+          - name: Self-hosted CUDA
+            runner: '["Linux", "cuda"]'
+            image: ghcr.io/intel/llvm/ubuntu2204_build:latest
+            image_options: -u 1001 --gpus all --cap-add SYS_ADMIN
+            target_devices: ext_oneapi_cuda:gpu
+    uses: ./.github/workflows/sycl_linux_run_tests.yml
+    with:
+      name: ${{ matrix.name }}
+      runner: ${{ matrix.runner }}
+      image: ${{ matrix.image }}
+      image_options: ${{ matrix.image_options }}
+      target_devices: ${{ matrix.target_devices }}
+      reset_gpu: ${{ matrix.reset_gpu }}
+      ref: ${{ github.sha }}
+      merge_ref: ''
+      sycl_toolchain_artifact: sycl_linux_default
+      sycl_toolchain_archive: ${{ needs.ubuntu2204_build.outputs.artifact_archive_name }}
+      sycl_toolchain_decompress_command: ${{ needs.ubuntu2204_build.outputs.artifact_decompress_command }}
+
+
+  test_matrix:
+    if: github.repository == 'intel/llvm'
+    name: Generate Test Matrix
+    uses: ./.github/workflows/sycl_gen_test_matrix.yml
+    with:
+      lts_config: "win_l0_gen12"
+
   windows_default:
     name: Windows
     if: github.repository == 'intel/llvm'
@@ -47,7 +103,7 @@ jobs:
   nightly_build_upload:
     name: Nightly Build Upload
     if: ${{ github.ref_name == 'sycl' }}
-    needs: [ubuntu2204_build_test, windows_default]
+    needs: [ubuntu2204_build, windows_default]
     runs-on: ubuntu-latest
     steps:
     - uses: actions/download-artifact@v3
@@ -80,7 +136,7 @@ jobs:
   ubuntu2204_docker_build_push:
     if: github.repository == 'intel/llvm'
     runs-on: [Linux, build]
-    needs: ubuntu2204_build_test
+    needs: ubuntu2204_build
     steps:
     - uses: actions/checkout@v3
     - uses: actions/download-artifact@v3

From f26df80a54791bab93ba9622a5526ae556c9685a Mon Sep 17 00:00:00 2001
From: aelovikov-intel <andrei.elovikov@intel.com>
Date: Wed, 9 Aug 2023 10:43:52 -0700
Subject: [PATCH 20/24] [CI] Use separate build/test in post-commit + add
 pull_request trigger (#10743)

This eliminates the usage of matrix generator in post-commit, following
a similar change done in pre-commit earlier.
---
 .github/workflows/sycl_post_commit.yml | 50 +++++++++++++++++++-------
 1 file changed, 37 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/sycl_post_commit.yml b/.github/workflows/sycl_post_commit.yml
index f0021e269a482..4db77b0a48728 100644
--- a/.github/workflows/sycl_post_commit.yml
+++ b/.github/workflows/sycl_post_commit.yml
@@ -7,19 +7,22 @@ on:
     - sycl-devops-pr/**
     - llvmspirv_pulldown
 
+  pull_request:
+    branches:
+    - sycl
+    - sycl-devops-pr/**
+    paths:
+    - .github/workflow/sycl_post_commit.yml
+    - .github/workflow/sycl_linux_build.yml
+    - .github/workflow/sycl_linux_run_tests.yml
+    - ./devops/actions/cleanup
+    - ./devops/actions/cached_checkout
+
 jobs:
-  # This job generates matrix of tests for SYCL End-to-End tests
-  test_matrix:
-    name: Generate Test Matrix
-    if: github.repository == 'intel/llvm'
-    uses: ./.github/workflows/sycl_gen_test_matrix.yml
-    with:
-      lts_config: "l0_gen12;win_l0_gen12"
-  linux_self_prod:
+  build:
     name: Linux (Self build + shared libraries + no-assertions)
     if: github.repository == 'intel/llvm'
-    needs: test_matrix
-    uses: ./.github/workflows/sycl_linux_build_and_test.yml
+    uses: ./.github/workflows/sycl_linux_build.yml
     with:
       build_cache_root: "/__w/llvm"
       build_cache_suffix: sprod_shared
@@ -29,11 +32,32 @@ jobs:
       build_image: "ghcr.io/intel/llvm/sycl_ubuntu2204_nightly:build"
       cc: clang
       cxx: clang++
-      lts_matrix: ${{ needs.test_matrix.outputs.lts_lx_matrix }}
-      cts_matrix: ${{ needs.test_matrix.outputs.cts_matrix }}
-      lts_aws_matrix: ${{ needs.test_matrix.outputs.lts_aws_matrix }}
       merge_ref: ''
 
+  test:
+    needs: [build]
+    if: ${{ always() && !cancelled() && needs.build.outputs.build_conclusion == 'success' }}
+    uses: ./.github/workflows/sycl_linux_run_tests.yml
+    with:
+      name: SYCL E2E on Intel Linux L0
+      runner: '["Linux", "gen12"]'
+      image: ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest
+      image_options: -u 1001 --device=/dev/dri --privileged --cap-add SYS_ADMIN
+      target_devices: ext_oneapi_level_zero:gpu
+      ref: ${{ github.sha }}
+      merge_ref: ''
+      sycl_toolchain_artifact: sycl_linux_sprod_shared
+      sycl_toolchain_archive: ${{ needs.build.outputs.artifact_archive_name }}
+      sycl_toolchain_decompress_command: ${{ needs.build.outputs.artifact_decompress_command }}
+
+  # This job generates matrix of tests for SYCL End-to-End tests on Windows
+  test_matrix:
+    name: Generate Test Matrix
+    if: github.repository == 'intel/llvm'
+    uses: ./.github/workflows/sycl_gen_test_matrix.yml
+    with:
+      lts_config: "win_l0_gen12"
+
   windows_default:
     name: Windows
     needs: test_matrix

From c6f82ed4a1599e77efd89c54c8cdb61d2df986e4 Mon Sep 17 00:00:00 2001
From: aelovikov-intel <andrei.elovikov@intel.com>
Date: Wed, 9 Aug 2023 11:10:48 -0700
Subject: [PATCH 21/24] [SYCL][E2E] Enable USM/memops2d on PVC (#10746)

The underlying issue in the GPU RT seems to have been fixed, verified
locally.

Closes https://github.com/intel/llvm/issues/8103.
---
 sycl/test-e2e/USM/memops2d/copy2d_device_to_device.cpp   | 2 +-
 sycl/test-e2e/USM/memops2d/copy2d_device_to_dhost.cpp    | 2 +-
 sycl/test-e2e/USM/memops2d/copy2d_device_to_host.cpp     | 2 +-
 sycl/test-e2e/USM/memops2d/copy2d_device_to_shared.cpp   | 2 +-
 sycl/test-e2e/USM/memops2d/copy2d_dhost_to_device.cpp    | 2 +-
 sycl/test-e2e/USM/memops2d/copy2d_dhost_to_dhost.cpp     | 2 +-
 sycl/test-e2e/USM/memops2d/copy2d_dhost_to_host.cpp      | 2 +-
 sycl/test-e2e/USM/memops2d/copy2d_dhost_to_shared.cpp    | 2 +-
 sycl/test-e2e/USM/memops2d/copy2d_host_to_device.cpp     | 2 +-
 sycl/test-e2e/USM/memops2d/copy2d_host_to_dhost.cpp      | 2 +-
 sycl/test-e2e/USM/memops2d/copy2d_host_to_host.cpp       | 2 +-
 sycl/test-e2e/USM/memops2d/copy2d_host_to_shared.cpp     | 2 +-
 sycl/test-e2e/USM/memops2d/copy2d_shared_to_device.cpp   | 2 +-
 sycl/test-e2e/USM/memops2d/copy2d_shared_to_dhost.cpp    | 2 +-
 sycl/test-e2e/USM/memops2d/copy2d_shared_to_host.cpp     | 2 +-
 sycl/test-e2e/USM/memops2d/copy2d_shared_to_shared.cpp   | 2 +-
 sycl/test-e2e/USM/memops2d/memcpy2d_device_to_device.cpp | 2 +-
 sycl/test-e2e/USM/memops2d/memcpy2d_device_to_dhost.cpp  | 2 +-
 sycl/test-e2e/USM/memops2d/memcpy2d_device_to_host.cpp   | 2 +-
 sycl/test-e2e/USM/memops2d/memcpy2d_device_to_shared.cpp | 2 +-
 sycl/test-e2e/USM/memops2d/memcpy2d_dhost_to_device.cpp  | 2 +-
 sycl/test-e2e/USM/memops2d/memcpy2d_dhost_to_dhost.cpp   | 2 +-
 sycl/test-e2e/USM/memops2d/memcpy2d_dhost_to_host.cpp    | 2 +-
 sycl/test-e2e/USM/memops2d/memcpy2d_dhost_to_shared.cpp  | 2 +-
 sycl/test-e2e/USM/memops2d/memcpy2d_host_to_device.cpp   | 2 +-
 sycl/test-e2e/USM/memops2d/memcpy2d_host_to_dhost.cpp    | 2 +-
 sycl/test-e2e/USM/memops2d/memcpy2d_host_to_host.cpp     | 2 +-
 sycl/test-e2e/USM/memops2d/memcpy2d_host_to_shared.cpp   | 2 +-
 sycl/test-e2e/USM/memops2d/memcpy2d_shared_to_device.cpp | 2 +-
 sycl/test-e2e/USM/memops2d/memcpy2d_shared_to_dhost.cpp  | 2 +-
 sycl/test-e2e/USM/memops2d/memcpy2d_shared_to_host.cpp   | 2 +-
 sycl/test-e2e/USM/memops2d/memcpy2d_shared_to_shared.cpp | 2 +-
 32 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/sycl/test-e2e/USM/memops2d/copy2d_device_to_device.cpp b/sycl/test-e2e/USM/memops2d/copy2d_device_to_device.cpp
index 7105217981123..329634f31d5c5 100644
--- a/sycl/test-e2e/USM/memops2d/copy2d_device_to_device.cpp
+++ b/sycl/test-e2e/USM/memops2d/copy2d_device_to_device.cpp
@@ -11,7 +11,7 @@
 // RUN: %{run} %t.out
 
 // Temporarily disabled until the failure is addressed.
-// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+// UNSUPPORTED: (level_zero && windows)
 
 #include "copy2d_common.hpp"
 
diff --git a/sycl/test-e2e/USM/memops2d/copy2d_device_to_dhost.cpp b/sycl/test-e2e/USM/memops2d/copy2d_device_to_dhost.cpp
index 524714903d9c1..8b40406bc5527 100644
--- a/sycl/test-e2e/USM/memops2d/copy2d_device_to_dhost.cpp
+++ b/sycl/test-e2e/USM/memops2d/copy2d_device_to_dhost.cpp
@@ -11,7 +11,7 @@
 // RUN: %{run} %t.out
 
 // Temporarily disabled until the failure is addressed.
-// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+// UNSUPPORTED: (level_zero && windows)
 
 #include "copy2d_common.hpp"
 
diff --git a/sycl/test-e2e/USM/memops2d/copy2d_device_to_host.cpp b/sycl/test-e2e/USM/memops2d/copy2d_device_to_host.cpp
index 7d33ac3c14bb1..fa23db186d9d2 100644
--- a/sycl/test-e2e/USM/memops2d/copy2d_device_to_host.cpp
+++ b/sycl/test-e2e/USM/memops2d/copy2d_device_to_host.cpp
@@ -12,7 +12,7 @@
 
 // Temporarily disabled until the failure is addressed.
 // For HIP see https://github.com/intel/llvm/issues/10157.
-// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows) || hip
+// UNSUPPORTED: (level_zero && windows) || hip
 
 #include "copy2d_common.hpp"
 
diff --git a/sycl/test-e2e/USM/memops2d/copy2d_device_to_shared.cpp b/sycl/test-e2e/USM/memops2d/copy2d_device_to_shared.cpp
index 0897b277c1ae2..d3f03c7d62ae1 100644
--- a/sycl/test-e2e/USM/memops2d/copy2d_device_to_shared.cpp
+++ b/sycl/test-e2e/USM/memops2d/copy2d_device_to_shared.cpp
@@ -11,7 +11,7 @@
 // RUN: %{run} %t.out
 
 // Temporarily disabled until the failure is addressed.
-// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+// UNSUPPORTED: (level_zero && windows)
 
 #include "copy2d_common.hpp"
 
diff --git a/sycl/test-e2e/USM/memops2d/copy2d_dhost_to_device.cpp b/sycl/test-e2e/USM/memops2d/copy2d_dhost_to_device.cpp
index bace791339282..fcbd42bb22faa 100644
--- a/sycl/test-e2e/USM/memops2d/copy2d_dhost_to_device.cpp
+++ b/sycl/test-e2e/USM/memops2d/copy2d_dhost_to_device.cpp
@@ -11,7 +11,7 @@
 // RUN: %{run} %t.out
 
 // Temporarily disabled until the failure is addressed.
-// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+// UNSUPPORTED: (level_zero && windows)
 
 #include "copy2d_common.hpp"
 
diff --git a/sycl/test-e2e/USM/memops2d/copy2d_dhost_to_dhost.cpp b/sycl/test-e2e/USM/memops2d/copy2d_dhost_to_dhost.cpp
index f6aed2ea89f13..5aa720e27feb3 100644
--- a/sycl/test-e2e/USM/memops2d/copy2d_dhost_to_dhost.cpp
+++ b/sycl/test-e2e/USM/memops2d/copy2d_dhost_to_dhost.cpp
@@ -10,7 +10,7 @@
 // RUN: %{run} %t.out
 
 // Temporarily disabled until the failure is addressed.
-// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+// UNSUPPORTED: (level_zero && windows)
 
 #include "copy2d_common.hpp"
 
diff --git a/sycl/test-e2e/USM/memops2d/copy2d_dhost_to_host.cpp b/sycl/test-e2e/USM/memops2d/copy2d_dhost_to_host.cpp
index 989c76d6f11d9..d9b1031c11534 100644
--- a/sycl/test-e2e/USM/memops2d/copy2d_dhost_to_host.cpp
+++ b/sycl/test-e2e/USM/memops2d/copy2d_dhost_to_host.cpp
@@ -11,7 +11,7 @@
 // RUN: %{run} %t.out
 
 // Temporarily disabled until the failure is addressed.
-// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+// UNSUPPORTED: (level_zero && windows)
 
 #include "copy2d_common.hpp"
 
diff --git a/sycl/test-e2e/USM/memops2d/copy2d_dhost_to_shared.cpp b/sycl/test-e2e/USM/memops2d/copy2d_dhost_to_shared.cpp
index 7b2a547ed9b28..c589782b8ab0a 100644
--- a/sycl/test-e2e/USM/memops2d/copy2d_dhost_to_shared.cpp
+++ b/sycl/test-e2e/USM/memops2d/copy2d_dhost_to_shared.cpp
@@ -11,7 +11,7 @@
 // RUN: %{run} %t.out
 
 // Temporarily disabled until the failure is addressed.
-// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+// UNSUPPORTED: (level_zero && windows)
 
 #include "copy2d_common.hpp"
 
diff --git a/sycl/test-e2e/USM/memops2d/copy2d_host_to_device.cpp b/sycl/test-e2e/USM/memops2d/copy2d_host_to_device.cpp
index 279deecea8d3c..733292ee77d92 100644
--- a/sycl/test-e2e/USM/memops2d/copy2d_host_to_device.cpp
+++ b/sycl/test-e2e/USM/memops2d/copy2d_host_to_device.cpp
@@ -12,7 +12,7 @@
 
 // Temporarily disabled until the failure is addressed.
 // For HIP see https://github.com/intel/llvm/issues/10157.
-// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows) || hip
+// UNSUPPORTED: (level_zero && windows) || hip
 
 #include "copy2d_common.hpp"
 
diff --git a/sycl/test-e2e/USM/memops2d/copy2d_host_to_dhost.cpp b/sycl/test-e2e/USM/memops2d/copy2d_host_to_dhost.cpp
index 768124e58c77d..3559b7defd3ec 100644
--- a/sycl/test-e2e/USM/memops2d/copy2d_host_to_dhost.cpp
+++ b/sycl/test-e2e/USM/memops2d/copy2d_host_to_dhost.cpp
@@ -11,7 +11,7 @@
 // RUN: %{run} %t.out
 
 // Temporarily disabled until the failure is addressed.
-// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+// UNSUPPORTED: (level_zero && windows)
 
 #include "copy2d_common.hpp"
 
diff --git a/sycl/test-e2e/USM/memops2d/copy2d_host_to_host.cpp b/sycl/test-e2e/USM/memops2d/copy2d_host_to_host.cpp
index b286386fc348a..e0e785a854a06 100644
--- a/sycl/test-e2e/USM/memops2d/copy2d_host_to_host.cpp
+++ b/sycl/test-e2e/USM/memops2d/copy2d_host_to_host.cpp
@@ -11,7 +11,7 @@
 // RUN: %{run} %t.out
 
 // Temporarily disabled until the failure is addressed.
-// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+// UNSUPPORTED: (level_zero && windows)
 
 #include "copy2d_common.hpp"
 
diff --git a/sycl/test-e2e/USM/memops2d/copy2d_host_to_shared.cpp b/sycl/test-e2e/USM/memops2d/copy2d_host_to_shared.cpp
index 46be8cb2b73ce..7956f97bfd0e5 100644
--- a/sycl/test-e2e/USM/memops2d/copy2d_host_to_shared.cpp
+++ b/sycl/test-e2e/USM/memops2d/copy2d_host_to_shared.cpp
@@ -11,7 +11,7 @@
 // RUN: %{run} %t.out
 
 // Temporarily disabled until the failure is addressed.
-// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+// UNSUPPORTED: (level_zero && windows)
 
 #include "copy2d_common.hpp"
 
diff --git a/sycl/test-e2e/USM/memops2d/copy2d_shared_to_device.cpp b/sycl/test-e2e/USM/memops2d/copy2d_shared_to_device.cpp
index 596be876d0283..6dbe667916985 100644
--- a/sycl/test-e2e/USM/memops2d/copy2d_shared_to_device.cpp
+++ b/sycl/test-e2e/USM/memops2d/copy2d_shared_to_device.cpp
@@ -11,7 +11,7 @@
 // RUN: %{run} %t.out
 
 // Temporarily disabled until the failure is addressed.
-// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+// UNSUPPORTED: (level_zero && windows)
 
 #include "copy2d_common.hpp"
 
diff --git a/sycl/test-e2e/USM/memops2d/copy2d_shared_to_dhost.cpp b/sycl/test-e2e/USM/memops2d/copy2d_shared_to_dhost.cpp
index 1bb0270688c9b..c483bca1be5a2 100644
--- a/sycl/test-e2e/USM/memops2d/copy2d_shared_to_dhost.cpp
+++ b/sycl/test-e2e/USM/memops2d/copy2d_shared_to_dhost.cpp
@@ -11,7 +11,7 @@
 // RUN: %{run} %t.out
 
 // Temporarily disabled until the failure is addressed.
-// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+// UNSUPPORTED: (level_zero && windows)
 
 #include "copy2d_common.hpp"
 
diff --git a/sycl/test-e2e/USM/memops2d/copy2d_shared_to_host.cpp b/sycl/test-e2e/USM/memops2d/copy2d_shared_to_host.cpp
index 126e47d7417c1..9719fe1ded062 100644
--- a/sycl/test-e2e/USM/memops2d/copy2d_shared_to_host.cpp
+++ b/sycl/test-e2e/USM/memops2d/copy2d_shared_to_host.cpp
@@ -11,7 +11,7 @@
 // RUN: %{run} %t.out
 
 // Temporarily disabled until the failure is addressed.
-// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+// UNSUPPORTED: (level_zero && windows)
 
 #include "copy2d_common.hpp"
 
diff --git a/sycl/test-e2e/USM/memops2d/copy2d_shared_to_shared.cpp b/sycl/test-e2e/USM/memops2d/copy2d_shared_to_shared.cpp
index bb83ca22ce9fb..ae10b36e92a62 100644
--- a/sycl/test-e2e/USM/memops2d/copy2d_shared_to_shared.cpp
+++ b/sycl/test-e2e/USM/memops2d/copy2d_shared_to_shared.cpp
@@ -11,7 +11,7 @@
 // RUN: %{run} %t.out
 
 // Temporarily disabled until the failure is addressed.
-// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+// UNSUPPORTED: (level_zero && windows)
 
 #include "copy2d_common.hpp"
 
diff --git a/sycl/test-e2e/USM/memops2d/memcpy2d_device_to_device.cpp b/sycl/test-e2e/USM/memops2d/memcpy2d_device_to_device.cpp
index 519f44d25aeb9..9af4e3c51838a 100644
--- a/sycl/test-e2e/USM/memops2d/memcpy2d_device_to_device.cpp
+++ b/sycl/test-e2e/USM/memops2d/memcpy2d_device_to_device.cpp
@@ -11,7 +11,7 @@
 // RUN: %{run} %t.out
 
 // Temporarily disabled until the failure is addressed.
-// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+// UNSUPPORTED: (level_zero && windows)
 
 #include "memcpy2d_common.hpp"
 
diff --git a/sycl/test-e2e/USM/memops2d/memcpy2d_device_to_dhost.cpp b/sycl/test-e2e/USM/memops2d/memcpy2d_device_to_dhost.cpp
index 8467f7720b445..f5620a9db475c 100644
--- a/sycl/test-e2e/USM/memops2d/memcpy2d_device_to_dhost.cpp
+++ b/sycl/test-e2e/USM/memops2d/memcpy2d_device_to_dhost.cpp
@@ -11,7 +11,7 @@
 // RUN: %{run} %t.out
 
 // Temporarily disabled until the failure is addressed.
-// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+// UNSUPPORTED: (level_zero && windows)
 
 #include "memcpy2d_common.hpp"
 
diff --git a/sycl/test-e2e/USM/memops2d/memcpy2d_device_to_host.cpp b/sycl/test-e2e/USM/memops2d/memcpy2d_device_to_host.cpp
index 7e107f9d0deef..e88a7948ec340 100644
--- a/sycl/test-e2e/USM/memops2d/memcpy2d_device_to_host.cpp
+++ b/sycl/test-e2e/USM/memops2d/memcpy2d_device_to_host.cpp
@@ -12,7 +12,7 @@
 
 // Temporarily disabled until the failure is addressed.
 // For HIP see https://github.com/intel/llvm/issues/10157.
-// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows) || hip
+// UNSUPPORTED: (level_zero && windows) || hip
 
 #include "memcpy2d_common.hpp"
 
diff --git a/sycl/test-e2e/USM/memops2d/memcpy2d_device_to_shared.cpp b/sycl/test-e2e/USM/memops2d/memcpy2d_device_to_shared.cpp
index 4fe95014e46cc..1a6859b369a06 100644
--- a/sycl/test-e2e/USM/memops2d/memcpy2d_device_to_shared.cpp
+++ b/sycl/test-e2e/USM/memops2d/memcpy2d_device_to_shared.cpp
@@ -11,7 +11,7 @@
 // RUN: %{run} %t.out
 
 // Temporarily disabled until the failure is addressed.
-// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+// UNSUPPORTED: (level_zero && windows)
 
 #include "memcpy2d_common.hpp"
 
diff --git a/sycl/test-e2e/USM/memops2d/memcpy2d_dhost_to_device.cpp b/sycl/test-e2e/USM/memops2d/memcpy2d_dhost_to_device.cpp
index cacf0b673f907..09c69179aaf57 100644
--- a/sycl/test-e2e/USM/memops2d/memcpy2d_dhost_to_device.cpp
+++ b/sycl/test-e2e/USM/memops2d/memcpy2d_dhost_to_device.cpp
@@ -11,7 +11,7 @@
 // RUN: %{run} %t.out
 
 // Temporarily disabled until the failure is addressed.
-// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+// UNSUPPORTED: (level_zero && windows)
 
 #include "memcpy2d_common.hpp"
 
diff --git a/sycl/test-e2e/USM/memops2d/memcpy2d_dhost_to_dhost.cpp b/sycl/test-e2e/USM/memops2d/memcpy2d_dhost_to_dhost.cpp
index 66b5bc2fffe49..b49dcfa410d08 100644
--- a/sycl/test-e2e/USM/memops2d/memcpy2d_dhost_to_dhost.cpp
+++ b/sycl/test-e2e/USM/memops2d/memcpy2d_dhost_to_dhost.cpp
@@ -10,7 +10,7 @@
 // RUN: %{run} %t.out
 
 // Temporarily disabled until the failure is addressed.
-// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+// UNSUPPORTED: (level_zero && windows)
 
 #include "memcpy2d_common.hpp"
 
diff --git a/sycl/test-e2e/USM/memops2d/memcpy2d_dhost_to_host.cpp b/sycl/test-e2e/USM/memops2d/memcpy2d_dhost_to_host.cpp
index b4553df6e575d..639da905607d3 100644
--- a/sycl/test-e2e/USM/memops2d/memcpy2d_dhost_to_host.cpp
+++ b/sycl/test-e2e/USM/memops2d/memcpy2d_dhost_to_host.cpp
@@ -11,7 +11,7 @@
 // RUN: %{run} %t.out
 
 // Temporarily disabled until the failure is addressed.
-// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+// UNSUPPORTED: (level_zero && windows)
 
 #include "memcpy2d_common.hpp"
 
diff --git a/sycl/test-e2e/USM/memops2d/memcpy2d_dhost_to_shared.cpp b/sycl/test-e2e/USM/memops2d/memcpy2d_dhost_to_shared.cpp
index b4b232d326131..a6a3b66a19316 100644
--- a/sycl/test-e2e/USM/memops2d/memcpy2d_dhost_to_shared.cpp
+++ b/sycl/test-e2e/USM/memops2d/memcpy2d_dhost_to_shared.cpp
@@ -11,7 +11,7 @@
 // RUN: %{run} %t.out
 
 // Temporarily disabled until the failure is addressed.
-// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+// UNSUPPORTED: (level_zero && windows)
 
 #include "memcpy2d_common.hpp"
 
diff --git a/sycl/test-e2e/USM/memops2d/memcpy2d_host_to_device.cpp b/sycl/test-e2e/USM/memops2d/memcpy2d_host_to_device.cpp
index 84f386e2698af..c1450cc8641c9 100644
--- a/sycl/test-e2e/USM/memops2d/memcpy2d_host_to_device.cpp
+++ b/sycl/test-e2e/USM/memops2d/memcpy2d_host_to_device.cpp
@@ -12,7 +12,7 @@
 
 // Temporarily disabled until the failure is addressed.
 // For HIP see https://github.com/intel/llvm/issues/10157.
-// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows) || hip
+// UNSUPPORTED: (level_zero && windows) || hip
 
 #include "memcpy2d_common.hpp"
 
diff --git a/sycl/test-e2e/USM/memops2d/memcpy2d_host_to_dhost.cpp b/sycl/test-e2e/USM/memops2d/memcpy2d_host_to_dhost.cpp
index 4dce170c37743..4f823a41f03c4 100644
--- a/sycl/test-e2e/USM/memops2d/memcpy2d_host_to_dhost.cpp
+++ b/sycl/test-e2e/USM/memops2d/memcpy2d_host_to_dhost.cpp
@@ -11,7 +11,7 @@
 // RUN: %{run} %t.out
 
 // Temporarily disabled until the failure is addressed.
-// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+// UNSUPPORTED: (level_zero && windows)
 
 #include "memcpy2d_common.hpp"
 
diff --git a/sycl/test-e2e/USM/memops2d/memcpy2d_host_to_host.cpp b/sycl/test-e2e/USM/memops2d/memcpy2d_host_to_host.cpp
index ec8bd062c3501..91c205615c74d 100644
--- a/sycl/test-e2e/USM/memops2d/memcpy2d_host_to_host.cpp
+++ b/sycl/test-e2e/USM/memops2d/memcpy2d_host_to_host.cpp
@@ -11,7 +11,7 @@
 // RUN: %{run} %t.out
 
 // Temporarily disabled until the failure is addressed.
-// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+// UNSUPPORTED: (level_zero && windows)
 
 #include "memcpy2d_common.hpp"
 
diff --git a/sycl/test-e2e/USM/memops2d/memcpy2d_host_to_shared.cpp b/sycl/test-e2e/USM/memops2d/memcpy2d_host_to_shared.cpp
index 54ce7718bc564..05c8218a73bd5 100644
--- a/sycl/test-e2e/USM/memops2d/memcpy2d_host_to_shared.cpp
+++ b/sycl/test-e2e/USM/memops2d/memcpy2d_host_to_shared.cpp
@@ -11,7 +11,7 @@
 // RUN: %{run} %t.out
 
 // Temporarily disabled until the failure is addressed.
-// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+// UNSUPPORTED: (level_zero && windows)
 
 #include "memcpy2d_common.hpp"
 
diff --git a/sycl/test-e2e/USM/memops2d/memcpy2d_shared_to_device.cpp b/sycl/test-e2e/USM/memops2d/memcpy2d_shared_to_device.cpp
index f04f9d47cead2..609052abcfb49 100644
--- a/sycl/test-e2e/USM/memops2d/memcpy2d_shared_to_device.cpp
+++ b/sycl/test-e2e/USM/memops2d/memcpy2d_shared_to_device.cpp
@@ -11,7 +11,7 @@
 // RUN: %{run} %t.out
 
 // Temporarily disabled until the failure is addressed.
-// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+// UNSUPPORTED: (level_zero && windows)
 
 #include "memcpy2d_common.hpp"
 
diff --git a/sycl/test-e2e/USM/memops2d/memcpy2d_shared_to_dhost.cpp b/sycl/test-e2e/USM/memops2d/memcpy2d_shared_to_dhost.cpp
index 1dcfe5e4e04dd..92ea726c74d1c 100644
--- a/sycl/test-e2e/USM/memops2d/memcpy2d_shared_to_dhost.cpp
+++ b/sycl/test-e2e/USM/memops2d/memcpy2d_shared_to_dhost.cpp
@@ -11,7 +11,7 @@
 // RUN: %{run} %t.out
 
 // Temporarily disabled until the failure is addressed.
-// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+// UNSUPPORTED: (level_zero && windows)
 
 #include "memcpy2d_common.hpp"
 
diff --git a/sycl/test-e2e/USM/memops2d/memcpy2d_shared_to_host.cpp b/sycl/test-e2e/USM/memops2d/memcpy2d_shared_to_host.cpp
index fbf8f8d191956..4f9fe761d751d 100644
--- a/sycl/test-e2e/USM/memops2d/memcpy2d_shared_to_host.cpp
+++ b/sycl/test-e2e/USM/memops2d/memcpy2d_shared_to_host.cpp
@@ -11,7 +11,7 @@
 // RUN: %{run} %t.out
 
 // Temporarily disabled until the failure is addressed.
-// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+// UNSUPPORTED: (level_zero && windows)
 
 #include "memcpy2d_common.hpp"
 
diff --git a/sycl/test-e2e/USM/memops2d/memcpy2d_shared_to_shared.cpp b/sycl/test-e2e/USM/memops2d/memcpy2d_shared_to_shared.cpp
index 50572328c075a..4fa122d334351 100644
--- a/sycl/test-e2e/USM/memops2d/memcpy2d_shared_to_shared.cpp
+++ b/sycl/test-e2e/USM/memops2d/memcpy2d_shared_to_shared.cpp
@@ -11,7 +11,7 @@
 // RUN: %{run} %t.out
 
 // Temporarily disabled until the failure is addressed.
-// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+// UNSUPPORTED: (level_zero && windows)
 
 #include "memcpy2d_common.hpp"
 

From c4db251179dcd1b30a1043d4780634dbe39b3e3a Mon Sep 17 00:00:00 2001
From: Dmitry Vodopyanov <dmitry.vodopyanov@intel.com>
Date: Thu, 10 Aug 2023 00:19:02 +0200
Subject: [PATCH 22/24] [SYCL] Add missing Intel archs to
 sycl_ext_oneapi_device_architecture (#10716)

This patch updates sycl_ext_oneapi_device_architecture extension with
the following:

* add missing architecture:
  * intel_gpu_ehl and its sibling - intel_gpu_jsl

* add missing aliases to existing architectures:
  * intel_gpu_bxt (alias to intel_gpu_apl)
  * intel_gpu_dg2_g10 (alias to intel_gpu_acm_g10)
  * intel_gpu_dg2_g11 (alias to intel_gpu_acm_g11)
  * intel_gpu_dg2_g12 (alias to intel_gpu_acm_g12)

* remove unnecessary implementation details for intel_gpu_rpl_s from the
driver and make it dependent on intel_gpu_adl_s as they are the same
architectures
---
 clang/lib/Driver/ToolChains/SYCL.cpp          | 121 +++++++++---------
 clang/test/Driver/sycl-oneapi-gpu.cpp         |  19 ++-
 sycl/doc/UsersManual.md                       |  11 +-
 ...cl_ext_oneapi_device_architecture.asciidoc |   6 +
 .../experimental/device_architecture.hpp      |  18 ++-
 sycl/source/detail/device_info.hpp            |   1 +
 6 files changed, 104 insertions(+), 72 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/SYCL.cpp b/clang/lib/Driver/ToolChains/SYCL.cpp
index ea704c20300c6..faeed8936a4c3 100644
--- a/clang/lib/Driver/ToolChains/SYCL.cpp
+++ b/clang/lib/Driver/ToolChains/SYCL.cpp
@@ -641,65 +641,66 @@ void SYCL::gen::BackendCompiler::ConstructJob(Compilation &C,
 
 StringRef SYCL::gen::resolveGenDevice(StringRef DeviceName) {
   StringRef Device;
-  Device = llvm::StringSwitch<StringRef>(DeviceName)
-               .Cases("intel_gpu_bdw", "intel_gpu_8_0_0", "bdw")
-               .Cases("intel_gpu_skl", "intel_gpu_9_0_9", "skl")
-               .Cases("intel_gpu_kbl", "intel_gpu_9_1_9", "kbl")
-               .Cases("intel_gpu_cfl", "intel_gpu_9_2_9", "cfl")
-               .Cases("intel_gpu_apl", "intel_gpu_9_3_0", "apl")
-               .Cases("intel_gpu_glk", "intel_gpu_9_4_0", "glk")
-               .Cases("intel_gpu_whl", "intel_gpu_9_5_0", "whl")
-               .Cases("intel_gpu_aml", "intel_gpu_9_6_0", "aml")
-               .Cases("intel_gpu_cml", "intel_gpu_9_7_0", "cml")
-               .Cases("intel_gpu_icllp", "intel_gpu_11_0_0", "icllp")
-               .Cases("intel_gpu_tgllp", "intel_gpu_12_0_0", "tgllp")
-               .Case("intel_gpu_rkl", "rkl")
-               .Case("intel_gpu_adl_s", "adl_s")
-               .Case("intel_gpu_rpl_s", "rpl_s")
-               .Case("intel_gpu_adl_p", "adl_p")
-               .Case("intel_gpu_adl_n", "adl_n")
-               .Cases("intel_gpu_dg1", "intel_gpu_12_10_0", "dg1")
-               .Case("intel_gpu_acm_g10", "acm_g10")
-               .Case("intel_gpu_acm_g11", "acm_g11")
-               .Case("intel_gpu_acm_g12", "acm_g12")
-               .Case("intel_gpu_pvc", "pvc")
-               .Case("nvidia_gpu_sm_50", "sm_50")
-               .Case("nvidia_gpu_sm_52", "sm_52")
-               .Case("nvidia_gpu_sm_53", "sm_53")
-               .Case("nvidia_gpu_sm_60", "sm_60")
-               .Case("nvidia_gpu_sm_61", "sm_61")
-               .Case("nvidia_gpu_sm_62", "sm_62")
-               .Case("nvidia_gpu_sm_70", "sm_70")
-               .Case("nvidia_gpu_sm_72", "sm_72")
-               .Case("nvidia_gpu_sm_75", "sm_75")
-               .Case("nvidia_gpu_sm_80", "sm_80")
-               .Case("nvidia_gpu_sm_86", "sm_86")
-               .Case("nvidia_gpu_sm_87", "sm_87")
-               .Case("nvidia_gpu_sm_89", "sm_89")
-               .Case("nvidia_gpu_sm_90", "sm_90")
-               .Case("amd_gpu_gfx700", "gfx700")
-               .Case("amd_gpu_gfx701", "gfx701")
-               .Case("amd_gpu_gfx702", "gfx702")
-               .Case("amd_gpu_gfx801", "gfx801")
-               .Case("amd_gpu_gfx802", "gfx802")
-               .Case("amd_gpu_gfx803", "gfx803")
-               .Case("amd_gpu_gfx805", "gfx805")
-               .Case("amd_gpu_gfx810", "gfx810")
-               .Case("amd_gpu_gfx900", "gfx900")
-               .Case("amd_gpu_gfx902", "gfx902")
-               .Case("amd_gpu_gfx904", "gfx904")
-               .Case("amd_gpu_gfx906", "gfx906")
-               .Case("amd_gpu_gfx908", "gfx908")
-               .Case("amd_gpu_gfx90a", "gfx90a")
-               .Case("amd_gpu_gfx1010", "gfx1010")
-               .Case("amd_gpu_gfx1011", "gfx1011")
-               .Case("amd_gpu_gfx1012", "gfx1012")
-               .Case("amd_gpu_gfx1013", "gfx1013")
-               .Case("amd_gpu_gfx1030", "gfx1030")
-               .Case("amd_gpu_gfx1031", "gfx1031")
-               .Case("amd_gpu_gfx1032", "gfx1032")
-               .Case("amd_gpu_gfx1034", "gfx1034")
-               .Default("");
+  Device =
+      llvm::StringSwitch<StringRef>(DeviceName)
+          .Cases("intel_gpu_bdw", "intel_gpu_8_0_0", "bdw")
+          .Cases("intel_gpu_skl", "intel_gpu_9_0_9", "skl")
+          .Cases("intel_gpu_kbl", "intel_gpu_9_1_9", "kbl")
+          .Cases("intel_gpu_cfl", "intel_gpu_9_2_9", "cfl")
+          .Cases("intel_gpu_apl", "intel_gpu_bxt", "intel_gpu_9_3_0", "apl")
+          .Cases("intel_gpu_glk", "intel_gpu_9_4_0", "glk")
+          .Cases("intel_gpu_whl", "intel_gpu_9_5_0", "whl")
+          .Cases("intel_gpu_aml", "intel_gpu_9_6_0", "aml")
+          .Cases("intel_gpu_cml", "intel_gpu_9_7_0", "cml")
+          .Cases("intel_gpu_icllp", "intel_gpu_11_0_0", "icllp")
+          .Cases("intel_gpu_ehl", "intel_gpu_jsl", "ehl")
+          .Cases("intel_gpu_tgllp", "intel_gpu_12_0_0", "tgllp")
+          .Case("intel_gpu_rkl", "rkl")
+          .Cases("intel_gpu_adl_s", "intel_gpu_rpl_s", "adl_s")
+          .Case("intel_gpu_adl_p", "adl_p")
+          .Case("intel_gpu_adl_n", "adl_n")
+          .Cases("intel_gpu_dg1", "intel_gpu_12_10_0", "dg1")
+          .Cases("intel_gpu_acm_g10", "intel_gpu_dg2_g10", "acm_g10")
+          .Cases("intel_gpu_acm_g11", "intel_gpu_dg2_g11", "acm_g11")
+          .Cases("intel_gpu_acm_g12", "intel_gpu_dg2_g12", "acm_g12")
+          .Case("intel_gpu_pvc", "pvc")
+          .Case("nvidia_gpu_sm_50", "sm_50")
+          .Case("nvidia_gpu_sm_52", "sm_52")
+          .Case("nvidia_gpu_sm_53", "sm_53")
+          .Case("nvidia_gpu_sm_60", "sm_60")
+          .Case("nvidia_gpu_sm_61", "sm_61")
+          .Case("nvidia_gpu_sm_62", "sm_62")
+          .Case("nvidia_gpu_sm_70", "sm_70")
+          .Case("nvidia_gpu_sm_72", "sm_72")
+          .Case("nvidia_gpu_sm_75", "sm_75")
+          .Case("nvidia_gpu_sm_80", "sm_80")
+          .Case("nvidia_gpu_sm_86", "sm_86")
+          .Case("nvidia_gpu_sm_87", "sm_87")
+          .Case("nvidia_gpu_sm_89", "sm_89")
+          .Case("nvidia_gpu_sm_90", "sm_90")
+          .Case("amd_gpu_gfx700", "gfx700")
+          .Case("amd_gpu_gfx701", "gfx701")
+          .Case("amd_gpu_gfx702", "gfx702")
+          .Case("amd_gpu_gfx801", "gfx801")
+          .Case("amd_gpu_gfx802", "gfx802")
+          .Case("amd_gpu_gfx803", "gfx803")
+          .Case("amd_gpu_gfx805", "gfx805")
+          .Case("amd_gpu_gfx810", "gfx810")
+          .Case("amd_gpu_gfx900", "gfx900")
+          .Case("amd_gpu_gfx902", "gfx902")
+          .Case("amd_gpu_gfx904", "gfx904")
+          .Case("amd_gpu_gfx906", "gfx906")
+          .Case("amd_gpu_gfx908", "gfx908")
+          .Case("amd_gpu_gfx90a", "gfx90a")
+          .Case("amd_gpu_gfx1010", "gfx1010")
+          .Case("amd_gpu_gfx1011", "gfx1011")
+          .Case("amd_gpu_gfx1012", "gfx1012")
+          .Case("amd_gpu_gfx1013", "gfx1013")
+          .Case("amd_gpu_gfx1030", "gfx1030")
+          .Case("amd_gpu_gfx1031", "gfx1031")
+          .Case("amd_gpu_gfx1032", "gfx1032")
+          .Case("amd_gpu_gfx1034", "gfx1034")
+          .Default("");
   return Device;
 }
 
@@ -716,10 +717,10 @@ SmallString<64> SYCL::gen::getGenDeviceMacro(StringRef DeviceName) {
                       .Case("aml", "INTEL_GPU_AML")
                       .Case("cml", "INTEL_GPU_CML")
                       .Case("icllp", "INTEL_GPU_ICLLP")
+                      .Case("ehl", "INTEL_GPU_EHL")
                       .Case("tgllp", "INTEL_GPU_TGLLP")
                       .Case("rkl", "INTEL_GPU_RKL")
                       .Case("adl_s", "INTEL_GPU_ADL_S")
-                      .Case("rpl_s", "INTEL_GPU_RPL_S")
                       .Case("adl_p", "INTEL_GPU_ADL_P")
                       .Case("adl_n", "INTEL_GPU_ADL_N")
                       .Case("dg1", "INTEL_GPU_DG1")
diff --git a/clang/test/Driver/sycl-oneapi-gpu.cpp b/clang/test/Driver/sycl-oneapi-gpu.cpp
index 7be7c85f3c33b..5445f2bcf9173 100644
--- a/clang/test/Driver/sycl-oneapi-gpu.cpp
+++ b/clang/test/Driver/sycl-oneapi-gpu.cpp
@@ -20,6 +20,8 @@
 // RUN:   FileCheck %s --check-prefixes=DEVICE,MACRO -DDEV_STR=apl -DMAC_STR=APL
 // RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_9_3_0 -### %s 2>&1 | \
 // RUN:   FileCheck %s --check-prefixes=DEVICE,MACRO -DDEV_STR=apl -DMAC_STR=APL
+// RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_bxt -### %s 2>&1 | \
+// RUN:   FileCheck %s --check-prefixes=DEVICE,MACRO -DDEV_STR=apl -DMAC_STR=APL
 // RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_glk -### %s 2>&1 | \
 // RUN:   FileCheck %s --check-prefixes=DEVICE,MACRO -DDEV_STR=glk -DMAC_STR=GLK
 // RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_9_4_0 -### %s 2>&1 | \
@@ -42,6 +44,10 @@
 // RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_11_0_0 -### %s 2>&1 | \
 // RUN:   FileCheck %s --check-prefixes=DEVICE,MACRO -DDEV_STR=icllp \
 // RUN:             -DMAC_STR=ICLLP
+// RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_ehl -### %s 2>&1 | \
+// RUN:   FileCheck %s --check-prefixes=DEVICE,MACRO -DDEV_STR=ehl -DMAC_STR=EHL
+// RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_jsl -### %s 2>&1 | \
+// RUN:   FileCheck %s --check-prefixes=DEVICE,MACRO -DDEV_STR=ehl -DMAC_STR=EHL
 // RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_tgllp -### %s 2>&1 | \
 // RUN:   FileCheck %s --check-prefixes=DEVICE,MACRO -DDEV_STR=tgllp \
 // RUN:             -DMAC_STR=TGLLP
@@ -54,8 +60,8 @@
 // RUN:   FileCheck %s --check-prefixes=DEVICE,MACRO -DDEV_STR=adl_s \
 // RUN:             -DMAC_STR=ADL_S
 // RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_rpl_s -### %s 2>&1 | \
-// RUN:   FileCheck %s --check-prefixes=DEVICE,MACRO -DDEV_STR=rpl_s \
-// RUN:             -DMAC_STR=RPL_S
+// RUN:   FileCheck %s --check-prefixes=DEVICE,MACRO -DDEV_STR=adl_s \
+// RUN:             -DMAC_STR=ADL_S
 // RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_adl_p -### %s 2>&1 | \
 // RUN:   FileCheck %s --check-prefixes=DEVICE,MACRO -DDEV_STR=adl_p \
 // RUN:             -DMAC_STR=ADL_P
@@ -69,12 +75,21 @@
 // RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_acm_g10 -### %s 2>&1 | \
 // RUN:   FileCheck %s --check-prefixes=DEVICE,MACRO -DDEV_STR=acm_g10 \
 // RUN:             -DMAC_STR=ACM_G10
+// RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_dg2_g10 -### %s 2>&1 | \
+// RUN:   FileCheck %s --check-prefixes=DEVICE,MACRO -DDEV_STR=acm_g10 \
+// RUN:             -DMAC_STR=ACM_G10
 // RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_acm_g11 -### %s 2>&1 | \
 // RUN:   FileCheck %s --check-prefixes=DEVICE,MACRO -DDEV_STR=acm_g11 \
 // RUN:             -DMAC_STR=ACM_G11
+// RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_dg2_g11 -### %s 2>&1 | \
+// RUN:   FileCheck %s --check-prefixes=DEVICE,MACRO -DDEV_STR=acm_g11 \
+// RUN:             -DMAC_STR=ACM_G11
 // RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_acm_g12 -### %s 2>&1 | \
 // RUN:   FileCheck %s --check-prefixes=DEVICE,MACRO -DDEV_STR=acm_g12 \
 // RUN:             -DMAC_STR=ACM_G12
+// RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_dg2_g12 -### %s 2>&1 | \
+// RUN:   FileCheck %s --check-prefixes=DEVICE,MACRO -DDEV_STR=acm_g12 \
+// RUN:             -DMAC_STR=ACM_G12
 // RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_pvc -### %s 2>&1 | \
 // RUN:   FileCheck %s --check-prefixes=DEVICE,MACRO -DDEV_STR=pvc -DMAC_STR=PVC
 // MACRO: clang{{.*}} "-triple" "spir64_gen-unknown-unknown"
diff --git a/sycl/doc/UsersManual.md b/sycl/doc/UsersManual.md
index 9e4f20ef2fccc..41c16ff59b6c2 100644
--- a/sycl/doc/UsersManual.md
+++ b/sycl/doc/UsersManual.md
@@ -40,21 +40,24 @@ and not recommended to use in production environment.
     support are accepted, providing a streamlined interface for AOT. Only one of
     these values at a time is supported.
     * intel_gpu_pvc - Ponte Vecchio Intel graphics architecture
-    * intel_gpu_acm_g12 - Alchemist G12 Intel graphics architecture
-    * intel_gpu_acm_g11 - Alchemist G11 Intel graphics architecture
-    * intel_gpu_acm_g10 - Alchemist G10 Intel graphics architecture
+    * intel_gpu_acm_g12, intel_gpu_dg2_g12 - Alchemist G12 Intel graphics architecture
+    * intel_gpu_acm_g11, intel_gpu_dg2_g11 - Alchemist G11 Intel graphics architecture
+    * intel_gpu_acm_g10, intel_gpu_dg2_g10 - Alchemist G10 Intel graphics architecture
     * intel_gpu_dg1, intel_gpu_12_10_0 - DG1 Intel graphics architecture
     * intel_gpu_adl_n - Alder Lake N Intel graphics architecture
     * intel_gpu_adl_p - Alder Lake P Intel graphics architecture
-    * intel_gpu_rpl_s - Raptor Lake Intel graphics architecture
+    * intel_gpu_rpl_s - Raptor Lake Intel graphics architecture (equal to intel_gpu_adl_s)
     * intel_gpu_adl_s - Alder Lake S Intel graphics architecture
     * intel_gpu_rkl - Rocket Lake Intel graphics architecture
     * intel_gpu_tgllp, intel_gpu_12_0_0 - Tiger Lake Intel graphics architecture
+    * intel_gpu_jsl - Jasper Lake Intel graphics architecture (equal to intel_gpu_ehl)
+    * intel_gpu_ehl - Elkhart Lake Intel graphics architecture
     * intel_gpu_icllp, intel_gpu_11_0_0 - Ice Lake Intel graphics architecture
     * intel_gpu_cml, intel_gpu_9_7_0 - Comet Lake Intel graphics architecture
     * intel_gpu_aml, intel_gpu_9_6_0 - Amber Lake Intel graphics architecture
     * intel_gpu_whl, intel_gpu_9_5_0 - Whiskey Lake Intel graphics architecture
     * intel_gpu_glk, intel_gpu_9_4_0 - Gemini Lake Intel graphics architecture
+    * intel_gpu_bxt - Broxton Intel graphics architecture (equal to intel_gpu_apl)
     * intel_gpu_apl, intel_gpu_9_3_0 - Apollo Lake Intel graphics architecture
     * intel_gpu_cfl, intel_gpu_9_2_9 - Coffee Lake Intel graphics architecture
     * intel_gpu_kbl, intel_gpu_9_1_9 - Kaby Lake Intel graphics architecture
diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc
index 10de1c45407bf..dd27c1437af4c 100644
--- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc
+++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc
@@ -104,11 +104,14 @@ enum class architecture : /* unspecified */ {
   intel_gpu_kbl,
   intel_gpu_cfl,
   intel_gpu_apl,
+  intel_gpu_bxt = intel_gpu_apl,
   intel_gpu_glk,
   intel_gpu_whl,
   intel_gpu_aml,
   intel_gpu_cml,
   intel_gpu_icllp,
+  intel_gpu_ehl,
+  intel_gpu_jsl = intel_gpu_ehl,
   intel_gpu_tgllp,
   intel_gpu_rkl,
   intel_gpu_adl_s,
@@ -117,8 +120,11 @@ enum class architecture : /* unspecified */ {
   intel_gpu_adl_n,
   intel_gpu_dg1,
   intel_gpu_acm_g10,
+  intel_gpu_dg2_g10 = intel_gpu_acm_g10,
   intel_gpu_acm_g11,
+  intel_gpu_dg2_g11 = intel_gpu_acm_g11,
   intel_gpu_acm_g12,
+  intel_gpu_dg2_g12 = intel_gpu_acm_g12,
   intel_gpu_pvc,
 
   nvidia_gpu_sm_50,
diff --git a/sycl/include/sycl/ext/oneapi/experimental/device_architecture.hpp b/sycl/include/sycl/ext/oneapi/experimental/device_architecture.hpp
index 92fda8191124b..5eb0a301ce4b5 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/device_architecture.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/device_architecture.hpp
@@ -19,11 +19,14 @@ enum class architecture {
   intel_gpu_kbl,
   intel_gpu_cfl,
   intel_gpu_apl,
+  intel_gpu_bxt = intel_gpu_apl,
   intel_gpu_glk,
   intel_gpu_whl,
   intel_gpu_aml,
   intel_gpu_cml,
   intel_gpu_icllp,
+  intel_gpu_ehl,
+  intel_gpu_jsl = intel_gpu_ehl,
   intel_gpu_tgllp,
   intel_gpu_rkl,
   intel_gpu_adl_s,
@@ -32,8 +35,11 @@ enum class architecture {
   intel_gpu_adl_n,
   intel_gpu_dg1,
   intel_gpu_acm_g10,
+  intel_gpu_dg2_g10 = intel_gpu_acm_g10,
   intel_gpu_acm_g11,
+  intel_gpu_dg2_g11 = intel_gpu_acm_g11,
   intel_gpu_acm_g12,
+  intel_gpu_dg2_g12 = intel_gpu_acm_g12,
   intel_gpu_pvc,
   // NVIDIA architectures
   nvidia_gpu_sm_50,
@@ -128,6 +134,9 @@ static constexpr ext::oneapi::experimental::architecture max_architecture =
 #ifndef __SYCL_TARGET_INTEL_GPU_ICLLP__
 #define __SYCL_TARGET_INTEL_GPU_ICLLP__ 0
 #endif
+#ifndef __SYCL_TARGET_INTEL_GPU_EHL__
+#define __SYCL_TARGET_INTEL_GPU_EHL__ 0
+#endif
 #ifndef __SYCL_TARGET_INTEL_GPU_TGLLP__
 #define __SYCL_TARGET_INTEL_GPU_TGLLP__ 0
 #endif
@@ -137,9 +146,6 @@ static constexpr ext::oneapi::experimental::architecture max_architecture =
 #ifndef __SYCL_TARGET_INTEL_GPU_ADL_S__
 #define __SYCL_TARGET_INTEL_GPU_ADL_S__ 0
 #endif
-#ifndef __SYCL_TARGET_INTEL_GPU_RPL_S__
-#define __SYCL_TARGET_INTEL_GPU_RPL_S__ 0
-#endif
 #ifndef __SYCL_TARGET_INTEL_GPU_ADL_P__
 #define __SYCL_TARGET_INTEL_GPU_ADL_P__ 0
 #endif
@@ -287,10 +293,10 @@ static constexpr bool is_allowable_aot_mode =
     (__SYCL_TARGET_INTEL_GPU_AML__ == 1) ||
     (__SYCL_TARGET_INTEL_GPU_CML__ == 1) ||
     (__SYCL_TARGET_INTEL_GPU_ICLLP__ == 1) ||
+    (__SYCL_TARGET_INTEL_GPU_EHL__ == 1) ||
     (__SYCL_TARGET_INTEL_GPU_TGLLP__ == 1) ||
     (__SYCL_TARGET_INTEL_GPU_RKL__ == 1) ||
     (__SYCL_TARGET_INTEL_GPU_ADL_S__ == 1) ||
-    (__SYCL_TARGET_INTEL_GPU_RPL_S__ == 1) ||
     (__SYCL_TARGET_INTEL_GPU_ADL_P__ == 1) ||
     (__SYCL_TARGET_INTEL_GPU_ADL_N__ == 1) ||
     (__SYCL_TARGET_INTEL_GPU_DG1__ == 1) ||
@@ -364,14 +370,14 @@ struct IsAOTForArchitectureClass {
         __SYCL_TARGET_INTEL_GPU_CML__ == 1;
     arr[static_cast<int>(arch::intel_gpu_icllp)] =
         __SYCL_TARGET_INTEL_GPU_ICLLP__ == 1;
+    arr[static_cast<int>(arch::intel_gpu_ehl)] =
+        __SYCL_TARGET_INTEL_GPU_EHL__ == 1;
     arr[static_cast<int>(arch::intel_gpu_tgllp)] =
         __SYCL_TARGET_INTEL_GPU_TGLLP__ == 1;
     arr[static_cast<int>(arch::intel_gpu_rkl)] =
         __SYCL_TARGET_INTEL_GPU_RKL__ == 1;
     arr[static_cast<int>(arch::intel_gpu_adl_s)] =
         __SYCL_TARGET_INTEL_GPU_ADL_S__ == 1;
-    arr[static_cast<int>(arch::intel_gpu_rpl_s)] =
-        __SYCL_TARGET_INTEL_GPU_RPL_S__ == 1;
     arr[static_cast<int>(arch::intel_gpu_adl_p)] =
         __SYCL_TARGET_INTEL_GPU_ADL_P__ == 1;
     arr[static_cast<int>(arch::intel_gpu_adl_n)] =
diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp
index 180fd6f6933f4..9aebaf63d0d61 100644
--- a/sycl/source/detail/device_info.hpp
+++ b/sycl/source/detail/device_info.hpp
@@ -601,6 +601,7 @@ struct get_device_info_impl<
           {0x02418000, oneapi_exp_arch::intel_gpu_aml},
           {0x0241c000, oneapi_exp_arch::intel_gpu_cml},
           {0x02c00000, oneapi_exp_arch::intel_gpu_icllp},
+          {0x02c08000, oneapi_exp_arch::intel_gpu_ehl},
           {0x03000000, oneapi_exp_arch::intel_gpu_tgllp},
           {0x03004000, oneapi_exp_arch::intel_gpu_rkl},
           {0x03008000, oneapi_exp_arch::intel_gpu_adl_s},

From 479c2ccabb00a3ce204976b100f276d23a0f20fa Mon Sep 17 00:00:00 2001
From: "Tsang, Whitney" <whitney.tsang@intel.com>
Date: Thu, 10 Aug 2023 09:17:15 -0700
Subject: [PATCH 23/24] [SYCL-MLIR] Revert [SYCL] Switch SPIR-V offload target
 to opaque pointers

Signed-off-by: Tsang, Whitney <whitney.tsang@intel.com>
---
 llvm/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index c0e30c1d2d315..fb5d2171da12b 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -893,7 +893,7 @@ set(DPCPP_ENABLE_OPAQUE_POINTERS FALSE CACHE BOOL
 if (DPCPP_ENABLE_OPAQUE_POINTERS)
   add_definitions("-DENABLE_OPAQUE_POINTERS=1")
 endif(DPCPP_ENABLE_OPAQUE_POINTERS)
-set(SPIRV_ENABLE_OPAQUE_POINTERS TRUE CACHE BOOL
+set(SPIRV_ENABLE_OPAQUE_POINTERS FALSE CACHE BOOL
     "Enable opaque pointers for SPIR-V offload by default.")
 if(SPIRV_ENABLE_OPAQUE_POINTERS)
   add_definitions("-DSPIRV_ENABLE_OPAQUE_POINTERS=1")

From d0ff1ac6c9b01416dc25fc787e84838acd7d6568 Mon Sep 17 00:00:00 2001
From: "Tsang, Whitney" <whitney.tsang@intel.com>
Date: Thu, 10 Aug 2023 09:50:59 -0700
Subject: [PATCH 24/24] [SYCL-MLIR] Fix merge

Signed-off-by: Tsang, Whitney <whitney.tsang@intel.com>
---
 .github/workflows/sycl_linux_build.yml   | 58 +++++++++++++-----------
 .github/workflows/sycl_precommit_aws.yml |  2 +
 sycl/test-e2e/xfail_tests.txt            |  1 +
 3 files changed, 35 insertions(+), 26 deletions(-)

diff --git a/.github/workflows/sycl_linux_build.yml b/.github/workflows/sycl_linux_build.yml
index c355f18f35471..48cee3f83bda3 100644
--- a/.github/workflows/sycl_linux_build.yml
+++ b/.github/workflows/sycl_linux_build.yml
@@ -38,7 +38,7 @@ on:
       changes:
         type: string
         description: 'Filter matches for the changed files in the PR'
-        default: '[llvm, clang, sycl, llvm_spirv, xptifw, libclc, libdevice]'
+        default: '[mlir_sycl, polygeist, cgeist, clang]'
         required: false
       merge_ref:
         description: |
@@ -150,6 +150,7 @@ jobs:
         ref: ${{ inputs.build_ref || github.sha }}
         merge_ref: ${{ inputs.merge_ref }}
         cache_path: "/__w/repo_cache/"
+        default_branch: sycl-mlir
     - name: Configure
       env:
         CC: ${{ inputs.cc }}
@@ -170,38 +171,43 @@ jobs:
     - name: Compile
       id: build
       run: cmake --build $GITHUB_WORKSPACE/build
-    - name: check-llvm
-      if: always() && !cancelled() && contains(inputs.changes, 'llvm')
+    - name: mlir-sycl-doc
+      if: always() && !cancelled() && contains(inputs.changes, 'mlir_sycl')
       run: |
-        cmake --build $GITHUB_WORKSPACE/build --target check-llvm
-    - name: check-clang
-      if: always() && !cancelled() && contains(inputs.changes, 'clang')
+        cmake --build $GITHUB_WORKSPACE/build --target mlir-sycl-doc
+    - name: polygeist-doc
+      if: always() && !cancelled() && contains(inputs.changes, 'polygeist')
       run: |
-        # Can we move this to Dockerfile? Hopefully, noop on Windows.
-        export XDG_CACHE_HOME=$GITHUB_WORKSPACE/os_cache
-        cmake --build $GITHUB_WORKSPACE/build --target check-clang
-    - name: check-sycl
-      if: always() && !cancelled() && contains(inputs.changes, 'sycl')
+        cmake --build $GITHUB_WORKSPACE/build --target polygeist-doc
+    # TODO allow to optionally disable in-tree checks
+    - name: check-mlir-sycl
+      shell: bash
+      if: always() && !cancelled() && contains(inputs.changes, 'mlir_sycl')
       run: |
-        # TODO consider moving this to Dockerfile.
-        export LD_LIBRARY_PATH=/usr/local/cuda/compat/:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
-        cmake --build $GITHUB_WORKSPACE/build --target check-sycl
-    - name: check-llvm-spirv
-      if: always() && !cancelled() && contains(inputs.changes, 'llvm_spirv')
+        cmake --build $GITHUB_WORKSPACE/build --target check-mlir-sycl
+    - name: check-polygeist
+      shell: bash
+      if: always() && !cancelled() && contains(inputs.changes, 'polygeist')
       run: |
-        cmake --build $GITHUB_WORKSPACE/build --target check-llvm-spirv
-    - name: check-xptifw
-      if: always() && !cancelled() && contains(inputs.changes, 'xptifw')
+        cmake --build $GITHUB_WORKSPACE/build --target check-polygeist
+    - name: check-polygeist-unit
+      shell: bash
+      if: always() && !cancelled() && contains(inputs.changes, 'polygeist')
       run: |
-        cmake --build $GITHUB_WORKSPACE/build --target check-xptifw
-    - name: check-libclc
-      if: always() && !cancelled() && contains(inputs.changes, 'libclc')
+        cmake --build $GITHUB_WORKSPACE/build --target check-polygeist-unit
+    - name: check-cgeist
+      shell: bash
+      if: always() && !cancelled() && contains(inputs.changes, 'cgeist')
       run: |
-        cmake --build $GITHUB_WORKSPACE/build --target check-libclc
-    - name: check-libdevice
-      if: always() && !cancelled() && contains(inputs.changes, 'libdevice')
+        if [ -e /runtimes/oneapi-tbb/env/vars.sh ]; then
+          source /runtimes/oneapi-tbb/env/vars.sh;
+        fi
+        cmake --build $GITHUB_WORKSPACE/build --target check-cgeist
+    - name: check-clang-driver
+      shell: bash
+      if: always() && !cancelled() && contains(inputs.changes, 'clang')
       run: |
-        cmake --build $GITHUB_WORKSPACE/build --target check-libdevice
+        cmake --build $GITHUB_WORKSPACE/build --target check-clang-driver
     - name: Install
       if: ${{ always() && !cancelled() && steps.build.conclusion == 'success' }}
       # TODO replace utility installation with a single CMake target
diff --git a/.github/workflows/sycl_precommit_aws.yml b/.github/workflows/sycl_precommit_aws.yml
index 6daa6dc9e5c02..c1d60bf8911df 100644
--- a/.github/workflows/sycl_precommit_aws.yml
+++ b/.github/workflows/sycl_precommit_aws.yml
@@ -13,6 +13,8 @@ on:
     workflows: [SYCL Pre Commit on Linux]
     types:
       - completed
+    branches-ignore:
+      - sycl-mlir
 
 jobs:
   create-check:
diff --git a/sycl/test-e2e/xfail_tests.txt b/sycl/test-e2e/xfail_tests.txt
index 02a3fab4af30c..3ec7ff326dfb3 100644
--- a/sycl/test-e2e/xfail_tests.txt
+++ b/sycl/test-e2e/xfail_tests.txt
@@ -450,6 +450,7 @@ NonUniformGroups/tangle_group.cpp
 NonUniformGroups/tangle_group_algorithms.cpp
 OptionalKernelFeatures/is_compatible.cpp
 OptionalKernelFeatures/is_compatible/is_compatible_with_aspects.cpp
+OptionalKernelFeatures/no-fp64-optimization-declared-aspects.cpp
 OptionalKernelFeatures/throw-exception-for-unsupported-aspect.cpp
 Plugin/interop-level-zero-buffer-multi-dim.cpp
 Plugin/interop-level-zero-image-get-native-mem.cpp