From 3c3ddc5cee5ec3e62fa45c4429861644cbd03188 Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Thu, 4 Jan 2024 06:37:44 -0500
Subject: [PATCH 1/4] Made tests portably pass on cuda/hip backends.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 sycl/test-e2e/USM/P2P/p2p_access.cpp  | 16 +++------------
 sycl/test-e2e/USM/P2P/p2p_atomics.cpp | 28 +++++++++------------------
 sycl/test-e2e/USM/P2P/p2p_copy.cpp    | 16 +++------------
 3 files changed, 15 insertions(+), 45 deletions(-)
diff --git a/sycl/test-e2e/USM/P2P/p2p_access.cpp b/sycl/test-e2e/USM/P2P/p2p_access.cpp
index cf8121db1a930..859b469f73a5f 100644
--- a/sycl/test-e2e/USM/P2P/p2p_access.cpp
+++ b/sycl/test-e2e/USM/P2P/p2p_access.cpp
@@ -1,6 +1,5 @@
-// REQUIRES: cuda
-// RUN: %{build} -o %t.out
-// RUN: %if cuda %{ %{run} %t.out %}
+// RUN:  %{build} -o %t.out
+// RUN:  %{run} %t.out
 
 #include <cassert>
 #include <sycl/sycl.hpp>
@@ -9,17 +8,8 @@ using namespace sycl;
 
 int main() {
 
-  // Note that this code will largely be removed: it is temporary due to the
-  // temporary lack of multiple devices per sycl context in the Nvidia backend.
-  // A portable implementation, using a single gpu platform, should be possible
-  // once the Nvidia context issues are resolved.
-  ////////////////////////////////////////////////////////////////////////
-  std::vector<sycl::device> Devs;
-  for (const auto &plt : sycl::platform::get_platforms()) {
+  auto Devs = platform(gpu_selector_v).get_devices(info::device_type::gpu);
 
-    if (plt.get_backend() == sycl::backend::ext_oneapi_cuda)
-      Devs.push_back(plt.get_devices()[0]);
-  }
   if (Devs.size() < 2) {
     std::cout << "Cannot test P2P capabilities, at least two devices are "
                  "required, exiting."
diff --git a/sycl/test-e2e/USM/P2P/p2p_atomics.cpp b/sycl/test-e2e/USM/P2P/p2p_atomics.cpp
index 9f6c254dbf915..59fd189fbd7d6 100644
--- a/sycl/test-e2e/USM/P2P/p2p_atomics.cpp
+++ b/sycl/test-e2e/USM/P2P/p2p_atomics.cpp
@@ -1,6 +1,5 @@
-// REQUIRES: cuda
-// RUN: %if any-device-is-cuda %{ %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_61 -o %t.out %}
-// RUN: %if cuda %{ %{run} %t.out %}
+// RUN:  %{build} %if any-device-is-cuda %{ -Xsycl-target-backend --cuda-gpu-arch=sm_61 %} -o %t.out
+// RUN:  %{run} %t.out
 
 #include <cassert>
 #include <numeric>
@@ -14,17 +13,8 @@ constexpr size_t N = 512;
 
 int main() {
 
-  // Note that this code will largely be removed: it is temporary due to the
-  // temporary lack of multiple devices per sycl context in the Nvidia backend.
-  // A portable implementation, using a single gpu platform, should be possible
-  // once the Nvidia context issues are resolved.
-  ////////////////////////////////////////////////////////////////////////
-  std::vector<sycl::device> Devs;
-  for (const auto &plt : sycl::platform::get_platforms()) {
+  auto Devs = platform(gpu_selector_v).get_devices(info::device_type::gpu);
 
-    if (plt.get_backend() == sycl::backend::ext_oneapi_cuda)
-      Devs.push_back(plt.get_devices()[0]);
-  }
   if (Devs.size() < 2) {
     std::cout << "Cannot test P2P capabilities, at least two devices are "
                  "required, exiting."
@@ -47,18 +37,18 @@ int main() {
   // Enables Devs[1] to access Devs[0] memory.
   Devs[1].ext_oneapi_enable_peer_access(Devs[0]);
 
-  std::vector<double> input(N);
+  std::vector<int> input(N);
   std::iota(input.begin(), input.end(), 0);
 
-  double h_sum = 0.;
+  int h_sum = 0.;
   for (const auto &value : input) {
     h_sum += value;
   }
 
-  double *d_sum = malloc_shared<double>(1, Queues[0]);
-  double *d_in = malloc_device<double>(N, Queues[0]);
+  int *d_sum = malloc_shared<int>(1, Queues[0]);
+  int *d_in = malloc_device<int>(N, Queues[0]);
 
-  Queues[0].memcpy(d_in, &input[0], N * sizeof(double));
+  Queues[0].memcpy(d_in, &input[0], N * sizeof(int));
   Queues[0].wait();
 
   range global_range{N};
@@ -66,7 +56,7 @@ int main() {
   *d_sum = 0.;
   Queues[1].submit([&](handler &h) {
     h.parallel_for<class peer_atomic>(global_range, [=](id<1> i) {
-      sycl::atomic_ref<double, sycl::memory_order::relaxed,
+      sycl::atomic_ref<int, sycl::memory_order::relaxed,
                        sycl::memory_scope::system,
                        access::address_space::global_space>(*d_sum) += d_in[i];
     });
diff --git a/sycl/test-e2e/USM/P2P/p2p_copy.cpp b/sycl/test-e2e/USM/P2P/p2p_copy.cpp
index f88f0d21af821..4cd4bf7bb8cfd 100644
--- a/sycl/test-e2e/USM/P2P/p2p_copy.cpp
+++ b/sycl/test-e2e/USM/P2P/p2p_copy.cpp
@@ -1,6 +1,5 @@
-// REQUIRES: cuda
-// RUN: %{build} -o %t.out
-// RUN: %if cuda %{ %{run} %t.out %}
+// RUN:  %{build} -o %t.out
+// RUN:  %{run} %t.out
 
 #include <cassert>
 #include <numeric>
@@ -14,17 +13,8 @@ constexpr int N = 100;
 
 int main() {
 
-  // Note that this code will largely be removed: it is temporary due to the
-  // temporary lack of multiple devices per sycl context in the Nvidia backend.
-  // A portable implementation, using a single gpu platform, should be possible
-  // once the Nvidia context issues are resolved.
-  ////////////////////////////////////////////////////////////////////////
-  std::vector<sycl::device> Devs;
-  for (const auto &plt : sycl::platform::get_platforms()) {
+  auto Devs = platform(gpu_selector_v).get_devices(info::device_type::gpu);
 
-    if (plt.get_backend() == sycl::backend::ext_oneapi_cuda)
-      Devs.push_back(plt.get_devices()[0]);
-  }
   if (Devs.size() < 2) {
     std::cout << "Cannot test P2P capabilities, at least two devices are "
                  "required, exiting."

From 9c8433382de0f078942ac0586a10352e0c6552d0 Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Thu, 8 Feb 2024 09:56:57 -0500
Subject: [PATCH 2/4] Set temp CMAKE for testing.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 sycl/plugins/hip/CMakeLists.txt             | 2 +-
 sycl/plugins/unified_runtime/CMakeLists.txt | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/sycl/plugins/hip/CMakeLists.txt b/sycl/plugins/hip/CMakeLists.txt
index 7bb8638d9aa13..6d455e460bcfa 100644
--- a/sycl/plugins/hip/CMakeLists.txt
+++ b/sycl/plugins/hip/CMakeLists.txt
@@ -5,7 +5,7 @@ set(SYCL_BUILD_PI_HIP_PLATFORM "AMD" CACHE STRING "PI HIP platform, AMD or NVIDI
 message(STATUS "Including the PI API HIP backend for ${SYCL_BUILD_PI_HIP_PLATFORM}.")
 
 # Set default ROCm installation directory
-set(SYCL_BUILD_PI_HIP_ROCM_DIR "/opt/rocm" CACHE STRING "ROCm installation dir")
+set(SYCL_BUILD_PI_HIP_ROCM_DIR "/opt/rocm-5.5.1" CACHE STRING "ROCm installation dir")
 
 # Set HIP include and lib dirs
 set(SYCL_BUILD_PI_HIP_INCLUDE_DIR "" CACHE STRING "Override HIP include dir path (set to \"\" for default behavior)")
diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index c12757976b0eb..81298f0b92e91 100644
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -56,14 +56,14 @@ endif()
 if(SYCL_PI_UR_USE_FETCH_CONTENT)
   include(FetchContent)
 
-  set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
+  set(UNIFIED_RUNTIME_REPO "https://github.com/JackAKirk/unified-runtime.git")
   # commit 749d8e51ea8e56726a9fb57949d7a3f81b47b15c
   # Merge: 810a5774 34831f4b
   # Author: Kenneth Benzie (Benie) <k.benzie@codeplay.com>
   # Date:   Wed Jan 3 13:27:16 2024 +0000
   #     Merge pull request #1198 from al42and/aa-rocm6
   #     [HIP] Fix build with ROCm 6.0.0
-  set(UNIFIED_RUNTIME_TAG 749d8e51ea8e56726a9fb57949d7a3f81b47b15c)
+  set(UNIFIED_RUNTIME_TAG 0dc0d83)
 
   if(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO)
     set(UNIFIED_RUNTIME_REPO "${SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO}")

From d9f675a47996ae1dfca37f3e2bf3fd939318e778 Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Fri, 7 Jun 2024 08:04:09 -0700
Subject: [PATCH 3/4] Revert "Set temp CMAKE for testing."

This reverts commit 9c8433382de0f078942ac0586a10352e0c6552d0.
---
 sycl/plugins/hip/CMakeLists.txt             | 2 +-
 sycl/plugins/unified_runtime/CMakeLists.txt | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/sycl/plugins/hip/CMakeLists.txt b/sycl/plugins/hip/CMakeLists.txt
index 6d455e460bcfa..7bb8638d9aa13 100644
--- a/sycl/plugins/hip/CMakeLists.txt
+++ b/sycl/plugins/hip/CMakeLists.txt
@@ -5,7 +5,7 @@ set(SYCL_BUILD_PI_HIP_PLATFORM "AMD" CACHE STRING "PI HIP platform, AMD or NVIDI
 message(STATUS "Including the PI API HIP backend for ${SYCL_BUILD_PI_HIP_PLATFORM}.")
 
 # Set default ROCm installation directory
-set(SYCL_BUILD_PI_HIP_ROCM_DIR "/opt/rocm-5.5.1" CACHE STRING "ROCm installation dir")
+set(SYCL_BUILD_PI_HIP_ROCM_DIR "/opt/rocm" CACHE STRING "ROCm installation dir")
 
 # Set HIP include and lib dirs
 set(SYCL_BUILD_PI_HIP_INCLUDE_DIR "" CACHE STRING "Override HIP include dir path (set to \"\" for default behavior)")
diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index 81298f0b92e91..c12757976b0eb 100644
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -56,14 +56,14 @@ endif()
 if(SYCL_PI_UR_USE_FETCH_CONTENT)
   include(FetchContent)
 
-  set(UNIFIED_RUNTIME_REPO "https://github.com/JackAKirk/unified-runtime.git")
+  set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
   # commit 749d8e51ea8e56726a9fb57949d7a3f81b47b15c
   # Merge: 810a5774 34831f4b
   # Author: Kenneth Benzie (Benie) <k.benzie@codeplay.com>
   # Date:   Wed Jan 3 13:27:16 2024 +0000
   #     Merge pull request #1198 from al42and/aa-rocm6
   #     [HIP] Fix build with ROCm 6.0.0
-  set(UNIFIED_RUNTIME_TAG 0dc0d83)
+  set(UNIFIED_RUNTIME_TAG 749d8e51ea8e56726a9fb57949d7a3f81b47b15c)
 
   if(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO)
     set(UNIFIED_RUNTIME_REPO "${SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO}")

From ee00147e4b1aec79e894557be29f37be3b60b34b Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Fri, 7 Jun 2024 08:06:34 -0700
Subject: [PATCH 4/4] Mark requires cuda || hip || level_zero

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 sycl/test-e2e/USM/P2P/p2p_access.cpp  | 1 +
 sycl/test-e2e/USM/P2P/p2p_atomics.cpp | 1 +
 sycl/test-e2e/USM/P2P/p2p_copy.cpp    | 1 +
 3 files changed, 3 insertions(+)

diff --git a/sycl/test-e2e/USM/P2P/p2p_access.cpp b/sycl/test-e2e/USM/P2P/p2p_access.cpp
index 859b469f73a5f..3b282557089a3 100644
--- a/sycl/test-e2e/USM/P2P/p2p_access.cpp
+++ b/sycl/test-e2e/USM/P2P/p2p_access.cpp
@@ -1,3 +1,4 @@
+// REQUIRES: cuda || hip || level_zero
 // RUN:  %{build} -o %t.out
 // RUN:  %{run} %t.out
 
diff --git a/sycl/test-e2e/USM/P2P/p2p_atomics.cpp b/sycl/test-e2e/USM/P2P/p2p_atomics.cpp
index 59fd189fbd7d6..ac51f11169f27 100644
--- a/sycl/test-e2e/USM/P2P/p2p_atomics.cpp
+++ b/sycl/test-e2e/USM/P2P/p2p_atomics.cpp
@@ -1,3 +1,4 @@
+// REQUIRES: cuda || hip || level_zero
 // RUN:  %{build} %if any-device-is-cuda %{ -Xsycl-target-backend --cuda-gpu-arch=sm_61 %} -o %t.out
 // RUN:  %{run} %t.out
 
diff --git a/sycl/test-e2e/USM/P2P/p2p_copy.cpp b/sycl/test-e2e/USM/P2P/p2p_copy.cpp
index 4cd4bf7bb8cfd..05a2f6ee8794d 100644
--- a/sycl/test-e2e/USM/P2P/p2p_copy.cpp
+++ b/sycl/test-e2e/USM/P2P/p2p_copy.cpp
@@ -1,3 +1,4 @@
+// REQUIRES: cuda || hip || level_zero
 // RUN:  %{build} -o %t.out
 // RUN:  %{run} %t.out