-
Notifications
You must be signed in to change notification settings - Fork 803
Description
Describe the bug
When a pattern like below is used, the second queue does not become dependent on the first one:
sycl::event eb1 = q1.ext_oneapi_submit_barrier();
q2.ext_oneapi_submit_barrier({ eb1 });
It looks like the eb1
event is ignored when submitting the barrier to q2
because of this part:
llvm/sycl/source/detail/scheduler/commands.cpp
Lines 280 to 282 in 8128c0c
if (!Event.isInterop() && !Event.isEnqueued()) { | |
if (!Event.getCommand() || !Event.getCommand()->producesPiEvent()) | |
continue; |
Commenting out (carefully) this continue
solves the problem.
The lack of synchronization also seen in NVIDIA Nsight Systems (as well as the lack of cuStreamWaitEvent
calls needed to implement it).
To reproduce
#include <iostream>
#include <sycl/sycl.hpp>
namespace oneapi_exp = sycl::ext::oneapi::experimental;
int main()
{
sycl::queue q1(sycl::property::queue::in_order{});
sycl::queue q2(sycl::property::queue::in_order{});
constexpr size_t N = 1024;
int* data = sycl::malloc_device<int>(N, q1);
int* dataHost = sycl::malloc_host<int>(N, q1);
if (!data || !dataHost) {
std::cerr << "Failed to allocate USM memory." << std::endl;
return 1;
}
std::cout << "Running on device: " << q1.get_device().get_info<sycl::info::device::name>() << std::endl;
std::vector<int> dataRef(N);
for (int idx = 0; idx < N; idx++) {
int temp = idx;
int iter = 1;
for (int i = 0; i < 1000; ++i) {
temp = temp * (iter + 1) - temp + idx;
}
dataRef[idx] = temp;
}
for (int iter = 0; iter < 2; iter++) {
std::cout << "Submitting kernel 1 (initialization) to q1." << std::endl;
sycl::event ek1 = q1.submit([&](sycl::handler& h) {
h.parallel_for(sycl::range<1>{ N }, [=](sycl::id<1> idx) {
for (int j = 0; j < 10000; j++) {
int temp = idx.get(0);
for (int i = 0; i < 1000; ++i) {
temp = temp * (iter + 1) - temp + idx.get(0);
}
data[idx] = temp;
}
});
});
std::cout << "Submitting submit_barrier to q2, dependent on q1." << std::endl;
sycl::event eb1 = q1.ext_oneapi_submit_barrier();
q2.ext_oneapi_submit_barrier({ eb1 }); // Using ek1 here also fixes the problem
std::cout << "Submitting copy to q2." << std::endl;
q2.submit(
[&](sycl::handler& h) { h.memcpy(dataHost, data, N * sizeof(int)); });
q2.wait_and_throw();
if (iter == 1) {
std::cout << "Verification" << std::endl;
for (int i = 0; i < N; i++) {
if (dataHost[i] != dataRef[i]) {
std::cerr << "Error in element " << i << std::endl;
return -1;
}
}
std::cout << "Verification PASSED" << std::endl;
}
}
sycl::free(data, q1);
return 0;
}
Output on CUDA and OpenCL (incorrect):
$ clang++ -fsycl barriers.cpp -O2 && ONEAPI_DEVICE_SELECTOR=opencl:0 ./a.out
Running on device: Intel(R) Arc(TM) A770 Graphics
Submitting kernel 1 (initialization) to q1.
Submitting submit_barrier to q2, dependent on q1.
Submitting copy to q2.
Submitting kernel 1 (initialization) to q1.
Submitting submit_barrier to q2, dependent on q1.
Submitting copy to q2.
Verification
Error in element 1
$ clang++ -fsycl -fsycl-targets=nvidia_gpu_sm_86 barriers.cpp -O2 && ONEAPI_DEVICE_SELECTOR=cuda:0 ./a.out
fatbinary warning : option 'image' has been deprecated
fatbinary warning : option 'image' has been deprecated
Running on device: NVIDIA GeForce RTX 3060
Submitting kernel 1 (initialization) to q1.
Submitting submit_barrier to q2, dependent on q1.
Submitting copy to q2.
Submitting kernel 1 (initialization) to q1.
Submitting submit_barrier to q2, dependent on q1.
Submitting copy to q2.
Verification
Error in element 1
Expected output:
Running on device: Intel(R) Arc(TM) A770 Graphics
Submitting kernel 1 (initialization) to q1.
Submitting submit_barrier to q2, dependent on q1.
Submitting copy to q2.
Submitting kernel 1 (initialization) to q1.
Submitting submit_barrier to q2, dependent on q1.
Submitting copy to q2.
Verification
Verification PASSED
Environment
- OS: Ubuntu 24.04
- Target device and vendor: Intel A770 GPU, NVIDIA RTX3060 GPU
- DPC++ version: 8128c0c
- Dependencies version:
$ sycl-ls
[cuda:gpu][cuda:0] NVIDIA CUDA BACKEND, NVIDIA GeForce RTX 3060 8.6 [CUDA 12.9]
[hip:gpu][hip:0] AMD HIP BACKEND, AMD Radeon RX 6400 gfx1034 [HIP 70051.83]
[level_zero:gpu][level_zero:0] Intel(R) oneAPI Unified Runtime over Level-Zero, Intel(R) Arc(TM) A770 Graphics 12.55.8 [1.6.33276.160000]
[level_zero:gpu][level_zero:1] Intel(R) oneAPI Unified Runtime over Level-Zero, Intel(R) UHD Graphics 770 12.2.0 [1.6.33276.160000]
[opencl:gpu][opencl:0] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO [25.13.33276.16]
[opencl:gpu][opencl:1] Intel(R) OpenCL Graphics, Intel(R) UHD Graphics 770 OpenCL 3.0 NEO [25.13.33276.16]
Additional context
Interestingly, the results are correct when running via L0 on the same GPU, although it looks like the synchronization is still missing there if we look with sycl-trace
:
Submitting submit_barrier to q2, dependent on q1.
[UR] urEnqueueEventsWait(
.hQueue = 0x34d6e900, .numEventsInWaitList = 0, .phEventWaitList = nullptr, .phEvent = 0x7ffccc6324e0 (0x34d741f0))
---> UR_RESULT_SUCCESS
Submitting copy to q2.
Contrasting to the same part of code when using the ek1
event from the kernel launch:
Submitting submit_barrier to q2, dependent on q1.
[UR] urEnqueueEventsWait(
.hQueue = 0x291c7900, .numEventsInWaitList = 0, .phEventWaitList = nullptr, .phEvent = 0x7ffe6da91a20 (0x29216180))
---> UR_RESULT_SUCCESS
[UR] urEnqueueEventsWaitWithBarrierExt(
.hQueue = 0x291c8390, .pProperties = 0x7ffe6da90c30 ((struct ur_exp_enqueue_ext_properties_t){.stype = UR_STRUCTURE_TYPE_EXP_ENQUEUE_EXT_PROPERTIES, .pNext = nullptr, .flags = 0}), .numEventsInWaitList = 1, .phEventWaitList = 0x291cabf0 {0x2920c930}, .phEvent = 0x7ffe6da90af8 (0x291cd1f0))
---> UR_RESULT_SUCCESS
[UR] urEventRelease(
.hEvent = 0x291cd1f0)
---> UR_RESULT_SUCCESS
Submitting copy to q2.
So, the correct results with L0 seem to be just a lucky coincidence; the bug is present all the same.