From d7fbed5281b7f67b59f0757bae066f6680eb77af Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Mon, 20 Oct 2025 13:20:59 -0700 Subject: [PATCH 1/8] Add graph test for handlerless enqueue recording in function call --- .../handlerless_enqueue_functions.cpp | 114 ++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 sycl/test-e2e/Graph/RecordReplay/handlerless_enqueue_functions.cpp diff --git a/sycl/test-e2e/Graph/RecordReplay/handlerless_enqueue_functions.cpp b/sycl/test-e2e/Graph/RecordReplay/handlerless_enqueue_functions.cpp new file mode 100644 index 0000000000000..1f387b93032e5 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/handlerless_enqueue_functions.cpp @@ -0,0 +1,114 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// Test recording of handlerless SYCL queue APIs (memset, memcpy, +// parallel_for/nd_launch, prefetch, single_task) inside a single function call +// while graph recording is active. All operations are USM-based and occur via +// eventless queue free functions or eventful queue shortcuts to bypass handler +// path. Recording is performed over a non-inlined function call. + +#include "../graph_common.hpp" +#include +#include +#include + +// Records a sequence of handlerless operations to exercise kernel argument +// capture in a function call: +// 1. memset A to a byte pattern +// 2. fill D with FillValue +// 3. copy D -> E +// 4. memcpy A -> B +// 5. prefetch B +// 6. kernel: C[i] = B[i] * 2 (DoubleKernel) +// 7. single_task: C[0] = 999 +// noinline is important as we have caught functional issues only when the recording +// function is not inlined. +__attribute__((noinline)) void +record_handlerless_ops(sycl::queue &Q, uint32_t *A, uint32_t *B, uint32_t *C, + uint32_t *D, uint32_t *E, size_t N, + unsigned char Pattern, uint32_t FillValue, + bool UseFreeFunctions) { + size_t WorkGroupSize = 16; + sycl::nd_range<1> KernelRange{sycl::range<1>{N}, + sycl::range<1>{WorkGroupSize}}; + auto DoubleKernelLambda = [=](sycl::nd_item<1> item) { + const size_t i = item.get_global_linear_id(); + C[i] = B[i] * 2; + }; + auto SingleTaskKernel = [=]() { C[0] = 999; }; + if (UseFreeFunctions) { + exp_ext::memset(Q, A, Pattern, N * sizeof(uint32_t)); + exp_ext::fill(Q, D, FillValue, N); + exp_ext::copy(Q, D, E, N); + exp_ext::memcpy(Q, B, A, N * sizeof(uint32_t)); + exp_ext::prefetch(Q, B, N * sizeof(uint32_t)); + exp_ext::nd_launch(Q, KernelRange, DoubleKernelLambda); + exp_ext::single_task(Q, SingleTaskKernel); + } else { + Q.memset(A, Pattern, N * sizeof(uint32_t)); + Q.fill(D, FillValue, N); + Q.copy(D, E, N); + Q.memcpy(B, A, N * sizeof(uint32_t)); + Q.prefetch(B, N * sizeof(uint32_t)); + Q.parallel_for(KernelRange, DoubleKernelLambda); + Q.single_task(SingleTaskKernel); + } +} + +int main() { + sycl::queue Q{sycl::property_list{sycl::property::queue::in_order{}}}; + const size_t N = 64; + const unsigned char Pattern = 42; + const uint32_t FillValue = 7; + + uint32_t *A = sycl::malloc_shared(N, Q); + uint32_t *B = sycl::malloc_shared(N, Q); + uint32_t *C = sycl::malloc_shared(N, Q); + + uint32_t *D = sycl::malloc_shared(N, Q); + uint32_t *E = sycl::malloc_shared(N, Q); + + for (uint32_t i = 0; i <= 1; ++i) { + Q.memset(A, 0, N * sizeof(uint32_t)); + Q.memset(B, 0, N * sizeof(uint32_t)); + Q.memset(C, 0, N * sizeof(uint32_t)); + Q.memset(D, 0, N * sizeof(uint32_t)); + Q.memset(E, 0, N * sizeof(uint32_t)); + Q.wait_and_throw(); + + exp_ext::command_graph Graph{Q.get_context(), Q.get_device()}; + // Begin recording, invoke function that issues handlerless ops, end + // recording. + Graph.begin_recording(Q); + record_handlerless_ops(Q, A, B, C, D, E, N, Pattern, FillValue, + /*UseFreeFunctions=*/static_cast(i)); + Graph.end_recording(); + + auto Exec = Graph.finalize(); + Q.ext_oneapi_graph(Exec); + Q.wait_and_throw(); + + // Validate results + // C[0] overridden by single_task + assert(check_value(0, static_cast(999), C[0], "C")); + uint32_t DoublePatternUint = 0; + std::memset(&DoublePatternUint, Pattern, sizeof(uint32_t)); + uint32_t DoublePatternUintDoubled = DoublePatternUint * 2; + for (size_t i = 1; i < N; ++i) { + assert(check_value(i, DoublePatternUintDoubled, C[i], "C")); + } + + // Validate fill & copy results + for (size_t i = 0; i < N; ++i) { + assert(check_value(i, FillValue, E[i], "E")); + } + } + + sycl::free(A, Q); + sycl::free(B, Q); + sycl::free(C, Q); + sycl::free(D, Q); + sycl::free(E, Q); + + return 0; +} From f9794f4b864a18da85b3b9b4408131adfca27609 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 21 Oct 2025 08:42:57 -0700 Subject: [PATCH 2/8] Add out-of-order queue test and general cleanup --- .../handlerless_enqueue_functions.cpp | 94 +++++++++---------- 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/sycl/test-e2e/Graph/RecordReplay/handlerless_enqueue_functions.cpp b/sycl/test-e2e/Graph/RecordReplay/handlerless_enqueue_functions.cpp index 1f387b93032e5..997f489951b2e 100644 --- a/sycl/test-e2e/Graph/RecordReplay/handlerless_enqueue_functions.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/handlerless_enqueue_functions.cpp @@ -1,33 +1,24 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out -// Test recording of handlerless SYCL queue APIs (memset, memcpy, +// Test recording of handlerless SYCL queue APIs (memset, memcpy, memadvise, // parallel_for/nd_launch, prefetch, single_task) inside a single function call // while graph recording is active. All operations are USM-based and occur via // eventless queue free functions or eventful queue shortcuts to bypass handler // path. Recording is performed over a non-inlined function call. #include "../graph_common.hpp" +#include #include #include #include -// Records a sequence of handlerless operations to exercise kernel argument -// capture in a function call: -// 1. memset A to a byte pattern -// 2. fill D with FillValue -// 3. copy D -> E -// 4. memcpy A -> B -// 5. prefetch B -// 6. kernel: C[i] = B[i] * 2 (DoubleKernel) -// 7. single_task: C[0] = 999 -// noinline is important as we have caught functional issues only when the recording -// function is not inlined. +// noinline is important as we have previously caught functional issues with +// kernel argument capture only when the function being recorded is not inlined. __attribute__((noinline)) void -record_handlerless_ops(sycl::queue &Q, uint32_t *A, uint32_t *B, uint32_t *C, - uint32_t *D, uint32_t *E, size_t N, - unsigned char Pattern, uint32_t FillValue, - bool UseFreeFunctions) { +recordHandlerLessOps(sycl::queue &Q, uint32_t *A, uint32_t *B, uint32_t *C, + uint32_t *D, uint32_t *E, size_t N, unsigned char Pattern, + uint32_t FillValue, bool InOrderQueue) { size_t WorkGroupSize = 16; sycl::nd_range<1> KernelRange{sycl::range<1>{N}, sycl::range<1>{WorkGroupSize}}; @@ -36,39 +27,52 @@ record_handlerless_ops(sycl::queue &Q, uint32_t *A, uint32_t *B, uint32_t *C, C[i] = B[i] * 2; }; auto SingleTaskKernel = [=]() { C[0] = 999; }; - if (UseFreeFunctions) { + // Test eventless free functions with in-order queue and eventful shortcuts + // with out-of-order queue. + if (InOrderQueue) { exp_ext::memset(Q, A, Pattern, N * sizeof(uint32_t)); exp_ext::fill(Q, D, FillValue, N); + exp_ext::mem_advise(Q, D, N * sizeof(uint32_t), 0); exp_ext::copy(Q, D, E, N); exp_ext::memcpy(Q, B, A, N * sizeof(uint32_t)); exp_ext::prefetch(Q, B, N * sizeof(uint32_t)); exp_ext::nd_launch(Q, KernelRange, DoubleKernelLambda); exp_ext::single_task(Q, SingleTaskKernel); } else { - Q.memset(A, Pattern, N * sizeof(uint32_t)); - Q.fill(D, FillValue, N); - Q.copy(D, E, N); - Q.memcpy(B, A, N * sizeof(uint32_t)); - Q.prefetch(B, N * sizeof(uint32_t)); - Q.parallel_for(KernelRange, DoubleKernelLambda); - Q.single_task(SingleTaskKernel); + auto e1 = Q.memset(A, Pattern, N * sizeof(uint32_t)); + auto e2 = Q.fill(D, FillValue, N); + auto e3 = Q.mem_advise(D, N * sizeof(uint32_t), 0, e2); + Q.copy(D, E, N, e3); + auto e4 = Q.memcpy(B, A, N * sizeof(uint32_t), e1); + auto e5 = Q.prefetch(B, N * sizeof(uint32_t), e4); + auto e6 = Q.parallel_for(KernelRange, e5, DoubleKernelLambda); + Q.single_task(e6, SingleTaskKernel); } } int main() { - sycl::queue Q{sycl::property_list{sycl::property::queue::in_order{}}}; const size_t N = 64; const unsigned char Pattern = 42; const uint32_t FillValue = 7; + auto getQueue = [](bool InOrder) { + if (InOrder) { + return sycl::queue{ + sycl::property_list{sycl::property::queue::in_order{}}}; + } else { + return sycl::queue{}; + } + }; - uint32_t *A = sycl::malloc_shared(N, Q); - uint32_t *B = sycl::malloc_shared(N, Q); - uint32_t *C = sycl::malloc_shared(N, Q); + for (uint32_t i = 0; i <= 1; ++i) { + const bool InOrderQueue = static_cast(i); + sycl::queue Q = getQueue(InOrderQueue); + uint32_t *A = sycl::malloc_shared(N, Q); + uint32_t *B = sycl::malloc_shared(N, Q); + uint32_t *C = sycl::malloc_shared(N, Q); - uint32_t *D = sycl::malloc_shared(N, Q); - uint32_t *E = sycl::malloc_shared(N, Q); + uint32_t *D = sycl::malloc_shared(N, Q); + uint32_t *E = sycl::malloc_shared(N, Q); - for (uint32_t i = 0; i <= 1; ++i) { Q.memset(A, 0, N * sizeof(uint32_t)); Q.memset(B, 0, N * sizeof(uint32_t)); Q.memset(C, 0, N * sizeof(uint32_t)); @@ -77,19 +81,20 @@ int main() { Q.wait_and_throw(); exp_ext::command_graph Graph{Q.get_context(), Q.get_device()}; - // Begin recording, invoke function that issues handlerless ops, end - // recording. Graph.begin_recording(Q); - record_handlerless_ops(Q, A, B, C, D, E, N, Pattern, FillValue, - /*UseFreeFunctions=*/static_cast(i)); + recordHandlerLessOps(Q, A, B, C, D, E, N, Pattern, FillValue, InOrderQueue); Graph.end_recording(); auto Exec = Graph.finalize(); Q.ext_oneapi_graph(Exec); Q.wait_and_throw(); - // Validate results - // C[0] overridden by single_task + // Validate copy from D -> E + for (size_t i = 0; i < N; ++i) { + assert(check_value(i, FillValue, E[i], "E")); + } + + // Validate final values in C assert(check_value(0, static_cast(999), C[0], "C")); uint32_t DoublePatternUint = 0; std::memset(&DoublePatternUint, Pattern, sizeof(uint32_t)); @@ -98,17 +103,12 @@ int main() { assert(check_value(i, DoublePatternUintDoubled, C[i], "C")); } - // Validate fill & copy results - for (size_t i = 0; i < N; ++i) { - assert(check_value(i, FillValue, E[i], "E")); - } + sycl::free(A, Q); + sycl::free(B, Q); + sycl::free(C, Q); + sycl::free(D, Q); + sycl::free(E, Q); } - sycl::free(A, Q); - sycl::free(B, Q); - sycl::free(C, Q); - sycl::free(D, Q); - sycl::free(E, Q); - return 0; } From 61e8133dc4e9b3a6fa681769ecd0c8afb3348ffa Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 21 Oct 2025 08:47:35 -0700 Subject: [PATCH 3/8] Add leak check to test header --- .../Graph/RecordReplay/handlerless_enqueue_functions.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sycl/test-e2e/Graph/RecordReplay/handlerless_enqueue_functions.cpp b/sycl/test-e2e/Graph/RecordReplay/handlerless_enqueue_functions.cpp index 997f489951b2e..6e7672ad48186 100644 --- a/sycl/test-e2e/Graph/RecordReplay/handlerless_enqueue_functions.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/handlerless_enqueue_functions.cpp @@ -1,5 +1,7 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // Test recording of handlerless SYCL queue APIs (memset, memcpy, memadvise, // parallel_for/nd_launch, prefetch, single_task) inside a single function call From cddcf89784afa106ca89e70b24bdcc1611d5469d Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 21 Oct 2025 11:49:46 -0700 Subject: [PATCH 4/8] Remove sycl.hpp include in e2e test --- .../Graph/RecordReplay/handlerless_enqueue_functions.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sycl/test-e2e/Graph/RecordReplay/handlerless_enqueue_functions.cpp b/sycl/test-e2e/Graph/RecordReplay/handlerless_enqueue_functions.cpp index 6e7672ad48186..988c879dbdf68 100644 --- a/sycl/test-e2e/Graph/RecordReplay/handlerless_enqueue_functions.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/handlerless_enqueue_functions.cpp @@ -12,8 +12,8 @@ #include "../graph_common.hpp" #include #include -#include -#include +#include +#include // noinline is important as we have previously caught functional issues with // kernel argument capture only when the function being recorded is not inlined. From 8997a0ffc5a9210c5553e3d6effd201e1f71598e Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 21 Oct 2025 17:21:23 -0700 Subject: [PATCH 5/8] Remove prefetch / memadvise to support ocl backend testing --- .../handlerless_enqueue_functions.cpp | 38 +++++++++++-------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/sycl/test-e2e/Graph/RecordReplay/handlerless_enqueue_functions.cpp b/sycl/test-e2e/Graph/RecordReplay/handlerless_enqueue_functions.cpp index 988c879dbdf68..498e7ac9cdf15 100644 --- a/sycl/test-e2e/Graph/RecordReplay/handlerless_enqueue_functions.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/handlerless_enqueue_functions.cpp @@ -3,8 +3,8 @@ // Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG // RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} -// Test recording of handlerless SYCL queue APIs (memset, memcpy, memadvise, -// parallel_for/nd_launch, prefetch, single_task) inside a single function call +// Test recording of several handlerless SYCL queue APIs (memset, memcpy, +// parallel_for/nd_launch, single_task) inside a single function call // while graph recording is active. All operations are USM-based and occur via // eventless queue free functions or eventful queue shortcuts to bypass handler // path. Recording is performed over a non-inlined function call. @@ -12,6 +12,7 @@ #include "../graph_common.hpp" #include #include +#include #include #include @@ -34,20 +35,16 @@ recordHandlerLessOps(sycl::queue &Q, uint32_t *A, uint32_t *B, uint32_t *C, if (InOrderQueue) { exp_ext::memset(Q, A, Pattern, N * sizeof(uint32_t)); exp_ext::fill(Q, D, FillValue, N); - exp_ext::mem_advise(Q, D, N * sizeof(uint32_t), 0); exp_ext::copy(Q, D, E, N); exp_ext::memcpy(Q, B, A, N * sizeof(uint32_t)); - exp_ext::prefetch(Q, B, N * sizeof(uint32_t)); exp_ext::nd_launch(Q, KernelRange, DoubleKernelLambda); exp_ext::single_task(Q, SingleTaskKernel); } else { auto e1 = Q.memset(A, Pattern, N * sizeof(uint32_t)); auto e2 = Q.fill(D, FillValue, N); - auto e3 = Q.mem_advise(D, N * sizeof(uint32_t), 0, e2); - Q.copy(D, E, N, e3); + Q.copy(D, E, N, e2); auto e4 = Q.memcpy(B, A, N * sizeof(uint32_t), e1); - auto e5 = Q.prefetch(B, N * sizeof(uint32_t), e4); - auto e6 = Q.parallel_for(KernelRange, e5, DoubleKernelLambda); + auto e6 = Q.parallel_for(KernelRange, e4, DoubleKernelLambda); Q.single_task(e6, SingleTaskKernel); } } @@ -68,12 +65,16 @@ int main() { for (uint32_t i = 0; i <= 1; ++i) { const bool InOrderQueue = static_cast(i); sycl::queue Q = getQueue(InOrderQueue); - uint32_t *A = sycl::malloc_shared(N, Q); - uint32_t *B = sycl::malloc_shared(N, Q); - uint32_t *C = sycl::malloc_shared(N, Q); + uint32_t *A = sycl::malloc_device(N, Q); + uint32_t *B = sycl::malloc_device(N, Q); + uint32_t *C = sycl::malloc_device(N, Q); - uint32_t *D = sycl::malloc_shared(N, Q); - uint32_t *E = sycl::malloc_shared(N, Q); + uint32_t *D = sycl::malloc_device(N, Q); + uint32_t *E = sycl::malloc_device(N, Q); + + // Host memory for verification + std::vector C_host(N); + std::vector E_host(N); Q.memset(A, 0, N * sizeof(uint32_t)); Q.memset(B, 0, N * sizeof(uint32_t)); @@ -91,18 +92,23 @@ int main() { Q.ext_oneapi_graph(Exec); Q.wait_and_throw(); + // Copy device memory to host for verification + Q.memcpy(E_host.data(), E, N * sizeof(uint32_t)); + Q.memcpy(C_host.data(), C, N * sizeof(uint32_t)); + Q.wait_and_throw(); + // Validate copy from D -> E for (size_t i = 0; i < N; ++i) { - assert(check_value(i, FillValue, E[i], "E")); + assert(check_value(i, FillValue, E_host[i], "E")); } // Validate final values in C - assert(check_value(0, static_cast(999), C[0], "C")); + assert(check_value(0, static_cast(999), C_host[0], "C")); uint32_t DoublePatternUint = 0; std::memset(&DoublePatternUint, Pattern, sizeof(uint32_t)); uint32_t DoublePatternUintDoubled = DoublePatternUint * 2; for (size_t i = 1; i < N; ++i) { - assert(check_value(i, DoublePatternUintDoubled, C[i], "C")); + assert(check_value(i, DoublePatternUintDoubled, C_host[i], "C")); } sycl::free(A, Q); From f6cd93ab3f36aacebcb1559f715a47306d48046b Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 21 Oct 2025 17:54:24 -0700 Subject: [PATCH 6/8] clang-format header order --- .../Graph/RecordReplay/handlerless_enqueue_functions.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test-e2e/Graph/RecordReplay/handlerless_enqueue_functions.cpp b/sycl/test-e2e/Graph/RecordReplay/handlerless_enqueue_functions.cpp index 498e7ac9cdf15..fce9fcf17c662 100644 --- a/sycl/test-e2e/Graph/RecordReplay/handlerless_enqueue_functions.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/handlerless_enqueue_functions.cpp @@ -12,9 +12,9 @@ #include "../graph_common.hpp" #include #include -#include #include #include +#include // noinline is important as we have previously caught functional issues with // kernel argument capture only when the function being recorded is not inlined. From 4a5a10fc86aa082285ed635169c9ee0e818c737b Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Wed, 22 Oct 2025 12:17:11 -0700 Subject: [PATCH 7/8] Remove unneeded includes --- .../Graph/RecordReplay/handlerless_enqueue_functions.cpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/sycl/test-e2e/Graph/RecordReplay/handlerless_enqueue_functions.cpp b/sycl/test-e2e/Graph/RecordReplay/handlerless_enqueue_functions.cpp index fce9fcf17c662..5c65e71a93da3 100644 --- a/sycl/test-e2e/Graph/RecordReplay/handlerless_enqueue_functions.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/handlerless_enqueue_functions.cpp @@ -10,11 +10,8 @@ // path. Recording is performed over a non-inlined function call. #include "../graph_common.hpp" -#include -#include #include #include -#include // noinline is important as we have previously caught functional issues with // kernel argument capture only when the function being recorded is not inlined. @@ -43,9 +40,9 @@ recordHandlerLessOps(sycl::queue &Q, uint32_t *A, uint32_t *B, uint32_t *C, auto e1 = Q.memset(A, Pattern, N * sizeof(uint32_t)); auto e2 = Q.fill(D, FillValue, N); Q.copy(D, E, N, e2); - auto e4 = Q.memcpy(B, A, N * sizeof(uint32_t), e1); - auto e6 = Q.parallel_for(KernelRange, e4, DoubleKernelLambda); - Q.single_task(e6, SingleTaskKernel); + auto e3 = Q.memcpy(B, A, N * sizeof(uint32_t), e1); + auto e4 = Q.parallel_for(KernelRange, e3, DoubleKernelLambda); + Q.single_task(e4, SingleTaskKernel); } } From ce1551befb54451a7877caad093ef960973f88de Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Thu, 23 Oct 2025 11:36:35 -0700 Subject: [PATCH 8/8] Reorder device submissions for clarity --- .../handlerless_enqueue_functions.cpp | 28 ++++++++++--------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/sycl/test-e2e/Graph/RecordReplay/handlerless_enqueue_functions.cpp b/sycl/test-e2e/Graph/RecordReplay/handlerless_enqueue_functions.cpp index 5c65e71a93da3..f6cdee4e00a96 100644 --- a/sycl/test-e2e/Graph/RecordReplay/handlerless_enqueue_functions.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/handlerless_enqueue_functions.cpp @@ -31,18 +31,20 @@ recordHandlerLessOps(sycl::queue &Q, uint32_t *A, uint32_t *B, uint32_t *C, // with out-of-order queue. if (InOrderQueue) { exp_ext::memset(Q, A, Pattern, N * sizeof(uint32_t)); - exp_ext::fill(Q, D, FillValue, N); - exp_ext::copy(Q, D, E, N); exp_ext::memcpy(Q, B, A, N * sizeof(uint32_t)); exp_ext::nd_launch(Q, KernelRange, DoubleKernelLambda); exp_ext::single_task(Q, SingleTaskKernel); + + exp_ext::fill(Q, D, FillValue, N); + exp_ext::copy(Q, D, E, N); } else { auto e1 = Q.memset(A, Pattern, N * sizeof(uint32_t)); - auto e2 = Q.fill(D, FillValue, N); - Q.copy(D, E, N, e2); - auto e3 = Q.memcpy(B, A, N * sizeof(uint32_t), e1); - auto e4 = Q.parallel_for(KernelRange, e3, DoubleKernelLambda); - Q.single_task(e4, SingleTaskKernel); + auto e2 = Q.memcpy(B, A, N * sizeof(uint32_t), e1); + auto e3 = Q.parallel_for(KernelRange, e2, DoubleKernelLambda); + Q.single_task(e3, SingleTaskKernel); + + auto e4 = Q.fill(D, FillValue, N); + Q.copy(D, E, N, e4); } } @@ -90,15 +92,10 @@ int main() { Q.wait_and_throw(); // Copy device memory to host for verification - Q.memcpy(E_host.data(), E, N * sizeof(uint32_t)); Q.memcpy(C_host.data(), C, N * sizeof(uint32_t)); + Q.memcpy(E_host.data(), E, N * sizeof(uint32_t)); Q.wait_and_throw(); - // Validate copy from D -> E - for (size_t i = 0; i < N; ++i) { - assert(check_value(i, FillValue, E_host[i], "E")); - } - // Validate final values in C assert(check_value(0, static_cast(999), C_host[0], "C")); uint32_t DoublePatternUint = 0; @@ -108,6 +105,11 @@ int main() { assert(check_value(i, DoublePatternUintDoubled, C_host[i], "C")); } + // Validate copy from D -> E + for (size_t i = 0; i < N; ++i) { + assert(check_value(i, FillValue, E_host[i], "E")); + } + sycl::free(A, Q); sycl::free(B, Q); sycl::free(C, Q);