From cf54274c2c1391f480728c57391b617ad170cdb2 Mon Sep 17 00:00:00 2001 From: "Bushi, Lorenc" Date: Mon, 27 Oct 2025 09:55:34 -0700 Subject: [PATCH 1/5] Fix a bug in hierarchical parallelism implementation --- llvm/lib/SYCLLowerIR/LowerWGScope.cpp | 25 ++++++++++++++++++++- sycl/test-e2e/HierPar/hier_par_indirect.cpp | 24 ++++++++++++++++++++ 2 files changed, 48 insertions(+), 1 deletion(-) diff --git a/llvm/lib/SYCLLowerIR/LowerWGScope.cpp b/llvm/lib/SYCLLowerIR/LowerWGScope.cpp index aa55c6cbe4650..01d276ddaf662 100644 --- a/llvm/lib/SYCLLowerIR/LowerWGScope.cpp +++ b/llvm/lib/SYCLLowerIR/LowerWGScope.cpp @@ -214,6 +214,22 @@ static bool hasCallToAFuncWithWGMetadata(Function &F) { return false; } +// Recursively searches for a call to a function with parallel_for_work_item +// metadata inside F. +static bool hasCallToAFuncWithPFWIMetadata(Function &F) { + for (auto &BB : F) + for (auto &I : BB) { + if (isCallToAFuncMarkedWithMD(&I, PFWI_MD)) + return true; + const CallInst *Call = dyn_cast(&I); + Function *F = dyn_cast_or_null(Call ? Call->getCalledFunction() + : nullptr); + if (F && hasCallToAFuncWithPFWIMetadata(*F)) + return true; + } + return false; +} + // Checks if this is a call to parallel_for_work_item. static bool isPFWICall(const Instruction *I) { return isCallToAFuncMarkedWithMD(I, PFWI_MD); @@ -835,7 +851,14 @@ PreservedAnalyses SYCLLowerWGScopePass::run(Function &F, } continue; } - if (!mayHaveSideEffects(I)) + // In addition to an instruction not having side effects, we end the range + // if the instruction is a call that contains, possibly several layers + // down the stack, a call to a parallel_for_work_item. Such calls should + // not be subject to lowering since they must be executed by every work + // item. + const CallInst *Call = dyn_cast(I); + if (!mayHaveSideEffects(I) || + (Call && hasCallToAFuncWithPFWIMetadata(*Call->getCalledFunction()))) continue; LLVM_DEBUG(llvm::dbgs() << "+++ Side effects: " << *I << "\n"); if (!First) diff --git a/sycl/test-e2e/HierPar/hier_par_indirect.cpp b/sycl/test-e2e/HierPar/hier_par_indirect.cpp index b0a1787368f97..2917cd0fe5610 100644 --- a/sycl/test-e2e/HierPar/hier_par_indirect.cpp +++ b/sycl/test-e2e/HierPar/hier_par_indirect.cpp @@ -19,12 +19,36 @@ void __attribute__((noinline)) foo(sycl::group<1> work_group) { work_group.parallel_for_work_item([&](sycl::h_item<1> index) {}); } +void __attribute__((noinline)) bar(sycl::group<1> work_group) { + work_group.parallel_for_work_item([&](sycl::h_item<1> index) {}); +} + int main(int argc, char **argv) { sycl::queue q; + + // Try a single indirect call, two indirect calls and an indirect call + // accompanied by multiple parallel_for_work_item calls in the same work_group + // scope. q.submit([&](sycl::handler &cgh) { cgh.parallel_for_work_group(sycl::range<1>{1}, sycl::range<1>{128}, ([=](sycl::group<1> wGroup) { foo(wGroup); })); }).wait(); + q.submit([&](sycl::handler &cgh) { + cgh.parallel_for_work_group(sycl::range<1>{1}, sycl::range<1>{128}, + ([=](sycl::group<1> wGroup) { + foo(wGroup); + bar(wGroup); + })); + }).wait(); + q.submit([&](sycl::handler &cgh) { + cgh.parallel_for_work_group( + sycl::range<1>{1}, sycl::range<1>{128}, ([=](sycl::group<1> wGroup) { + wGroup.parallel_for_work_item([&](sycl::h_item<1> index) {}); + foo(wGroup); + wGroup.parallel_for_work_item([&](sycl::h_item<1> index) {}); + })); + }).wait(); + std::cout << "test passed" << std::endl; return 0; } From 852e3cfaab93be037119fbd8ca3b1612b245d510 Mon Sep 17 00:00:00 2001 From: "Bushi, Lorenc" Date: Mon, 27 Oct 2025 10:03:27 -0700 Subject: [PATCH 2/5] Add more tests --- sycl/test-e2e/HierPar/hier_par_indirect.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sycl/test-e2e/HierPar/hier_par_indirect.cpp b/sycl/test-e2e/HierPar/hier_par_indirect.cpp index 2917cd0fe5610..073013a48b63b 100644 --- a/sycl/test-e2e/HierPar/hier_par_indirect.cpp +++ b/sycl/test-e2e/HierPar/hier_par_indirect.cpp @@ -20,7 +20,7 @@ void __attribute__((noinline)) foo(sycl::group<1> work_group) { } void __attribute__((noinline)) bar(sycl::group<1> work_group) { - work_group.parallel_for_work_item([&](sycl::h_item<1> index) {}); + foo(work_group); } int main(int argc, char **argv) { @@ -34,11 +34,11 @@ int main(int argc, char **argv) { ([=](sycl::group<1> wGroup) { foo(wGroup); })); }).wait(); q.submit([&](sycl::handler &cgh) { - cgh.parallel_for_work_group(sycl::range<1>{1}, sycl::range<1>{128}, - ([=](sycl::group<1> wGroup) { - foo(wGroup); - bar(wGroup); - })); + cgh.parallel_for_work_group( + sycl::range<1>{1}, sycl::range<1>{128}, ([=](sycl::group<1> wGroup) { + foo(wGroup); // 1-layer indirect call + bar(wGroup); // 2-layer indirect call since bar calls foo + })); }).wait(); q.submit([&](sycl::handler &cgh) { cgh.parallel_for_work_group( From 80e8bb80a29c382c2ebfc0a3eba9e67534ca2312 Mon Sep 17 00:00:00 2001 From: "Bushi, Lorenc" Date: Tue, 28 Oct 2025 06:26:15 -0700 Subject: [PATCH 3/5] Add IR test --- llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll b/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll index c9b8388078b2b..0643791e0b692 100644 --- a/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll +++ b/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll @@ -4,7 +4,9 @@ ; Check that allocas which correspond to PFWI lambda object and a local copy of the PFWG lambda object ; are properly handled by LowerWGScope pass. Check that WG-shared local "shadow" variables are created ; and before each PFWI invocation leader WI stores its private copy of the variable into the shadow, -; then all WIs load the shadow value into their private copies ("materialize" the private copy). +; then all WIs load the shadow value into their private copies ("materialize" the private copy). +; Also check that an indirect call to a function marked with parallel_for_work_item is treated +; the same as a direct call. %struct.bar = type { i8 } %struct.zot = type { %struct.widget, %struct.widget, %struct.widget, %struct.foo } @@ -54,6 +56,7 @@ define internal spir_func void @wibble(ptr addrspace(4) %arg, ptr byval(%struct. ; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 2, i32 2, i32 272) #[[ATTR0]] ; CHECK-NEXT: [[TMP9:%.*]] = addrspacecast ptr [[ARG1]] to ptr addrspace(4) ; CHECK-NEXT: call spir_func void @bar(ptr addrspace(4) [[TMP9]], ptr byval([[STRUCT_FOO_0]]) align 1 [[TMP1]]) +; CHECK-NEXT: call spir_func void @foo(ptr addrspace(4) [[TMP9]], ptr byval([[STRUCT_FOO_0]]) align 1 [[TMP1]]) ; CHECK-NEXT: ret void ; bb: @@ -62,6 +65,7 @@ bb: store ptr addrspace(4) %arg, ptr %0, align 8 %2 = addrspacecast ptr %arg1 to ptr addrspace(4) call spir_func void @bar(ptr addrspace(4) %2, ptr byval(%struct.foo.0) align 1 %1) + call spir_func void @foo(ptr addrspace(4) %2, ptr byval(%struct.foo.0) align 1 %1) ret void } @@ -70,4 +74,15 @@ bb: ret void } +define internal spir_func void @foo(ptr addrspace(4) %arg, ptr byval(%struct.foo.0) align 1 %arg1) align 2 !work_group_scope !0 { +bb: + call spir_func void @baz(ptr addrspace(4) %arg, ptr byval(%struct.foo.0) align 1 %arg1) + ret void +} + +define internal spir_func void @baz(ptr addrspace(4) %arg, ptr byval(%struct.foo.0) align 1 %arg1) align 2 !work_item_scope !0 !parallel_for_work_item !0 { +bb: + ret void +} + !0 = !{} From 444bbb313bb58f93e47ad87515e299e9e10f6ed1 Mon Sep 17 00:00:00 2001 From: "Bushi, Lorenc" Date: Tue, 28 Oct 2025 13:41:37 -0700 Subject: [PATCH 4/5] Enhance test --- llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll | 61 +++++++++++++++++++++----- 1 file changed, 50 insertions(+), 11 deletions(-) diff --git a/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll b/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll index 0643791e0b692..5b0c3ad99b781 100644 --- a/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll +++ b/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll @@ -4,8 +4,8 @@ ; Check that allocas which correspond to PFWI lambda object and a local copy of the PFWG lambda object ; are properly handled by LowerWGScope pass. Check that WG-shared local "shadow" variables are created ; and before each PFWI invocation leader WI stores its private copy of the variable into the shadow, -; then all WIs load the shadow value into their private copies ("materialize" the private copy). -; Also check that an indirect call to a function marked with parallel_for_work_item is treated +; then all WIs load the shadow value into their private copies ("materialize" the private copy). +; Also check that an indirect call to a function marked with parallel_for_work_item is treated ; the same as a direct call. %struct.bar = type { i8 } @@ -69,20 +69,59 @@ bb: ret void } -define internal spir_func void @bar(ptr addrspace(4) %arg, ptr byval(%struct.foo.0) align 1 %arg1) align 2 !work_item_scope !0 !parallel_for_work_item !0 { -bb: - ret void -} - define internal spir_func void @foo(ptr addrspace(4) %arg, ptr byval(%struct.foo.0) align 1 %arg1) align 2 !work_group_scope !0 { +; CHECK: bb: +; CHECK-NEXT: [[TMP0:%.*]] = alloca ptr addrspace(4), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = alloca [[STRUCT_FOO_0:%.*]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr addrspace(1) @__spirv_BuiltInLocalInvocationIndex, align 4 +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 2, i32 2, i32 272) #[[ATTR0]] +; CHECK-NEXT: [[CMPZ3:%.*]] = icmp eq i64 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[CMPZ3]], label [[LEADER:%.*]], label [[MERGE:%.*]] +; CHECK: leader: +; CHECK-NEXT: call void @llvm.memcpy.p3.p0.i64(ptr addrspace(3) align 8 @ArgShadow.4, ptr align 1 [[ARG1:%.*]], i64 1, i1 false) +; CHECK-NEXT: br label [[MERGE]] +; CHECK: merge: +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 2, i32 2, i32 272) #[[ATTR0]] +; CHECK-NEXT: call void @llvm.memcpy.p0.p3.i64(ptr align 1 [[ARG1]], ptr addrspace(3) align 8 @ArgShadow.4, i64 1, i1 false) +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr addrspace(1) @__spirv_BuiltInLocalInvocationIndex, align 4 +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 2, i32 2, i32 272) #[[ATTR0]] +; CHECK-NEXT: [[CMPZ:%.*]] = icmp eq i64 [[TMP3]], 0 +; CHECK-NEXT: br i1 [[CMPZ]], label [[WG_LEADER:%.*]], label [[WG_CF:%.*]] +; CHECK: wg_leader: +; CHECK-NEXT: store ptr addrspace(4) [[ARG:%.*]], ptr [[TMP0]], align 8 +; CHECK-NEXT: br label [[WG_CF]] +; CHECK: wg_cf: +; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr addrspace(1) @__spirv_BuiltInLocalInvocationIndex, align 4 +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 2, i32 2, i32 272) #[[ATTR0]] +; CHECK-NEXT: [[CMPZ2:%.*]] = icmp eq i64 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[CMPZ2]], label [[TESTMAT:%.*]], label [[LEADERMAT:%.*]] +; CHECK: TestMat: +; CHECK-NEXT: call void @llvm.memcpy.p3.p0.i64(ptr addrspace(3) align 8 @WGCopy.3, ptr align 1 [[TMP1]], i64 1, i1 false) +; CHECK-NEXT: [[MAT_LD:%.*]] = load ptr addrspace(4), ptr [[TMP0]], align 8 +; CHECK-NEXT: store ptr addrspace(4) [[MAT_LD]], ptr addrspace(3) @WGCopy.2, align 8 +; CHECK-NEXT: br label [[LEADERMAT]] +; CHECK: LeaderMat: +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 2, i32 2, i32 272) #[[ATTR0]] +; CHECK-NEXT: [[MAT_LD1:%.*]] = load ptr addrspace(4), ptr addrspace(3) @WGCopy.2, align 8 +; CHECK-NEXT: store ptr addrspace(4) [[MAT_LD1]], ptr [[TMP0]], align 8 +; CHECK-NEXT: call void @llvm.memcpy.p0.p3.i64(ptr align 1 [[TMP1]], ptr addrspace(3) align 8 @WGCopy.3, i64 1, i1 false) +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 2, i32 2, i32 272) #[[ATTR0]] +; CHECK-NEXT: [[TMP5:%.*]] = addrspacecast ptr [[ARG1]] to ptr addrspace(4) +; CHECK-NEXT: call spir_func void @bar(ptr addrspace(4) [[TMP5]], ptr byval([[STRUCT_FOO_0]]) align 1 [[TMP1]]) +; CHECK-NEXT: ret void +; bb: - call spir_func void @baz(ptr addrspace(4) %arg, ptr byval(%struct.foo.0) align 1 %arg1) - ret void + %0 = alloca ptr addrspace(4), align 8 + %1 = alloca %struct.foo.0, align 1 + store ptr addrspace(4) %arg, ptr %0, align 8 + %2 = addrspacecast ptr %arg1 to ptr addrspace(4) + call spir_func void @bar(ptr addrspace(4) %2, ptr byval(%struct.foo.0) align 1 %1) + ret void } -define internal spir_func void @baz(ptr addrspace(4) %arg, ptr byval(%struct.foo.0) align 1 %arg1) align 2 !work_item_scope !0 !parallel_for_work_item !0 { +define internal spir_func void @bar(ptr addrspace(4) %arg, ptr byval(%struct.foo.0) align 1 %arg1) align 2 !work_item_scope !0 !parallel_for_work_item !0 { bb: - ret void + ret void } !0 = !{} From fb54be432d6f64f1c22f4d15bbf9b0ecb78400d5 Mon Sep 17 00:00:00 2001 From: "Bushi, Lorenc" Date: Tue, 28 Oct 2025 13:49:55 -0700 Subject: [PATCH 5/5] Do some renaming --- llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll b/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll index 5b0c3ad99b781..94f9c4b0f3d50 100644 --- a/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll +++ b/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll @@ -111,11 +111,11 @@ define internal spir_func void @foo(ptr addrspace(4) %arg, ptr byval(%struct.foo ; CHECK-NEXT: ret void ; bb: - %0 = alloca ptr addrspace(4), align 8 - %1 = alloca %struct.foo.0, align 1 - store ptr addrspace(4) %arg, ptr %0, align 8 - %2 = addrspacecast ptr %arg1 to ptr addrspace(4) - call spir_func void @bar(ptr addrspace(4) %2, ptr byval(%struct.foo.0) align 1 %1) + %1 = alloca ptr addrspace(4), align 8 + %2 = alloca %struct.foo.0, align 1 + store ptr addrspace(4) %arg, ptr %1, align 8 + %3 = addrspacecast ptr %arg1 to ptr addrspace(4) + call spir_func void @bar(ptr addrspace(4) %3, ptr byval(%struct.foo.0) align 1 %2) ret void }