diff --git a/llvm/lib/SYCLLowerIR/LowerWGScope.cpp b/llvm/lib/SYCLLowerIR/LowerWGScope.cpp index aa55c6cbe4650..01d276ddaf662 100644 --- a/llvm/lib/SYCLLowerIR/LowerWGScope.cpp +++ b/llvm/lib/SYCLLowerIR/LowerWGScope.cpp @@ -214,6 +214,22 @@ static bool hasCallToAFuncWithWGMetadata(Function &F) { return false; } +// Recursively searches for a call to a function with parallel_for_work_item +// metadata inside F. +static bool hasCallToAFuncWithPFWIMetadata(Function &F) { + for (auto &BB : F) + for (auto &I : BB) { + if (isCallToAFuncMarkedWithMD(&I, PFWI_MD)) + return true; + const CallInst *Call = dyn_cast(&I); + Function *F = dyn_cast_or_null(Call ? Call->getCalledFunction() + : nullptr); + if (F && hasCallToAFuncWithPFWIMetadata(*F)) + return true; + } + return false; +} + // Checks if this is a call to parallel_for_work_item. static bool isPFWICall(const Instruction *I) { return isCallToAFuncMarkedWithMD(I, PFWI_MD); @@ -835,7 +851,14 @@ PreservedAnalyses SYCLLowerWGScopePass::run(Function &F, } continue; } - if (!mayHaveSideEffects(I)) + // In addition to an instruction not having side effects, we end the range + // if the instruction is a call that contains, possibly several layers + // down the stack, a call to a parallel_for_work_item. Such calls should + // not be subject to lowering since they must be executed by every work + // item. + const CallInst *Call = dyn_cast(I); + if (!mayHaveSideEffects(I) || + (Call && hasCallToAFuncWithPFWIMetadata(*Call->getCalledFunction()))) continue; LLVM_DEBUG(llvm::dbgs() << "+++ Side effects: " << *I << "\n"); if (!First) diff --git a/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll b/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll index c9b8388078b2b..94f9c4b0f3d50 100644 --- a/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll +++ b/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll @@ -5,6 +5,8 @@ ; are properly handled by LowerWGScope pass. Check that WG-shared local "shadow" variables are created ; and before each PFWI invocation leader WI stores its private copy of the variable into the shadow, ; then all WIs load the shadow value into their private copies ("materialize" the private copy). +; Also check that an indirect call to a function marked with parallel_for_work_item is treated +; the same as a direct call. %struct.bar = type { i8 } %struct.zot = type { %struct.widget, %struct.widget, %struct.widget, %struct.foo } @@ -54,6 +56,7 @@ define internal spir_func void @wibble(ptr addrspace(4) %arg, ptr byval(%struct. ; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 2, i32 2, i32 272) #[[ATTR0]] ; CHECK-NEXT: [[TMP9:%.*]] = addrspacecast ptr [[ARG1]] to ptr addrspace(4) ; CHECK-NEXT: call spir_func void @bar(ptr addrspace(4) [[TMP9]], ptr byval([[STRUCT_FOO_0]]) align 1 [[TMP1]]) +; CHECK-NEXT: call spir_func void @foo(ptr addrspace(4) [[TMP9]], ptr byval([[STRUCT_FOO_0]]) align 1 [[TMP1]]) ; CHECK-NEXT: ret void ; bb: @@ -62,6 +65,57 @@ bb: store ptr addrspace(4) %arg, ptr %0, align 8 %2 = addrspacecast ptr %arg1 to ptr addrspace(4) call spir_func void @bar(ptr addrspace(4) %2, ptr byval(%struct.foo.0) align 1 %1) + call spir_func void @foo(ptr addrspace(4) %2, ptr byval(%struct.foo.0) align 1 %1) + ret void +} + +define internal spir_func void @foo(ptr addrspace(4) %arg, ptr byval(%struct.foo.0) align 1 %arg1) align 2 !work_group_scope !0 { +; CHECK: bb: +; CHECK-NEXT: [[TMP0:%.*]] = alloca ptr addrspace(4), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = alloca [[STRUCT_FOO_0:%.*]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr addrspace(1) @__spirv_BuiltInLocalInvocationIndex, align 4 +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 2, i32 2, i32 272) #[[ATTR0]] +; CHECK-NEXT: [[CMPZ3:%.*]] = icmp eq i64 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[CMPZ3]], label [[LEADER:%.*]], label [[MERGE:%.*]] +; CHECK: leader: +; CHECK-NEXT: call void @llvm.memcpy.p3.p0.i64(ptr addrspace(3) align 8 @ArgShadow.4, ptr align 1 [[ARG1:%.*]], i64 1, i1 false) +; CHECK-NEXT: br label [[MERGE]] +; CHECK: merge: +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 2, i32 2, i32 272) #[[ATTR0]] +; CHECK-NEXT: call void @llvm.memcpy.p0.p3.i64(ptr align 1 [[ARG1]], ptr addrspace(3) align 8 @ArgShadow.4, i64 1, i1 false) +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr addrspace(1) @__spirv_BuiltInLocalInvocationIndex, align 4 +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 2, i32 2, i32 272) #[[ATTR0]] +; CHECK-NEXT: [[CMPZ:%.*]] = icmp eq i64 [[TMP3]], 0 +; CHECK-NEXT: br i1 [[CMPZ]], label [[WG_LEADER:%.*]], label [[WG_CF:%.*]] +; CHECK: wg_leader: +; CHECK-NEXT: store ptr addrspace(4) [[ARG:%.*]], ptr [[TMP0]], align 8 +; CHECK-NEXT: br label [[WG_CF]] +; CHECK: wg_cf: +; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr addrspace(1) @__spirv_BuiltInLocalInvocationIndex, align 4 +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 2, i32 2, i32 272) #[[ATTR0]] +; CHECK-NEXT: [[CMPZ2:%.*]] = icmp eq i64 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[CMPZ2]], label [[TESTMAT:%.*]], label [[LEADERMAT:%.*]] +; CHECK: TestMat: +; CHECK-NEXT: call void @llvm.memcpy.p3.p0.i64(ptr addrspace(3) align 8 @WGCopy.3, ptr align 1 [[TMP1]], i64 1, i1 false) +; CHECK-NEXT: [[MAT_LD:%.*]] = load ptr addrspace(4), ptr [[TMP0]], align 8 +; CHECK-NEXT: store ptr addrspace(4) [[MAT_LD]], ptr addrspace(3) @WGCopy.2, align 8 +; CHECK-NEXT: br label [[LEADERMAT]] +; CHECK: LeaderMat: +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 2, i32 2, i32 272) #[[ATTR0]] +; CHECK-NEXT: [[MAT_LD1:%.*]] = load ptr addrspace(4), ptr addrspace(3) @WGCopy.2, align 8 +; CHECK-NEXT: store ptr addrspace(4) [[MAT_LD1]], ptr [[TMP0]], align 8 +; CHECK-NEXT: call void @llvm.memcpy.p0.p3.i64(ptr align 1 [[TMP1]], ptr addrspace(3) align 8 @WGCopy.3, i64 1, i1 false) +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 2, i32 2, i32 272) #[[ATTR0]] +; CHECK-NEXT: [[TMP5:%.*]] = addrspacecast ptr [[ARG1]] to ptr addrspace(4) +; CHECK-NEXT: call spir_func void @bar(ptr addrspace(4) [[TMP5]], ptr byval([[STRUCT_FOO_0]]) align 1 [[TMP1]]) +; CHECK-NEXT: ret void +; +bb: + %1 = alloca ptr addrspace(4), align 8 + %2 = alloca %struct.foo.0, align 1 + store ptr addrspace(4) %arg, ptr %1, align 8 + %3 = addrspacecast ptr %arg1 to ptr addrspace(4) + call spir_func void @bar(ptr addrspace(4) %3, ptr byval(%struct.foo.0) align 1 %2) ret void } diff --git a/sycl/test-e2e/HierPar/hier_par_indirect.cpp b/sycl/test-e2e/HierPar/hier_par_indirect.cpp index b0a1787368f97..073013a48b63b 100644 --- a/sycl/test-e2e/HierPar/hier_par_indirect.cpp +++ b/sycl/test-e2e/HierPar/hier_par_indirect.cpp @@ -19,12 +19,36 @@ void __attribute__((noinline)) foo(sycl::group<1> work_group) { work_group.parallel_for_work_item([&](sycl::h_item<1> index) {}); } +void __attribute__((noinline)) bar(sycl::group<1> work_group) { + foo(work_group); +} + int main(int argc, char **argv) { sycl::queue q; + + // Try a single indirect call, two indirect calls and an indirect call + // accompanied by multiple parallel_for_work_item calls in the same work_group + // scope. q.submit([&](sycl::handler &cgh) { cgh.parallel_for_work_group(sycl::range<1>{1}, sycl::range<1>{128}, ([=](sycl::group<1> wGroup) { foo(wGroup); })); }).wait(); + q.submit([&](sycl::handler &cgh) { + cgh.parallel_for_work_group( + sycl::range<1>{1}, sycl::range<1>{128}, ([=](sycl::group<1> wGroup) { + foo(wGroup); // 1-layer indirect call + bar(wGroup); // 2-layer indirect call since bar calls foo + })); + }).wait(); + q.submit([&](sycl::handler &cgh) { + cgh.parallel_for_work_group( + sycl::range<1>{1}, sycl::range<1>{128}, ([=](sycl::group<1> wGroup) { + wGroup.parallel_for_work_item([&](sycl::h_item<1> index) {}); + foo(wGroup); + wGroup.parallel_for_work_item([&](sycl::h_item<1> index) {}); + })); + }).wait(); + std::cout << "test passed" << std::endl; return 0; }