diff --git a/llvm/lib/SYCLLowerIR/GlobalOffset.cpp b/llvm/lib/SYCLLowerIR/GlobalOffset.cpp index 7325da9f258bf..13957fbccafab 100644 --- a/llvm/lib/SYCLLowerIR/GlobalOffset.cpp +++ b/llvm/lib/SYCLLowerIR/GlobalOffset.cpp @@ -60,16 +60,52 @@ ModulePass *llvm::createGlobalOffsetPassLegacy() { return new GlobalOffsetLegacy(); } -// Recursive helper function to collect Loads from GEPs in a BFS fashion. -static void getLoads(Instruction *P, SmallVectorImpl &Traversed, - SmallVectorImpl &Loads) { - Traversed.push_back(P); - if (auto *L = dyn_cast(P)) // Base case for recursion - Loads.push_back(L); - else { - assert(isa(*P)); - for (Value *V : P->users()) - getLoads(cast(V), Traversed, Loads); +// Helper function to collect all GEPs, PHIs and Loads in post-order. +static void collectGlobalOffsetUses(Function *ImplicitOffsetIntrinsic, + SmallVectorImpl &LoadPtrUses, + SmallVectorImpl &Loads) { + SmallVector WorkList; + SmallPtrSet Visited; + + // Find load instructions. + for (auto *U : ImplicitOffsetIntrinsic->users()) { + for (auto *U2 : cast(U)->users()) { + auto *I = cast(U2); + WorkList.push_back(I); + Visited.insert(I); + } + } + while (!WorkList.empty()) { + Instruction *I = WorkList.pop_back_val(); + if (isa(I) || isa(I)) { + for (User *U : I->users()) + if (Visited.insert(U).second) + WorkList.push_back(cast(U)); + } + if (isa(I)) + Loads.push_back(I); + } + + // For each load, find its defs by post-order walking operand use. + Visited.clear(); + for (auto *LI : Loads) { + Use *OpUse0 = &LI->getOperandUse(0); + auto PostOrderTraveral = [&](auto &Self, Use &U) -> void { + auto *I = cast(U.get()); + Visited.insert(I); + for (auto &Op : I->operands()) { + auto *OpI = dyn_cast(Op.get()); + if (!OpI || isa(OpI)) + continue; + if (!Visited.contains(OpI)) + Self(Self, Op); + } + if (!isa(I)) + LoadPtrUses.push_back(I); + }; + Visited.insert(LI); + if (!Visited.contains(OpUse0->get())) + PostOrderTraveral(PostOrderTraveral, *OpUse0); } } @@ -199,32 +235,26 @@ PreservedAnalyses GlobalOffsetPass::run(Module &M, ModuleAnalysisManager &) { // Add implicit parameters to all direct and indirect users of the offset addImplicitParameterToCallers(M, ImplicitOffsetIntrinsic, nullptr, KCache); } - SmallVector Worklist; - SmallVector Loads; + SmallVector Loads; SmallVector PtrUses; - // Collect all GEPs and Loads from the intrinsic's CallInsts - for (Value *V : ImplicitOffsetIntrinsic->users()) { - Worklist.push_back(cast(V)); - for (Value *V2 : V->users()) - getLoads(cast(V2), PtrUses, Loads); - } + collectGlobalOffsetUses(ImplicitOffsetIntrinsic, PtrUses, Loads); // Replace each use of a collected Load with a Constant 0 - for (LoadInst *L : Loads) + for (Instruction *L : Loads) { L->replaceAllUsesWith(ConstantInt::get(L->getType(), 0)); + L->eraseFromParent(); + } // Remove all collected Loads and GEPs from the kernel. - // PtrUses is returned by `getLoads` in topological order. + // PtrUses is returned by `collectGlobalOffsetUses` in topological order. // Walk it backwards so we don't violate users. for (auto *I : reverse(PtrUses)) I->eraseFromParent(); // Remove all collected CallInsts from the kernel. - for (CallInst *CI : Worklist) { - auto *I = cast(CI); - I->eraseFromParent(); - } + for (auto *U : make_early_inc_range(ImplicitOffsetIntrinsic->users())) + cast(U)->eraseFromParent(); // Assert that all uses of `ImplicitOffsetIntrinsic` are removed and delete // it. diff --git a/llvm/test/CodeGen/AMDGPU/global-offset-phi.ll b/llvm/test/CodeGen/AMDGPU/global-offset-phi.ll new file mode 100644 index 0000000000000..645fdc30ce396 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/global-offset-phi.ll @@ -0,0 +1,96 @@ +; RUN: opt -bugpoint-enable-legacy-pm -globaloffset %s -S -o - | FileCheck %s + +; Check that phi is correctly handled in load's defs collection. + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" +target triple = "amdgcn-amd-amdhsa" + +define i64 @test_phi(i32 %x) { +; CHECK-LABEL: define i64 @test_phi( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: switch i32 [[X]], label %[[B5:.*]] [ +; CHECK-NEXT: i32 0, label %[[B1:.*]] +; CHECK-NEXT: i32 1, label %[[B2:.*]] +; CHECK-NEXT: i32 2, label %[[B3:.*]] +; CHECK-NEXT: ] +; CHECK: [[B1]]: +; CHECK-NEXT: br label %[[B4:.*]] +; CHECK: [[B2]]: +; CHECK-NEXT: br label %[[B4]] +; CHECK: [[B3]]: +; CHECK-NEXT: br label %[[B4]] +; CHECK: [[B4]]: +; CHECK-NEXT: [[EXT1:%.*]] = zext i32 0 to i64 +; CHECK-NEXT: [[EXT2:%.*]] = zext i32 0 to i64 +; CHECK-NEXT: [[RES:%.*]] = add nuw nsw i64 [[EXT1]], [[EXT2]] +; CHECK-NEXT: ret i64 [[RES]] +; CHECK: [[B5]]: +; CHECK-NEXT: unreachable +; +entry: + switch i32 %x, label %b5 [ + i32 0, label %b1 + i32 1, label %b2 + i32 2, label %b3 + ] + +b1: ; preds = %entry + %offset0 = tail call ptr addrspace(5) @llvm.amdgcn.implicit.offset() + br label %b4 + +b2: ; preds = %entry + %offset1 = tail call ptr addrspace(5) @llvm.amdgcn.implicit.offset() + %gep1 = getelementptr inbounds nuw i8, ptr addrspace(5) %offset1, i32 4 + br label %b4 + +b3: ; preds = %entry + %offset2 = tail call ptr addrspace(5) @llvm.amdgcn.implicit.offset() + %gep2 = getelementptr inbounds nuw i8, ptr addrspace(5) %offset2, i32 8 + br label %b4 + +b4: ; preds = %b3, %b2, %b1 + %p = phi ptr addrspace(5) [ %offset0, %b1 ], [ %gep1, %b2 ], [ %gep2, %b3 ] + %load1 = load i32, ptr addrspace(5) %p, align 4 + %load2 = load i32, ptr addrspace(5) %p, align 4 + %ext1 = zext i32 %load1 to i64 + %ext2 = zext i32 %load2 to i64 + %res = add nuw nsw i64 %ext1, %ext2 + ret i64 %res + +b5: ; preds = %entry + unreachable +} + +; CHECK-LABEL: define i64 @test_phi_with_offset( +; CHECK-SAME: i32 [[X:%.*]], ptr addrspace(5) [[PTR:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: switch i32 [[X]], label %[[B5:.*]] [ +; CHECK-NEXT: i32 0, label %[[B1:.*]] +; CHECK-NEXT: i32 1, label %[[B2:.*]] +; CHECK-NEXT: i32 2, label %[[B3:.*]] +; CHECK-NEXT: ] +; CHECK: [[B1]]: +; CHECK-NEXT: br label %[[B4:.*]] +; CHECK: [[B2]]: +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) [[PTR]], i32 4 +; CHECK-NEXT: br label %[[B4]] +; CHECK: [[B3]]: +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) [[PTR]], i32 8 +; CHECK-NEXT: br label %[[B4]] +; CHECK: [[B4]]: +; CHECK-NEXT: [[P:%.*]] = phi ptr addrspace(5) [ [[PTR]], %[[B1]] ], [ [[GEP1]], %[[B2]] ], [ [[GEP2]], %[[B3]] ] +; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(5) [[P]], align 4 +; CHECK-NEXT: [[LOAD2:%.*]] = load i32, ptr addrspace(5) [[P]], align 4 +; CHECK-NEXT: [[EXT1:%.*]] = zext i32 [[LOAD1]] to i64 +; CHECK-NEXT: [[EXT2:%.*]] = zext i32 [[LOAD2]] to i64 +; CHECK-NEXT: [[RES:%.*]] = add nuw nsw i64 [[EXT1]], [[EXT2]] +; CHECK-NEXT: ret i64 [[RES]] +; CHECK: [[B5]]: +; CHECK-NEXT: unreachable + +declare ptr addrspace(5) @llvm.amdgcn.implicit.offset() + +!llvm.module.flags = !{!0} + +!0 = !{i32 1, !"sycl-device", i32 1} diff --git a/llvm/test/CodeGen/NVPTX/global-offset-two-loads.ll b/llvm/test/CodeGen/NVPTX/global-offset-two-loads.ll new file mode 100644 index 0000000000000..d022b76406cd4 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/global-offset-two-loads.ll @@ -0,0 +1,34 @@ +; RUN: opt -bugpoint-enable-legacy-pm -globaloffset %s -S -o - | FileCheck %s + +target datalayout = "e-p6:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +declare ptr @llvm.nvvm.implicit.offset() + +define i32 @test_two_loads() { +; CHECK-LABEL: define i32 @test_two_loads() { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[RES:%.*]] = add i32 0, 0 +; CHECK-NEXT: ret i32 [[RES]] +; +entry: + %offset = tail call ptr @llvm.nvvm.implicit.offset() + %gep = getelementptr inbounds nuw i8, ptr %offset, i64 4 + %load1 = load i32, ptr %gep, align 4 + %load2 = load i32, ptr %offset, align 4 + %res = add i32 %load1, %load2 + ret i32 %res +} + +; CHECK-LABEL: define i32 @test_two_loads_with_offset( +; CHECK-SAME: ptr [[PTR:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 4 +; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr [[GEP]], align 4 +; CHECK-NEXT: [[LOAD2:%.*]] = load i32, ptr [[PTR]], align 4 +; CHECK-NEXT: [[RES:%.*]] = add i32 [[LOAD1]], [[LOAD2]] +; CHECK-NEXT: ret i32 [[RES]] + +!llvm.module.flags = !{!0} + +!0 = !{i32 1, !"sycl-device", i32 1}