Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 54 additions & 24 deletions llvm/lib/SYCLLowerIR/GlobalOffset.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,16 +60,52 @@ ModulePass *llvm::createGlobalOffsetPassLegacy() {
return new GlobalOffsetLegacy();
}

// Recursive helper function to collect Loads from GEPs in a BFS fashion.
static void getLoads(Instruction *P, SmallVectorImpl<Instruction *> &Traversed,
SmallVectorImpl<LoadInst *> &Loads) {
Traversed.push_back(P);
if (auto *L = dyn_cast<LoadInst>(P)) // Base case for recursion
Loads.push_back(L);
else {
assert(isa<GetElementPtrInst>(*P));
for (Value *V : P->users())
getLoads(cast<Instruction>(V), Traversed, Loads);
// Helper function to collect all GEPs, PHIs and Loads in post-order.
static void collectGlobalOffsetUses(Function *ImplicitOffsetIntrinsic,
SmallVectorImpl<Instruction *> &LoadPtrUses,
SmallVectorImpl<Instruction *> &Loads) {
SmallVector<Instruction *, 4> WorkList;
SmallPtrSet<Value *, 4> Visited;

// Find load instructions.
for (auto *U : ImplicitOffsetIntrinsic->users()) {
for (auto *U2 : cast<CallInst>(U)->users()) {
auto *I = cast<Instruction>(U2);
WorkList.push_back(I);
Visited.insert(I);
}
}
while (!WorkList.empty()) {
Instruction *I = WorkList.pop_back_val();
if (isa<PHINode>(I) || isa<GetElementPtrInst>(I)) {
for (User *U : I->users())
if (Visited.insert(U).second)
WorkList.push_back(cast<Instruction>(U));
}
if (isa<LoadInst>(I))
Loads.push_back(I);
}

// For each load, find its defs by post-order walking operand use.
Visited.clear();
for (auto *LI : Loads) {
Use *OpUse0 = &LI->getOperandUse(0);
auto PostOrderTraveral = [&](auto &Self, Use &U) -> void {
auto *I = cast<Instruction>(U.get());
Visited.insert(I);
for (auto &Op : I->operands()) {
auto *OpI = dyn_cast<Instruction>(Op.get());
if (!OpI || isa<CallInst>(OpI))
continue;
if (!Visited.contains(OpI))
Self(Self, Op);
}
if (!isa<CallInst>(I))
LoadPtrUses.push_back(I);
};
Visited.insert(LI);
if (!Visited.contains(OpUse0->get()))
PostOrderTraveral(PostOrderTraveral, *OpUse0);
}
}

Expand Down Expand Up @@ -199,32 +235,26 @@ PreservedAnalyses GlobalOffsetPass::run(Module &M, ModuleAnalysisManager &) {
// Add implicit parameters to all direct and indirect users of the offset
addImplicitParameterToCallers(M, ImplicitOffsetIntrinsic, nullptr, KCache);
}
SmallVector<CallInst *, 4> Worklist;
SmallVector<LoadInst *, 4> Loads;
SmallVector<Instruction *, 4> Loads;
SmallVector<Instruction *, 4> PtrUses;

// Collect all GEPs and Loads from the intrinsic's CallInsts
for (Value *V : ImplicitOffsetIntrinsic->users()) {
Worklist.push_back(cast<CallInst>(V));
for (Value *V2 : V->users())
getLoads(cast<Instruction>(V2), PtrUses, Loads);
}
collectGlobalOffsetUses(ImplicitOffsetIntrinsic, PtrUses, Loads);

// Replace each use of a collected Load with a Constant 0
for (LoadInst *L : Loads)
for (Instruction *L : Loads) {
L->replaceAllUsesWith(ConstantInt::get(L->getType(), 0));
L->eraseFromParent();
}

// Remove all collected Loads and GEPs from the kernel.
// PtrUses is returned by `getLoads` in topological order.
// PtrUses is returned by `collectGlobalOffsetUses` in topological order.
// Walk it backwards so we don't violate users.
for (auto *I : reverse(PtrUses))
I->eraseFromParent();

// Remove all collected CallInsts from the kernel.
for (CallInst *CI : Worklist) {
auto *I = cast<Instruction>(CI);
I->eraseFromParent();
}
for (auto *U : make_early_inc_range(ImplicitOffsetIntrinsic->users()))
cast<Instruction>(U)->eraseFromParent();

// Assert that all uses of `ImplicitOffsetIntrinsic` are removed and delete
// it.
Expand Down
96 changes: 96 additions & 0 deletions llvm/test/CodeGen/AMDGPU/global-offset-phi.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
; RUN: opt -bugpoint-enable-legacy-pm -globaloffset %s -S -o - | FileCheck %s

; Check that phi is correctly handled in load's defs collection.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit:
would it make sense to add test, where result of load is used? What would happen with those uses?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: would it make sense to add test, where result of load is used? What would happen with those uses?

done, updated the test to have detailed check, thanks


target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
target triple = "amdgcn-amd-amdhsa"

define i64 @test_phi(i32 %x) {
; CHECK-LABEL: define i64 @test_phi(
; CHECK-SAME: i32 [[X:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: switch i32 [[X]], label %[[B5:.*]] [
; CHECK-NEXT: i32 0, label %[[B1:.*]]
; CHECK-NEXT: i32 1, label %[[B2:.*]]
; CHECK-NEXT: i32 2, label %[[B3:.*]]
; CHECK-NEXT: ]
; CHECK: [[B1]]:
; CHECK-NEXT: br label %[[B4:.*]]
; CHECK: [[B2]]:
; CHECK-NEXT: br label %[[B4]]
; CHECK: [[B3]]:
; CHECK-NEXT: br label %[[B4]]
; CHECK: [[B4]]:
; CHECK-NEXT: [[EXT1:%.*]] = zext i32 0 to i64
; CHECK-NEXT: [[EXT2:%.*]] = zext i32 0 to i64
; CHECK-NEXT: [[RES:%.*]] = add nuw nsw i64 [[EXT1]], [[EXT2]]
; CHECK-NEXT: ret i64 [[RES]]
; CHECK: [[B5]]:
; CHECK-NEXT: unreachable
;
entry:
switch i32 %x, label %b5 [
i32 0, label %b1
i32 1, label %b2
i32 2, label %b3
]

b1: ; preds = %entry
%offset0 = tail call ptr addrspace(5) @llvm.amdgcn.implicit.offset()
br label %b4

b2: ; preds = %entry
%offset1 = tail call ptr addrspace(5) @llvm.amdgcn.implicit.offset()
%gep1 = getelementptr inbounds nuw i8, ptr addrspace(5) %offset1, i32 4
br label %b4

b3: ; preds = %entry
%offset2 = tail call ptr addrspace(5) @llvm.amdgcn.implicit.offset()
%gep2 = getelementptr inbounds nuw i8, ptr addrspace(5) %offset2, i32 8
br label %b4

b4: ; preds = %b3, %b2, %b1
%p = phi ptr addrspace(5) [ %offset0, %b1 ], [ %gep1, %b2 ], [ %gep2, %b3 ]
%load1 = load i32, ptr addrspace(5) %p, align 4
%load2 = load i32, ptr addrspace(5) %p, align 4
%ext1 = zext i32 %load1 to i64
%ext2 = zext i32 %load2 to i64
%res = add nuw nsw i64 %ext1, %ext2
ret i64 %res

b5: ; preds = %entry
unreachable
}

; CHECK-LABEL: define i64 @test_phi_with_offset(
; CHECK-SAME: i32 [[X:%.*]], ptr addrspace(5) [[PTR:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: switch i32 [[X]], label %[[B5:.*]] [
; CHECK-NEXT: i32 0, label %[[B1:.*]]
; CHECK-NEXT: i32 1, label %[[B2:.*]]
; CHECK-NEXT: i32 2, label %[[B3:.*]]
; CHECK-NEXT: ]
; CHECK: [[B1]]:
; CHECK-NEXT: br label %[[B4:.*]]
; CHECK: [[B2]]:
; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) [[PTR]], i32 4
; CHECK-NEXT: br label %[[B4]]
; CHECK: [[B3]]:
; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) [[PTR]], i32 8
; CHECK-NEXT: br label %[[B4]]
; CHECK: [[B4]]:
; CHECK-NEXT: [[P:%.*]] = phi ptr addrspace(5) [ [[PTR]], %[[B1]] ], [ [[GEP1]], %[[B2]] ], [ [[GEP2]], %[[B3]] ]
; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(5) [[P]], align 4
; CHECK-NEXT: [[LOAD2:%.*]] = load i32, ptr addrspace(5) [[P]], align 4
; CHECK-NEXT: [[EXT1:%.*]] = zext i32 [[LOAD1]] to i64
; CHECK-NEXT: [[EXT2:%.*]] = zext i32 [[LOAD2]] to i64
; CHECK-NEXT: [[RES:%.*]] = add nuw nsw i64 [[EXT1]], [[EXT2]]
; CHECK-NEXT: ret i64 [[RES]]
; CHECK: [[B5]]:
; CHECK-NEXT: unreachable

declare ptr addrspace(5) @llvm.amdgcn.implicit.offset()

!llvm.module.flags = !{!0}

!0 = !{i32 1, !"sycl-device", i32 1}
34 changes: 34 additions & 0 deletions llvm/test/CodeGen/NVPTX/global-offset-two-loads.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
; RUN: opt -bugpoint-enable-legacy-pm -globaloffset %s -S -o - | FileCheck %s

target datalayout = "e-p6:32:32-i64:64-i128:128-i256:256-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"

declare ptr @llvm.nvvm.implicit.offset()

define i32 @test_two_loads() {
; CHECK-LABEL: define i32 @test_two_loads() {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[RES:%.*]] = add i32 0, 0
; CHECK-NEXT: ret i32 [[RES]]
;
entry:
%offset = tail call ptr @llvm.nvvm.implicit.offset()
%gep = getelementptr inbounds nuw i8, ptr %offset, i64 4
%load1 = load i32, ptr %gep, align 4
%load2 = load i32, ptr %offset, align 4
%res = add i32 %load1, %load2
ret i32 %res
}

; CHECK-LABEL: define i32 @test_two_loads_with_offset(
; CHECK-SAME: ptr [[PTR:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 4
; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr [[GEP]], align 4
; CHECK-NEXT: [[LOAD2:%.*]] = load i32, ptr [[PTR]], align 4
; CHECK-NEXT: [[RES:%.*]] = add i32 [[LOAD1]], [[LOAD2]]
; CHECK-NEXT: ret i32 [[RES]]

!llvm.module.flags = !{!0}

!0 = !{i32 1, !"sycl-device", i32 1}