-
Notifications
You must be signed in to change notification settings - Fork 309
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implement scalarization of some vector operations that operate on vector containing only the one actual value. It happens often in code generated to XE. With CPU, it is rare but happens either.
- Loading branch information
1 parent
cf2cd06
commit 2eb90ed
Showing
8 changed files
with
255 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
/* | ||
Copyright (c) 2024, Intel Corporation | ||
SPDX-License-Identifier: BSD-3-Clause | ||
*/ | ||
|
||
#include "ScalarizePass.h" | ||
#include "builtins-decl.h" | ||
|
||
#include "llvm/IR/IRBuilder.h" | ||
#include "llvm/Transforms/Utils/Local.h" | ||
|
||
namespace ispc { | ||
|
||
using namespace llvm::PatternMatch; | ||
|
||
static llvm::Value *lMatchAndScalarize(llvm::Instruction *inst) { | ||
llvm::Value *Vec = nullptr, *OutZeroMask = nullptr; | ||
llvm::BinaryOperator *BO = nullptr; | ||
if (!match(inst, m_Shuffle(m_BinOp(BO), m_Poison(), m_ZeroMask())) && | ||
!match(inst, m_Shuffle(m_BinOp(BO), m_Undef(), m_ZeroMask()))) { | ||
return nullptr; | ||
} | ||
|
||
llvm::ShuffleVectorInst *SVI = llvm::cast<llvm::ShuffleVectorInst>(inst); | ||
OutZeroMask = SVI->getShuffleMaskForBitcode(); | ||
|
||
llvm::BinaryOperator::BinaryOps Opc = BO->getOpcode(); | ||
llvm::Value *Op1 = nullptr, *Op2 = nullptr; | ||
if (match(BO, m_BinOp(m_InsertElt(m_Undef(), m_Value(Op1), m_ZeroInt()), m_Value(Vec))) || | ||
match(BO, m_BinOp(m_Value(Vec), m_InsertElt(m_Undef(), m_Value(Op2), m_ZeroInt())))) { | ||
|
||
if (llvm::ConstantVector *CV = llvm::dyn_cast<llvm::ConstantVector>(Vec)) { | ||
unsigned N = CV->getType()->getNumElements(); | ||
bool isTailPoison = true; | ||
unsigned EltIdx = 0; | ||
|
||
for (unsigned i = 1; i < N; i++) { | ||
auto E = CV->getAggregateElement(i); | ||
if (!llvm::isa<llvm::PoisonValue>(E) && !llvm::isa<llvm::UndefValue>(E)) { | ||
isTailPoison = false; | ||
break; | ||
} | ||
} | ||
|
||
if (isTailPoison) { | ||
(!Op1 ? Op1 : Op2) = CV->getAggregateElement(EltIdx); | ||
Assert(Op1 && Op2); | ||
|
||
llvm::Type *VecType = Vec->getType(); | ||
llvm::Value *UV = llvm::UndefValue::get(VecType); | ||
llvm::IRBuilder<> Builder(inst->getParent()->getParent()->getContext()); | ||
|
||
Builder.SetInsertPoint(inst); | ||
|
||
llvm::Value *ScalarBinOp = Builder.CreateBinOp(Opc, Op1, Op2); | ||
llvm::Value *newVec = Builder.CreateInsertElement(UV, ScalarBinOp, (uint64_t)0); | ||
return Builder.CreateShuffleVector(newVec, UV, OutZeroMask); | ||
} | ||
} | ||
} | ||
return nullptr; | ||
} | ||
|
||
bool ScalarizePass::matchAndReplace(llvm::BasicBlock &bb) { | ||
DEBUG_START_BB("ScalarizePass"); | ||
|
||
bool modifiedAny = false; | ||
|
||
// Note: we do modify instruction list during the traversal, so the iterator | ||
// is moved forward before the instruction is processed. | ||
for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e;) { | ||
llvm::Instruction *inst = &*(iter++); | ||
|
||
llvm::Value *newValue = lMatchAndScalarize(inst); | ||
if (newValue != nullptr) { | ||
inst->replaceAllUsesWith(newValue); | ||
llvm::RecursivelyDeleteTriviallyDeadInstructions(inst); | ||
modifiedAny = true; | ||
} | ||
} | ||
|
||
DEBUG_END_BB("ScalarizePass"); | ||
|
||
return modifiedAny; | ||
} | ||
|
||
llvm::PreservedAnalyses ScalarizePass::run(llvm::Function &F, llvm::FunctionAnalysisManager &FAM) { | ||
llvm::TimeTraceScope FuncScope("ScalarizePass::run", F.getName()); | ||
|
||
bool modifiedAny = false; | ||
for (llvm::BasicBlock &BB : F) { | ||
modifiedAny |= matchAndReplace(BB); | ||
} | ||
if (!modifiedAny) { | ||
// No changes, all analyses are preserved. | ||
return llvm::PreservedAnalyses::all(); | ||
} | ||
|
||
llvm::PreservedAnalyses PA; | ||
PA.preserveSet<llvm::CFGAnalyses>(); | ||
return PA; | ||
} | ||
|
||
} // namespace ispc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
/* | ||
Copyright (c) 2024, Intel Corporation | ||
SPDX-License-Identifier: BSD-3-Clause | ||
*/ | ||
|
||
#pragma once | ||
|
||
#include "ISPCPass.h" | ||
|
||
namespace ispc { | ||
|
||
// ScalarizePass change some vector operations to scalar ones. | ||
// There are patterns with arithmetic operations with vectors that fully | ||
// consist of undef/poison values except the first element. In this case, | ||
// it can be replaced with a scalar operation with the following broadcast. | ||
// This can help to reduce the number of vector operations and reduce the | ||
// amount of used vector registers. It is especially useful for XE targets | ||
// where such pattern happen often. Although, it can be useful for CPU targets | ||
// as well. This currently covering binary operations with such values. | ||
// More details can be found in the test/lit-tests/scalarize.ll file. | ||
|
||
struct ScalarizePass : public llvm::PassInfoMixin<ScalarizePass> { | ||
|
||
llvm::PreservedAnalyses run(llvm::Function &F, llvm::FunctionAnalysisManager &FAM); | ||
|
||
private: | ||
bool matchAndReplace(llvm::BasicBlock &BB); | ||
}; | ||
|
||
} // namespace ispc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
; RUN: %{ispc-opt} --passes=scalarize %s -o - | FileCheck %s | ||
|
||
; CHECK-LABEL: @add | ||
; CHECK-NEXT: %1 = add i32 %a, 1 | ||
; CHECK-NEXT: %2 = insertelement <4 x i32> undef, i32 %1, i64 0 | ||
; CHECK-NEXT: %3 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer | ||
; CHECK-NEXT: ret <4 x i32> %3 | ||
define <4 x i32> @add(i32 %a) { | ||
%op1 = insertelement <4 x i32> undef, i32 %a, i32 0 | ||
%sum = add <4 x i32> %op1, <i32 1, i32 poison, i32 poison, i32 poison> | ||
%shuffled = shufflevector <4 x i32> %sum, <4 x i32> poison, <4 x i32> zeroinitializer | ||
ret <4 x i32> %shuffled | ||
} | ||
|
||
; CHECK-LABEL: @sub | ||
; CHECK-NEXT: %1 = sub i32 1, %a | ||
; CHECK-NEXT: %2 = insertelement <4 x i32> undef, i32 %1, i64 0 | ||
; CHECK-NEXT: %3 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer | ||
; CHECK-NEXT: ret <4 x i32> %3 | ||
define <4 x i32> @sub(i32 %a) { | ||
%op1 = insertelement <4 x i32> undef, i32 %a, i32 0 | ||
%binop = sub <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>, %op1 | ||
%shuffled = shufflevector <4 x i32> %binop, <4 x i32> poison, <4 x i32> zeroinitializer | ||
ret <4 x i32> %shuffled | ||
} | ||
|
||
; CHECK-LABEL: @undef | ||
; CHECK-NEXT: %1 = add i32 %a, 1 | ||
; CHECK-NEXT: %2 = insertelement <4 x i32> undef, i32 %1, i64 0 | ||
; CHECK-NEXT: %3 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer | ||
; CHECK-NEXT: ret <4 x i32> %3 | ||
define <4 x i32> @undef(i32 %a) { | ||
%op1 = insertelement <4 x i32> undef, i32 %a, i32 0 | ||
%sum = add <4 x i32> %op1, <i32 1, i32 undef, i32 undef, i32 undef> | ||
%shuffled = shufflevector <4 x i32> %sum, <4 x i32> poison, <4 x i32> zeroinitializer | ||
ret <4 x i32> %shuffled | ||
} | ||
|
||
; CHECK-LABEL: @fmul | ||
; CHECK-NEXT: %1 = fmul float 1.000000e+00, %a | ||
; CHECK-NEXT: %2 = insertelement <4 x float> undef, float %1, i64 0 | ||
; CHECK-NEXT: %3 = shufflevector <4 x float> %2, <4 x float> undef, <2 x i32> zeroinitializer | ||
; CHECK-NEXT: ret <2 x float> %3 | ||
define <2 x float> @fmul(float %a, float %b) { | ||
%op1 = insertelement <4 x float> undef, float %a, i32 0 | ||
%binop = fmul <4 x float> <float 1.0, float poison, float poison, float poison>, %op1 | ||
%shuffled = shufflevector <4 x float> %binop, <4 x float> undef, <2 x i32> zeroinitializer | ||
ret <2 x float> %shuffled | ||
} | ||
|
||
; cmp not supported | ||
; CHECK-LABEL: @cmp | ||
; CHECK-NEXT: %op1 = insertelement <4 x i32> undef, i32 %a, i32 0 | ||
define <4 x i1> @cmp(i32 %a) { | ||
%op1 = insertelement <4 x i32> undef, i32 %a, i32 0 | ||
%binop = icmp eq <4 x i32> %op1, <i32 1, i32 undef, i32 undef, i32 undef> | ||
%shuffled = shufflevector <4 x i1> %binop, <4 x i1> undef, <4 x i32> zeroinitializer | ||
ret <4 x i1> %shuffled | ||
} | ||
|
||
; not supported any position of non-posion value except the first one | ||
; CHECK-LABEL: @nonzeroelement | ||
; CHECK-NEXT: %op1 = insertelement <4 x i32> undef, i32 %a, i32 0 | ||
define <4 x i32> @nonzeroelement(i32 %a) { | ||
%op1 = insertelement <4 x i32> undef, i32 %a, i32 0 | ||
%sum = add <4 x i32> %op1, <i32 poison, i32 poison, i32 poison, i32 1> | ||
%shuffled = shufflevector <4 x i32> %sum, <4 x i32> poison, <4 x i32> zeroinitializer | ||
ret <4 x i32> %shuffled | ||
} | ||
|
||
; not supported non-const second operand | ||
; CHECK-LABEL: @nonconst | ||
; CHECK-NEXT: %op1 = insertelement <4 x i32> undef, i32 %a, i32 0 | ||
; CHECK-NEXT: %op2 = insertelement <4 x i32> undef, i32 %b, i32 0 | ||
define <4 x i32> @nonconst(i32 %a, i32 %b) { | ||
%op1 = insertelement <4 x i32> undef, i32 %a, i32 0 | ||
%op2 = insertelement <4 x i32> undef, i32 %b, i32 0 | ||
%sum = add <4 x i32> %op1, %op2 | ||
%shuffled = shufflevector <4 x i32> %sum, <4 x i32> poison, <4 x i32> zeroinitializer | ||
ret <4 x i32> %shuffled | ||
} | ||
|
||
; not supported for values across phi-nodes | ||
define <4 x i32> @phi(i32 %a, i32 %b, i32 %c) { | ||
%cmp = icmp slt i32 %c, 0 | ||
br i1 %cmp, label %block_a, label %block_b | ||
|
||
block_a: | ||
%incoming_a = insertelement <4 x i32> undef, i32 %a, i32 0 | ||
br label %exit | ||
|
||
block_b: | ||
%incoming_b = insertelement <4 x i32> undef, i32 %b, i32 0 | ||
br label %exit | ||
|
||
exit: | ||
%op1 = phi <4 x i32> [ %incoming_a, %block_a ], [ %incoming_b, %block_b ] | ||
%binop = add <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>, %op1 | ||
%shuffled = shufflevector <4 x i32> %binop, <4 x i32> poison, <4 x i32> zeroinitializer | ||
ret <4 x i32> %shuffled | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
// RUN: %{ispc} %s --target=xehpg-x16 --emit-llvm-text -o - | FileCheck %s | ||
|
||
// REQUIRES: XE_ENABLED | ||
|
||
// CHECK-NOT: [[V:%.*]] = add <16 x i64> %base.i, <i64 {{.*}}, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison> | ||
|
||
struct vec4f { float x, y, z, w; }; | ||
struct FB { vec4f *buf; }; | ||
unmasked void foo(FB *uniform p, const int index) { | ||
const vec4f color = { 0, 1, 2, 3, }; | ||
p->buf[index] = color; | ||
} |