Skip to content

Commit

Permalink
ScalarizePass: implement
Browse files Browse the repository at this point in the history
Implement scalarization of some vector operations that operate on vector
containing only the one actual value. It happens often in code generated
to XE. With CPU, it is rare but happens either.
  • Loading branch information
nurmukhametov committed May 14, 2024
1 parent cf2cd06 commit 2eb90ed
Show file tree
Hide file tree
Showing 8 changed files with 255 additions and 0 deletions.
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -507,6 +507,8 @@ list(APPEND OPT_SOURCES
src/opt/ReplacePseudoMemoryOps.h
src/opt/ReplaceStdlibShiftPass.cpp
src/opt/ReplaceStdlibShiftPass.h
src/opt/ScalarizePass.cpp
src/opt/ScalarizePass.h
src/opt/XeGatherCoalescePass.cpp
src/opt/XeGatherCoalescePass.h
src/opt/XeReplaceLLVMIntrinsics.cpp
Expand Down
1 change: 1 addition & 0 deletions src/opt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -646,6 +646,7 @@ void ispc::Optimize(llvm::Module *module, int optLevel) {
#endif

optPM.addFunctionPass(PeepholePass());
optPM.addFunctionPass(ScalarizePass());
optPM.addFunctionPass(llvm::ADCEPass());
optPM.commitFunctionToModulePassManager();
optPM.addModulePass(llvm::ModuleInlinerWrapperPass());
Expand Down
1 change: 1 addition & 0 deletions src/opt/ISPCPassRegistry.def
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ FUNCTION_PASS("is-compile-time-constant", IsCompileTimeConstantPass())
FUNCTION_PASS("peephole", PeepholePass())
FUNCTION_PASS("replace-pseudo-memory-ops", ReplacePseudoMemoryOpsPass())
FUNCTION_PASS("replace-stdlib-shift", ReplaceStdlibShiftPass())
FUNCTION_PASS("scalarize", ScalarizePass())
#ifdef ISPC_XE_ENABLED
FUNCTION_PASS("check-ir-for-xe-target", CheckIRForXeTarget())
FUNCTION_PASS("mangle-opencl-builtins", MangleOpenCLBuiltins())
Expand Down
1 change: 1 addition & 0 deletions src/opt/ISPCPasses.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,5 +21,6 @@
#include "PeepholePass.h"
#include "ReplacePseudoMemoryOps.h"
#include "ReplaceStdlibShiftPass.h"
#include "ScalarizePass.h"
#include "XeGatherCoalescePass.h"
#include "XeReplaceLLVMIntrinsics.h"
105 changes: 105 additions & 0 deletions src/opt/ScalarizePass.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
/*
Copyright (c) 2024, Intel Corporation
SPDX-License-Identifier: BSD-3-Clause
*/

#include "ScalarizePass.h"
#include "builtins-decl.h"

#include "llvm/IR/IRBuilder.h"
#include "llvm/Transforms/Utils/Local.h"

namespace ispc {

using namespace llvm::PatternMatch;

static llvm::Value *lMatchAndScalarize(llvm::Instruction *inst) {
llvm::Value *Vec = nullptr, *OutZeroMask = nullptr;
llvm::BinaryOperator *BO = nullptr;
if (!match(inst, m_Shuffle(m_BinOp(BO), m_Poison(), m_ZeroMask())) &&
!match(inst, m_Shuffle(m_BinOp(BO), m_Undef(), m_ZeroMask()))) {
return nullptr;
}

llvm::ShuffleVectorInst *SVI = llvm::cast<llvm::ShuffleVectorInst>(inst);
OutZeroMask = SVI->getShuffleMaskForBitcode();

llvm::BinaryOperator::BinaryOps Opc = BO->getOpcode();
llvm::Value *Op1 = nullptr, *Op2 = nullptr;
if (match(BO, m_BinOp(m_InsertElt(m_Undef(), m_Value(Op1), m_ZeroInt()), m_Value(Vec))) ||
match(BO, m_BinOp(m_Value(Vec), m_InsertElt(m_Undef(), m_Value(Op2), m_ZeroInt())))) {

if (llvm::ConstantVector *CV = llvm::dyn_cast<llvm::ConstantVector>(Vec)) {
unsigned N = CV->getType()->getNumElements();
bool isTailPoison = true;
unsigned EltIdx = 0;

for (unsigned i = 1; i < N; i++) {
auto E = CV->getAggregateElement(i);
if (!llvm::isa<llvm::PoisonValue>(E) && !llvm::isa<llvm::UndefValue>(E)) {
isTailPoison = false;
break;
}
}

if (isTailPoison) {
(!Op1 ? Op1 : Op2) = CV->getAggregateElement(EltIdx);
Assert(Op1 && Op2);

llvm::Type *VecType = Vec->getType();
llvm::Value *UV = llvm::UndefValue::get(VecType);
llvm::IRBuilder<> Builder(inst->getParent()->getParent()->getContext());

Builder.SetInsertPoint(inst);

llvm::Value *ScalarBinOp = Builder.CreateBinOp(Opc, Op1, Op2);
llvm::Value *newVec = Builder.CreateInsertElement(UV, ScalarBinOp, (uint64_t)0);
return Builder.CreateShuffleVector(newVec, UV, OutZeroMask);
}
}
}
return nullptr;
}

bool ScalarizePass::matchAndReplace(llvm::BasicBlock &bb) {
DEBUG_START_BB("ScalarizePass");

bool modifiedAny = false;

// Note: we do modify instruction list during the traversal, so the iterator
// is moved forward before the instruction is processed.
for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e;) {
llvm::Instruction *inst = &*(iter++);

llvm::Value *newValue = lMatchAndScalarize(inst);
if (newValue != nullptr) {
inst->replaceAllUsesWith(newValue);
llvm::RecursivelyDeleteTriviallyDeadInstructions(inst);
modifiedAny = true;
}
}

DEBUG_END_BB("ScalarizePass");

return modifiedAny;
}

llvm::PreservedAnalyses ScalarizePass::run(llvm::Function &F, llvm::FunctionAnalysisManager &FAM) {
llvm::TimeTraceScope FuncScope("ScalarizePass::run", F.getName());

bool modifiedAny = false;
for (llvm::BasicBlock &BB : F) {
modifiedAny |= matchAndReplace(BB);
}
if (!modifiedAny) {
// No changes, all analyses are preserved.
return llvm::PreservedAnalyses::all();
}

llvm::PreservedAnalyses PA;
PA.preserveSet<llvm::CFGAnalyses>();
return PA;
}

} // namespace ispc
31 changes: 31 additions & 0 deletions src/opt/ScalarizePass.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
/*
Copyright (c) 2024, Intel Corporation
SPDX-License-Identifier: BSD-3-Clause
*/

#pragma once

#include "ISPCPass.h"

namespace ispc {

// ScalarizePass change some vector operations to scalar ones.
// There are patterns with arithmetic operations with vectors that fully
// consist of undef/poison values except the first element. In this case,
// it can be replaced with a scalar operation with the following broadcast.
// This can help to reduce the number of vector operations and reduce the
// amount of used vector registers. It is especially useful for XE targets
// where such pattern happen often. Although, it can be useful for CPU targets
// as well. This currently covering binary operations with such values.
// More details can be found in the test/lit-tests/scalarize.ll file.

struct ScalarizePass : public llvm::PassInfoMixin<ScalarizePass> {

llvm::PreservedAnalyses run(llvm::Function &F, llvm::FunctionAnalysisManager &FAM);

private:
bool matchAndReplace(llvm::BasicBlock &BB);
};

} // namespace ispc
102 changes: 102 additions & 0 deletions tests/lit-tests/scalarize.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
; RUN: %{ispc-opt} --passes=scalarize %s -o - | FileCheck %s

; CHECK-LABEL: @add
; CHECK-NEXT: %1 = add i32 %a, 1
; CHECK-NEXT: %2 = insertelement <4 x i32> undef, i32 %1, i64 0
; CHECK-NEXT: %3 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
; CHECK-NEXT: ret <4 x i32> %3
define <4 x i32> @add(i32 %a) {
%op1 = insertelement <4 x i32> undef, i32 %a, i32 0
%sum = add <4 x i32> %op1, <i32 1, i32 poison, i32 poison, i32 poison>
%shuffled = shufflevector <4 x i32> %sum, <4 x i32> poison, <4 x i32> zeroinitializer
ret <4 x i32> %shuffled
}

; CHECK-LABEL: @sub
; CHECK-NEXT: %1 = sub i32 1, %a
; CHECK-NEXT: %2 = insertelement <4 x i32> undef, i32 %1, i64 0
; CHECK-NEXT: %3 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
; CHECK-NEXT: ret <4 x i32> %3
define <4 x i32> @sub(i32 %a) {
%op1 = insertelement <4 x i32> undef, i32 %a, i32 0
%binop = sub <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>, %op1
%shuffled = shufflevector <4 x i32> %binop, <4 x i32> poison, <4 x i32> zeroinitializer
ret <4 x i32> %shuffled
}

; CHECK-LABEL: @undef
; CHECK-NEXT: %1 = add i32 %a, 1
; CHECK-NEXT: %2 = insertelement <4 x i32> undef, i32 %1, i64 0
; CHECK-NEXT: %3 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
; CHECK-NEXT: ret <4 x i32> %3
define <4 x i32> @undef(i32 %a) {
%op1 = insertelement <4 x i32> undef, i32 %a, i32 0
%sum = add <4 x i32> %op1, <i32 1, i32 undef, i32 undef, i32 undef>
%shuffled = shufflevector <4 x i32> %sum, <4 x i32> poison, <4 x i32> zeroinitializer
ret <4 x i32> %shuffled
}

; CHECK-LABEL: @fmul
; CHECK-NEXT: %1 = fmul float 1.000000e+00, %a
; CHECK-NEXT: %2 = insertelement <4 x float> undef, float %1, i64 0
; CHECK-NEXT: %3 = shufflevector <4 x float> %2, <4 x float> undef, <2 x i32> zeroinitializer
; CHECK-NEXT: ret <2 x float> %3
define <2 x float> @fmul(float %a, float %b) {
%op1 = insertelement <4 x float> undef, float %a, i32 0
%binop = fmul <4 x float> <float 1.0, float poison, float poison, float poison>, %op1
%shuffled = shufflevector <4 x float> %binop, <4 x float> undef, <2 x i32> zeroinitializer
ret <2 x float> %shuffled
}

; cmp not supported
; CHECK-LABEL: @cmp
; CHECK-NEXT: %op1 = insertelement <4 x i32> undef, i32 %a, i32 0
define <4 x i1> @cmp(i32 %a) {
%op1 = insertelement <4 x i32> undef, i32 %a, i32 0
%binop = icmp eq <4 x i32> %op1, <i32 1, i32 undef, i32 undef, i32 undef>
%shuffled = shufflevector <4 x i1> %binop, <4 x i1> undef, <4 x i32> zeroinitializer
ret <4 x i1> %shuffled
}

; not supported any position of non-posion value except the first one
; CHECK-LABEL: @nonzeroelement
; CHECK-NEXT: %op1 = insertelement <4 x i32> undef, i32 %a, i32 0
define <4 x i32> @nonzeroelement(i32 %a) {
%op1 = insertelement <4 x i32> undef, i32 %a, i32 0
%sum = add <4 x i32> %op1, <i32 poison, i32 poison, i32 poison, i32 1>
%shuffled = shufflevector <4 x i32> %sum, <4 x i32> poison, <4 x i32> zeroinitializer
ret <4 x i32> %shuffled
}

; not supported non-const second operand
; CHECK-LABEL: @nonconst
; CHECK-NEXT: %op1 = insertelement <4 x i32> undef, i32 %a, i32 0
; CHECK-NEXT: %op2 = insertelement <4 x i32> undef, i32 %b, i32 0
define <4 x i32> @nonconst(i32 %a, i32 %b) {
%op1 = insertelement <4 x i32> undef, i32 %a, i32 0
%op2 = insertelement <4 x i32> undef, i32 %b, i32 0
%sum = add <4 x i32> %op1, %op2
%shuffled = shufflevector <4 x i32> %sum, <4 x i32> poison, <4 x i32> zeroinitializer
ret <4 x i32> %shuffled
}

; not supported for values across phi-nodes
define <4 x i32> @phi(i32 %a, i32 %b, i32 %c) {
%cmp = icmp slt i32 %c, 0
br i1 %cmp, label %block_a, label %block_b

block_a:
%incoming_a = insertelement <4 x i32> undef, i32 %a, i32 0
br label %exit

block_b:
%incoming_b = insertelement <4 x i32> undef, i32 %b, i32 0
br label %exit

exit:
%op1 = phi <4 x i32> [ %incoming_a, %block_a ], [ %incoming_b, %block_b ]
%binop = add <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>, %op1
%shuffled = shufflevector <4 x i32> %binop, <4 x i32> poison, <4 x i32> zeroinitializer
ret <4 x i32> %shuffled
}

12 changes: 12 additions & 0 deletions tests/lit-tests/undef-vectors-arith.ispc
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
// RUN: %{ispc} %s --target=xehpg-x16 --emit-llvm-text -o - | FileCheck %s

// REQUIRES: XE_ENABLED

// CHECK-NOT: [[V:%.*]] = add <16 x i64> %base.i, <i64 {{.*}}, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison>

struct vec4f { float x, y, z, w; };
struct FB { vec4f *buf; };
unmasked void foo(FB *uniform p, const int index) {
const vec4f color = { 0, 1, 2, 3, };
p->buf[index] = color;
}

0 comments on commit 2eb90ed

Please sign in to comment.