ScalarizePass: implement

Implement scalarization of some vector operations that operate on vector containing only the one actual value. It happens often in code generated to XE. With CPU, it is rare but happens either.
ispc · May 14, 2024 · 2eb90ed · 2eb90ed
1 parent cf2cd06
commit 2eb90ed
Show file tree

Hide file tree

Showing 8 changed files with 255 additions and 0 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -507,6 +507,8 @@ list(APPEND OPT_SOURCES
     src/opt/ReplacePseudoMemoryOps.h
     src/opt/ReplaceStdlibShiftPass.cpp
     src/opt/ReplaceStdlibShiftPass.h
+    src/opt/ScalarizePass.cpp
+    src/opt/ScalarizePass.h
     src/opt/XeGatherCoalescePass.cpp
     src/opt/XeGatherCoalescePass.h
     src/opt/XeReplaceLLVMIntrinsics.cpp

diff --git a/src/opt.cpp b/src/opt.cpp
@@ -646,6 +646,7 @@ void ispc::Optimize(llvm::Module *module, int optLevel) {
 #endif
 
         optPM.addFunctionPass(PeepholePass());
+        optPM.addFunctionPass(ScalarizePass());
         optPM.addFunctionPass(llvm::ADCEPass());
         optPM.commitFunctionToModulePassManager();
         optPM.addModulePass(llvm::ModuleInlinerWrapperPass());

diff --git a/src/opt/ISPCPassRegistry.def b/src/opt/ISPCPassRegistry.def
@@ -15,6 +15,7 @@ FUNCTION_PASS("is-compile-time-constant", IsCompileTimeConstantPass())
 FUNCTION_PASS("peephole", PeepholePass())
 FUNCTION_PASS("replace-pseudo-memory-ops", ReplacePseudoMemoryOpsPass())
 FUNCTION_PASS("replace-stdlib-shift", ReplaceStdlibShiftPass())
+FUNCTION_PASS("scalarize", ScalarizePass())
 #ifdef ISPC_XE_ENABLED
 FUNCTION_PASS("check-ir-for-xe-target", CheckIRForXeTarget())
 FUNCTION_PASS("mangle-opencl-builtins", MangleOpenCLBuiltins())

diff --git a/src/opt/ISPCPasses.h b/src/opt/ISPCPasses.h
@@ -21,5 +21,6 @@
 #include "PeepholePass.h"
 #include "ReplacePseudoMemoryOps.h"
 #include "ReplaceStdlibShiftPass.h"
+#include "ScalarizePass.h"
 #include "XeGatherCoalescePass.h"
 #include "XeReplaceLLVMIntrinsics.h"
diff --git a/src/opt/ScalarizePass.cpp b/src/opt/ScalarizePass.cpp
@@ -0,0 +1,105 @@
+/*
+  Copyright (c) 2024, Intel Corporation
+
+  SPDX-License-Identifier: BSD-3-Clause
+*/
+
+#include "ScalarizePass.h"
+#include "builtins-decl.h"
+
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+namespace ispc {
+
+using namespace llvm::PatternMatch;
+
+static llvm::Value *lMatchAndScalarize(llvm::Instruction *inst) {
+    llvm::Value *Vec = nullptr, *OutZeroMask = nullptr;
+    llvm::BinaryOperator *BO = nullptr;
+    if (!match(inst, m_Shuffle(m_BinOp(BO), m_Poison(), m_ZeroMask())) &&
+        !match(inst, m_Shuffle(m_BinOp(BO), m_Undef(), m_ZeroMask()))) {
+        return nullptr;
+    }
+
+    llvm::ShuffleVectorInst *SVI = llvm::cast<llvm::ShuffleVectorInst>(inst);
+    OutZeroMask = SVI->getShuffleMaskForBitcode();
+
+    llvm::BinaryOperator::BinaryOps Opc = BO->getOpcode();
+    llvm::Value *Op1 = nullptr, *Op2 = nullptr;
+    if (match(BO, m_BinOp(m_InsertElt(m_Undef(), m_Value(Op1), m_ZeroInt()), m_Value(Vec))) ||
+        match(BO, m_BinOp(m_Value(Vec), m_InsertElt(m_Undef(), m_Value(Op2), m_ZeroInt())))) {
+
+        if (llvm::ConstantVector *CV = llvm::dyn_cast<llvm::ConstantVector>(Vec)) {
+            unsigned N = CV->getType()->getNumElements();
+            bool isTailPoison = true;
+            unsigned EltIdx = 0;
+
+            for (unsigned i = 1; i < N; i++) {
+                auto E = CV->getAggregateElement(i);
+                if (!llvm::isa<llvm::PoisonValue>(E) && !llvm::isa<llvm::UndefValue>(E)) {
+                    isTailPoison = false;
+                    break;
+                }
+            }
+
+            if (isTailPoison) {
+                (!Op1 ? Op1 : Op2) = CV->getAggregateElement(EltIdx);
+                Assert(Op1 && Op2);
+
+                llvm::Type *VecType = Vec->getType();
+                llvm::Value *UV = llvm::UndefValue::get(VecType);
+                llvm::IRBuilder<> Builder(inst->getParent()->getParent()->getContext());
+
+                Builder.SetInsertPoint(inst);
+
+                llvm::Value *ScalarBinOp = Builder.CreateBinOp(Opc, Op1, Op2);
+                llvm::Value *newVec = Builder.CreateInsertElement(UV, ScalarBinOp, (uint64_t)0);
+                return Builder.CreateShuffleVector(newVec, UV, OutZeroMask);
+            }
+        }
+    }
+    return nullptr;
+}
+
+bool ScalarizePass::matchAndReplace(llvm::BasicBlock &bb) {
+    DEBUG_START_BB("ScalarizePass");
+
+    bool modifiedAny = false;
+
+    // Note: we do modify instruction list during the traversal, so the iterator
+    // is moved forward before the instruction is processed.
+    for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e;) {
+        llvm::Instruction *inst = &*(iter++);
+
+        llvm::Value *newValue = lMatchAndScalarize(inst);
+        if (newValue != nullptr) {
+            inst->replaceAllUsesWith(newValue);
+            llvm::RecursivelyDeleteTriviallyDeadInstructions(inst);
+            modifiedAny = true;
+        }
+    }
+
+    DEBUG_END_BB("ScalarizePass");
+
+    return modifiedAny;
+}
+
+llvm::PreservedAnalyses ScalarizePass::run(llvm::Function &F, llvm::FunctionAnalysisManager &FAM) {
+    llvm::TimeTraceScope FuncScope("ScalarizePass::run", F.getName());
+
+    bool modifiedAny = false;
+    for (llvm::BasicBlock &BB : F) {
+        modifiedAny |= matchAndReplace(BB);
+    }
+    if (!modifiedAny) {
+        // No changes, all analyses are preserved.
+        return llvm::PreservedAnalyses::all();
+    }
+
+    llvm::PreservedAnalyses PA;
+    PA.preserveSet<llvm::CFGAnalyses>();
+    return PA;
+}
+
+} // namespace ispc
diff --git a/src/opt/ScalarizePass.h b/src/opt/ScalarizePass.h
@@ -0,0 +1,31 @@
+/*
+  Copyright (c) 2024, Intel Corporation
+
+  SPDX-License-Identifier: BSD-3-Clause
+*/
+
+#pragma once
+
+#include "ISPCPass.h"
+
+namespace ispc {
+
+// ScalarizePass change some vector operations to scalar ones.
+// There are patterns with arithmetic operations with vectors that fully
+// consist of undef/poison values except the first element. In this case,
+// it can be replaced with a scalar operation with the following broadcast.
+// This can help to reduce the number of vector operations and reduce the
+// amount of used vector registers. It is especially useful for XE targets
+// where such pattern happen often. Although, it can be useful for CPU targets
+// as well. This currently covering binary operations with such values.
+// More details can be found in the test/lit-tests/scalarize.ll file.
+
+struct ScalarizePass : public llvm::PassInfoMixin<ScalarizePass> {
+
+    llvm::PreservedAnalyses run(llvm::Function &F, llvm::FunctionAnalysisManager &FAM);
+
+  private:
+    bool matchAndReplace(llvm::BasicBlock &BB);
+};
+
+} // namespace ispc
diff --git a/tests/lit-tests/scalarize.ll b/tests/lit-tests/scalarize.ll
@@ -0,0 +1,102 @@
+; RUN: %{ispc-opt} --passes=scalarize %s -o - | FileCheck %s
+
+; CHECK-LABEL: @add
+; CHECK-NEXT:  %1 = add i32 %a, 1
+; CHECK-NEXT:  %2 = insertelement <4 x i32> undef, i32 %1, i64 0
+; CHECK-NEXT:  %3 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:  ret <4 x i32> %3
+define <4 x i32> @add(i32 %a) {
+  %op1 = insertelement <4 x i32> undef, i32 %a, i32 0
+  %sum = add <4 x i32> %op1, <i32 1, i32 poison, i32 poison, i32 poison>
+  %shuffled = shufflevector <4 x i32> %sum, <4 x i32> poison, <4 x i32> zeroinitializer
+  ret <4 x i32> %shuffled
+}
+
+; CHECK-LABEL: @sub
+; CHECK-NEXT:  %1 = sub i32 1, %a
+; CHECK-NEXT:  %2 = insertelement <4 x i32> undef, i32 %1, i64 0
+; CHECK-NEXT:  %3 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:  ret <4 x i32> %3
+define <4 x i32> @sub(i32 %a) {
+  %op1 = insertelement <4 x i32> undef, i32 %a, i32 0
+  %binop = sub <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>, %op1
+  %shuffled = shufflevector <4 x i32> %binop, <4 x i32> poison, <4 x i32> zeroinitializer
+  ret <4 x i32> %shuffled
+}
+
+; CHECK-LABEL: @undef
+; CHECK-NEXT:  %1 = add i32 %a, 1
+; CHECK-NEXT:  %2 = insertelement <4 x i32> undef, i32 %1, i64 0
+; CHECK-NEXT:  %3 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:  ret <4 x i32> %3
+define <4 x i32> @undef(i32 %a) {
+  %op1 = insertelement <4 x i32> undef, i32 %a, i32 0
+  %sum = add <4 x i32> %op1, <i32 1, i32 undef, i32 undef, i32 undef>
+  %shuffled = shufflevector <4 x i32> %sum, <4 x i32> poison, <4 x i32> zeroinitializer
+  ret <4 x i32> %shuffled
+}
+
+; CHECK-LABEL: @fmul
+; CHECK-NEXT:   %1 = fmul float 1.000000e+00, %a
+; CHECK-NEXT:   %2 = insertelement <4 x float> undef, float %1, i64 0
+; CHECK-NEXT:   %3 = shufflevector <4 x float> %2, <4 x float> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:   ret <2 x float> %3
+define <2 x float> @fmul(float %a, float %b) {
+  %op1 = insertelement <4 x float> undef, float %a, i32 0
+  %binop = fmul <4 x float> <float 1.0, float poison, float poison, float poison>, %op1
+  %shuffled = shufflevector <4 x float> %binop, <4 x float> undef, <2 x i32> zeroinitializer
+  ret <2 x float> %shuffled
+}
+
+; cmp not supported
+; CHECK-LABEL: @cmp
+; CHECK-NEXT:  %op1 = insertelement <4 x i32> undef, i32 %a, i32 0
+define <4 x i1> @cmp(i32 %a) {
+  %op1 = insertelement <4 x i32> undef, i32 %a, i32 0
+  %binop = icmp eq <4 x i32> %op1, <i32 1, i32 undef, i32 undef, i32 undef>
+  %shuffled = shufflevector <4 x i1> %binop, <4 x i1> undef, <4 x i32> zeroinitializer
+  ret <4 x i1> %shuffled
+}
+
+; not supported any position of non-posion value except the first one
+; CHECK-LABEL: @nonzeroelement
+; CHECK-NEXT:  %op1 = insertelement <4 x i32> undef, i32 %a, i32 0
+define <4 x i32> @nonzeroelement(i32 %a) {
+  %op1 = insertelement <4 x i32> undef, i32 %a, i32 0
+  %sum = add <4 x i32> %op1, <i32 poison, i32 poison, i32 poison, i32 1>
+  %shuffled = shufflevector <4 x i32> %sum, <4 x i32> poison, <4 x i32> zeroinitializer
+  ret <4 x i32> %shuffled
+}
+
+; not supported non-const second operand
+; CHECK-LABEL: @nonconst
+; CHECK-NEXT:  %op1 = insertelement <4 x i32> undef, i32 %a, i32 0
+; CHECK-NEXT:  %op2 = insertelement <4 x i32> undef, i32 %b, i32 0
+define <4 x i32> @nonconst(i32 %a, i32 %b) {
+  %op1 = insertelement <4 x i32> undef, i32 %a, i32 0
+  %op2 = insertelement <4 x i32> undef, i32 %b, i32 0
+  %sum = add <4 x i32> %op1, %op2
+  %shuffled = shufflevector <4 x i32> %sum, <4 x i32> poison, <4 x i32> zeroinitializer
+  ret <4 x i32> %shuffled
+}
+
+; not supported for values across phi-nodes
+define <4 x i32> @phi(i32 %a, i32 %b, i32 %c) {
+  %cmp = icmp slt i32 %c, 0
+  br i1 %cmp, label %block_a, label %block_b
+
+  block_a:
+    %incoming_a = insertelement <4 x i32> undef, i32 %a, i32 0
+    br label %exit
+
+  block_b:
+    %incoming_b = insertelement <4 x i32> undef, i32 %b, i32 0
+    br label %exit
+
+  exit:
+    %op1 = phi <4 x i32> [ %incoming_a, %block_a ], [ %incoming_b, %block_b ]
+    %binop = add <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>, %op1
+    %shuffled = shufflevector <4 x i32> %binop, <4 x i32> poison, <4 x i32> zeroinitializer
+    ret <4 x i32> %shuffled
+}
+
diff --git a/tests/lit-tests/undef-vectors-arith.ispc b/tests/lit-tests/undef-vectors-arith.ispc
@@ -0,0 +1,12 @@
+// RUN: %{ispc} %s --target=xehpg-x16 --emit-llvm-text -o - | FileCheck %s
+
+// REQUIRES: XE_ENABLED
+
+// CHECK-NOT:   [[V:%.*]] = add <16 x i64> %base.i, <i64 {{.*}}, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison>
+
+struct vec4f { float x, y, z, w; };
+struct FB { vec4f *buf; };
+unmasked void foo(FB *uniform p, const int index) {
+  const vec4f color = { 0, 1, 2, 3, };
+  p->buf[index] = color;
+}