Skip to content

Commit

Permalink
Plumb through tensor.pack e2e execution for llvm-cpu backend. (#11875)
Browse files Browse the repository at this point in the history
All the tensor.pack ops with static inner_tile_sizes are vectorized, which are all covered by e2e tests.
  • Loading branch information
hanhanW authored and jpienaar committed May 1, 2023
1 parent 7f37bbf commit 60a8701
Show file tree
Hide file tree
Showing 8 changed files with 536 additions and 16 deletions.
Expand Up @@ -94,7 +94,7 @@ struct OuterParallelAsPartitionableLoops
// loops, but that needs the interface to return the static sizes of the
// loops.
SmallVector<unsigned> partitionableLoops;
auto interfaceOp = cast<OpTy>(op);
auto interfaceOp = cast<TilingInterface>(op);
for (auto [index, iteratorType] :
llvm::enumerate(interfaceOp.getLoopIteratorTypes())) {
if (iteratorType != utils::IteratorType::parallel) {
Expand Down Expand Up @@ -241,6 +241,10 @@ void registerPartitionableLoopsInterfaceModels(DialectRegistry &registry) {
IREE::LinalgExt::AttentionOp::attachInterface<
AllParallelAsPartitionableLoops<IREE::LinalgExt::AttentionOp>>(*ctx);
});
registry.addExtension(+[](MLIRContext *ctx, tensor::TensorDialect *dialect) {
tensor::PackOp::attachInterface<
OuterParallelAsPartitionableLoops<tensor::PackOp>>(*ctx);
});
}

} // namespace iree_compiler
Expand Down
29 changes: 18 additions & 11 deletions compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
Expand Up @@ -1068,10 +1068,15 @@ static SmallVector<int64_t> getLinalgExtDefaultWorkgroupTileSizes(
return workgroupTileSizes;
}

static LogicalResult setRootConfig(func::FuncOp entryPointFn,
IREE::LinalgExt::PackOp op) {
SmallVector<int64_t> tileSizes =
getLinalgExtDefaultWorkgroupTileSizes(op, defaultWorkgroupTileSize);
template <typename OpTy>
static LogicalResult setPackOpRootConfig(func::FuncOp entryPointFn, OpTy op) {
// TODO(hanchung): Retire IREE::LinalgExt::PackOp. This is for having
// consistent configurations for pack ops.
static_assert(
llvm::is_one_of<OpTy, IREE::LinalgExt::PackOp, tensor::PackOp>::value,
"applies to only pack operations");
SmallVector<int64_t> tileSizes = getLinalgExtDefaultWorkgroupTileSizes(
cast<TilingInterface>(op.getOperation()), defaultWorkgroupTileSize);

// The default function aims to returns the number of workload per workgroup,
// but it does not know that it is working on packed domain. We need to take
Expand Down Expand Up @@ -1702,14 +1707,16 @@ static LogicalResult setRootConfigImpl(
return setRootConfig(entryPointFn, op, LinalgOpInfo(op),
targetMLTransInfo);
})
.Case<IREE::LinalgExt::FftOp, IREE::LinalgExt::PackOp,
IREE::LinalgExt::UnPackOp, linalg::Mmt4DOp,
linalg::Conv2DNhwcHwcfOp, linalg::Conv2DNchwFchwOp,
linalg::PoolingNhwcSumOp, linalg::PoolingNhwcMaxOp,
linalg::PoolingNhwcMaxUnsignedOp, linalg::PoolingNhwcMinOp,
linalg::PoolingNhwcMinUnsignedOp, linalg::PoolingNchwSumOp,
linalg::PoolingNchwMaxOp, linalg::DepthwiseConv2DNhwcHwcOp>(
.Case<IREE::LinalgExt::FftOp, IREE::LinalgExt::UnPackOp,
linalg::Mmt4DOp, linalg::Conv2DNhwcHwcfOp,
linalg::Conv2DNchwFchwOp, linalg::PoolingNhwcSumOp,
linalg::PoolingNhwcMaxOp, linalg::PoolingNhwcMaxUnsignedOp,
linalg::PoolingNhwcMinOp, linalg::PoolingNhwcMinUnsignedOp,
linalg::PoolingNchwSumOp, linalg::PoolingNchwMaxOp,
linalg::DepthwiseConv2DNhwcHwcOp>(
[&](auto op) { return setRootConfig(entryPointFn, op); })
.Case<IREE::LinalgExt::PackOp, tensor::PackOp>(
[&](auto op) { return setPackOpRootConfig(entryPointFn, op); })
.Case<linalg::ContractionOpInterface>(
[&](auto op) { return setRootConfig(entryPointFn, op); })
.Case<linalg::LinalgOp>(
Expand Down
2 changes: 2 additions & 0 deletions compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
Expand Up @@ -665,6 +665,8 @@ void addCPUDataTilingPipeline(OpPassManager &passManager) {
OpPassManager &nestedModulePM = passManager.nest<ModuleOp>();
nestedModulePM.addNestedPass<func::FuncOp>(
IREE::LinalgExt::createLinalgExtVectorizationPass());
nestedModulePM.addNestedPass<func::FuncOp>(
createVectorizePackUnPackOpsPass());
addBufferizePasses(nestedModulePM);
nestedModulePM.addNestedPass<func::FuncOp>(
createSplitFullPartialTransferPass("linalg-copy"));
Expand Down
Expand Up @@ -196,7 +196,9 @@ static bool isRootOp(Operation *op) {
}
return !isa<linalg::FillOp>(op);
}
return isa<TilingInterface>(op) ||
// tensor::PadOp fusion is not ready. Explicitly marking it not a root op for
// now.
return (isa<TilingInterface>(op) && !isa<tensor::PadOp>(op)) ||
isa<LinalgExt::SetEncodingOp, LinalgExt::UnsetEncodingOp>(op);
}

Expand Down Expand Up @@ -676,7 +678,8 @@ static unsigned decideFusableLinalgOps(FunctionOpInterface funcOp,
// Only look for Linalg ops here. Avoid moving `linalg.fill` that aren't
// fused with anything else into their own dispatches since it is better
// to convert them to splats.
if (!isa<linalg::LinalgOp>(op) || isa<linalg::FillOp>(op)) continue;
if (!isa<linalg::LinalgOp, tensor::PackOp>(op) || isa<linalg::FillOp>(op))
continue;

unsigned newGroup = numRootOps++;
setRootAttribute(context, &op, newGroup);
Expand Down
Expand Up @@ -39,7 +39,8 @@ struct RewriteInitTensorToSplat : public OpRewritePattern<tensor::EmptyOp> {
LogicalResult matchAndRewrite(tensor::EmptyOp emptyTensorOp,
PatternRewriter &rewriter) const override {
if (llvm::all_of(emptyTensorOp->getUsers(), [](Operation *user) -> bool {
return isa<linalg::LinalgOp, LinalgExt::LinalgExtOp>(user);
return isa<linalg::LinalgOp, LinalgExt::LinalgExtOp, tensor::PackOp>(
user);
})) {
return failure();
}
Expand All @@ -66,7 +67,8 @@ struct RewriteInitTensorToEmpty : public OpRewritePattern<tensor::EmptyOp> {
LogicalResult matchAndRewrite(tensor::EmptyOp emptyTensorOp,
PatternRewriter &rewriter) const override {
if (llvm::all_of(emptyTensorOp->getUsers(), [](Operation *user) -> bool {
return isa<linalg::LinalgOp, LinalgExt::LinalgExtOp>(user);
return isa<linalg::LinalgOp, LinalgExt::LinalgExtOp, tensor::PackOp>(
user);
})) {
return failure();
}
Expand Down
3 changes: 3 additions & 0 deletions tests/e2e/tensor_ops/BUILD
Expand Up @@ -38,6 +38,7 @@ iree_check_single_backend_test_suite(
# keep sorted
[
"extract_slice.mlir",
"pack.mlir",
"tensor_insert_slice.mlir",
],
include = ["*.mlir"],
Expand All @@ -59,6 +60,7 @@ iree_check_single_backend_test_suite(
],
include = ["*.mlir"],
exclude = [
"pack.mlir",
"tensor_cast.mlir",
],
),
Expand All @@ -83,6 +85,7 @@ iree_check_single_backend_test_suite(
],
include = ["*.mlir"],
exclude = [
"pack.mlir",
"tensor_cast.mlir",
],
),
Expand Down
1 change: 1 addition & 0 deletions tests/e2e/tensor_ops/CMakeLists.txt
Expand Up @@ -31,6 +31,7 @@ iree_check_single_backend_test_suite(
check_llvm-cpu_local-task
SRCS
"extract_slice.mlir"
"pack.mlir"
"tensor_insert_slice.mlir"
TARGET_BACKEND
"llvm-cpu"
Expand Down

0 comments on commit 60a8701

Please sign in to comment.