Plumb through tensor.pack e2e execution for llvm-cpu backend. (#11875)

All the tensor.pack ops with static inner_tile_sizes are vectorized, which are all covered by e2e tests.
iree-org · May 1, 2023 · 60a8701 · 60a8701
1 parent 7f37bbf
commit 60a8701
Show file tree

Hide file tree

Showing 8 changed files with 536 additions and 16 deletions.
diff --git a/compiler/src/iree/compiler/Codegen/Interfaces/PartitionableLoopsInterface.cpp b/compiler/src/iree/compiler/Codegen/Interfaces/PartitionableLoopsInterface.cpp
@@ -94,7 +94,7 @@ struct OuterParallelAsPartitionableLoops
     // loops, but that needs the interface to return the static sizes of the
     // loops.
     SmallVector<unsigned> partitionableLoops;
-    auto interfaceOp = cast<OpTy>(op);
+    auto interfaceOp = cast<TilingInterface>(op);
     for (auto [index, iteratorType] :
          llvm::enumerate(interfaceOp.getLoopIteratorTypes())) {
       if (iteratorType != utils::IteratorType::parallel) {
@@ -241,6 +241,10 @@ void registerPartitionableLoopsInterfaceModels(DialectRegistry &registry) {
     IREE::LinalgExt::AttentionOp::attachInterface<
         AllParallelAsPartitionableLoops<IREE::LinalgExt::AttentionOp>>(*ctx);
   });
+  registry.addExtension(+[](MLIRContext *ctx, tensor::TensorDialect *dialect) {
+    tensor::PackOp::attachInterface<
+        OuterParallelAsPartitionableLoops<tensor::PackOp>>(*ctx);
+  });
 }
 
 }  // namespace iree_compiler

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -1068,10 +1068,15 @@ static SmallVector<int64_t> getLinalgExtDefaultWorkgroupTileSizes(
   return workgroupTileSizes;
 }
 
-static LogicalResult setRootConfig(func::FuncOp entryPointFn,
-                                   IREE::LinalgExt::PackOp op) {
-  SmallVector<int64_t> tileSizes =
-      getLinalgExtDefaultWorkgroupTileSizes(op, defaultWorkgroupTileSize);
+template <typename OpTy>
+static LogicalResult setPackOpRootConfig(func::FuncOp entryPointFn, OpTy op) {
+  // TODO(hanchung): Retire IREE::LinalgExt::PackOp. This is for having
+  // consistent configurations for pack ops.
+  static_assert(
+      llvm::is_one_of<OpTy, IREE::LinalgExt::PackOp, tensor::PackOp>::value,
+      "applies to only pack operations");
+  SmallVector<int64_t> tileSizes = getLinalgExtDefaultWorkgroupTileSizes(
+      cast<TilingInterface>(op.getOperation()), defaultWorkgroupTileSize);
 
   // The default function aims to returns the number of workload per workgroup,
   // but it does not know that it is working on packed domain. We need to take
@@ -1702,14 +1707,16 @@ static LogicalResult setRootConfigImpl(
           return setRootConfig(entryPointFn, op, LinalgOpInfo(op),
                                targetMLTransInfo);
         })
-        .Case<IREE::LinalgExt::FftOp, IREE::LinalgExt::PackOp,
-              IREE::LinalgExt::UnPackOp, linalg::Mmt4DOp,
-              linalg::Conv2DNhwcHwcfOp, linalg::Conv2DNchwFchwOp,
-              linalg::PoolingNhwcSumOp, linalg::PoolingNhwcMaxOp,
-              linalg::PoolingNhwcMaxUnsignedOp, linalg::PoolingNhwcMinOp,
-              linalg::PoolingNhwcMinUnsignedOp, linalg::PoolingNchwSumOp,
-              linalg::PoolingNchwMaxOp, linalg::DepthwiseConv2DNhwcHwcOp>(
+        .Case<IREE::LinalgExt::FftOp, IREE::LinalgExt::UnPackOp,
+              linalg::Mmt4DOp, linalg::Conv2DNhwcHwcfOp,
+              linalg::Conv2DNchwFchwOp, linalg::PoolingNhwcSumOp,
+              linalg::PoolingNhwcMaxOp, linalg::PoolingNhwcMaxUnsignedOp,
+              linalg::PoolingNhwcMinOp, linalg::PoolingNhwcMinUnsignedOp,
+              linalg::PoolingNchwSumOp, linalg::PoolingNchwMaxOp,
+              linalg::DepthwiseConv2DNhwcHwcOp>(
             [&](auto op) { return setRootConfig(entryPointFn, op); })
+        .Case<IREE::LinalgExt::PackOp, tensor::PackOp>(
+            [&](auto op) { return setPackOpRootConfig(entryPointFn, op); })
         .Case<linalg::ContractionOpInterface>(
             [&](auto op) { return setRootConfig(entryPointFn, op); })
         .Case<linalg::LinalgOp>(

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
@@ -665,6 +665,8 @@ void addCPUDataTilingPipeline(OpPassManager &passManager) {
   OpPassManager &nestedModulePM = passManager.nest<ModuleOp>();
   nestedModulePM.addNestedPass<func::FuncOp>(
       IREE::LinalgExt::createLinalgExtVectorizationPass());
+  nestedModulePM.addNestedPass<func::FuncOp>(
+      createVectorizePackUnPackOpsPass());
   addBufferizePasses(nestedModulePM);
   nestedModulePM.addNestedPass<func::FuncOp>(
       createSplitFullPartialTransferPass("linalg-copy"));

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/FormDispatchRegions.cpp b/compiler/src/iree/compiler/Dialect/Flow/Transforms/FormDispatchRegions.cpp
@@ -196,7 +196,9 @@ static bool isRootOp(Operation *op) {
     }
     return !isa<linalg::FillOp>(op);
   }
-  return isa<TilingInterface>(op) ||
+  // tensor::PadOp fusion is not ready. Explicitly marking it not a root op for
+  // now.
+  return (isa<TilingInterface>(op) && !isa<tensor::PadOp>(op)) ||
          isa<LinalgExt::SetEncodingOp, LinalgExt::UnsetEncodingOp>(op);
 }
 
@@ -676,7 +678,8 @@ static unsigned decideFusableLinalgOps(FunctionOpInterface funcOp,
       // Only look for Linalg ops here. Avoid moving `linalg.fill` that aren't
       // fused with anything else into their own dispatches since it is better
       // to convert them to splats.
-      if (!isa<linalg::LinalgOp>(op) || isa<linalg::FillOp>(op)) continue;
+      if (!isa<linalg::LinalgOp, tensor::PackOp>(op) || isa<linalg::FillOp>(op))
+        continue;
 
       unsigned newGroup = numRootOps++;
       setRootAttribute(context, &op, newGroup);

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/InitializeEmptyTensors.cpp b/compiler/src/iree/compiler/Dialect/Flow/Transforms/InitializeEmptyTensors.cpp
@@ -39,7 +39,8 @@ struct RewriteInitTensorToSplat : public OpRewritePattern<tensor::EmptyOp> {
   LogicalResult matchAndRewrite(tensor::EmptyOp emptyTensorOp,
                                 PatternRewriter &rewriter) const override {
     if (llvm::all_of(emptyTensorOp->getUsers(), [](Operation *user) -> bool {
-          return isa<linalg::LinalgOp, LinalgExt::LinalgExtOp>(user);
+          return isa<linalg::LinalgOp, LinalgExt::LinalgExtOp, tensor::PackOp>(
+              user);
         })) {
       return failure();
     }
@@ -66,7 +67,8 @@ struct RewriteInitTensorToEmpty : public OpRewritePattern<tensor::EmptyOp> {
   LogicalResult matchAndRewrite(tensor::EmptyOp emptyTensorOp,
                                 PatternRewriter &rewriter) const override {
     if (llvm::all_of(emptyTensorOp->getUsers(), [](Operation *user) -> bool {
-          return isa<linalg::LinalgOp, LinalgExt::LinalgExtOp>(user);
+          return isa<linalg::LinalgOp, LinalgExt::LinalgExtOp, tensor::PackOp>(
+              user);
         })) {
       return failure();
     }

diff --git a/tests/e2e/tensor_ops/BUILD b/tests/e2e/tensor_ops/BUILD
@@ -38,6 +38,7 @@ iree_check_single_backend_test_suite(
         # keep sorted
         [
             "extract_slice.mlir",
+            "pack.mlir",
             "tensor_insert_slice.mlir",
         ],
         include = ["*.mlir"],
@@ -59,6 +60,7 @@ iree_check_single_backend_test_suite(
         ],
         include = ["*.mlir"],
         exclude = [
+            "pack.mlir",
             "tensor_cast.mlir",
         ],
     ),
@@ -83,6 +85,7 @@ iree_check_single_backend_test_suite(
         ],
         include = ["*.mlir"],
         exclude = [
+            "pack.mlir",
             "tensor_cast.mlir",
         ],
     ),

diff --git a/tests/e2e/tensor_ops/CMakeLists.txt b/tests/e2e/tensor_ops/CMakeLists.txt
@@ -31,6 +31,7 @@ iree_check_single_backend_test_suite(
     check_llvm-cpu_local-task
   SRCS
     "extract_slice.mlir"
+    "pack.mlir"
     "tensor_insert_slice.mlir"
   TARGET_BACKEND
     "llvm-cpu"