Add an experimental pipeline for conv vector distribution. (#16789)

It excludes the tiling from first level of tiling; promotes images, and tiles the filter.
iree-org · Mar 19, 2024 · c5150c3 · c5150c3
1 parent d523174
commit c5150c3
Show file tree

Hide file tree

Showing 9 changed files with 221 additions and 3 deletions.
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.td b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.td
@@ -52,6 +52,8 @@ def LLVMGPU_VectorDistribute
     : I32EnumAttrCase<"LLVMGPUVectorDistribute", 110>;
 def LLVMGPU_ImplicitGEMM
     : I32EnumAttrCase<"LLVMGPUImplicitGEMM", 111>;
+def LLVMGPU_ConvVectorDistribute
+    : I32EnumAttrCase<"LLVMGPUConvVectorDistribute", 112>;
 
 def SPIRV_BaseLowering
     : I32EnumAttrCase<"SPIRVBaseLowering", 200>;
@@ -92,7 +94,7 @@ def DispatchLoweringPassPipelineEnum : I32EnumAttr<
     LLVMGPU_Vectorize, LLVMGPU_MatmulSimt, LLVMGPU_MatmulTensorCore,
     LLVMGPU_TransposeSharedMem, LLVMGPU_WarpReduction, LLVMGPU_PackUnPack,
     LLVMGPU_MatmulTensorCoreMmaSync, LLVMGPU_VectorDistribute,
-    LLVMGPU_ImplicitGEMM,
+    LLVMGPU_ImplicitGEMM, LLVMGPU_ConvVectorDistribute,
 
     // SPIR-V CodeGen pipelines
     SPIRV_BaseLowering, SPIRV_BaseDistribute, SPIRV_BaseVectorize,

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel
@@ -101,6 +101,7 @@ iree_compiler_cc_library(
         "LLVMGPUPackSharedMemoryAlloc.cpp",
         "LLVMGPUPadIGemm.cpp",
         "LLVMGPUPrefetching.cpp",
+        "LLVMGPUPromoteConvImgAndTileFilter.cpp",
         "LLVMGPURewritePadInDestinationPassingStyle.cpp",
         "LLVMGPUSelectLoweringStrategy.cpp",
         "LLVMGPUTensorCoreVectorization.cpp",

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt
@@ -86,6 +86,7 @@ iree_cc_library(
     "LLVMGPUPackSharedMemoryAlloc.cpp"
     "LLVMGPUPadIGemm.cpp"
     "LLVMGPUPrefetching.cpp"
+    "LLVMGPUPromoteConvImgAndTileFilter.cpp"
     "LLVMGPURewritePadInDestinationPassingStyle.cpp"
     "LLVMGPUSelectLoweringStrategy.cpp"
     "LLVMGPUTensorCoreVectorization.cpp"

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
@@ -441,8 +441,15 @@ setConvolutionVectorDistributionConfig(mlir::FunctionOpInterface entryPoint,
     workgroupTileSizes[filterDim] = 1;
   }
 
-  TileSizesListType tileSizes;
-  tileSizes.push_back(workgroupTileSizes);
+  SmallVector<int64_t> fstLevelTileSizes = workgroupTileSizes;
+  for (int64_t filterDim : convolutionDims->filterLoop) {
+    fstLevelTileSizes[filterDim] = 0;
+  }
+  SmallVector<int64_t> sndLevelTileSizes(fstLevelTileSizes.size(), 0);
+  for (int64_t filterDim : convolutionDims->filterLoop) {
+    sndLevelTileSizes[filterDim] = workgroupTileSizes[filterDim];
+  }
+  TileSizesListType tileSizes = {fstLevelTileSizes, sndLevelTileSizes};
 
   // Attach the MMA schedule as an attribute to the entry point export function
   // for later access in the pipeline.

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp
@@ -124,6 +124,9 @@ void LLVMGPULowerExecutableTargetPass::runOnOperation() {
   case IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUVectorDistribute:
     addGPUVectorDistributePassPipeline(pipeline);
     break;
+  case IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUConvVectorDistribute:
+    addGPUConvVectorDistributePassPipeline(pipeline);
+    break;
   case IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUWarpReduction:
     addGPUWarpReductionPassPipeline(pipeline);
     break;

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUPromoteConvImgAndTileFilter.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUPromoteConvImgAndTileFilter.cpp
@@ -0,0 +1,113 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/compiler/Codegen/LLVMGPU/PassDetail.h"
+#include "iree/compiler/Codegen/LLVMGPU/Passes.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/MemRef/Transforms/Transforms.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+#define DEBUG_TYPE "iree-llvmgpu-promote-conv-img"
+
+namespace mlir::iree_compiler {
+
+namespace {
+struct LLVMGPUPromoteConvImgAndTileFilterPass
+    : public LLVMGPUPromoteConvImgAndTileFilterBase<
+          LLVMGPUPromoteConvImgAndTileFilterPass> {
+  void getDependentDialects(DialectRegistry &registry) const override {}
+  void runOnOperation() override {
+    MLIRContext *ctx = &getContext();
+    auto funcOp = getOperation();
+
+    linalg::LinalgOp conv;
+    auto found = funcOp->walk([&](linalg::Conv2DNhwcHwcfOp op) {
+      if (conv) {
+        return WalkResult::interrupt();
+      }
+      conv = op;
+      return WalkResult::advance();
+    });
+    if (found.wasInterrupted()) {
+      LLVM_DEBUG(llvm::dbgs() << "skip, expect a single conv\n");
+      return;
+    }
+
+    LLVM_DEBUG(llvm::dbgs() << "candidate: " << conv << "\n");
+    IRRewriter rewriter(ctx);
+    SmallVector<int64_t> paddingDims = {0, 1, 2, 3};
+    SmallVector<bool> packPaddings = {1, 0, 0};
+    SmallVector<int64_t> padToMultipleOf(paddingDims.size(), 1);
+    SmallVector<Attribute> paddingValueAttributes;
+    for (auto &operand : conv->getOpOperands()) {
+      auto elemType = getElementTypeOrSelf(operand.get().getType());
+      paddingValueAttributes.push_back(rewriter.getZeroAttr(elemType));
+    }
+
+    auto options =
+        linalg::LinalgPaddingOptions()
+            .setPaddingDimensions(paddingDims)
+            .setPaddingValues(paddingValueAttributes)
+            .setPadToMultipleOf(padToMultipleOf)
+            .setPackPaddings(packPaddings)
+            .setCopyBackOp(linalg::LinalgPaddingOptions::CopyBackOp::None);
+    linalg::LinalgOp paddedOp;
+    SmallVector<Value> replacements;
+    SmallVector<tensor::PadOp> newPadOps;
+    if (failed(rewriteAsPaddedOp(rewriter, conv, options, paddedOp,
+                                 replacements, newPadOps))) {
+      LLVM_DEBUG(llvm::dbgs() << "failed to pad op " << conv << "\n");
+      return signalPassFailure();
+    }
+    rewriter.replaceOp(conv, replacements);
+
+    // tile filter
+    {
+      funcOp->walk([&](linalg::Conv2DNhwcHwcfOp op) { conv = op; });
+      FailureOr<IREE::Codegen::LoweringConfigAttr> loweringConfig =
+          getLoweringConfig(conv);
+      if (failed(loweringConfig)) {
+        LLVM_DEBUG(llvm::dbgs()
+                   << "skip tiling because there are no lowering_config\n");
+        return;
+      }
+
+      IRRewriter rewriter(ctx);
+      SmallVector<OpFoldResult> tileSizes = llvm::map_to_vector(
+          loweringConfig->getTileSizeVals(1), [&](int64_t val) -> OpFoldResult {
+            return rewriter.getIndexAttr(val);
+          });
+      auto options = scf::SCFTilingOptions().setTileSizes(tileSizes);
+      FailureOr<scf::SCFTilingResult> tilingResult = scf::tileUsingSCF(
+          rewriter, cast<TilingInterface>(conv.getOperation()), options);
+      if (failed(tilingResult))
+        return signalPassFailure();
+      rewriter.replaceOp(conv, tilingResult->replacements);
+    }
+
+    // Canonicalize tiled ops.
+    {
+      RewritePatternSet patterns(ctx);
+      linalg::populateLinalgTilingCanonicalizationPatterns(patterns);
+      memref::populateResolveRankedShapedTypeResultDimsPatterns(patterns);
+      ctx->getOrLoadDialect<tensor::TensorDialect>()
+          ->getCanonicalizationPatterns(patterns);
+      if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(patterns)))) {
+        return signalPassFailure();
+      }
+    }
+  }
+};
+} // namespace
+
+std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
+createLLVMGPUPromoteConvImgAndTileFilterPass() {
+  return std::make_unique<LLVMGPUPromoteConvImgAndTileFilterPass>();
+}
+
+} // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
@@ -604,6 +604,87 @@ void addGPUVectorDistributePassPipeline(OpPassManager &pm) {
   nestedModulePM.addPass(createCSEPass());
 }
 
+void addGPUConvVectorDistributePassPipeline(OpPassManager &pm) {
+  tileAndDistributeToWorkgroup(pm);
+  auto &nestedModulePM = pm.nest<ModuleOp>();
+  nestedModulePM.addNestedPass<func::FuncOp>(
+      createReorderWorkgroups(clLogSwizzleTile, [](FunctionOpInterface funcOp) {
+        auto entryPoint = getEntryPoint(funcOp);
+        if (failed(entryPoint))
+          return failure();
+        IREE::Codegen::TranslationInfoAttr transInfo =
+            getTranslationInfo(*entryPoint);
+        if (!transInfo)
+          return failure();
+        DictionaryAttr config = transInfo.getConfiguration();
+        if (config.contains("mma_schedule"))
+          return success();
+        return failure();
+      }));
+  nestedModulePM.addPass(createCanonicalizerPass());
+  nestedModulePM.addPass(createCSEPass());
+
+  // Problem specific (reduction) tiling.
+  nestedModulePM.addNestedPass<func::FuncOp>(
+      createGPUTensorTileToSerialLoops(true));
+  nestedModulePM.addNestedPass<func::FuncOp>(
+      createLLVMGPUPromoteConvImgAndTileFilterPass());
+
+  nestedModulePM.addPass(createCanonicalizerPass());
+  nestedModulePM.addPass(createCSEPass());
+  nestedModulePM.addNestedPass<func::FuncOp>(
+      createLLVMGPURewritePadInDestinationPassingStylePass());
+
+  // Generalize all named ops so that we can fold away unit extent dims. By this
+  // point, all tiling is finished so the tiling configurations on those ops can
+  // be safely dropped. This additionally allows vectorization of convolution to
+  // `vector.contract` as filter dimensions are expected to be tiled to 1 by
+  // this point.
+  nestedModulePM.addNestedPass<func::FuncOp>(
+      createLinalgGeneralizeNamedOpsPass());
+  LinalgFoldUnitExtentDimsPassOptions options;
+  options.useRankReducingSlices = true;
+  nestedModulePM.addNestedPass<func::FuncOp>(
+      mlir::createLinalgFoldUnitExtentDimsPass(options));
+  nestedModulePM.addPass(createCanonicalizerPass());
+  nestedModulePM.addPass(createCSEPass());
+
+  nestedModulePM.addNestedPass<func::FuncOp>(
+      createOptimizeTensorInsertExtractSlicesPass());
+
+  // Linalg -> Vector
+  addGPUVectorizationPasses(nestedModulePM);
+
+  // Tensor -> Memref
+  addVectorBufferizePasses(nestedModulePM);
+  nestedModulePM.addPass(createCanonicalizerPass());
+  nestedModulePM.addPass(createCSEPass());
+  nestedModulePM.addNestedPass<func::FuncOp>(
+      createHoistStaticallyBoundAllocationsPass());
+
+  // Vector SIMD -> Vector SIMT
+  nestedModulePM.addNestedPass<func::FuncOp>(
+      createLLVMGPUNormalizeContractMapsPass());
+  nestedModulePM.addNestedPass<func::FuncOp>(
+      createLLVMGPUCastTypeToFitMMAPass());
+  nestedModulePM.addNestedPass<func::FuncOp>(createLLVMGPUVectorDistribute());
+  nestedModulePM.addPass(createCanonicalizerPass());
+  nestedModulePM.addPass(createCSEPass());
+
+  nestedModulePM.addNestedPass<func::FuncOp>(createMemrefCopyToLinalgPass());
+  nestedModulePM.addNestedPass<func::FuncOp>(
+      createGPUDistributeSharedMemoryCopy());
+
+  nestedModulePM.addNestedPass<func::FuncOp>(
+      createGPUReduceSharedMemoryBankConflicts());
+
+  nestedModulePM.addNestedPass<func::FuncOp>(
+      memref::createFoldMemRefAliasOpsPass());
+  nestedModulePM.addPass(createCSEPass());
+  nestedModulePM.addPass(createCanonicalizerPass());
+  nestedModulePM.addPass(createCSEPass());
+}
+
 void addGPUWarpReductionPassPipeline(OpPassManager &pm) {
   tileAndDistributeToWorkgroup(pm);
   auto &nestedModulePM = pm.nest<ModuleOp>();

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
@@ -51,6 +51,7 @@ void addGPUVectorizationPassPipeline(OpPassManager &pm);
 
 /// Lowering based on vector distribution patterns.
 void addGPUVectorDistributePassPipeline(OpPassManager &pm);
+void addGPUConvVectorDistributePassPipeline(OpPassManager &pm);
 
 /// Lowering reductions to warp reductions.
 void addGPUWarpReductionPassPipeline(OpPassManager &pm);
@@ -184,6 +185,9 @@ verifyGPUMatmulPipeline(Operation *op,
 std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
 createAMDGPUPrepareForChainedMatmulPass();
 
+std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
+createLLVMGPUPromoteConvImgAndTileFilterPass();
+
 //----------------------------------------------------------------------------//
 // Register LLVMGPU Passes
 //----------------------------------------------------------------------------//

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.td b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.td
@@ -89,6 +89,12 @@ def LLVMGPUPrefetchSharedMemory :
   let constructor = "mlir::iree_compiler::createLLVMGPUPrefetchSharedMemoryPass()";
 }
 
+def LLVMGPUPromoteConvImgAndTileFilter :
+    InterfacePass<"iree-llvmgpu-promote-conv-img-and-tile-filter", "mlir::FunctionOpInterface"> {
+  let summary = "Pass promote convolution image operand and tile filter with tilingLevel=1.";
+  let constructor = "mlir::iree_compiler::createLLVMGPUPromoteConvImgAndTileFilterPass()";
+}
+
 def LLVMGPUSelectLoweringStrategy :
     Pass<"iree-llvmgpu-select-lowering-strategy", "mlir::iree_compiler::IREE::HAL::ExecutableVariantOp"> {
   let summary = "Select a IREE::HAL::DispatchLoweringPassPipeline for lowering the target variant";