Skip to content

Commit

Permalink
Add an experimental pipeline for conv vector distribution. (#16789)
Browse files Browse the repository at this point in the history
It excludes the tiling from first level of tiling; promotes images, and
tiles the filter.
  • Loading branch information
hanhanW authored and antiagainst committed Mar 19, 2024
1 parent d523174 commit c5150c3
Show file tree
Hide file tree
Showing 9 changed files with 221 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ def LLVMGPU_VectorDistribute
: I32EnumAttrCase<"LLVMGPUVectorDistribute", 110>;
def LLVMGPU_ImplicitGEMM
: I32EnumAttrCase<"LLVMGPUImplicitGEMM", 111>;
def LLVMGPU_ConvVectorDistribute
: I32EnumAttrCase<"LLVMGPUConvVectorDistribute", 112>;

def SPIRV_BaseLowering
: I32EnumAttrCase<"SPIRVBaseLowering", 200>;
Expand Down Expand Up @@ -92,7 +94,7 @@ def DispatchLoweringPassPipelineEnum : I32EnumAttr<
LLVMGPU_Vectorize, LLVMGPU_MatmulSimt, LLVMGPU_MatmulTensorCore,
LLVMGPU_TransposeSharedMem, LLVMGPU_WarpReduction, LLVMGPU_PackUnPack,
LLVMGPU_MatmulTensorCoreMmaSync, LLVMGPU_VectorDistribute,
LLVMGPU_ImplicitGEMM,
LLVMGPU_ImplicitGEMM, LLVMGPU_ConvVectorDistribute,

// SPIR-V CodeGen pipelines
SPIRV_BaseLowering, SPIRV_BaseDistribute, SPIRV_BaseVectorize,
Expand Down
1 change: 1 addition & 0 deletions compiler/src/iree/compiler/Codegen/LLVMGPU/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ iree_compiler_cc_library(
"LLVMGPUPackSharedMemoryAlloc.cpp",
"LLVMGPUPadIGemm.cpp",
"LLVMGPUPrefetching.cpp",
"LLVMGPUPromoteConvImgAndTileFilter.cpp",
"LLVMGPURewritePadInDestinationPassingStyle.cpp",
"LLVMGPUSelectLoweringStrategy.cpp",
"LLVMGPUTensorCoreVectorization.cpp",
Expand Down
1 change: 1 addition & 0 deletions compiler/src/iree/compiler/Codegen/LLVMGPU/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ iree_cc_library(
"LLVMGPUPackSharedMemoryAlloc.cpp"
"LLVMGPUPadIGemm.cpp"
"LLVMGPUPrefetching.cpp"
"LLVMGPUPromoteConvImgAndTileFilter.cpp"
"LLVMGPURewritePadInDestinationPassingStyle.cpp"
"LLVMGPUSelectLoweringStrategy.cpp"
"LLVMGPUTensorCoreVectorization.cpp"
Expand Down
11 changes: 9 additions & 2 deletions compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -441,8 +441,15 @@ setConvolutionVectorDistributionConfig(mlir::FunctionOpInterface entryPoint,
workgroupTileSizes[filterDim] = 1;
}

TileSizesListType tileSizes;
tileSizes.push_back(workgroupTileSizes);
SmallVector<int64_t> fstLevelTileSizes = workgroupTileSizes;
for (int64_t filterDim : convolutionDims->filterLoop) {
fstLevelTileSizes[filterDim] = 0;
}
SmallVector<int64_t> sndLevelTileSizes(fstLevelTileSizes.size(), 0);
for (int64_t filterDim : convolutionDims->filterLoop) {
sndLevelTileSizes[filterDim] = workgroupTileSizes[filterDim];
}
TileSizesListType tileSizes = {fstLevelTileSizes, sndLevelTileSizes};

// Attach the MMA schedule as an attribute to the entry point export function
// for later access in the pipeline.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,9 @@ void LLVMGPULowerExecutableTargetPass::runOnOperation() {
case IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUVectorDistribute:
addGPUVectorDistributePassPipeline(pipeline);
break;
case IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUConvVectorDistribute:
addGPUConvVectorDistributePassPipeline(pipeline);
break;
case IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUWarpReduction:
addGPUWarpReductionPassPipeline(pipeline);
break;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
// Copyright 2024 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include "iree/compiler/Codegen/LLVMGPU/PassDetail.h"
#include "iree/compiler/Codegen/LLVMGPU/Passes.h"
#include "mlir/Dialect/Linalg/IR/Linalg.h"
#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
#include "mlir/Dialect/MemRef/Transforms/Transforms.h"
#include "mlir/IR/PatternMatch.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"

#define DEBUG_TYPE "iree-llvmgpu-promote-conv-img"

namespace mlir::iree_compiler {

namespace {
struct LLVMGPUPromoteConvImgAndTileFilterPass
: public LLVMGPUPromoteConvImgAndTileFilterBase<
LLVMGPUPromoteConvImgAndTileFilterPass> {
void getDependentDialects(DialectRegistry &registry) const override {}
void runOnOperation() override {
MLIRContext *ctx = &getContext();
auto funcOp = getOperation();

linalg::LinalgOp conv;
auto found = funcOp->walk([&](linalg::Conv2DNhwcHwcfOp op) {
if (conv) {
return WalkResult::interrupt();
}
conv = op;
return WalkResult::advance();
});
if (found.wasInterrupted()) {
LLVM_DEBUG(llvm::dbgs() << "skip, expect a single conv\n");
return;
}

LLVM_DEBUG(llvm::dbgs() << "candidate: " << conv << "\n");
IRRewriter rewriter(ctx);
SmallVector<int64_t> paddingDims = {0, 1, 2, 3};
SmallVector<bool> packPaddings = {1, 0, 0};
SmallVector<int64_t> padToMultipleOf(paddingDims.size(), 1);
SmallVector<Attribute> paddingValueAttributes;
for (auto &operand : conv->getOpOperands()) {
auto elemType = getElementTypeOrSelf(operand.get().getType());
paddingValueAttributes.push_back(rewriter.getZeroAttr(elemType));
}

auto options =
linalg::LinalgPaddingOptions()
.setPaddingDimensions(paddingDims)
.setPaddingValues(paddingValueAttributes)
.setPadToMultipleOf(padToMultipleOf)
.setPackPaddings(packPaddings)
.setCopyBackOp(linalg::LinalgPaddingOptions::CopyBackOp::None);
linalg::LinalgOp paddedOp;
SmallVector<Value> replacements;
SmallVector<tensor::PadOp> newPadOps;
if (failed(rewriteAsPaddedOp(rewriter, conv, options, paddedOp,
replacements, newPadOps))) {
LLVM_DEBUG(llvm::dbgs() << "failed to pad op " << conv << "\n");
return signalPassFailure();
}
rewriter.replaceOp(conv, replacements);

// tile filter
{
funcOp->walk([&](linalg::Conv2DNhwcHwcfOp op) { conv = op; });
FailureOr<IREE::Codegen::LoweringConfigAttr> loweringConfig =
getLoweringConfig(conv);
if (failed(loweringConfig)) {
LLVM_DEBUG(llvm::dbgs()
<< "skip tiling because there are no lowering_config\n");
return;
}

IRRewriter rewriter(ctx);
SmallVector<OpFoldResult> tileSizes = llvm::map_to_vector(
loweringConfig->getTileSizeVals(1), [&](int64_t val) -> OpFoldResult {
return rewriter.getIndexAttr(val);
});
auto options = scf::SCFTilingOptions().setTileSizes(tileSizes);
FailureOr<scf::SCFTilingResult> tilingResult = scf::tileUsingSCF(
rewriter, cast<TilingInterface>(conv.getOperation()), options);
if (failed(tilingResult))
return signalPassFailure();
rewriter.replaceOp(conv, tilingResult->replacements);
}

// Canonicalize tiled ops.
{
RewritePatternSet patterns(ctx);
linalg::populateLinalgTilingCanonicalizationPatterns(patterns);
memref::populateResolveRankedShapedTypeResultDimsPatterns(patterns);
ctx->getOrLoadDialect<tensor::TensorDialect>()
->getCanonicalizationPatterns(patterns);
if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(patterns)))) {
return signalPassFailure();
}
}
}
};
} // namespace

std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
createLLVMGPUPromoteConvImgAndTileFilterPass() {
return std::make_unique<LLVMGPUPromoteConvImgAndTileFilterPass>();
}

} // namespace mlir::iree_compiler
81 changes: 81 additions & 0 deletions compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -604,6 +604,87 @@ void addGPUVectorDistributePassPipeline(OpPassManager &pm) {
nestedModulePM.addPass(createCSEPass());
}

void addGPUConvVectorDistributePassPipeline(OpPassManager &pm) {
tileAndDistributeToWorkgroup(pm);
auto &nestedModulePM = pm.nest<ModuleOp>();
nestedModulePM.addNestedPass<func::FuncOp>(
createReorderWorkgroups(clLogSwizzleTile, [](FunctionOpInterface funcOp) {
auto entryPoint = getEntryPoint(funcOp);
if (failed(entryPoint))
return failure();
IREE::Codegen::TranslationInfoAttr transInfo =
getTranslationInfo(*entryPoint);
if (!transInfo)
return failure();
DictionaryAttr config = transInfo.getConfiguration();
if (config.contains("mma_schedule"))
return success();
return failure();
}));
nestedModulePM.addPass(createCanonicalizerPass());
nestedModulePM.addPass(createCSEPass());

// Problem specific (reduction) tiling.
nestedModulePM.addNestedPass<func::FuncOp>(
createGPUTensorTileToSerialLoops(true));
nestedModulePM.addNestedPass<func::FuncOp>(
createLLVMGPUPromoteConvImgAndTileFilterPass());

nestedModulePM.addPass(createCanonicalizerPass());
nestedModulePM.addPass(createCSEPass());
nestedModulePM.addNestedPass<func::FuncOp>(
createLLVMGPURewritePadInDestinationPassingStylePass());

// Generalize all named ops so that we can fold away unit extent dims. By this
// point, all tiling is finished so the tiling configurations on those ops can
// be safely dropped. This additionally allows vectorization of convolution to
// `vector.contract` as filter dimensions are expected to be tiled to 1 by
// this point.
nestedModulePM.addNestedPass<func::FuncOp>(
createLinalgGeneralizeNamedOpsPass());
LinalgFoldUnitExtentDimsPassOptions options;
options.useRankReducingSlices = true;
nestedModulePM.addNestedPass<func::FuncOp>(
mlir::createLinalgFoldUnitExtentDimsPass(options));
nestedModulePM.addPass(createCanonicalizerPass());
nestedModulePM.addPass(createCSEPass());

nestedModulePM.addNestedPass<func::FuncOp>(
createOptimizeTensorInsertExtractSlicesPass());

// Linalg -> Vector
addGPUVectorizationPasses(nestedModulePM);

// Tensor -> Memref
addVectorBufferizePasses(nestedModulePM);
nestedModulePM.addPass(createCanonicalizerPass());
nestedModulePM.addPass(createCSEPass());
nestedModulePM.addNestedPass<func::FuncOp>(
createHoistStaticallyBoundAllocationsPass());

// Vector SIMD -> Vector SIMT
nestedModulePM.addNestedPass<func::FuncOp>(
createLLVMGPUNormalizeContractMapsPass());
nestedModulePM.addNestedPass<func::FuncOp>(
createLLVMGPUCastTypeToFitMMAPass());
nestedModulePM.addNestedPass<func::FuncOp>(createLLVMGPUVectorDistribute());
nestedModulePM.addPass(createCanonicalizerPass());
nestedModulePM.addPass(createCSEPass());

nestedModulePM.addNestedPass<func::FuncOp>(createMemrefCopyToLinalgPass());
nestedModulePM.addNestedPass<func::FuncOp>(
createGPUDistributeSharedMemoryCopy());

nestedModulePM.addNestedPass<func::FuncOp>(
createGPUReduceSharedMemoryBankConflicts());

nestedModulePM.addNestedPass<func::FuncOp>(
memref::createFoldMemRefAliasOpsPass());
nestedModulePM.addPass(createCSEPass());
nestedModulePM.addPass(createCanonicalizerPass());
nestedModulePM.addPass(createCSEPass());
}

void addGPUWarpReductionPassPipeline(OpPassManager &pm) {
tileAndDistributeToWorkgroup(pm);
auto &nestedModulePM = pm.nest<ModuleOp>();
Expand Down
4 changes: 4 additions & 0 deletions compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ void addGPUVectorizationPassPipeline(OpPassManager &pm);

/// Lowering based on vector distribution patterns.
void addGPUVectorDistributePassPipeline(OpPassManager &pm);
void addGPUConvVectorDistributePassPipeline(OpPassManager &pm);

/// Lowering reductions to warp reductions.
void addGPUWarpReductionPassPipeline(OpPassManager &pm);
Expand Down Expand Up @@ -184,6 +185,9 @@ verifyGPUMatmulPipeline(Operation *op,
std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
createAMDGPUPrepareForChainedMatmulPass();

std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
createLLVMGPUPromoteConvImgAndTileFilterPass();

//----------------------------------------------------------------------------//
// Register LLVMGPU Passes
//----------------------------------------------------------------------------//
Expand Down
6 changes: 6 additions & 0 deletions compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,12 @@ def LLVMGPUPrefetchSharedMemory :
let constructor = "mlir::iree_compiler::createLLVMGPUPrefetchSharedMemoryPass()";
}

def LLVMGPUPromoteConvImgAndTileFilter :
InterfacePass<"iree-llvmgpu-promote-conv-img-and-tile-filter", "mlir::FunctionOpInterface"> {
let summary = "Pass promote convolution image operand and tile filter with tilingLevel=1.";
let constructor = "mlir::iree_compiler::createLLVMGPUPromoteConvImgAndTileFilterPass()";
}

def LLVMGPUSelectLoweringStrategy :
Pass<"iree-llvmgpu-select-lowering-strategy", "mlir::iree_compiler::IREE::HAL::ExecutableVariantOp"> {
let summary = "Select a IREE::HAL::DispatchLoweringPassPipeline for lowering the target variant";
Expand Down

0 comments on commit c5150c3

Please sign in to comment.