From 4395c118895887b39c0172604e1f691fbe185f0b Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Wed, 11 Sep 2024 16:26:48 -0400 Subject: [PATCH] GPU data tiling changes from `shared/gpu-data-tiling-materialize-encoding` (#18492) This PR is a squashed rebasing of https://github.com/iree-org/iree/tree/shared/gpu-data-tiling-materialize-encoding . This squashes together commits by @hanhanW , @lialan and myself. Here are all the commits: https://github.com/iree-org/iree/compare/40258db63fdfd2228adb2fd0baaea53b9e3c9d63...shared/gpu-data-tiling-materialize-encoding The intent is to carry on on `main` branch. The motivation is to pick up the recent TileAndFuse pipeline. --------- Signed-off-by: hanhanW Signed-off-by: Alan Li Signed-off-by: Benoit Jacob Co-authored-by: hanhanW Co-authored-by: Alan Li --- .../Common/CPU/CPUMaterializeEncodings.cpp | 8 +- .../compiler/Codegen/Common/EncodingUtils.cpp | 44 +- .../compiler/Codegen/Common/EncodingUtils.h | 67 +- .../compiler/Codegen/Common/GPU/BUILD.bazel | 2 + .../Codegen/Common/GPU/CMakeLists.txt | 2 + .../Common/GPU/GPUMaterializeEncoding.cpp | 645 ++++++++++++++++++ .../compiler/Codegen/Common/GPU/Passes.td | 5 + .../Codegen/Common/GPU/test/BUILD.bazel | 1 + .../Codegen/Common/GPU/test/CMakeLists.txt | 1 + .../GPU/test/gpu_materialize_encoding.mlir | 391 +++++++++++ .../Common/MaterializeEncodingIntoNop.cpp | 2 + .../MaterializeEncodingIntoPackUnPack.cpp | 73 +- .../Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp | 35 +- .../Codegen/Dialect/GPU/IR/IREEGPUAttrs.td | 47 ++ .../Codegen/Dialect/GPU/IR/IREEGPUOps.cpp | 9 +- .../Dialect/GPU/IR/test/iree_gpu_attrs.mlir | 27 + .../Dialect/GPU/IR/test/iree_gpu_ops.mlir | 54 ++ .../iree/compiler/Codegen/Utils/GPUUtils.cpp | 2 +- .../iree/compiler/Codegen/Utils/GPUUtils.h | 4 + 19 files changed, 1371 insertions(+), 48 deletions(-) create mode 100644 compiler/src/iree/compiler/Codegen/Common/GPU/GPUMaterializeEncoding.cpp create mode 100644 compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_materialize_encoding.mlir diff --git a/compiler/src/iree/compiler/Codegen/Common/CPU/CPUMaterializeEncodings.cpp b/compiler/src/iree/compiler/Codegen/Common/CPU/CPUMaterializeEncodings.cpp index 894f03053da1..21f988fe359e 100644 --- a/compiler/src/iree/compiler/Codegen/Common/CPU/CPUMaterializeEncodings.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/CPU/CPUMaterializeEncodings.cpp @@ -464,9 +464,11 @@ materializeFuncOpEncodings(FunctionOpInterface funcOp, targetAttr); MaterializeEncodingConversionTarget target(*funcOp.getContext()); auto materializeEncodingValueFn = getMaterializeEncodingValueFn(targetAttr); - populateMaterializeEncodingIntoPackUnPackPatterns(materializeEncodingPattern, - target, typeConverter, - materializeEncodingValueFn); + populateMaterializeEncodingIntoPackUnPackPatterns( + materializeEncodingPattern, typeConverter, materializeEncodingValueFn); + populateIREEMaterializeEncodingIntoPackUnPackPatterns( + materializeEncodingPattern, target, typeConverter, + materializeEncodingValueFn); if (failed(applyPartialConversion(funcOp, target, std::move(materializeEncodingPattern)))) { diff --git a/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.cpp b/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.cpp index 378383d9973a..15a99d868d04 100644 --- a/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.cpp @@ -117,9 +117,29 @@ MaterializeEncodingTypeConverter::MaterializeEncodingTypeConverter( if (failed(maybeEncodingInfo)) { return dropEncoding(type); } - return cast(tensor::PackOp::inferPackedType( + auto encodingInfo = *maybeEncodingInfo; + auto packedType = cast(tensor::PackOp::inferPackedType( tensorType, maybeEncodingInfo->innerTileSizes, maybeEncodingInfo->innerDimsPos, maybeEncodingInfo->outerDimsPerm)); + + // There is no swizzle, we are already done. Typically the case on CPU. + if (!encodingInfo.swizzle) { + return packedType; + } + + // There is a swizzle, we need to handle it. Typically the case on GPU. + auto swizzle = *encodingInfo.swizzle; + SmallVector newShape( + packedType.getShape().drop_back(encodingInfo.innerTileSizes.size())); + SmallVector swizzledTileShape; + for (auto expandedDimShape : swizzle.expandShape) { + for (int64_t d : expandedDimShape) { + swizzledTileShape.push_back(d); + } + } + applyPermutationToVector(swizzledTileShape, swizzle.permutation); + newShape.append(swizzledTileShape); + return RankedTensorType::get(newShape, packedType.getElementType()); }); } @@ -143,19 +163,6 @@ MaterializeEncodingConversionTarget::MaterializeEncodingConversionTarget( }); } -RankedTensorType getOriginalTypeWithEncoding(RankedTensorType type) { - auto encoding = getEncodingAttr(type); - if (!encoding) { - return type; - } - RankedTensorType originalType = type; - if (auto originalTypeAttr = encoding.getOriginalType()) { - originalType = cast(originalTypeAttr.getValue()); - } - return RankedTensorType::get(originalType.getShape(), - originalType.getElementType(), encoding); -} - RankedTensorType dropEncoding(RankedTensorType type) { return RankedTensorType::get(type.getShape(), type.getElementType()); } @@ -213,4 +220,13 @@ bool isNarrowNResult(EncodingAttr encoding) { return narrowN && (!narrowM || narrowM.getInt() > narrowN.getInt()); } +SmallVector +getExpandedTileShape(SmallVector> expandShape) { + SmallVector result; + for (auto expandShapeDim : expandShape) { + result.append(expandShapeDim); + } + return result; +} + } // namespace mlir::iree_compiler diff --git a/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.h b/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.h index 9a2aafa5fe0c..d12a44fe8626 100644 --- a/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.h +++ b/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.h @@ -9,16 +9,47 @@ #include "iree/compiler/Dialect/Encoding/IR/EncodingOps.h" #include "iree/compiler/Dialect/HAL/IR/HALTypes.h" +#include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/Transforms/DialectConversion.h" namespace mlir::iree_compiler { -/// Container of information needed to materialize the pack operation. +/// Container of information needed to materialize the layout transformations. +/// +/// On CPU, these layout transformations consist of a single `temsor.pack` +/// or `tensor.unpack` op, implementing a tiled layout where each tile is +/// row-major. +/// +/// On GPU, there is an additional `swizzle`, which changes the layout inside +/// of the tile. See the comment on the nested Swizzle struct. struct MaterializeEncodingInfo { + // Metadata for a swizzle, that is, an (expand_shape -> transposition) + // pair of ops performing a change of layout within the tiles. This is used + // on GPU, where the tiles themselves can have an arbitrary layout. + struct Swizzle { + // This vector-of-vectors contains all the information needed to generate + // a `tensor.expand_shape` creating additional internal dimensions into the + // tile. For example, expandShape = [[16], [4, 2]] means that the original + // tile shape [16, 8] gets expanded such that the first dimension 16 is left + // unchanged, and the second dimension 8 gets split into two internal dims + // of size 4 and 2. + SmallVector> expandShape; + // This permutation vector applies to the expanded dimensions and is used + // to generate a `linalg.transpose` changing the layout of the tile. For + // example, permutation[0] dictates which of the expanded dimensions becomes + // the leading dimension of the layout. + SmallVector permutation; + }; + + // The next 3 fields are used to create a `tensor.pack` or `tensor.unpack` op, + // changing the overall layout between row-major and tiled (where each tile is + // row-major). SmallVector innerDimsPos; SmallVector innerTileSizes; SmallVector outerDimsPerm; - unsigned srcRank = 0; + + // The optional swizzle, see the above comment on Swizzle. Only used on GPU. + std::optional swizzle; }; using MaterializeEncodingFn = std::function( @@ -83,9 +114,6 @@ class OpMaterializeEncodingPattern : public OpConversionPattern { // Utility methods about Encoding. //===---------------------------------------------------------------------===// -/// Returns the original type that carried by encoding. -RankedTensorType getOriginalTypeWithEncoding(RankedTensorType type); - /// Returns the RankedTensorType without encodings. RankedTensorType dropEncoding(RankedTensorType type); @@ -102,7 +130,32 @@ MaterializeEncodingInfo getEncodingInfoForMatmul(IREE::Encoding::EncodingAttr encoding, int64_t rank, TileMxNxK tileMxNxK); +/// Utility method to convert from `set_encoding` op to `pack` operation. +/// For now this takes a `paddingValue` as input. The source is also taken +/// as input so that these could be used with `OpConversionPatterns`. +FailureOr lowerSetEncodingOpToPackOp( + RewriterBase &rewriter, IREE::Encoding::SetEncodingOp encodingOp, + Value source, const MaterializeEncodingTypeConverter &typeConverter, + MaterializeEncodingValueFn materializeEncodingValueFn); + +/// Utility method to convert from `unset_encoding` op to `unpack` operation. +/// The source is taken as input so that these could be used with +/// `OpConversionPatterns`. +FailureOr lowerUnsetEncodingToUnpackOp( + RewriterBase &rewriter, IREE::Encoding::UnsetEncodingOp encodingOp, + Value packedValue, const MaterializeEncodingTypeConverter &typeConverter, + MaterializeEncodingValueFn materializeEncodingValueFn); + +/// Pouplates the set of patterns that lowers set_encoding, unset_encoding, and +/// upstream dialect ops with encoding types to pack/unpack ops. void populateMaterializeEncodingIntoPackUnPackPatterns( + RewritePatternSet &patterns, + MaterializeEncodingTypeConverter &typeConverter, + MaterializeEncodingValueFn materializeEncodingValueFn); + +/// Pouplates the set of patterns that lowers IREE dialect (e.g., Flow, Hal, +/// etc) ops with encoding types to pack/unpack ops. +void populateIREEMaterializeEncodingIntoPackUnPackPatterns( RewritePatternSet &patterns, MaterializeEncodingConversionTarget &target, MaterializeEncodingTypeConverter &typeConverter, MaterializeEncodingValueFn materializeEncodingValueFn); @@ -111,6 +164,10 @@ void populateMaterializeEncodingIntoPackUnPackPatterns( // result of a matvec. bool isNarrowNResult(IREE::Encoding::EncodingAttr encoding); +// Concatenates the vectors. +SmallVector +getExpandedTileShape(SmallVector> expandShape); + } // namespace mlir::iree_compiler #endif // IREE_COMPILER_SRC_IREE_COMPILER_CODEGEN_COMMON_ENCODINGUTILS_H_ diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel index 1799a0987b6b..39543637e60e 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel @@ -61,6 +61,7 @@ iree_compiler_cc_library( "GPUGeneralizeNamedOps.cpp", "GPUInferMemorySpace.cpp", "GPULowerToUKernels.cpp", + "GPUMaterializeEncoding.cpp", "GPUMultiBuffering.cpp", "GPUNestedLayoutDistributionPatterns.cpp", "GPUPatterns.cpp", @@ -98,6 +99,7 @@ iree_compiler_cc_library( "//compiler/src/iree/compiler/Codegen/Transforms", "//compiler/src/iree/compiler/Codegen/Utils", "//compiler/src/iree/compiler/Codegen/Utils:VectorOpUtils", + "//compiler/src/iree/compiler/Dialect/Encoding/IR", "//compiler/src/iree/compiler/Dialect/HAL/IR", "@llvm-project//llvm:Support", "@llvm-project//mlir:AMDGPUDialect", diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt index 4adc4cd165b8..b14f0d5fb633 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt @@ -59,6 +59,7 @@ iree_cc_library( "GPUGeneralizeNamedOps.cpp" "GPUInferMemorySpace.cpp" "GPULowerToUKernels.cpp" + "GPUMaterializeEncoding.cpp" "GPUMultiBuffering.cpp" "GPUNestedLayoutDistributionPatterns.cpp" "GPUPatterns.cpp" @@ -129,6 +130,7 @@ iree_cc_library( iree::compiler::Codegen::Transforms iree::compiler::Codegen::Utils iree::compiler::Codegen::Utils::VectorOpUtils + iree::compiler::Dialect::Encoding::IR iree::compiler::Dialect::HAL::IR PUBLIC ) diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUMaterializeEncoding.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUMaterializeEncoding.cpp new file mode 100644 index 000000000000..2d8aa4b67ca6 --- /dev/null +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUMaterializeEncoding.cpp @@ -0,0 +1,645 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree/compiler/Codegen/Common/EncodingUtils.h" +#include "iree/compiler/Codegen/Common/GPU/Passes.h" +#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h" +#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h" +#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUDialect.h" +#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.h" +#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUInterfaces.h" +#include "iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.h" +#include "iree/compiler/Codegen/Utils/GPUUtils.h" +#include "iree/compiler/Dialect/Encoding/IR/EncodingDialect.h" +#include "iree/compiler/Dialect/Encoding/IR/EncodingOps.h" +#include "iree/compiler/Dialect/HAL/IR/HALTypes.h" +#include "llvm/ADT/SmallVector.h" +#include "mlir/Dialect/Linalg/IR/Linalg.h" +#include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h" +#include "mlir/Dialect/Linalg/Transforms/Transforms.h" +#include "mlir/Dialect/MemRef/Transforms/Transforms.h" +#include "mlir/Dialect/Tensor/IR/Tensor.h" +#include "mlir/Dialect/Tensor/Transforms/Transforms.h" +#include "mlir/Dialect/Utils/IndexingUtils.h" +#include "mlir/Dialect/Utils/ReshapeOpsUtils.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/MLIRContext.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" + +#define DEBUG_TYPE "iree-codegen-gpu-materialize-encoding" + +namespace mlir::iree_compiler { + +#define GEN_PASS_DEF_GPUMATERIALIZEDEVICEENCODINGPASS +#include "iree/compiler/Codegen/Common/GPU/Passes.h.inc" + +// Returns the swizzle for a given intrinsic and operand index. +// See the comment on MaterializeEncodingInfo::Swizzle for what that means. +// This function is concerned with a single intrinsic, not a whole kernel tile. +// TODO(bjacob): derive this automatically from the intrinsic layout getters. +static MaterializeEncodingInfo::Swizzle +getIntrinsicSwizzle(IREE::GPU::MMAIntrinsic mma, int operandIdx) { + switch (mma) { + case IREE::GPU::MMAIntrinsic::MFMA_F32_16x16x4_F32: + if (operandIdx == 2) { + return MaterializeEncodingInfo::Swizzle{/*expandShape=*/{{4, 4}, {16}}, + /*permutation=*/{0, 2, 1}}; + } else { + return MaterializeEncodingInfo::Swizzle{/*expandShape=*/{{16}, {4}}, + /*permutation=*/{1, 0}}; + } + case IREE::GPU::MMAIntrinsic::MFMA_F32_16x16x16_F16: + if (operandIdx == 2) { + return MaterializeEncodingInfo::Swizzle{/*expandShape=*/{{4, 4}, {16}}, + /*permutation=*/{0, 2, 1}}; + } else { + return MaterializeEncodingInfo::Swizzle{/*expandShape=*/{{16}, {4, 4}}, + /*permutation=*/{1, 0, 2}}; + } + case IREE::GPU::MMAIntrinsic::MFMA_I32_16x16x32_I8: + if (operandIdx == 2) { + return MaterializeEncodingInfo::Swizzle{/*expandShape=*/{{4, 4}, {16}}, + /*permutation=*/{0, 2, 1}}; + } else { + return MaterializeEncodingInfo::Swizzle{/*expandShape=*/{{16}, {4, 8}}, + /*permutation=*/{1, 0, 2}}; + } + default: + assert(false && "should not get here."); + return {}; + } +} + +// Given an `expandShape` vector-of-vectors describing the mapping from source +// dimensions to expanded dimensions, returns the index of the first expanded +// dimension corresponding to the given source dimension index. +static int64_t +getExpandedDimFirstIdx(const SmallVector> &expandShape, + int64_t srcIndex) { + int dstIndexFirst = 0; + for (int i = 0; i < srcIndex; ++i) { + dstIndexFirst += expandShape[i].size(); + } + return dstIndexFirst; +} + +// Unroll the dimension given by `srcIndex` by the given `unrollFactor`. +// This is not interleaving layouts. The layout will consist of multiple copies +// of the input tile, side by side. +// +// Example: +// Input swizzle = { expandShape = [[16], [4]], permutation = [1, 0] } +// Input srcIndex = 1 +// Input unrollFactor = 4 +// -> Output swizzle = { expandShape = [[16], [4, 4]], permutation = [1, 2, 0] } +// +static void unroll(MaterializeEncodingInfo::Swizzle &swizzle, int srcIndex, + int unrollFactor) { + assert(unrollFactor > 1); + int dstIndexFirst = getExpandedDimFirstIdx(swizzle.expandShape, srcIndex); + + // The new unrolling dimension is inserted at the start of the expandShape + // dimensions group corresponding to srcIndex. + swizzle.expandShape[srcIndex].insert(swizzle.expandShape[srcIndex].begin(), + unrollFactor); + // Since we are not interleaving here, generating side-by-side copies of the + // original layout, the new unrolling dimension is the new outermost + // dimension. Existing entries get shifted to make room for it. + for (auto &p : swizzle.permutation) { + p += (p >= dstIndexFirst); + } + swizzle.permutation.insert(swizzle.permutation.begin(), dstIndexFirst); +} + +// Interleave the layout in `swizzle` by mutating `swizzle.permutation` to +// move permutation[0], the outer-most dimension (which the unroll() function +// created to be the unrolling dimension), to the inner dimension given by +// `expandedDimIndexToInterleaveAt`. +// +// Example: +// Input swizzle = { expandShape = [[16], [4, 4]], permutation = [1, 2, 0] } +// Input srcIndex = 1 +// Input expandedDimIndexToInterleaveAt = 1 +// -> Output swizzle = { expandShape = [[16], [4, 4]], permutation = [2, 0, 1] } +// +static void interleave(MaterializeEncodingInfo::Swizzle &swizzle, int srcIndex, + int expandedDimIndexToInterleaveAt) { + // Compute which inner dimension to permute the current outer dimension into. + int dstIndexFirst = getExpandedDimFirstIdx(swizzle.expandShape, srcIndex); + int dstIndexToInterleaveAt = dstIndexFirst + expandedDimIndexToInterleaveAt; + + SmallVector outPermutation(swizzle.permutation.size()); + // The leading dimension, permutation[0], gets moved inwards to the + // position that we just computed, dstIndexToInterleaveAt. + outPermutation[dstIndexToInterleaveAt] = swizzle.permutation[0]; + // Outer dimensions get shifted outwards to fill the gap. + for (int i = 0; i < dstIndexToInterleaveAt; ++i) { + outPermutation[i] = swizzle.permutation[i + 1]; + } + // Inner dimensions don't change. That is to say that we only interleave + // at `targetInterleavedElements` granularity, we don't swizzle further + // internally to that. + for (int i = dstIndexToInterleaveAt + 1; i < outPermutation.size(); ++i) { + outPermutation[i] = swizzle.permutation[i]; + } + swizzle.permutation = outPermutation; +} + +// Returns the index of the dimension whose flattened size (flattening inner +// dimensions into it) matches the given `targetSize`. This is used to compute +// interleaving indices. +// +// Example: +// Input shape = [16, 8, 4, 4] +// Input targetSize = 16 +// -> Return 2, because the tail of the shape starting at index 2 is [4, 4], +// whose product equals targetSize. +static int64_t getDimIdxForTargetSize(const SmallVector &shape, + int64_t targetSize) { + int interleaveAt = 0; + int size = 1; + for (interleaveAt = shape.size() - 1; interleaveAt >= 0; --interleaveAt) { + assert(size <= targetSize); + assert((targetSize % size) == 0); + if (size == targetSize) { + break; + } + size *= shape[interleaveAt]; + } + return interleaveAt; +} + +// Generates the swizzle for the full data-tiled-mma tile, including all the +// relevant unrolling factors. +static MaterializeEncodingInfo::Swizzle +getSwizzle(IREE::GPU::DataTiledMMAAttr mma, int operandIdx) { + auto [AType, BType, CType] = mma.getABCElementTypes(); + int ABits = AType.getIntOrFloatBitWidth(); + int BBits = BType.getIntOrFloatBitWidth(); + // TODO(bjacob): Should be looked up from GPU target, instead of hard-coded. + const int targetPreferredLoadBitWidth = 128; + auto swizzle = getIntrinsicSwizzle(mma.getIntrinsic().getValue(), operandIdx); + switch (operandIdx) { + case 0: + // A-matrix (LHS). Source dimensions are M (index 0) and K (index 1). + // Unroll on K with interleaving, then on M. + if (mma.getUnrollK() > 1) { + unroll(swizzle, 1, mma.getUnrollK()); + int interleavingIdx = getDimIdxForTargetSize( + swizzle.expandShape[1], + targetPreferredLoadBitWidth / (mma.getUnrollK() * ABits)); + interleave(swizzle, 1, interleavingIdx); + } + if (mma.getUnrollM() > 1) { + unroll(swizzle, 0, mma.getUnrollM()); + } + break; + case 1: + // B-matrix (RHS). Since the pack ops already took care of transposing B, + // source dimensions are N (index 0) and K (index 1). + // Unroll on K with interleaving, then on N. + if (mma.getUnrollK() > 1) { + unroll(swizzle, 1, mma.getUnrollK()); + int interleavingIdx = getDimIdxForTargetSize( + swizzle.expandShape[1], + targetPreferredLoadBitWidth / (mma.getUnrollK() * BBits)); + interleave(swizzle, 1, interleavingIdx); + } + if (mma.getUnrollN() > 1) { + unroll(swizzle, 0, mma.getUnrollN()); + } + break; + case 2: + // C-matrix (accumulator). Source dimensions are M (index 0) and N (index + // 1). Unroll on N, then on M. + if (mma.getUnrollN() > 1) { + unroll(swizzle, 1, mma.getUnrollN()); + } + if (mma.getUnrollM() > 1) { + unroll(swizzle, 0, mma.getUnrollM()); + } + break; + } + return swizzle; +} + +static bool hasIntrinsic(IREE::GPU::TargetAttr target, + IREE::GPU::MMAIntrinsic intrinsic) { + for (IREE::GPU::MMAAttr mma : target.getWgp().getMma()) { + if (mma.getIntrinsic().getValue() == intrinsic) { + return true; + } + } + return false; +} + +static std::optional +chooseDataTiledMMAAttr(TypeRange elementTypes, IREE::GPU::TargetAttr target) { + assert(elementTypes.size() == 3); + using namespace IREE::GPU; + MLIRContext *ctx = target.getContext(); + Type lhs = elementTypes[0]; + Type rhs = elementTypes[1]; + Type out = elementTypes[2]; + auto match = [=](MMAIntrinsic intrinsic, int unrollM, int unrollN, + int unrollK) -> std::optional { + if (!hasIntrinsic(target, intrinsic)) { + return std::nullopt; + } + auto candidate = DataTiledMMAAttr::get( + ctx, MMAIntrinsicAttr::get(ctx, intrinsic), unrollM, unrollN, unrollK); + auto [candidateLhs, candidateRhs, candidateOut] = + candidate.getABCElementTypes(); + if (candidateLhs != lhs || candidateRhs != rhs || candidateOut != out) { + return std::nullopt; + } + return candidate; + }; + if (auto m = match(MMAIntrinsic::MFMA_F32_16x16x4_F32, 8, 8, 4)) { + return m; + } + if (auto m = match(MMAIntrinsic::MFMA_F32_16x16x16_F16, 8, 8, 2)) { + return m; + } + if (auto m = match(MMAIntrinsic::MFMA_I32_16x16x32_I8, 8, 8, 2)) { + return m; + } + // Fallback - no architecture-optimized tile size for this case. + return std::nullopt; +} + +static FailureOr +materializeEncodingForTarget(RankedTensorType tensorType, + IREE::HAL::ExecutableTargetAttr targetAttr) { + auto encoding = + dyn_cast_or_null(tensorType.getEncoding()); + if (!encoding) { + return failure(); + } + // We only know about contractions with {Batch, M, N, K} <= 1 at the moment. + auto cDims = getEncodingContractionDims(encoding); + if (failed(cDims) || cDims->batch.size() > 1 || cDims->m.size() > 1 || + cDims->n.size() > 1 || cDims->k.size() > 1) { + return failure(); + } + + // Enumerate available tile shapes for the given encoding and target. + IREE::GPU::TargetAttr gpuTargetAttr; + if (targetAttr) { + gpuTargetAttr = getGPUTargetAttr(targetAttr); + } else { + gpuTargetAttr = getCLGPUTarget(tensorType.getContext()); + } + auto elementTypes = llvm::to_vector( + llvm::map_range(encoding.getElementTypes().getValue(), [](Attribute a) { + return cast(a).getValue(); + })); + std::optional mma = + chooseDataTiledMMAAttr(elementTypes, gpuTargetAttr); + if (!mma) { + return failure(); + } + + // Map the matmul TileMxNxK to an actual tile shape for the tensor at hand, + // based on its operand index in the matmul. + // TODO: Support unrolling. + auto rank = tensorType.getRank(); + TileMxNxK innerTile; + std::tie(innerTile.M, innerTile.N, innerTile.K) = mma->getMNKShape(); + auto encodingInfo = getEncodingInfoForMatmul(encoding, rank, innerTile); + auto operandIdx = encoding.getOperandIndex().getInt(); + encodingInfo.swizzle = getSwizzle(*mma, operandIdx); + return encodingInfo; +} + +namespace { +struct GPUMaterializeDeviceEncodingPass final + : impl::GPUMaterializeDeviceEncodingPassBase< + GPUMaterializeDeviceEncodingPass> { + using GPUMaterializeDeviceEncodingPassBase:: + GPUMaterializeDeviceEncodingPassBase; + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + void runOnOperation() override; +}; + +SmallVector +getReassociationIndices(int outerDims, + SmallVector> expandShape) { + SmallVector result; + int expandedIdx = 0; + for (int i = 0; i < outerDims; ++i) { + result.push_back({expandedIdx++}); + } + for (auto expandShapeDim : expandShape) { + result.push_back({}); + for (int64_t d : expandShapeDim) { + (void)d; + result.back().push_back(expandedIdx++); + } + } + return result; +} + +/// Convert iree_linalg_ext.set_encoding op to pack + tile swizzling ops. We use +/// expand_shape + linalg.transpose to represent a tile swizzling op. +struct GPUSetEncodingOpLoweringConversion + : public OpMaterializeEncodingPattern { + using OpMaterializeEncodingPattern< + IREE::Encoding::SetEncodingOp>::OpMaterializeEncodingPattern; + + LogicalResult + matchAndRewrite(IREE::Encoding::SetEncodingOp encodingOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto converter = static_cast( + getTypeConverter()); + auto packOp = lowerSetEncodingOpToPackOp(rewriter, encodingOp, + adaptor.getSource(), *converter, + this->materializeEncodingValueFn); + if (failed(packOp)) { + Value result = adaptor.getSource(); + Type targetType = + getTypeConverter()->convertType(encodingOp.getResultType()); + if (targetType != result.getType()) { + result = rewriter.create(encodingOp.getLoc(), + targetType, result); + } + rewriter.replaceOp(encodingOp, result); + return success(); + } + + FailureOr maybeEncodingInfo = + converter->getEncodingInfo(encodingOp.getResultType()); + if (failed(maybeEncodingInfo)) { + return rewriter.notifyMatchFailure(encodingOp, + "unhandled result encoding"); + } + if (!maybeEncodingInfo->swizzle) { + rewriter.replaceOp(encodingOp, packOp->getResult()); + return success(); + } + SmallVector innerTiles = maybeEncodingInfo->innerTileSizes; + + // TODO(hanchung): Add a util to the encoding attribute, so we don't need + // the map_to_vector method here. + auto loc = encodingOp.getLoc(); + + // Create expand_shape op to tile the innermost two dimensions. + int origRank = encodingOp.getSourceType().getRank(); + SmallVector expandShapeShape(packOp->getDestType().getShape()); + expandShapeShape.truncate(origRank); + expandShapeShape.append( + getExpandedTileShape(maybeEncodingInfo->swizzle->expandShape)); + + auto expandShapeType = RankedTensorType::get( + expandShapeShape, encodingOp.getSourceType().getElementType()); + + SmallVector reassociation = getReassociationIndices( + origRank, maybeEncodingInfo->swizzle->expandShape); + auto expandShapeOp = rewriter.create( + loc, expandShapeType, packOp->getResult(), reassociation); + + // create linalg.transpose on expandShapeShape + + SmallVector transposePerm; + for (int i = 0; i < origRank; ++i) { + transposePerm.push_back(i); + } + for (auto perm : maybeEncodingInfo->swizzle->permutation) { + transposePerm.push_back(origRank + perm); + } + SmallVector transposeResultDims = expandShapeShape; + applyPermutationToVector(transposeResultDims, transposePerm); + + auto emptyTensor = rewriter.create( + loc, transposeResultDims, encodingOp.getSourceType().getElementType()); + auto transposeOp = rewriter.create( + loc, expandShapeOp, emptyTensor, transposePerm); + rewriter.replaceOp(encodingOp, transposeOp->getResult(0)); + + return success(); + } +}; + +struct GPUUnsetEncodingOpLoweringConversion + : public OpMaterializeEncodingPattern { + using OpMaterializeEncodingPattern< + IREE::Encoding::UnsetEncodingOp>::OpMaterializeEncodingPattern; + + LogicalResult + matchAndRewrite(IREE::Encoding::UnsetEncodingOp unsetEncodingOp, + OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto converter = static_cast( + getTypeConverter()); + + Location loc = unsetEncodingOp.getLoc(); + + FailureOr maybeEncodingInfo = + converter->getEncodingInfo(unsetEncodingOp.getSource().getType()); + if (failed(maybeEncodingInfo)) { + return rewriter.notifyMatchFailure(unsetEncodingOp, + "unhandled result encoding"); + } + Value unpackSrc = adaptor.getSource(); + if (maybeEncodingInfo->swizzle) { + SmallVector innerTiles = maybeEncodingInfo->innerTileSizes; + + int targetRank = unsetEncodingOp.getResultType().getRank(); + auto srcConvertedType = + cast(adaptor.getSource().getType()); + SmallVector expandShapeShape(srcConvertedType.getShape()); + expandShapeShape.truncate(targetRank); + expandShapeShape.append( + getExpandedTileShape(maybeEncodingInfo->swizzle->expandShape)); + + SmallVector transposePerm; + for (int i = 0; i < targetRank; ++i) { + transposePerm.push_back(i); + } + for (auto perm : maybeEncodingInfo->swizzle->permutation) { + transposePerm.push_back(targetRank + perm); + } + SmallVector expandShapeResultDims = expandShapeShape; + applyPermutationToVector(expandShapeResultDims, transposePerm); + auto invertedTransposePerm = invertPermutationVector(transposePerm); + + auto emptyTensor = rewriter.create( + loc, expandShapeShape, + unsetEncodingOp.getSourceType().getElementType()); + auto transposeOp = rewriter.create( + loc, adaptor.getSource(), emptyTensor, invertedTransposePerm); + + SmallVector reassociation = getReassociationIndices( + targetRank, maybeEncodingInfo->swizzle->expandShape); + SmallVector unpackSrcShape( + srcConvertedType.getShape().take_front(targetRank)); + unpackSrcShape.append(maybeEncodingInfo->innerTileSizes.begin(), + maybeEncodingInfo->innerTileSizes.end()); + auto unpackSrcType = RankedTensorType::get( + unpackSrcShape, unsetEncodingOp.getSourceType().getElementType()); + unpackSrc = rewriter.create( + loc, unpackSrcType, transposeOp->getResult(0), reassociation); + } + + auto unPackOp = lowerUnsetEncodingToUnpackOp( + rewriter, unsetEncodingOp, unpackSrc, *converter, + this->materializeEncodingValueFn); + if (failed(unPackOp)) { + Value result = adaptor.getSource(); + Type targetType = + getTypeConverter()->convertType(unsetEncodingOp.getResultType()); + if (targetType != result.getType()) { + result = rewriter.create(unsetEncodingOp.getLoc(), + targetType, result); + } + rewriter.replaceOp(unsetEncodingOp, result); + return success(); + } + rewriter.replaceOp(unsetEncodingOp, unPackOp->getResult()); + return success(); + } +}; + +class GPUConvertToMultiMma final + : public OpInterfaceConversionPattern { +public: + using OpInterfaceConversionPattern< + linalg::ContractionOpInterface>::OpInterfaceConversionPattern; + + GPUConvertToMultiMma( + MLIRContext *context, + const MaterializeEncodingTypeConverter &typeConverter, + MaterializeEncodingValueFn materializeEncodingValueFn = {}, + PatternBenefit benefit = 1) + : OpInterfaceConversionPattern( + typeConverter, context, benefit), + materializeEncodingValueFn(materializeEncodingValueFn) {} + + LogicalResult + matchAndRewrite(linalg::ContractionOpInterface op, ArrayRef operands, + ConversionPatternRewriter &rewriter) const override { + auto linalgOp = cast(op.getOperation()); + auto inputs = linalgOp.getDpsInputOperands(); + auto outputs = linalgOp.getDpsInits(); + auto lhsType = cast(inputs[0]->get().getType()); + auto rhsType = cast(inputs[1]->get().getType()); + auto resultType = cast(outputs[0].getType()); + auto lhsEncoding = IREE::Encoding::getEncodingAttr(lhsType); + auto rhsEncoding = IREE::Encoding::getEncodingAttr(rhsType); + auto resultEncoding = IREE::Encoding::getEncodingAttr(resultType); + if (!lhsEncoding || !rhsEncoding || !resultEncoding) { + LLVM_DEBUG(llvm::dbgs() << "expect encodings on operand types\n"); + return failure(); + } + + auto converter = static_cast( + getTypeConverter()); + + // TODO(hanchung): Perhaps the MaterializedEncodingInfo should carry the + // target intrinsic attribute, so we don't need to query it again. + IREE::HAL::ExecutableTargetAttr targetAttr = converter->getTargetAttr(); + IREE::GPU::TargetAttr gpuTargetAttr; + if (targetAttr) { + gpuTargetAttr = getGPUTargetAttr(targetAttr); + } else { + gpuTargetAttr = getCLGPUTarget(op.getContext()); + } + auto elementTypes = llvm::to_vector(llvm::map_range( + resultEncoding.getElementTypes().getValue(), + [](Attribute a) { return cast(a).getValue(); })); + std::optional mma = + chooseDataTiledMMAAttr(elementTypes, gpuTargetAttr); + if (!mma) { + LLVM_DEBUG(llvm::dbgs() << "can't find supported Mma intrinsic\n"); + return failure(); + } + LLVM_DEBUG(llvm::dbgs() << "Target MMA: " << mma.value() << "\n"); + + FailureOr contractionDims = + linalg::inferContractionDims(linalgOp); + assert( + succeeded(contractionDims) && + "should always be able to infer contraction dims for contraction ops"); + // TODO(hanchung): Support batch gemms. + if (!contractionDims->batch.empty()) { + LLVM_DEBUG(llvm::dbgs() << "batch gemm is not yet implemented\n"); + return failure(); + } + + // TODO(hanchung): Support unrolling cases. We likely need to teach + // multi_mma op about interleaving K dimension. + MLIRContext *ctx = rewriter.getContext(); + AffineExpr mExpr = rewriter.getAffineDimExpr(0); + AffineExpr nExpr = rewriter.getAffineDimExpr(1); + AffineExpr kExpr = rewriter.getAffineDimExpr(2); + + // The outer dims are all in row-major fasion after relayout. + auto lhsMap = AffineMap::get(3, 0, {mExpr, kExpr}, ctx); + auto rhsMap = AffineMap::get(3, 0, {nExpr, kExpr}, ctx); + auto accMap = AffineMap::get(3, 0, {mExpr, nExpr}, ctx); + + SmallVector iteratorTypes = + linalgOp.getIteratorTypesArray(); + + // TODO(hanchung): Support batch gemms. + Location loc = op.getLoc(); + auto mmaOp = rewriter.create( + loc, operands[0], operands[1], operands[2], + ArrayRef{lhsMap, rhsMap, accMap}, iteratorTypes, + mma.value()); + rewriter.replaceOp(op, mmaOp); + return success(); + } + +protected: + const MaterializeEncodingValueFn materializeEncodingValueFn; +}; + +} // namespace + +void GPUMaterializeDeviceEncodingPass::runOnOperation() { + MLIRContext *ctx = &getContext(); + FunctionOpInterface funcOp = getOperation(); + auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(funcOp); + { + RewritePatternSet patterns(ctx); + MaterializeEncodingTypeConverter typeConverter(materializeEncodingForTarget, + targetAttr); + MaterializeEncodingConversionTarget target(*funcOp.getContext()); + MaterializeEncodingValueFn materializeEncodingValueFn = + [](RankedTensorType, OpBuilder, + Location) -> FailureOr { return {}; }; + populateIREEMaterializeEncodingIntoPackUnPackPatterns( + patterns, target, typeConverter, materializeEncodingValueFn); + + patterns.insert( + ctx, typeConverter, materializeEncodingValueFn); + + if (failed(applyPartialConversion(funcOp, target, std::move(patterns)))) { + funcOp.emitOpError("materialization failed"); + return signalPassFailure(); + } + } + + // Add patterns to fold pack/unpack ops with pad/extract_slice ops and + // resolve dims ops. + { + RewritePatternSet patterns(ctx); + tensor::populateFoldIntoPackAndUnpackPatterns(patterns); + memref::populateResolveRankedShapedTypeResultDimsPatterns(patterns); + if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(patterns)))) { + funcOp.emitOpError("folding patterns failed"); + return signalPassFailure(); + } + } +} + +} // namespace mlir::iree_compiler diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td index a5c5b090a28a..570d3f90104b 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td @@ -186,6 +186,11 @@ def GPUApplyTilingLevelPass : ]; } +def GPUMaterializeDeviceEncodingPass : + InterfacePass<"iree-codegen-gpu-materialize-device-encoding", "mlir::FunctionOpInterface"> { + let summary = "Materialize the encoding for tensor as specified by the backend."; +} + def GPUTensorTileToSerialLoopsPass : InterfacePass<"iree-codegen-gpu-tensor-tile-to-serial-loops", "mlir::FunctionOpInterface"> { let summary = "Pass to tile reduction dimensions for certain GPU ops"; diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel index ab48f26ec813..bb1e43081185 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel @@ -28,6 +28,7 @@ iree_lit_test_suite( "gpu_infer_memory_space.mlir", "gpu_lower_to_ukernels.mlir", "gpu_combine_value_barriers.mlir", + "gpu_materialize_encoding.mlir", "gpu_nested_layout_contract_amdgpu.mlir", "gpu_nested_layout_vector_distribution.mlir", "gpu_pipeline.mlir", diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt index ea90e3bb433d..94953712e849 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt @@ -24,6 +24,7 @@ iree_lit_test_suite( "gpu_generalize_named_ops.mlir" "gpu_infer_memory_space.mlir" "gpu_lower_to_ukernels.mlir" + "gpu_materialize_encoding.mlir" "gpu_nested_layout_contract_amdgpu.mlir" "gpu_nested_layout_vector_distribution.mlir" "gpu_pipeline.mlir" diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_materialize_encoding.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_materialize_encoding.mlir new file mode 100644 index 000000000000..209b29c4d32d --- /dev/null +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_materialize_encoding.mlir @@ -0,0 +1,391 @@ +// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-materialize-device-encoding))" \ +// RUN: --iree-gpu-test-target=gfx942 \ +// RUN: --split-input-file %s | FileCheck %s + +//----------------------------------------------------------------------------- +// 1. MFMA_F32_16x16x4_F32 +//----------------------------------------------------------------------------- + +#encoding = #iree_encoding.encoding, + user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], + round_dims_to = array> +#pipeline_layout = #hal.pipeline.layout, + #hal.pipeline.binding +]> +func.func @set_encoding_LHS_unroll8x8x4_MFMA_F32_16x16x4_F32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [255, 513], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<255x513xf32> + %3 = iree_encoding.set_encoding %2 : tensor<255x513xf32> -> tensor<255x513xf32, #encoding> + flow.dispatch.tensor.store %3, %1, offsets = [0, 0], sizes = [255, 513], strides = [1, 1] : tensor<255x513xf32, #encoding> -> !flow.dispatch.tensor> + return +} + +// CHECK-LABEL: func.func @set_encoding_LHS_unroll8x8x4_MFMA_F32_16x16x4_F32 +// CHECK: %[[PACK:.*]] = tensor.pack %{{.+}} padding_value(%{{.+}} : f32) +// CHECK-SAME: outer_dims_perm = [0, 1] +// CHECK-SAME: inner_dims_pos = [0, 1] +// CHECK-SAME: inner_tiles = [128, 16] +// CHECK-SAME: : tensor<255x513xf32> -> tensor<2x33x128x16xf32> +// CHECK: %[[EXPAND:.*]] = tensor.expand_shape %[[PACK]] +// CHECK-SAME : tensor<2x33x128x16xf32> into tensor<2x33x8x16x4x4xf32> +// CHECK: %[[TRANSPOSE:.*]] = linalg.transpose +// CHECK-SAME: ins(%[[EXPAND]] : tensor<2x33x8x16x4x4xf32>) +// CHECK-SAME: outs({{.*}} : tensor<2x33x8x4x16x4xf32>) +// CHECK-SAME: permutation = [0, 1, 2, 5, 3, 4] +// CHECK: flow.dispatch.tensor.store %[[TRANSPOSE]] + +// ----- + +#encoding = #iree_encoding.encoding, + user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], + round_dims_to = array> +#pipeline_layout = #hal.pipeline.layout, + #hal.pipeline.binding +]> +func.func @set_encoding_RHS_unroll8x8x4_MFMA_F32_16x16x4_F32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [255, 513], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<255x513xf32> + %3 = iree_encoding.set_encoding %2 : tensor<255x513xf32> -> tensor<255x513xf32, #encoding> + flow.dispatch.tensor.store %3, %1, offsets = [0, 0], sizes = [255, 513], strides = [1, 1] : tensor<255x513xf32, #encoding> -> !flow.dispatch.tensor> + return +} + +// CHECK-LABEL: func.func @set_encoding_RHS_unroll8x8x4_MFMA_F32_16x16x4_F32 +// CHECK: %[[PACK:.*]] = tensor.pack %{{.+}} padding_value(%{{.+}} : f32) +// CHECK-SAME: outer_dims_perm = [1, 0] +// CHECK-SAME: inner_dims_pos = [1, 0] +// CHECK-SAME: inner_tiles = [128, 16] +// CHECK-SAME: : tensor<255x513xf32> -> tensor<5x16x128x16xf32> +// CHECK: %[[EXPAND:.*]] = tensor.expand_shape %[[PACK]] +// CHECK-SAME : tensor<5x16x128x16xf32> into tensor<5x16x8x16x4x4xf32> +// CHECK: %[[TRANSPOSE:.*]] = linalg.transpose +// CHECK-SAME: ins(%[[EXPAND]] : tensor<5x16x8x16x4x4xf32>) +// CHECK-SAME: outs({{.*}} : tensor<5x16x8x4x16x4xf32>) +// CHECK-SAME: permutation = [0, 1, 2, 5, 3, 4] +// CHECK: flow.dispatch.tensor.store %[[TRANSPOSE]] + +// ----- + +#encoding = #iree_encoding.encoding, + user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], + round_dims_to = array> +#pipeline_layout = #hal.pipeline.layout, + #hal.pipeline.binding +]> +func.func @set_encoding_ACC_unroll8x8x4_MFMA_F32_16x16x4_F32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [255, 513], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<255x513xf32> + %3 = iree_encoding.set_encoding %2 : tensor<255x513xf32> -> tensor<255x513xf32, #encoding> + flow.dispatch.tensor.store %3, %1, offsets = [0, 0], sizes = [255, 513], strides = [1, 1] : tensor<255x513xf32, #encoding> -> !flow.dispatch.tensor> + return +} + +// CHECK-LABEL: func.func @set_encoding_ACC_unroll8x8x4_MFMA_F32_16x16x4_F32 +// CHECK: %[[PACK:.*]] = tensor.pack %{{.+}} padding_value(%{{.+}} : f32) +// CHECK-SAME: outer_dims_perm = [0, 1] +// CHECK-SAME: inner_dims_pos = [0, 1] +// CHECK-SAME: inner_tiles = [128, 128] +// CHECK-SAME: : tensor<255x513xf32> -> tensor<2x5x128x128xf32> +// CHECK: %[[EXPAND:.*]] = tensor.expand_shape %[[PACK]] +// CHECK-SAME : tensor<2x5x128x128xf32> into tensor<2x5x8x4x4x8x16xf32> +// CHECK: %[[TRANSPOSE:.*]] = linalg.transpose +// CHECK-SAME: ins(%[[EXPAND]] : tensor<2x5x8x4x4x8x16xf32>) +// CHECK-SAME: outs({{.*}} : tensor<2x5x8x8x4x16x4xf32>) +// CHECK-SAME: permutation = [0, 1, 2, 5, 3, 6, 4] +// CHECK: flow.dispatch.tensor.store %[[TRANSPOSE]] + +// ----- + +#encoding = #iree_encoding.encoding, + user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], + round_dims_to = array> +#pipeline_layout = #hal.pipeline.layout, + #hal.pipeline.binding +]> +func.func @unset_encoding_ACC_unroll8x8x4_MFMA_F32_16x16x4_F32() { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [255, 513], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<255x513xf32, #encoding> + %3 = iree_encoding.unset_encoding %2 : tensor<255x513xf32, #encoding> -> tensor<255x513xf32> + flow.dispatch.tensor.store %3, %1, offsets = [0, 0], sizes = [255, 513], strides = [1, 1] : tensor<255x513xf32> -> !flow.dispatch.tensor> + return +} + +// CHECK-LABEL: func.func @unset_encoding_ACC_unroll8x8x4_MFMA_F32_16x16x4_F32() { +// CHECK: %[[TRANSPOSE:.*]] = linalg.transpose +// CHECK-SAME: ins(%{{.+}} : tensor<2x5x8x8x4x16x4xf32>) +// CHECK-SAME: outs({{.*}} : tensor<2x5x8x4x4x8x16xf32>) +// CHECK-SAME: permutation = [0, 1, 2, 4, 6, 3, 5] +// CHECK: %[[COLLAPSE:.*]] = tensor.collapse_shape %[[TRANSPOSE]] +// CHECK-SAME: : tensor<2x5x8x4x4x8x16xf32> into tensor<2x5x128x128xf32> +// CHECK: %[[UNPACK:.*]] = tensor.unpack %[[COLLAPSE]] +// CHECK-SAME: outer_dims_perm = [0, 1] +// CHECK-SAME: inner_dims_pos = [0, 1] +// CHECK-SAME: inner_tiles = [128, 128] +// CHECK-SAME: : tensor<2x5x128x128xf32> -> tensor<255x513xf32> +// CHECK: flow.dispatch.tensor.store %[[UNPACK]] + +// ----- + +#map = affine_map<(d0, d1, d2) -> (d0, d2)> +#map1 = affine_map<(d0, d1, d2) -> (d2, d1)> +#map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding_lhs = #iree_encoding.encoding> +#encoding_rhs = #iree_encoding.encoding> +#encoding_result = #iree_encoding.encoding> +#pipeline_layout_3 = #hal.pipeline.layout, + #hal.pipeline.binding, + #hal.pipeline.binding +]> +func.func @matmul_lowering_unroll8x8x4_MFMA_F32_16x16x4_F32() { + %c0 = arith.constant 0 : index + %M = hal.interface.constant.load layout(#pipeline_layout_3) ordinal(0) : index + %N = hal.interface.constant.load layout(#pipeline_layout_3) ordinal(1) : index + %K = hal.interface.constant.load layout(#pipeline_layout_3) ordinal(2) : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout_3) binding(0) alignment(64) offset(%c0) + : !flow.dispatch.tensor>{%M, %K} + %1 = hal.interface.binding.subspan layout(#pipeline_layout_3) binding(1) alignment(64) offset(%c0) + : !flow.dispatch.tensor>{%K, %N} + %2 = hal.interface.binding.subspan layout(#pipeline_layout_3) binding(2) alignment(64) offset(%c0) + : !flow.dispatch.tensor>{%M, %N} + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] + : !flow.dispatch.tensor>{%M, %K} + -> tensor + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] + : !flow.dispatch.tensor>{%K, %N} + -> tensor + %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] + : !flow.dispatch.tensor>{%M, %N} + -> tensor + %6 = linalg.matmul + ins(%3, %4 : tensor, + tensor) + outs(%5 : tensor) + -> tensor + flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] + : tensor + -> !flow.dispatch.tensor>{%M, %N} + return +} +// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)> +// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)> +// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> +// CHECK: func.func @matmul_lowering_unroll8x8x4_MFMA_F32_16x16x4_F32 +// CHECK-DAG: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan {{.+}} binding(0) +// CHECK-DAG: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan {{.+}} binding(1) +// CHECK-DAG: %[[ACC_BINDING:.+]] = hal.interface.binding.subspan {{.+}} binding(2) +// CHECK-DAG: %[[LHS:.+]] = flow.dispatch.tensor.load %[[LHS_BINDING]]{{.+}} -> tensor +// CHECK-DAG: %[[RHS:.+]] = flow.dispatch.tensor.load %[[RHS_BINDING]]{{.+}} -> tensor +// CHECK-DAG: %[[ACC:.+]] = flow.dispatch.tensor.load %[[ACC_BINDING]]{{.+}} -> tensor +// CHECK: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[ACC]] +// CHECK-SAME: indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]], +// CHECK-SAME: iterator_types = [#iree_gpu.iterator_type, #iree_gpu.iterator_type, #iree_gpu.iterator_type] +// CHECK-SAME: kind = #iree_gpu.data_tiled_mma_layout +// CHECK: flow.dispatch.tensor.store %[[MMA]], %[[ACC_BINDING]] + + +//----------------------------------------------------------------------------- +// 2. MFMA_I32_16x16x32_I8 +//----------------------------------------------------------------------------- + +#encoding = #iree_encoding.encoding, + user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], + round_dims_to = array> +#pipeline_layout = #hal.pipeline.layout, + #hal.pipeline.binding +]> +func.func @set_encoding_LHS_unroll8x8x2_MFMA_I32_16x16x32_I8() { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [255, 513], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<255x513xi8> + %3 = iree_encoding.set_encoding %2 : tensor<255x513xi8> -> tensor<255x513xi8, #encoding> + flow.dispatch.tensor.store %3, %1, offsets = [0, 0], sizes = [255, 513], strides = [1, 1] : tensor<255x513xi8, #encoding> -> !flow.dispatch.tensor> + return +} + +// CHECK-LABEL: func.func @set_encoding_LHS_unroll8x8x2_MFMA_I32_16x16x32_I8 +// CHECK: %[[PACK:.*]] = tensor.pack %{{.+}} padding_value(%{{.+}} : i8) +// CHECK-SAME: outer_dims_perm = [0, 1] +// CHECK-SAME: inner_dims_pos = [0, 1] +// CHECK-SAME: inner_tiles = [128, 64] +// CHECK-SAME: : tensor<255x513xi8> -> tensor<2x9x128x64xi8> +// CHECK: %[[EXPAND:.*]] = tensor.expand_shape %[[PACK]] +// CHECK-SAME : tensor<2x9x128x64xi8> into tensor<2x9x8x16x2x4x8xi8> +// CHECK: %[[TRANSPOSE:.*]] = linalg.transpose +// CHECK-SAME: ins(%[[EXPAND]] : tensor<2x9x8x16x2x4x8xi8>) +// CHECK-SAME: outs({{.*}} : tensor<2x9x8x4x16x2x8xi8>) +// CHECK-SAME: permutation = [0, 1, 2, 5, 3, 4, 6] +// CHECK: flow.dispatch.tensor.store %[[TRANSPOSE]] + +// ----- + +#encoding = #iree_encoding.encoding, + user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], + round_dims_to = array> +#pipeline_layout = #hal.pipeline.layout, + #hal.pipeline.binding +]> +func.func @set_encoding_RHS_unroll8x8x2_MFMA_I32_16x16x32_I8() { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [255, 513], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<255x513xi8> + %3 = iree_encoding.set_encoding %2 : tensor<255x513xi8> -> tensor<255x513xi8, #encoding> + flow.dispatch.tensor.store %3, %1, offsets = [0, 0], sizes = [255, 513], strides = [1, 1] : tensor<255x513xi8, #encoding> -> !flow.dispatch.tensor> + return +} + +// CHECK-LABEL: func.func @set_encoding_RHS_unroll8x8x2_MFMA_I32_16x16x32_I8 +// CHECK: %[[PACK:.*]] = tensor.pack %{{.+}} padding_value(%{{.+}} : i8) +// CHECK-SAME: outer_dims_perm = [1, 0] +// CHECK-SAME: inner_dims_pos = [1, 0] +// CHECK-SAME: inner_tiles = [128, 64] +// CHECK-SAME: : tensor<255x513xi8> -> tensor<5x4x128x64xi8> +// CHECK: %[[EXPAND:.*]] = tensor.expand_shape %[[PACK]] +// CHECK-SAME : tensor<5x4x128x64xi8> into tensor<5x4x8x16x2x4x8xi8> +// CHECK: %[[TRANSPOSE:.*]] = linalg.transpose +// CHECK-SAME: ins(%[[EXPAND]] : tensor<5x4x8x16x2x4x8xi8>) +// CHECK-SAME: outs({{.*}} : tensor<5x4x8x4x16x2x8xi8>) +// CHECK-SAME: permutation = [0, 1, 2, 5, 3, 4, 6] +// CHECK: flow.dispatch.tensor.store %[[TRANSPOSE]] + +// ----- + +#encoding = #iree_encoding.encoding, + user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], + round_dims_to = array> +#pipeline_layout = #hal.pipeline.layout, + #hal.pipeline.binding +]> +func.func @set_encoding_ACC_unroll8x8x2_MFMA_I32_16x16x32_I8() { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [255, 513], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<255x513xi32> + %3 = iree_encoding.set_encoding %2 : tensor<255x513xi32> -> tensor<255x513xi32, #encoding> + flow.dispatch.tensor.store %3, %1, offsets = [0, 0], sizes = [255, 513], strides = [1, 1] : tensor<255x513xi32, #encoding> -> !flow.dispatch.tensor> + return +} + +// CHECK-LABEL: func.func @set_encoding_ACC_unroll8x8x2_MFMA_I32_16x16x32_I8 +// CHECK: %[[PACK:.*]] = tensor.pack %{{.+}} padding_value(%{{.+}} : i32) +// CHECK-SAME: outer_dims_perm = [0, 1] +// CHECK-SAME: inner_dims_pos = [0, 1] +// CHECK-SAME: inner_tiles = [128, 128] +// CHECK-SAME: : tensor<255x513xi32> -> tensor<2x5x128x128xi32> +// CHECK: %[[EXPAND:.*]] = tensor.expand_shape %[[PACK]] +// CHECK-SAME : tensor<2x5x128x128xi32> into tensor<2x5x8x4x4x8x16xi32> +// CHECK: %[[TRANSPOSE:.*]] = linalg.transpose +// CHECK-SAME: ins(%[[EXPAND]] : tensor<2x5x8x4x4x8x16xi32>) +// CHECK-SAME: outs({{.*}} : tensor<2x5x8x8x4x16x4xi32>) +// CHECK-SAME: permutation = [0, 1, 2, 5, 3, 6, 4] +// CHECK: flow.dispatch.tensor.store %[[TRANSPOSE]] + +// ----- + +#encoding = #iree_encoding.encoding, + user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], + round_dims_to = array> +#pipeline_layout = #hal.pipeline.layout, + #hal.pipeline.binding +]> +func.func @unset_encoding_ACC_unroll8x8x2_MFMA_I32_16x16x32_I8() { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [255, 513], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<255x513xi32, #encoding> + %3 = iree_encoding.unset_encoding %2 : tensor<255x513xi32, #encoding> -> tensor<255x513xi32> + flow.dispatch.tensor.store %3, %1, offsets = [0, 0], sizes = [255, 513], strides = [1, 1] : tensor<255x513xi32> -> !flow.dispatch.tensor> + return +} + +// CHECK-LABEL: func.func @unset_encoding_ACC_unroll8x8x2_MFMA_I32_16x16x32_I8() { +// CHECK: %[[TRANSPOSE:.*]] = linalg.transpose +// CHECK-SAME: ins(%{{.+}} : tensor<2x5x8x8x4x16x4xi32>) +// CHECK-SAME: outs({{.*}} : tensor<2x5x8x4x4x8x16xi32>) +// CHECK-SAME: permutation = [0, 1, 2, 4, 6, 3, 5] +// CHECK: %[[COLLAPSE:.*]] = tensor.collapse_shape %[[TRANSPOSE]] +// CHECK-SAME: : tensor<2x5x8x4x4x8x16xi32> into tensor<2x5x128x128xi32> +// CHECK: %[[UNPACK:.*]] = tensor.unpack %[[COLLAPSE]] +// CHECK-SAME: outer_dims_perm = [0, 1] +// CHECK-SAME: inner_dims_pos = [0, 1] +// CHECK-SAME: inner_tiles = [128, 128] +// CHECK-SAME: : tensor<2x5x128x128xi32> -> tensor<255x513xi32> +// CHECK: flow.dispatch.tensor.store %[[UNPACK]] + +// ----- + +#map = affine_map<(d0, d1, d2) -> (d0, d2)> +#map1 = affine_map<(d0, d1, d2) -> (d2, d1)> +#map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding_lhs = #iree_encoding.encoding> +#encoding_rhs = #iree_encoding.encoding> +#encoding_result = #iree_encoding.encoding> +#pipeline_layout_3 = #hal.pipeline.layout, + #hal.pipeline.binding, + #hal.pipeline.binding +]> + +func.func @matmul_lowering_unroll8x8x2_MFMA_I32_16x16x32_I8() { + %c0 = arith.constant 0 : index + %M = hal.interface.constant.load layout(#pipeline_layout_3) ordinal(0) : index + %N = hal.interface.constant.load layout(#pipeline_layout_3) ordinal(1) : index + %K = hal.interface.constant.load layout(#pipeline_layout_3) ordinal(2) : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout_3) binding(0) alignment(64) offset(%c0) + : !flow.dispatch.tensor>{%M, %K} + %1 = hal.interface.binding.subspan layout(#pipeline_layout_3) binding(1) alignment(64) offset(%c0) + : !flow.dispatch.tensor>{%K, %N} + %2 = hal.interface.binding.subspan layout(#pipeline_layout_3) binding(2) alignment(64) offset(%c0) + : !flow.dispatch.tensor>{%M, %N} + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] + : !flow.dispatch.tensor>{%M, %K} + -> tensor + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] + : !flow.dispatch.tensor>{%K, %N} + -> tensor + %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] + : !flow.dispatch.tensor>{%M, %N} + -> tensor + %6 = linalg.matmul + ins(%3, %4 : tensor, + tensor) + outs(%5 : tensor) + -> tensor + flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] + : tensor + -> !flow.dispatch.tensor>{%M, %N} + return +} +// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)> +// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)> +// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> +// CHECK: func.func @matmul_lowering_unroll8x8x2_MFMA_I32_16x16x32_I8 +// CHECK-DAG: %[[LHS_BINDING:.+]] = hal.interface.binding.subspan {{.+}} binding(0) +// CHECK-DAG: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan {{.+}} binding(1) +// CHECK-DAG: %[[ACC_BINDING:.+]] = hal.interface.binding.subspan {{.+}} binding(2) +// CHECK-DAG: %[[LHS:.+]] = flow.dispatch.tensor.load %[[LHS_BINDING]]{{.+}} -> tensor +// CHECK-DAG: %[[RHS:.+]] = flow.dispatch.tensor.load %[[RHS_BINDING]]{{.+}} -> tensor +// CHECK-DAG: %[[ACC:.+]] = flow.dispatch.tensor.load %[[ACC_BINDING]]{{.+}} -> tensor +// CHECK: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[ACC]] +// CHECK-SAME: indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]], +// CHECK-SAME: iterator_types = [#iree_gpu.iterator_type, #iree_gpu.iterator_type, #iree_gpu.iterator_type] +// CHECK-SAME: kind = #iree_gpu.data_tiled_mma_layout +// CHECK: flow.dispatch.tensor.store %[[MMA]], %[[ACC_BINDING]] diff --git a/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingIntoNop.cpp b/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingIntoNop.cpp index 2802736fe232..8024a7a71edf 100644 --- a/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingIntoNop.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingIntoNop.cpp @@ -48,6 +48,8 @@ struct MaterializeEncodingIntoNopPass final materializeEncodingFn, IREE::HAL::ExecutableTargetAttr()); MaterializeEncodingConversionTarget target(*context); populateMaterializeEncodingIntoPackUnPackPatterns( + materializeEncodingPattern, typeConverter, materializeEncodingValueFn); + populateIREEMaterializeEncodingIntoPackUnPackPatterns( materializeEncodingPattern, target, typeConverter, materializeEncodingValueFn); diff --git a/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingIntoPackUnPack.cpp b/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingIntoPackUnPack.cpp index 2de3dc30029a..852183289304 100644 --- a/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingIntoPackUnPack.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingIntoPackUnPack.cpp @@ -187,10 +187,12 @@ static void transposeInPlace(MaterializeEncodingInfo &info) { // to `pack` and `unpack` operations respectively. //===---------------------------------------------------------------------===// -/// Utility method to convert from `set_encoding` op to `pack` operation with -/// zero padding values. The source is also taken as input so that these could -/// be used with `OpConversionPatterns`. -static FailureOr lowerSetEncodingOpToPackOp( +/// TODO(hanchung): Move the implementation to EncodingUtils.cpp. It is not +/// moved because it needs some cleanup for this file. E.g., `getPaddingValue` +/// is no longer needed. Ideally we should move CPU specific patterns (e.g., +/// lowerContractionOpWithEncoding, etc) to the CPUMaterializeEncoding file; +/// move general patterns to EncodingUtils, and retire this file. +FailureOr lowerSetEncodingOpToPackOp( RewriterBase &rewriter, IREE::Encoding::SetEncodingOp encodingOp, Value source, const MaterializeEncodingTypeConverter &typeConverter, MaterializeEncodingValueFn materializeEncodingValueFn) { @@ -231,10 +233,9 @@ static FailureOr lowerSetEncodingOpToPackOp( paddingValue, encodingInfo->outerDimsPerm); } -/// Utility method to convert from `set_encoding` op to `pack` operation. -/// The source is taken as input so that these could be used with -/// `OpConversionPatterns`. -static FailureOr lowerUnsetEncodingToUnpackOp( +/// TODO(hanchung): Move the implementation to EncodingUtils.cpp. See the reason +/// in the implementation comment of lowerSetEncodingToPackOp method. +FailureOr lowerUnsetEncodingToUnpackOp( RewriterBase &rewriter, IREE::Encoding::UnsetEncodingOp encodingOp, Value packedValue, const MaterializeEncodingTypeConverter &typeConverter, MaterializeEncodingValueFn materializeEncodingValueFn) { @@ -506,6 +507,33 @@ lowerOpWithEncoding(RewriterBase &rewriter, linalg::LinalgOp linalgOp, .Default([](Operation *op) { return failure(); }); } +// Utility to apply a tile-swizzling to a packed shape. +static SmallVector +getSwizzledShape(ArrayRef packedShape, + MaterializeEncodingInfo encodingInfo) { + if (packedShape.empty() || !encodingInfo.swizzle) { + return SmallVector(packedShape); + } + + int64_t srcRank = packedShape.size() - encodingInfo.innerTileSizes.size(); + SmallVector perm = llvm::to_vector(llvm::seq(0, srcRank)); + for (auto i : encodingInfo.swizzle->permutation) { + perm.push_back(i + srcRank); + } + + SmallVector newShape(packedShape.take_front(srcRank)); + SmallVector expandedTileShape = + getExpandedTileShape(encodingInfo.swizzle->expandShape); + MLIRContext *ctx = packedShape[0].getContext(); + Builder b(ctx); + for (int64_t d : expandedTileShape) { + newShape.push_back(b.getIndexAttr(d)); + } + applyPermutationToVector(newShape, perm); + + return newShape; +} + /// For `dispatchTensorType` that bind a `RankedTensorType` with encoding, /// returns the materialized shape of the `dispatchTensorType`. The /// dynamic dimensions of the `dispatchTensorType` are provided in @@ -541,7 +569,7 @@ static FailureOr> getPackedDimsForDispatchTensor( tensor::PackOp::getResultShape(builder, loc, targetShape, *innerTileSizes, encodingInfo->innerDimsPos, encodingInfo->outerDimsPerm); - return convertedTargetShape; + return getSwizzledShape(convertedTargetShape, *encodingInfo); } /// For `dispatchTensorType` that bind a `RankedTensorType` with encoding, @@ -882,11 +910,24 @@ class MaterializeContractionOp : public OpInterfaceConversionPattern< } // namespace void populateMaterializeEncodingIntoPackUnPackPatterns( - RewritePatternSet &patterns, MaterializeEncodingConversionTarget &target, + RewritePatternSet &patterns, MaterializeEncodingTypeConverter &typeConverter, MaterializeEncodingValueFn materializeEncodingValueFn) { MLIRContext *context = patterns.getContext(); + patterns.insert, + MaterializeDPSOperation, + MaterializeOperation, + MaterializeContractionOp, SetEncodingOpToPackOpConversion, + UnsetEncodingOpToUnPackOpConversion>( + context, typeConverter, materializeEncodingValueFn); + memref::populateResolveRankedShapedTypeResultDimsPatterns(patterns); +} +void populateIREEMaterializeEncodingIntoPackUnPackPatterns( + RewritePatternSet &patterns, MaterializeEncodingConversionTarget &target, + MaterializeEncodingTypeConverter &typeConverter, + MaterializeEncodingValueFn materializeEncodingValueFn) { + MLIRContext *context = patterns.getContext(); typeConverter.addConversion( [&typeConverter](IREE::Flow::DispatchTensorType dispatchTensorType) { Type boundType = dispatchTensorType.getBoundType(); @@ -908,20 +949,10 @@ void populateMaterializeEncodingIntoPackUnPackPatterns( return resultType == typeConverter.convertType(resultType); }); - // Add all patterns for converting from encoded type to the materialized - // type. - patterns.insert, - MaterializeDPSOperation, - MaterializeOperation, - MaterializeContractionOp, SetEncodingOpToPackOpConversion, - UnsetEncodingOpToUnPackOpConversion>( - patterns.getContext(), typeConverter, materializeEncodingValueFn); - memref::populateResolveRankedShapedTypeResultDimsPatterns(patterns); - patterns.insert( context, typeConverter, materializeEncodingValueFn); -} +}; } // namespace mlir::iree_compiler diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp index bb7612e72399..11e8a3d39826 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp @@ -524,8 +524,8 @@ int64_t MMAAttr::getBlockSize() const { return 0; } -int64_t MMAAttr::getSubgroupSize() const { - switch (getIntrinsic().getValue()) { +static int64_t getIntrinsicSubgroupSize(MMAIntrinsic intrinsic) { + switch (intrinsic) { case MMAIntrinsic::MFMA_F32_16x16x4_F32: case MMAIntrinsic::MFMA_F32_16x16x16_F16: case MMAIntrinsic::MFMA_I32_16x16x16_I8: @@ -546,6 +546,10 @@ int64_t MMAAttr::getSubgroupSize() const { return 0; } +int64_t MMAAttr::getSubgroupSize() const { + return getIntrinsicSubgroupSize(getIntrinsic().getValue()); +} + MMAAttr::SingleSubgroupLayout MMAAttr::getASingleSubgroupLayout() const { switch (getIntrinsic().getValue()) { case MMAIntrinsic::MFMA_F32_16x16x4_F32: { @@ -868,6 +872,33 @@ LogicalResult MMAAttr::materializeOperandConcreteShape( return success(); } +//===----------------------------------------------------------------------===// +// DataTiledMMA Attributes +//===----------------------------------------------------------------------===// + +std::tuple DataTiledMMAAttr::getABCElementTypes() const { + MLIRContext *ctx = getContext(); + auto opaqueLayout = getOpaqueMFMALayout(ctx, getIntrinsic().getValue()); + return {opaqueLayout.aType, opaqueLayout.bType, opaqueLayout.cType}; +} + +std::tuple DataTiledMMAAttr::getMNKShape() const { + MLIRContext *ctx = getContext(); + auto opaqueLayout = getOpaqueMFMALayout(ctx, getIntrinsic().getValue()); + return {opaqueLayout.mSize * getUnrollM(), opaqueLayout.nSize * getUnrollN(), + opaqueLayout.kSize * getUnrollK()}; +} + +std::tuple +DataTiledMMAAttr::getABCVectorTypes() const { + return MMAAttr::get(getContext(), getIntrinsic().getValue()) + .getABCVectorTypes(); +} + +int64_t DataTiledMMAAttr::getSubgroupSize() const { + return getIntrinsicSubgroupSize(getIntrinsic().getValue()); +} + //===----------------------------------------------------------------------===// // MMA Schedule Attributes //===----------------------------------------------------------------------===// diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td index 968fa736ec4d..56ae368b14c6 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td @@ -220,6 +220,53 @@ def IREEGPU_MMAAttr : IREEGPU_MmaVectorLayoutAttr<"MMA", "MMAIntrinsicAttr"> { }]; } +def IREEGPU_DataTiledMMAAttr : + AttrDef +]> { + let mnemonic = "data_tiled_mma_layout"; + let cppNamespace = "::mlir::iree_compiler::IREE::GPU"; + + let description = [{ + This mma variant represents MMA ops with data-tiling details. The + |intrinsic| field specifies which particular MMA intrinsic is targeted by + the data-tiling. + + The tile swizzling already happens, so the attribute does not need to + implement materializeOperandConcreteShape interface method. E.g., if the + target intrinsic is MFMA_F32_16x16x4_F32: + - The inner tile shape of LHS is 4x16. + - The inner tile shape of RHS is 4x16. + - The inner tile shape of ACC is 4x16x4. + + Furthermore, the unrolling and interleaving can be represented with the + attribute. In the concept of data-tiling, we always unroll the parallel + dimensions (i.e., M, N dimensions) to be outermost, and interleave the + unrolled K dimension. I.e., the unrolled K dimension becomes the innermost + dimension. The constraint can be relaxed based on data-tiling needs. The + additional information can be added to `parameters`. + }]; + + let assemblyFormat = "`<` struct(params) `>`"; + + let parameters = (ins + "::mlir::iree_compiler::IREE::GPU::MMAIntrinsicAttr":$intrinsic, + "int64_t":$unroll_m, + "int64_t":$unroll_n, + "int64_t":$unroll_k + ); +} + def IREEGPU_MMAOpsArrayAttr : ArrayOfAttr< IREEGPU_Dialect, "MMAOpsArray", "mma_ops", "MMAAttr"> { let cppNamespace = "::mlir::iree_compiler::IREE::GPU"; diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUOps.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUOps.cpp index 4fe1eab9f986..780fc0920186 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUOps.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUOps.cpp @@ -245,8 +245,13 @@ LogicalResult MultiMmaOp::verify() { int64_t accInnerElementCount = multiplyAcc(getAccInnerShape()); auto [m, n, k] = getKind().getMNKShape(); - if (m * k != lhsInnerElementCount || n * k != rhsInnerElementCount || - m * n != accInnerElementCount) { + int64_t expectedNumLhsElem = m * k; + int64_t expectedNumRhsElem = n * k; + int64_t expectedNumAccElem = m * n; + + if (expectedNumLhsElem != lhsInnerElementCount || + expectedNumRhsElem != rhsInnerElementCount || + expectedNumAccElem != accInnerElementCount) { auto [lhsThreadType, rhsThreadType, accThreadType] = getKind().getABCVectorTypes(); int64_t lhsThreadElementCount = multiplyAcc(lhsThreadType.getShape()); diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/iree_gpu_attrs.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/iree_gpu_attrs.mlir index d1003069e94a..046a3c88abbc 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/iree_gpu_attrs.mlir +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/iree_gpu_attrs.mlir @@ -27,6 +27,33 @@ module { // CHECK-LABEL: func @test_wmma_f16_16x16x16_f32 // CHECK-SAME: mma_types = #iree_gpu.mma_layout +module { + func.func @test_data_tiled_mfma_f32_16x16x4_f32() attributes { + mma_types = #iree_gpu.data_tiled_mma_layout} { + return + } +} +// CHECK-LABEL: func @test_data_tiled_mfma_f32_16x16x4_f32 +// CHECK-SAME: mma_types = #iree_gpu.data_tiled_mma_layout + +module { + func.func @test_data_tiled_mfma_f32_16x16x16_f16() attributes { + mma_types = #iree_gpu.data_tiled_mma_layout} { + return + } +} +// CHECK-LABEL: func @test_data_tiled_mfma_f32_16x16x16_f16 +// CHECK-SAME: mma_types = #iree_gpu.data_tiled_mma_layout + +module { + func.func @test_data_tiled_mfma_i32_16x16x32_i8() attributes { + mma_types = #iree_gpu.data_tiled_mma_layout} { + return + } +} +// CHECK-LABEL: func @test_data_tiled_mfma_i32_16x16x32_i8 +// CHECK-SAME: mma_types = #iree_gpu.data_tiled_mma_layout + module { func.func @test_any_lowering_config() attributes { lowering_config = #iree_gpu.lowering_config<{workgroup = [16, 16], thread = [0, 4]}>} { diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/iree_gpu_ops.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/iree_gpu_ops.mlir index ca1aed161afb..e70664547753 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/iree_gpu_ops.mlir +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/iree_gpu_ops.mlir @@ -204,6 +204,60 @@ func.func @tensor_subgroup_matmul_transpose_b_32x32x8_multi_mma( // ----- +#contraction_accesses = [ + affine_map<(i, j, k) -> (i, k)>, + affine_map<(i, j, k) -> (k, j)>, + affine_map<(i, j, k) -> (i, j)> +] +func.func @data_tiled_1x1x1_tensor_multi_mma(%lhs: tensor, %rhs: tensor, %acc: tensor) -> tensor { + %0 = iree_gpu.multi_mma %lhs, %rhs, %acc { + indexing_maps = #contraction_accesses, + iterator_types = [#iree_gpu.iterator_type, #iree_gpu.iterator_type, #iree_gpu.iterator_type], + kind = #iree_gpu.data_tiled_mma_layout + } : tensor, tensor into tensor + return %0 : tensor +} + +// CHECK: #[[$MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)> +// CHECK: #[[$MAP1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)> +// CHECK: #[[$MAP2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> + +// CHECK-LABEL: func @data_tiled_1x1x1_tensor_multi_mma +// CHECK: iree_gpu.multi_mma %arg0, %arg1, %arg2 +// CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] +// CHECK-SAME: iterator_types = [#iree_gpu.iterator_type, #iree_gpu.iterator_type, #iree_gpu.iterator_type] +// CHECK-SAME: kind = #iree_gpu.data_tiled_mma_layout +// CHECK-SAME: : tensor, tensor into tensor + +// ----- + +#contraction_accesses = [ + affine_map<(i, j, k) -> (i, k)>, + affine_map<(i, j, k) -> (k, j)>, + affine_map<(i, j, k) -> (i, j)> +] +func.func @data_tiled_2x2x4_tensor_multi_mma(%lhs: tensor, %rhs: tensor, %acc: tensor) -> tensor { + %0 = iree_gpu.multi_mma %lhs, %rhs, %acc { + indexing_maps = #contraction_accesses, + iterator_types = [#iree_gpu.iterator_type, #iree_gpu.iterator_type, #iree_gpu.iterator_type], + kind = #iree_gpu.data_tiled_mma_layout + } : tensor, tensor into tensor + return %0 : tensor +} + +// CHECK: #[[$MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)> +// CHECK: #[[$MAP1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)> +// CHECK: #[[$MAP2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> + +// CHECK-LABEL: func @data_tiled_2x2x4_tensor_multi_mma +// CHECK: iree_gpu.multi_mma %arg0, %arg1, %arg2 +// CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] +// CHECK-SAME: iterator_types = [#iree_gpu.iterator_type, #iree_gpu.iterator_type, #iree_gpu.iterator_type] +// CHECK-SAME: kind = #iree_gpu.data_tiled_mma_layout +// CHECK-SAME: : tensor, tensor into tensor + +// ----- + func.func @tensor_barrier(%input: tensor) -> tensor { %out = iree_gpu.value_barrier %input : tensor return %out : tensor diff --git a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp index 3e52cef7c16d..d085849d86a0 100644 --- a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp +++ b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp @@ -959,7 +959,7 @@ bool hasUkernelSupportedGpuArch(IREE::HAL::ExecutableTargetAttr targetAttr) { // GPU Target Information //===----------------------------------------------------------------------===// -static IREE::GPU::TargetAttr getCLGPUTarget(MLIRContext *context) { +IREE::GPU::TargetAttr getCLGPUTarget(MLIRContext *context) { if (clTestTarget.empty()) return nullptr; diff --git a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h index 7cbddf4e79bc..3da1ec775f94 100644 --- a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h +++ b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h @@ -174,6 +174,10 @@ FailureOr getSupportedMmaTypes(DictionaryAttr config); FailureOr getSupportedMmaTypes(mlir::FunctionOpInterface entryPoint); +/// Returns the GPU target attribute from `iree-gpu-test-target` if provided. +/// Returns null TargetAttr othersise. +IREE::GPU::TargetAttr getCLGPUTarget(MLIRContext *context); + /// Returns the GPU target attribute from executable |target| if found. /// Returns null TargetAttr othersise. IREE::GPU::TargetAttr getGPUTargetAttr(IREE::HAL::ExecutableTargetAttr target);