GPU data tiling changes from `shared/gpu-data-tiling-materialize-enco…

…ding` (#18492) This PR is a squashed rebasing of https://github.com/iree-org/iree/tree/shared/gpu-data-tiling-materialize-encoding . This squashes together commits by @hanhanW , @lialan and myself. Here are all the commits: 40258db...shared/gpu-data-tiling-materialize-encoding The intent is to carry on on `main` branch. The motivation is to pick up the recent TileAndFuse pipeline. --------- Signed-off-by: hanhanW <hanhan0912@gmail.com> Signed-off-by: Alan Li <me@alanli.org> Signed-off-by: Benoit Jacob <jacob.benoit.1@gmail.com> Co-authored-by: hanhanW <hanhan0912@gmail.com> Co-authored-by: Alan Li <me@alanli.org>
iree-org · Sep 11, 2024 · 4395c11 · 4395c11
1 parent bb82e78
commit 4395c11
Show file tree

Hide file tree

Showing 19 changed files with 1,371 additions and 48 deletions.
diff --git a/compiler/src/iree/compiler/Codegen/Common/CPU/CPUMaterializeEncodings.cpp b/compiler/src/iree/compiler/Codegen/Common/CPU/CPUMaterializeEncodings.cpp
@@ -464,9 +464,11 @@ materializeFuncOpEncodings(FunctionOpInterface funcOp,
                                                  targetAttr);
   MaterializeEncodingConversionTarget target(*funcOp.getContext());
   auto materializeEncodingValueFn = getMaterializeEncodingValueFn(targetAttr);
-  populateMaterializeEncodingIntoPackUnPackPatterns(materializeEncodingPattern,
-                                                    target, typeConverter,
-                                                    materializeEncodingValueFn);
+  populateMaterializeEncodingIntoPackUnPackPatterns(
+      materializeEncodingPattern, typeConverter, materializeEncodingValueFn);
+  populateIREEMaterializeEncodingIntoPackUnPackPatterns(
+      materializeEncodingPattern, target, typeConverter,
+      materializeEncodingValueFn);
 
   if (failed(applyPartialConversion(funcOp, target,
                                     std::move(materializeEncodingPattern)))) {

diff --git a/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.cpp b/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.cpp
@@ -117,9 +117,29 @@ MaterializeEncodingTypeConverter::MaterializeEncodingTypeConverter(
     if (failed(maybeEncodingInfo)) {
       return dropEncoding(type);
     }
-    return cast<RankedTensorType>(tensor::PackOp::inferPackedType(
+    auto encodingInfo = *maybeEncodingInfo;
+    auto packedType = cast<RankedTensorType>(tensor::PackOp::inferPackedType(
         tensorType, maybeEncodingInfo->innerTileSizes,
         maybeEncodingInfo->innerDimsPos, maybeEncodingInfo->outerDimsPerm));
+
+    // There is no swizzle, we are already done. Typically the case on CPU.
+    if (!encodingInfo.swizzle) {
+      return packedType;
+    }
+
+    // There is a swizzle, we need to handle it. Typically the case on GPU.
+    auto swizzle = *encodingInfo.swizzle;
+    SmallVector<int64_t> newShape(
+        packedType.getShape().drop_back(encodingInfo.innerTileSizes.size()));
+    SmallVector<int64_t> swizzledTileShape;
+    for (auto expandedDimShape : swizzle.expandShape) {
+      for (int64_t d : expandedDimShape) {
+        swizzledTileShape.push_back(d);
+      }
+    }
+    applyPermutationToVector(swizzledTileShape, swizzle.permutation);
+    newShape.append(swizzledTileShape);
+    return RankedTensorType::get(newShape, packedType.getElementType());
   });
 }
 
@@ -143,19 +163,6 @@ MaterializeEncodingConversionTarget::MaterializeEncodingConversionTarget(
   });
 }
 
-RankedTensorType getOriginalTypeWithEncoding(RankedTensorType type) {
-  auto encoding = getEncodingAttr(type);
-  if (!encoding) {
-    return type;
-  }
-  RankedTensorType originalType = type;
-  if (auto originalTypeAttr = encoding.getOriginalType()) {
-    originalType = cast<RankedTensorType>(originalTypeAttr.getValue());
-  }
-  return RankedTensorType::get(originalType.getShape(),
-                               originalType.getElementType(), encoding);
-}
-
 RankedTensorType dropEncoding(RankedTensorType type) {
   return RankedTensorType::get(type.getShape(), type.getElementType());
 }
@@ -213,4 +220,13 @@ bool isNarrowNResult(EncodingAttr encoding) {
   return narrowN && (!narrowM || narrowM.getInt() > narrowN.getInt());
 }
 
+SmallVector<int64_t>
+getExpandedTileShape(SmallVector<SmallVector<int64_t>> expandShape) {
+  SmallVector<int64_t> result;
+  for (auto expandShapeDim : expandShape) {
+    result.append(expandShapeDim);
+  }
+  return result;
+}
+
 } // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.h b/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.h
@@ -9,16 +9,47 @@
 
 #include "iree/compiler/Dialect/Encoding/IR/EncodingOps.h"
 #include "iree/compiler/Dialect/HAL/IR/HALTypes.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Transforms/DialectConversion.h"
 
 namespace mlir::iree_compiler {
 
-/// Container of information needed to materialize the pack operation.
+/// Container of information needed to materialize the layout transformations.
+///
+/// On CPU, these layout transformations consist of a single `temsor.pack`
+/// or `tensor.unpack` op, implementing a tiled layout where each tile is
+/// row-major.
+///
+/// On GPU, there is an additional `swizzle`, which changes the layout inside
+/// of the tile. See the comment on the nested Swizzle struct.
 struct MaterializeEncodingInfo {
+  // Metadata for a swizzle, that is, an (expand_shape -> transposition)
+  // pair of ops performing a change of layout within the tiles. This is used
+  // on GPU, where the tiles themselves can have an arbitrary layout.
+  struct Swizzle {
+    // This vector-of-vectors contains all the information needed to generate
+    // a `tensor.expand_shape` creating additional internal dimensions into the
+    // tile. For example, expandShape = [[16], [4, 2]] means that the original
+    // tile shape [16, 8] gets expanded such that the first dimension 16 is left
+    // unchanged, and the second dimension 8 gets split into two internal dims
+    // of size 4 and 2.
+    SmallVector<SmallVector<int64_t>> expandShape;
+    // This permutation vector applies to the expanded dimensions and is used
+    // to generate a `linalg.transpose` changing the layout of the tile. For
+    // example, permutation[0] dictates which of the expanded dimensions becomes
+    // the leading dimension of the layout.
+    SmallVector<int64_t> permutation;
+  };
+
+  // The next 3 fields are used to create a `tensor.pack` or `tensor.unpack` op,
+  // changing the overall layout between row-major and tiled (where each tile is
+  // row-major).
   SmallVector<int64_t> innerDimsPos;
   SmallVector<int64_t> innerTileSizes;
   SmallVector<int64_t> outerDimsPerm;
-  unsigned srcRank = 0;
+
+  // The optional swizzle, see the above comment on Swizzle. Only used on GPU.
+  std::optional<Swizzle> swizzle;
 };
 
 using MaterializeEncodingFn = std::function<FailureOr<MaterializeEncodingInfo>(
@@ -83,9 +114,6 @@ class OpMaterializeEncodingPattern : public OpConversionPattern<OpTy> {
 // Utility methods about Encoding.
 //===---------------------------------------------------------------------===//
 
-/// Returns the original type that carried by encoding.
-RankedTensorType getOriginalTypeWithEncoding(RankedTensorType type);
-
 /// Returns the RankedTensorType without encodings.
 RankedTensorType dropEncoding(RankedTensorType type);
 
@@ -102,7 +130,32 @@ MaterializeEncodingInfo
 getEncodingInfoForMatmul(IREE::Encoding::EncodingAttr encoding, int64_t rank,
                          TileMxNxK tileMxNxK);
 
+/// Utility method to convert from `set_encoding` op to `pack` operation.
+/// For now this takes a `paddingValue` as input. The source is also taken
+/// as input so that these could be used with `OpConversionPatterns`.
+FailureOr<tensor::PackOp> lowerSetEncodingOpToPackOp(
+    RewriterBase &rewriter, IREE::Encoding::SetEncodingOp encodingOp,
+    Value source, const MaterializeEncodingTypeConverter &typeConverter,
+    MaterializeEncodingValueFn materializeEncodingValueFn);
+
+/// Utility method to convert from `unset_encoding` op to `unpack` operation.
+/// The source is taken as input so that these could be used with
+/// `OpConversionPatterns`.
+FailureOr<tensor::UnPackOp> lowerUnsetEncodingToUnpackOp(
+    RewriterBase &rewriter, IREE::Encoding::UnsetEncodingOp encodingOp,
+    Value packedValue, const MaterializeEncodingTypeConverter &typeConverter,
+    MaterializeEncodingValueFn materializeEncodingValueFn);
+
+/// Pouplates the set of patterns that lowers set_encoding, unset_encoding, and
+/// upstream dialect ops with encoding types to pack/unpack ops.
 void populateMaterializeEncodingIntoPackUnPackPatterns(
+    RewritePatternSet &patterns,
+    MaterializeEncodingTypeConverter &typeConverter,
+    MaterializeEncodingValueFn materializeEncodingValueFn);
+
+/// Pouplates the set of patterns that lowers IREE dialect (e.g., Flow, Hal,
+/// etc) ops with encoding types to pack/unpack ops.
+void populateIREEMaterializeEncodingIntoPackUnPackPatterns(
     RewritePatternSet &patterns, MaterializeEncodingConversionTarget &target,
     MaterializeEncodingTypeConverter &typeConverter,
     MaterializeEncodingValueFn materializeEncodingValueFn);
@@ -111,6 +164,10 @@ void populateMaterializeEncodingIntoPackUnPackPatterns(
 // result of a matvec.
 bool isNarrowNResult(IREE::Encoding::EncodingAttr encoding);
 
+// Concatenates the vectors.
+SmallVector<int64_t>
+getExpandedTileShape(SmallVector<SmallVector<int64_t>> expandShape);
+
 } // namespace mlir::iree_compiler
 
 #endif // IREE_COMPILER_SRC_IREE_COMPILER_CODEGEN_COMMON_ENCODINGUTILS_H_
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel
@@ -61,6 +61,7 @@ iree_compiler_cc_library(
         "GPUGeneralizeNamedOps.cpp",
         "GPUInferMemorySpace.cpp",
         "GPULowerToUKernels.cpp",
+        "GPUMaterializeEncoding.cpp",
         "GPUMultiBuffering.cpp",
         "GPUNestedLayoutDistributionPatterns.cpp",
         "GPUPatterns.cpp",
@@ -98,6 +99,7 @@ iree_compiler_cc_library(
         "//compiler/src/iree/compiler/Codegen/Transforms",
         "//compiler/src/iree/compiler/Codegen/Utils",
         "//compiler/src/iree/compiler/Codegen/Utils:VectorOpUtils",
+        "//compiler/src/iree/compiler/Dialect/Encoding/IR",
         "//compiler/src/iree/compiler/Dialect/HAL/IR",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AMDGPUDialect",

diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt
@@ -59,6 +59,7 @@ iree_cc_library(
     "GPUGeneralizeNamedOps.cpp"
     "GPUInferMemorySpace.cpp"
     "GPULowerToUKernels.cpp"
+    "GPUMaterializeEncoding.cpp"
     "GPUMultiBuffering.cpp"
     "GPUNestedLayoutDistributionPatterns.cpp"
     "GPUPatterns.cpp"
@@ -129,6 +130,7 @@ iree_cc_library(
     iree::compiler::Codegen::Transforms
     iree::compiler::Codegen::Utils
     iree::compiler::Codegen::Utils::VectorOpUtils
+    iree::compiler::Dialect::Encoding::IR
     iree::compiler::Dialect::HAL::IR
   PUBLIC
 )