From 4395c118895887b39c0172604e1f691fbe185f0b Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Wed, 11 Sep 2024 16:26:48 -0400
Subject: [PATCH] GPU data tiling changes from
 `shared/gpu-data-tiling-materialize-encoding` (#18492)

This PR is a squashed rebasing of
https://github.com/iree-org/iree/tree/shared/gpu-data-tiling-materialize-encoding
.

This squashes together commits by @hanhanW , @lialan and myself. Here
are all the commits:

https://github.com/iree-org/iree/compare/40258db63fdfd2228adb2fd0baaea53b9e3c9d63...shared/gpu-data-tiling-materialize-encoding

The intent is to carry on on `main` branch. The motivation is to pick up
the recent TileAndFuse pipeline.

---------

Signed-off-by: hanhanW <hanhan0912@gmail.com>
Signed-off-by: Alan Li <me@alanli.org>
Signed-off-by: Benoit Jacob <jacob.benoit.1@gmail.com>
Co-authored-by: hanhanW <hanhan0912@gmail.com>
Co-authored-by: Alan Li <me@alanli.org>
---
 .../Common/CPU/CPUMaterializeEncodings.cpp    |   8 +-
 .../compiler/Codegen/Common/EncodingUtils.cpp |  44 +-
 .../compiler/Codegen/Common/EncodingUtils.h   |  67 +-
 .../compiler/Codegen/Common/GPU/BUILD.bazel   |   2 +
 .../Codegen/Common/GPU/CMakeLists.txt         |   2 +
 .../Common/GPU/GPUMaterializeEncoding.cpp     | 645 ++++++++++++++++++
 .../compiler/Codegen/Common/GPU/Passes.td     |   5 +
 .../Codegen/Common/GPU/test/BUILD.bazel       |   1 +
 .../Codegen/Common/GPU/test/CMakeLists.txt    |   1 +
 .../GPU/test/gpu_materialize_encoding.mlir    | 391 +++++++++++
 .../Common/MaterializeEncodingIntoNop.cpp     |   2 +
 .../MaterializeEncodingIntoPackUnPack.cpp     |  73 +-
 .../Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp   |  35 +-
 .../Codegen/Dialect/GPU/IR/IREEGPUAttrs.td    |  47 ++
 .../Codegen/Dialect/GPU/IR/IREEGPUOps.cpp     |   9 +-
 .../Dialect/GPU/IR/test/iree_gpu_attrs.mlir   |  27 +
 .../Dialect/GPU/IR/test/iree_gpu_ops.mlir     |  54 ++
 .../iree/compiler/Codegen/Utils/GPUUtils.cpp  |   2 +-
 .../iree/compiler/Codegen/Utils/GPUUtils.h    |   4 +
 19 files changed, 1371 insertions(+), 48 deletions(-)
 create mode 100644 compiler/src/iree/compiler/Codegen/Common/GPU/GPUMaterializeEncoding.cpp
 create mode 100644 compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_materialize_encoding.mlir

diff --git a/compiler/src/iree/compiler/Codegen/Common/CPU/CPUMaterializeEncodings.cpp b/compiler/src/iree/compiler/Codegen/Common/CPU/CPUMaterializeEncodings.cpp
index 894f03053da1..21f988fe359e 100644
--- a/compiler/src/iree/compiler/Codegen/Common/CPU/CPUMaterializeEncodings.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/CPU/CPUMaterializeEncodings.cpp
@@ -464,9 +464,11 @@ materializeFuncOpEncodings(FunctionOpInterface funcOp,
                                                  targetAttr);
   MaterializeEncodingConversionTarget target(*funcOp.getContext());
   auto materializeEncodingValueFn = getMaterializeEncodingValueFn(targetAttr);
-  populateMaterializeEncodingIntoPackUnPackPatterns(materializeEncodingPattern,
-                                                    target, typeConverter,
-                                                    materializeEncodingValueFn);
+  populateMaterializeEncodingIntoPackUnPackPatterns(
+      materializeEncodingPattern, typeConverter, materializeEncodingValueFn);
+  populateIREEMaterializeEncodingIntoPackUnPackPatterns(
+      materializeEncodingPattern, target, typeConverter,
+      materializeEncodingValueFn);
 
   if (failed(applyPartialConversion(funcOp, target,
                                     std::move(materializeEncodingPattern)))) {
diff --git a/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.cpp b/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.cpp
index 378383d9973a..15a99d868d04 100644
--- a/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.cpp
@@ -117,9 +117,29 @@ MaterializeEncodingTypeConverter::MaterializeEncodingTypeConverter(
     if (failed(maybeEncodingInfo)) {
       return dropEncoding(type);
     }
-    return cast<RankedTensorType>(tensor::PackOp::inferPackedType(
+    auto encodingInfo = *maybeEncodingInfo;
+    auto packedType = cast<RankedTensorType>(tensor::PackOp::inferPackedType(
         tensorType, maybeEncodingInfo->innerTileSizes,
         maybeEncodingInfo->innerDimsPos, maybeEncodingInfo->outerDimsPerm));
+
+    // There is no swizzle, we are already done. Typically the case on CPU.
+    if (!encodingInfo.swizzle) {
+      return packedType;
+    }
+
+    // There is a swizzle, we need to handle it. Typically the case on GPU.
+    auto swizzle = *encodingInfo.swizzle;
+    SmallVector<int64_t> newShape(
+        packedType.getShape().drop_back(encodingInfo.innerTileSizes.size()));
+    SmallVector<int64_t> swizzledTileShape;
+    for (auto expandedDimShape : swizzle.expandShape) {
+      for (int64_t d : expandedDimShape) {
+        swizzledTileShape.push_back(d);
+      }
+    }
+    applyPermutationToVector(swizzledTileShape, swizzle.permutation);
+    newShape.append(swizzledTileShape);
+    return RankedTensorType::get(newShape, packedType.getElementType());
   });
 }
 
@@ -143,19 +163,6 @@ MaterializeEncodingConversionTarget::MaterializeEncodingConversionTarget(
   });
 }
 
-RankedTensorType getOriginalTypeWithEncoding(RankedTensorType type) {
-  auto encoding = getEncodingAttr(type);
-  if (!encoding) {
-    return type;
-  }
-  RankedTensorType originalType = type;
-  if (auto originalTypeAttr = encoding.getOriginalType()) {
-    originalType = cast<RankedTensorType>(originalTypeAttr.getValue());
-  }
-  return RankedTensorType::get(originalType.getShape(),
-                               originalType.getElementType(), encoding);
-}
-
 RankedTensorType dropEncoding(RankedTensorType type) {
   return RankedTensorType::get(type.getShape(), type.getElementType());
 }
@@ -213,4 +220,13 @@ bool isNarrowNResult(EncodingAttr encoding) {
   return narrowN && (!narrowM || narrowM.getInt() > narrowN.getInt());
 }
 
+SmallVector<int64_t>
+getExpandedTileShape(SmallVector<SmallVector<int64_t>> expandShape) {
+  SmallVector<int64_t> result;
+  for (auto expandShapeDim : expandShape) {
+    result.append(expandShapeDim);
+  }
+  return result;
+}
+
 } // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.h b/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.h
index 9a2aafa5fe0c..d12a44fe8626 100644
--- a/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.h
+++ b/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.h
@@ -9,16 +9,47 @@
 
 #include "iree/compiler/Dialect/Encoding/IR/EncodingOps.h"
 #include "iree/compiler/Dialect/HAL/IR/HALTypes.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Transforms/DialectConversion.h"
 
 namespace mlir::iree_compiler {
 
-/// Container of information needed to materialize the pack operation.
+/// Container of information needed to materialize the layout transformations.
+///
+/// On CPU, these layout transformations consist of a single `temsor.pack`
+/// or `tensor.unpack` op, implementing a tiled layout where each tile is
+/// row-major.
+///
+/// On GPU, there is an additional `swizzle`, which changes the layout inside
+/// of the tile. See the comment on the nested Swizzle struct.
 struct MaterializeEncodingInfo {
+  // Metadata for a swizzle, that is, an (expand_shape -> transposition)
+  // pair of ops performing a change of layout within the tiles. This is used
+  // on GPU, where the tiles themselves can have an arbitrary layout.
+  struct Swizzle {
+    // This vector-of-vectors contains all the information needed to generate
+    // a `tensor.expand_shape` creating additional internal dimensions into the
+    // tile. For example, expandShape = [[16], [4, 2]] means that the original
+    // tile shape [16, 8] gets expanded such that the first dimension 16 is left
+    // unchanged, and the second dimension 8 gets split into two internal dims
+    // of size 4 and 2.
+    SmallVector<SmallVector<int64_t>> expandShape;
+    // This permutation vector applies to the expanded dimensions and is used
+    // to generate a `linalg.transpose` changing the layout of the tile. For
+    // example, permutation[0] dictates which of the expanded dimensions becomes
+    // the leading dimension of the layout.
+    SmallVector<int64_t> permutation;
+  };
+
+  // The next 3 fields are used to create a `tensor.pack` or `tensor.unpack` op,
+  // changing the overall layout between row-major and tiled (where each tile is
+  // row-major).
   SmallVector<int64_t> innerDimsPos;
   SmallVector<int64_t> innerTileSizes;
   SmallVector<int64_t> outerDimsPerm;
-  unsigned srcRank = 0;
+
+  // The optional swizzle, see the above comment on Swizzle. Only used on GPU.
+  std::optional<Swizzle> swizzle;
 };
 
 using MaterializeEncodingFn = std::function<FailureOr<MaterializeEncodingInfo>(
@@ -83,9 +114,6 @@ class OpMaterializeEncodingPattern : public OpConversionPattern<OpTy> {
 // Utility methods about Encoding.
 //===---------------------------------------------------------------------===//
 
-/// Returns the original type that carried by encoding.
-RankedTensorType getOriginalTypeWithEncoding(RankedTensorType type);
-
 /// Returns the RankedTensorType without encodings.
 RankedTensorType dropEncoding(RankedTensorType type);
 
@@ -102,7 +130,32 @@ MaterializeEncodingInfo
 getEncodingInfoForMatmul(IREE::Encoding::EncodingAttr encoding, int64_t rank,
                          TileMxNxK tileMxNxK);
 
+/// Utility method to convert from `set_encoding` op to `pack` operation.
+/// For now this takes a `paddingValue` as input. The source is also taken
+/// as input so that these could be used with `OpConversionPatterns`.
+FailureOr<tensor::PackOp> lowerSetEncodingOpToPackOp(
+    RewriterBase &rewriter, IREE::Encoding::SetEncodingOp encodingOp,
+    Value source, const MaterializeEncodingTypeConverter &typeConverter,
+    MaterializeEncodingValueFn materializeEncodingValueFn);
+
+/// Utility method to convert from `unset_encoding` op to `unpack` operation.
+/// The source is taken as input so that these could be used with
+/// `OpConversionPatterns`.
+FailureOr<tensor::UnPackOp> lowerUnsetEncodingToUnpackOp(
+    RewriterBase &rewriter, IREE::Encoding::UnsetEncodingOp encodingOp,
+    Value packedValue, const MaterializeEncodingTypeConverter &typeConverter,
+    MaterializeEncodingValueFn materializeEncodingValueFn);
+
+/// Pouplates the set of patterns that lowers set_encoding, unset_encoding, and
+/// upstream dialect ops with encoding types to pack/unpack ops.
 void populateMaterializeEncodingIntoPackUnPackPatterns(
+    RewritePatternSet &patterns,
+    MaterializeEncodingTypeConverter &typeConverter,
+    MaterializeEncodingValueFn materializeEncodingValueFn);
+
+/// Pouplates the set of patterns that lowers IREE dialect (e.g., Flow, Hal,
+/// etc) ops with encoding types to pack/unpack ops.
+void populateIREEMaterializeEncodingIntoPackUnPackPatterns(
     RewritePatternSet &patterns, MaterializeEncodingConversionTarget &target,
     MaterializeEncodingTypeConverter &typeConverter,
     MaterializeEncodingValueFn materializeEncodingValueFn);
@@ -111,6 +164,10 @@ void populateMaterializeEncodingIntoPackUnPackPatterns(
 // result of a matvec.
 bool isNarrowNResult(IREE::Encoding::EncodingAttr encoding);
 
+// Concatenates the vectors.
+SmallVector<int64_t>
+getExpandedTileShape(SmallVector<SmallVector<int64_t>> expandShape);
+
 } // namespace mlir::iree_compiler
 
 #endif // IREE_COMPILER_SRC_IREE_COMPILER_CODEGEN_COMMON_ENCODINGUTILS_H_
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel
index 1799a0987b6b..39543637e60e 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel
@@ -61,6 +61,7 @@ iree_compiler_cc_library(
         "GPUGeneralizeNamedOps.cpp",
         "GPUInferMemorySpace.cpp",
         "GPULowerToUKernels.cpp",
+        "GPUMaterializeEncoding.cpp",
         "GPUMultiBuffering.cpp",
         "GPUNestedLayoutDistributionPatterns.cpp",
         "GPUPatterns.cpp",
@@ -98,6 +99,7 @@ iree_compiler_cc_library(
         "//compiler/src/iree/compiler/Codegen/Transforms",
         "//compiler/src/iree/compiler/Codegen/Utils",
         "//compiler/src/iree/compiler/Codegen/Utils:VectorOpUtils",
+        "//compiler/src/iree/compiler/Dialect/Encoding/IR",
         "//compiler/src/iree/compiler/Dialect/HAL/IR",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AMDGPUDialect",
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt
index 4adc4cd165b8..b14f0d5fb633 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt
@@ -59,6 +59,7 @@ iree_cc_library(
     "GPUGeneralizeNamedOps.cpp"
     "GPUInferMemorySpace.cpp"
     "GPULowerToUKernels.cpp"
+    "GPUMaterializeEncoding.cpp"
     "GPUMultiBuffering.cpp"
     "GPUNestedLayoutDistributionPatterns.cpp"
     "GPUPatterns.cpp"
@@ -129,6 +130,7 @@ iree_cc_library(
     iree::compiler::Codegen::Transforms
     iree::compiler::Codegen::Utils
     iree::compiler::Codegen::Utils::VectorOpUtils
+    iree::compiler::Dialect::Encoding::IR
     iree::compiler::Dialect::HAL::IR
   PUBLIC
 )
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUMaterializeEncoding.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUMaterializeEncoding.cpp
new file mode 100644
index 000000000000..2d8aa4b67ca6
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUMaterializeEncoding.cpp
@@ -0,0 +1,645 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/compiler/Codegen/Common/EncodingUtils.h"
+#include "iree/compiler/Codegen/Common/GPU/Passes.h"
+#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
+#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
+#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUDialect.h"
+#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.h"
+#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUInterfaces.h"
+#include "iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.h"
+#include "iree/compiler/Codegen/Utils/GPUUtils.h"
+#include "iree/compiler/Dialect/Encoding/IR/EncodingDialect.h"
+#include "iree/compiler/Dialect/Encoding/IR/EncodingOps.h"
+#include "iree/compiler/Dialect/HAL/IR/HALTypes.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/MemRef/Transforms/Transforms.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Tensor/Transforms/Transforms.h"
+#include "mlir/Dialect/Utils/IndexingUtils.h"
+#include "mlir/Dialect/Utils/ReshapeOpsUtils.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+#define DEBUG_TYPE "iree-codegen-gpu-materialize-encoding"
+
+namespace mlir::iree_compiler {
+
+#define GEN_PASS_DEF_GPUMATERIALIZEDEVICEENCODINGPASS
+#include "iree/compiler/Codegen/Common/GPU/Passes.h.inc"
+
+// Returns the swizzle for a given intrinsic and operand index.
+// See the comment on MaterializeEncodingInfo::Swizzle for what that means.
+// This function is concerned with a single intrinsic, not a whole kernel tile.
+// TODO(bjacob): derive this automatically from the intrinsic layout getters.
+static MaterializeEncodingInfo::Swizzle
+getIntrinsicSwizzle(IREE::GPU::MMAIntrinsic mma, int operandIdx) {
+  switch (mma) {
+  case IREE::GPU::MMAIntrinsic::MFMA_F32_16x16x4_F32:
+    if (operandIdx == 2) {
+      return MaterializeEncodingInfo::Swizzle{/*expandShape=*/{{4, 4}, {16}},
+                                              /*permutation=*/{0, 2, 1}};
+    } else {
+      return MaterializeEncodingInfo::Swizzle{/*expandShape=*/{{16}, {4}},
+                                              /*permutation=*/{1, 0}};
+    }
+  case IREE::GPU::MMAIntrinsic::MFMA_F32_16x16x16_F16:
+    if (operandIdx == 2) {
+      return MaterializeEncodingInfo::Swizzle{/*expandShape=*/{{4, 4}, {16}},
+                                              /*permutation=*/{0, 2, 1}};
+    } else {
+      return MaterializeEncodingInfo::Swizzle{/*expandShape=*/{{16}, {4, 4}},
+                                              /*permutation=*/{1, 0, 2}};
+    }
+  case IREE::GPU::MMAIntrinsic::MFMA_I32_16x16x32_I8:
+    if (operandIdx == 2) {
+      return MaterializeEncodingInfo::Swizzle{/*expandShape=*/{{4, 4}, {16}},
+                                              /*permutation=*/{0, 2, 1}};
+    } else {
+      return MaterializeEncodingInfo::Swizzle{/*expandShape=*/{{16}, {4, 8}},
+                                              /*permutation=*/{1, 0, 2}};
+    }
+  default:
+    assert(false && "should not get here.");
+    return {};
+  }
+}
+
+// Given an `expandShape` vector-of-vectors describing the mapping from source
+// dimensions to expanded dimensions, returns the index of the first expanded
+// dimension corresponding to the given source dimension index.
+static int64_t
+getExpandedDimFirstIdx(const SmallVector<SmallVector<int64_t>> &expandShape,
+                       int64_t srcIndex) {
+  int dstIndexFirst = 0;
+  for (int i = 0; i < srcIndex; ++i) {
+    dstIndexFirst += expandShape[i].size();
+  }
+  return dstIndexFirst;
+}
+
+// Unroll the dimension given by `srcIndex` by the given `unrollFactor`.
+// This is not interleaving layouts. The layout will consist of multiple copies
+// of the input tile, side by side.
+//
+// Example:
+//    Input swizzle = { expandShape = [[16], [4]], permutation = [1, 0] }
+//    Input srcIndex = 1
+//    Input unrollFactor = 4
+// -> Output swizzle = { expandShape = [[16], [4, 4]], permutation = [1, 2, 0] }
+//
+static void unroll(MaterializeEncodingInfo::Swizzle &swizzle, int srcIndex,
+                   int unrollFactor) {
+  assert(unrollFactor > 1);
+  int dstIndexFirst = getExpandedDimFirstIdx(swizzle.expandShape, srcIndex);
+
+  // The new unrolling dimension is inserted at the start of the expandShape
+  // dimensions group corresponding to srcIndex.
+  swizzle.expandShape[srcIndex].insert(swizzle.expandShape[srcIndex].begin(),
+                                       unrollFactor);
+  // Since we are not interleaving here, generating side-by-side copies of the
+  // original layout, the new unrolling dimension is the new outermost
+  // dimension. Existing entries get shifted to make room for it.
+  for (auto &p : swizzle.permutation) {
+    p += (p >= dstIndexFirst);
+  }
+  swizzle.permutation.insert(swizzle.permutation.begin(), dstIndexFirst);
+}
+
+// Interleave the layout in `swizzle` by mutating `swizzle.permutation` to
+// move permutation[0], the outer-most dimension (which the unroll() function
+// created to be the unrolling dimension), to the inner dimension given by
+// `expandedDimIndexToInterleaveAt`.
+//
+// Example:
+//    Input swizzle = { expandShape = [[16], [4, 4]], permutation = [1, 2, 0] }
+//    Input srcIndex = 1
+//    Input expandedDimIndexToInterleaveAt = 1
+// -> Output swizzle = { expandShape = [[16], [4, 4]], permutation = [2, 0, 1] }
+//
+static void interleave(MaterializeEncodingInfo::Swizzle &swizzle, int srcIndex,
+                       int expandedDimIndexToInterleaveAt) {
+  // Compute which inner dimension to permute the current outer dimension into.
+  int dstIndexFirst = getExpandedDimFirstIdx(swizzle.expandShape, srcIndex);
+  int dstIndexToInterleaveAt = dstIndexFirst + expandedDimIndexToInterleaveAt;
+
+  SmallVector<int64_t> outPermutation(swizzle.permutation.size());
+  // The leading dimension, permutation[0], gets moved inwards to the
+  // position that we just computed, dstIndexToInterleaveAt.
+  outPermutation[dstIndexToInterleaveAt] = swizzle.permutation[0];
+  // Outer dimensions get shifted outwards to fill the gap.
+  for (int i = 0; i < dstIndexToInterleaveAt; ++i) {
+    outPermutation[i] = swizzle.permutation[i + 1];
+  }
+  // Inner dimensions don't change. That is to say that we only interleave
+  // at `targetInterleavedElements` granularity, we don't swizzle further
+  // internally to that.
+  for (int i = dstIndexToInterleaveAt + 1; i < outPermutation.size(); ++i) {
+    outPermutation[i] = swizzle.permutation[i];
+  }
+  swizzle.permutation = outPermutation;
+}
+
+// Returns the index of the dimension whose flattened size (flattening inner
+// dimensions into it) matches the given `targetSize`. This is used to compute
+// interleaving indices.
+//
+// Example:
+//    Input shape = [16, 8, 4, 4]
+//    Input targetSize = 16
+// -> Return 2, because the tail of the shape starting at index 2 is [4, 4],
+//    whose product equals targetSize.
+static int64_t getDimIdxForTargetSize(const SmallVector<int64_t> &shape,
+                                      int64_t targetSize) {
+  int interleaveAt = 0;
+  int size = 1;
+  for (interleaveAt = shape.size() - 1; interleaveAt >= 0; --interleaveAt) {
+    assert(size <= targetSize);
+    assert((targetSize % size) == 0);
+    if (size == targetSize) {
+      break;
+    }
+    size *= shape[interleaveAt];
+  }
+  return interleaveAt;
+}
+
+// Generates the swizzle for the full data-tiled-mma tile, including all the
+// relevant unrolling factors.
+static MaterializeEncodingInfo::Swizzle
+getSwizzle(IREE::GPU::DataTiledMMAAttr mma, int operandIdx) {
+  auto [AType, BType, CType] = mma.getABCElementTypes();
+  int ABits = AType.getIntOrFloatBitWidth();
+  int BBits = BType.getIntOrFloatBitWidth();
+  // TODO(bjacob): Should be looked up from GPU target, instead of hard-coded.
+  const int targetPreferredLoadBitWidth = 128;
+  auto swizzle = getIntrinsicSwizzle(mma.getIntrinsic().getValue(), operandIdx);
+  switch (operandIdx) {
+  case 0:
+    // A-matrix (LHS). Source dimensions are M (index 0) and K (index 1).
+    // Unroll on K with interleaving, then on M.
+    if (mma.getUnrollK() > 1) {
+      unroll(swizzle, 1, mma.getUnrollK());
+      int interleavingIdx = getDimIdxForTargetSize(
+          swizzle.expandShape[1],
+          targetPreferredLoadBitWidth / (mma.getUnrollK() * ABits));
+      interleave(swizzle, 1, interleavingIdx);
+    }
+    if (mma.getUnrollM() > 1) {
+      unroll(swizzle, 0, mma.getUnrollM());
+    }
+    break;
+  case 1:
+    // B-matrix (RHS). Since the pack ops already took care of transposing B,
+    // source dimensions are N (index 0) and K (index 1).
+    // Unroll on K with interleaving, then on N.
+    if (mma.getUnrollK() > 1) {
+      unroll(swizzle, 1, mma.getUnrollK());
+      int interleavingIdx = getDimIdxForTargetSize(
+          swizzle.expandShape[1],
+          targetPreferredLoadBitWidth / (mma.getUnrollK() * BBits));
+      interleave(swizzle, 1, interleavingIdx);
+    }
+    if (mma.getUnrollN() > 1) {
+      unroll(swizzle, 0, mma.getUnrollN());
+    }
+    break;
+  case 2:
+    // C-matrix (accumulator). Source dimensions are M (index 0) and N (index
+    // 1). Unroll on N, then on M.
+    if (mma.getUnrollN() > 1) {
+      unroll(swizzle, 1, mma.getUnrollN());
+    }
+    if (mma.getUnrollM() > 1) {
+      unroll(swizzle, 0, mma.getUnrollM());
+    }
+    break;
+  }
+  return swizzle;
+}
+
+static bool hasIntrinsic(IREE::GPU::TargetAttr target,
+                         IREE::GPU::MMAIntrinsic intrinsic) {
+  for (IREE::GPU::MMAAttr mma : target.getWgp().getMma()) {
+    if (mma.getIntrinsic().getValue() == intrinsic) {
+      return true;
+    }
+  }
+  return false;
+}
+
+static std::optional<IREE::GPU::DataTiledMMAAttr>
+chooseDataTiledMMAAttr(TypeRange elementTypes, IREE::GPU::TargetAttr target) {
+  assert(elementTypes.size() == 3);
+  using namespace IREE::GPU;
+  MLIRContext *ctx = target.getContext();
+  Type lhs = elementTypes[0];
+  Type rhs = elementTypes[1];
+  Type out = elementTypes[2];
+  auto match = [=](MMAIntrinsic intrinsic, int unrollM, int unrollN,
+                   int unrollK) -> std::optional<DataTiledMMAAttr> {
+    if (!hasIntrinsic(target, intrinsic)) {
+      return std::nullopt;
+    }
+    auto candidate = DataTiledMMAAttr::get(
+        ctx, MMAIntrinsicAttr::get(ctx, intrinsic), unrollM, unrollN, unrollK);
+    auto [candidateLhs, candidateRhs, candidateOut] =
+        candidate.getABCElementTypes();
+    if (candidateLhs != lhs || candidateRhs != rhs || candidateOut != out) {
+      return std::nullopt;
+    }
+    return candidate;
+  };
+  if (auto m = match(MMAIntrinsic::MFMA_F32_16x16x4_F32, 8, 8, 4)) {
+    return m;
+  }
+  if (auto m = match(MMAIntrinsic::MFMA_F32_16x16x16_F16, 8, 8, 2)) {
+    return m;
+  }
+  if (auto m = match(MMAIntrinsic::MFMA_I32_16x16x32_I8, 8, 8, 2)) {
+    return m;
+  }
+  // Fallback - no architecture-optimized tile size for this case.
+  return std::nullopt;
+}
+
+static FailureOr<MaterializeEncodingInfo>
+materializeEncodingForTarget(RankedTensorType tensorType,
+                             IREE::HAL::ExecutableTargetAttr targetAttr) {
+  auto encoding =
+      dyn_cast_or_null<IREE::Encoding::EncodingAttr>(tensorType.getEncoding());
+  if (!encoding) {
+    return failure();
+  }
+  // We only know about contractions with {Batch, M, N, K} <= 1 at the moment.
+  auto cDims = getEncodingContractionDims(encoding);
+  if (failed(cDims) || cDims->batch.size() > 1 || cDims->m.size() > 1 ||
+      cDims->n.size() > 1 || cDims->k.size() > 1) {
+    return failure();
+  }
+
+  // Enumerate available tile shapes for the given encoding and target.
+  IREE::GPU::TargetAttr gpuTargetAttr;
+  if (targetAttr) {
+    gpuTargetAttr = getGPUTargetAttr(targetAttr);
+  } else {
+    gpuTargetAttr = getCLGPUTarget(tensorType.getContext());
+  }
+  auto elementTypes = llvm::to_vector(
+      llvm::map_range(encoding.getElementTypes().getValue(), [](Attribute a) {
+        return cast<TypeAttr>(a).getValue();
+      }));
+  std::optional<IREE::GPU::DataTiledMMAAttr> mma =
+      chooseDataTiledMMAAttr(elementTypes, gpuTargetAttr);
+  if (!mma) {
+    return failure();
+  }
+
+  // Map the matmul TileMxNxK to an actual tile shape for the tensor at hand,
+  // based on its operand index in the matmul.
+  // TODO: Support unrolling.
+  auto rank = tensorType.getRank();
+  TileMxNxK innerTile;
+  std::tie(innerTile.M, innerTile.N, innerTile.K) = mma->getMNKShape();
+  auto encodingInfo = getEncodingInfoForMatmul(encoding, rank, innerTile);
+  auto operandIdx = encoding.getOperandIndex().getInt();
+  encodingInfo.swizzle = getSwizzle(*mma, operandIdx);
+  return encodingInfo;
+}
+
+namespace {
+struct GPUMaterializeDeviceEncodingPass final
+    : impl::GPUMaterializeDeviceEncodingPassBase<
+          GPUMaterializeDeviceEncodingPass> {
+  using GPUMaterializeDeviceEncodingPassBase::
+      GPUMaterializeDeviceEncodingPassBase;
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<arith::ArithDialect, tensor::TensorDialect,
+                    linalg::LinalgDialect, IREE::Encoding::IREEEncodingDialect,
+                    IREE::GPU::IREEGPUDialect>();
+  }
+  void runOnOperation() override;
+};
+
+SmallVector<ReassociationIndices>
+getReassociationIndices(int outerDims,
+                        SmallVector<SmallVector<int64_t>> expandShape) {
+  SmallVector<ReassociationIndices> result;
+  int expandedIdx = 0;
+  for (int i = 0; i < outerDims; ++i) {
+    result.push_back({expandedIdx++});
+  }
+  for (auto expandShapeDim : expandShape) {
+    result.push_back({});
+    for (int64_t d : expandShapeDim) {
+      (void)d;
+      result.back().push_back(expandedIdx++);
+    }
+  }
+  return result;
+}
+
+/// Convert iree_linalg_ext.set_encoding op to pack + tile swizzling ops. We use
+/// expand_shape + linalg.transpose to represent a tile swizzling op.
+struct GPUSetEncodingOpLoweringConversion
+    : public OpMaterializeEncodingPattern<IREE::Encoding::SetEncodingOp> {
+  using OpMaterializeEncodingPattern<
+      IREE::Encoding::SetEncodingOp>::OpMaterializeEncodingPattern;
+
+  LogicalResult
+  matchAndRewrite(IREE::Encoding::SetEncodingOp encodingOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto converter = static_cast<const MaterializeEncodingTypeConverter *>(
+        getTypeConverter());
+    auto packOp = lowerSetEncodingOpToPackOp(rewriter, encodingOp,
+                                             adaptor.getSource(), *converter,
+                                             this->materializeEncodingValueFn);
+    if (failed(packOp)) {
+      Value result = adaptor.getSource();
+      Type targetType =
+          getTypeConverter()->convertType(encodingOp.getResultType());
+      if (targetType != result.getType()) {
+        result = rewriter.create<tensor::CastOp>(encodingOp.getLoc(),
+                                                 targetType, result);
+      }
+      rewriter.replaceOp(encodingOp, result);
+      return success();
+    }
+
+    FailureOr<MaterializeEncodingInfo> maybeEncodingInfo =
+        converter->getEncodingInfo(encodingOp.getResultType());
+    if (failed(maybeEncodingInfo)) {
+      return rewriter.notifyMatchFailure(encodingOp,
+                                         "unhandled result encoding");
+    }
+    if (!maybeEncodingInfo->swizzle) {
+      rewriter.replaceOp(encodingOp, packOp->getResult());
+      return success();
+    }
+    SmallVector<int64_t> innerTiles = maybeEncodingInfo->innerTileSizes;
+
+    // TODO(hanchung): Add a util to the encoding attribute, so we don't need
+    // the map_to_vector method here.
+    auto loc = encodingOp.getLoc();
+
+    // Create expand_shape op to tile the innermost two dimensions.
+    int origRank = encodingOp.getSourceType().getRank();
+    SmallVector<int64_t> expandShapeShape(packOp->getDestType().getShape());
+    expandShapeShape.truncate(origRank);
+    expandShapeShape.append(
+        getExpandedTileShape(maybeEncodingInfo->swizzle->expandShape));
+
+    auto expandShapeType = RankedTensorType::get(
+        expandShapeShape, encodingOp.getSourceType().getElementType());
+
+    SmallVector<ReassociationIndices> reassociation = getReassociationIndices(
+        origRank, maybeEncodingInfo->swizzle->expandShape);
+    auto expandShapeOp = rewriter.create<tensor::ExpandShapeOp>(
+        loc, expandShapeType, packOp->getResult(), reassociation);
+
+    // create linalg.transpose on expandShapeShape
+
+    SmallVector<int64_t> transposePerm;
+    for (int i = 0; i < origRank; ++i) {
+      transposePerm.push_back(i);
+    }
+    for (auto perm : maybeEncodingInfo->swizzle->permutation) {
+      transposePerm.push_back(origRank + perm);
+    }
+    SmallVector<int64_t> transposeResultDims = expandShapeShape;
+    applyPermutationToVector(transposeResultDims, transposePerm);
+
+    auto emptyTensor = rewriter.create<tensor::EmptyOp>(
+        loc, transposeResultDims, encodingOp.getSourceType().getElementType());
+    auto transposeOp = rewriter.create<linalg::TransposeOp>(
+        loc, expandShapeOp, emptyTensor, transposePerm);
+    rewriter.replaceOp(encodingOp, transposeOp->getResult(0));
+
+    return success();
+  }
+};
+
+struct GPUUnsetEncodingOpLoweringConversion
+    : public OpMaterializeEncodingPattern<IREE::Encoding::UnsetEncodingOp> {
+  using OpMaterializeEncodingPattern<
+      IREE::Encoding::UnsetEncodingOp>::OpMaterializeEncodingPattern;
+
+  LogicalResult
+  matchAndRewrite(IREE::Encoding::UnsetEncodingOp unsetEncodingOp,
+                  OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto converter = static_cast<const MaterializeEncodingTypeConverter *>(
+        getTypeConverter());
+
+    Location loc = unsetEncodingOp.getLoc();
+
+    FailureOr<MaterializeEncodingInfo> maybeEncodingInfo =
+        converter->getEncodingInfo(unsetEncodingOp.getSource().getType());
+    if (failed(maybeEncodingInfo)) {
+      return rewriter.notifyMatchFailure(unsetEncodingOp,
+                                         "unhandled result encoding");
+    }
+    Value unpackSrc = adaptor.getSource();
+    if (maybeEncodingInfo->swizzle) {
+      SmallVector<int64_t> innerTiles = maybeEncodingInfo->innerTileSizes;
+
+      int targetRank = unsetEncodingOp.getResultType().getRank();
+      auto srcConvertedType =
+          cast<RankedTensorType>(adaptor.getSource().getType());
+      SmallVector<int64_t> expandShapeShape(srcConvertedType.getShape());
+      expandShapeShape.truncate(targetRank);
+      expandShapeShape.append(
+          getExpandedTileShape(maybeEncodingInfo->swizzle->expandShape));
+
+      SmallVector<int64_t> transposePerm;
+      for (int i = 0; i < targetRank; ++i) {
+        transposePerm.push_back(i);
+      }
+      for (auto perm : maybeEncodingInfo->swizzle->permutation) {
+        transposePerm.push_back(targetRank + perm);
+      }
+      SmallVector<int64_t> expandShapeResultDims = expandShapeShape;
+      applyPermutationToVector(expandShapeResultDims, transposePerm);
+      auto invertedTransposePerm = invertPermutationVector(transposePerm);
+
+      auto emptyTensor = rewriter.create<tensor::EmptyOp>(
+          loc, expandShapeShape,
+          unsetEncodingOp.getSourceType().getElementType());
+      auto transposeOp = rewriter.create<linalg::TransposeOp>(
+          loc, adaptor.getSource(), emptyTensor, invertedTransposePerm);
+
+      SmallVector<ReassociationIndices> reassociation = getReassociationIndices(
+          targetRank, maybeEncodingInfo->swizzle->expandShape);
+      SmallVector<int64_t> unpackSrcShape(
+          srcConvertedType.getShape().take_front(targetRank));
+      unpackSrcShape.append(maybeEncodingInfo->innerTileSizes.begin(),
+                            maybeEncodingInfo->innerTileSizes.end());
+      auto unpackSrcType = RankedTensorType::get(
+          unpackSrcShape, unsetEncodingOp.getSourceType().getElementType());
+      unpackSrc = rewriter.create<tensor::CollapseShapeOp>(
+          loc, unpackSrcType, transposeOp->getResult(0), reassociation);
+    }
+
+    auto unPackOp = lowerUnsetEncodingToUnpackOp(
+        rewriter, unsetEncodingOp, unpackSrc, *converter,
+        this->materializeEncodingValueFn);
+    if (failed(unPackOp)) {
+      Value result = adaptor.getSource();
+      Type targetType =
+          getTypeConverter()->convertType(unsetEncodingOp.getResultType());
+      if (targetType != result.getType()) {
+        result = rewriter.create<tensor::CastOp>(unsetEncodingOp.getLoc(),
+                                                 targetType, result);
+      }
+      rewriter.replaceOp(unsetEncodingOp, result);
+      return success();
+    }
+    rewriter.replaceOp(unsetEncodingOp, unPackOp->getResult());
+    return success();
+  }
+};
+
+class GPUConvertToMultiMma final
+    : public OpInterfaceConversionPattern<linalg::ContractionOpInterface> {
+public:
+  using OpInterfaceConversionPattern<
+      linalg::ContractionOpInterface>::OpInterfaceConversionPattern;
+
+  GPUConvertToMultiMma(
+      MLIRContext *context,
+      const MaterializeEncodingTypeConverter &typeConverter,
+      MaterializeEncodingValueFn materializeEncodingValueFn = {},
+      PatternBenefit benefit = 1)
+      : OpInterfaceConversionPattern<mlir::linalg::ContractionOpInterface>(
+            typeConverter, context, benefit),
+        materializeEncodingValueFn(materializeEncodingValueFn) {}
+
+  LogicalResult
+  matchAndRewrite(linalg::ContractionOpInterface op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto linalgOp = cast<linalg::LinalgOp>(op.getOperation());
+    auto inputs = linalgOp.getDpsInputOperands();
+    auto outputs = linalgOp.getDpsInits();
+    auto lhsType = cast<RankedTensorType>(inputs[0]->get().getType());
+    auto rhsType = cast<RankedTensorType>(inputs[1]->get().getType());
+    auto resultType = cast<RankedTensorType>(outputs[0].getType());
+    auto lhsEncoding = IREE::Encoding::getEncodingAttr(lhsType);
+    auto rhsEncoding = IREE::Encoding::getEncodingAttr(rhsType);
+    auto resultEncoding = IREE::Encoding::getEncodingAttr(resultType);
+    if (!lhsEncoding || !rhsEncoding || !resultEncoding) {
+      LLVM_DEBUG(llvm::dbgs() << "expect encodings on operand types\n");
+      return failure();
+    }
+
+    auto converter = static_cast<const MaterializeEncodingTypeConverter *>(
+        getTypeConverter());
+
+    // TODO(hanchung): Perhaps the MaterializedEncodingInfo should carry the
+    // target intrinsic attribute, so we don't need to query it again.
+    IREE::HAL::ExecutableTargetAttr targetAttr = converter->getTargetAttr();
+    IREE::GPU::TargetAttr gpuTargetAttr;
+    if (targetAttr) {
+      gpuTargetAttr = getGPUTargetAttr(targetAttr);
+    } else {
+      gpuTargetAttr = getCLGPUTarget(op.getContext());
+    }
+    auto elementTypes = llvm::to_vector(llvm::map_range(
+        resultEncoding.getElementTypes().getValue(),
+        [](Attribute a) { return cast<TypeAttr>(a).getValue(); }));
+    std::optional<IREE::GPU::DataTiledMMAAttr> mma =
+        chooseDataTiledMMAAttr(elementTypes, gpuTargetAttr);
+    if (!mma) {
+      LLVM_DEBUG(llvm::dbgs() << "can't find supported Mma intrinsic\n");
+      return failure();
+    }
+    LLVM_DEBUG(llvm::dbgs() << "Target MMA: " << mma.value() << "\n");
+
+    FailureOr<linalg::ContractionDimensions> contractionDims =
+        linalg::inferContractionDims(linalgOp);
+    assert(
+        succeeded(contractionDims) &&
+        "should always be able to infer contraction dims for contraction ops");
+    // TODO(hanchung): Support batch gemms.
+    if (!contractionDims->batch.empty()) {
+      LLVM_DEBUG(llvm::dbgs() << "batch gemm is not yet implemented\n");
+      return failure();
+    }
+
+    // TODO(hanchung): Support unrolling cases. We likely need to teach
+    // multi_mma op about interleaving K dimension.
+    MLIRContext *ctx = rewriter.getContext();
+    AffineExpr mExpr = rewriter.getAffineDimExpr(0);
+    AffineExpr nExpr = rewriter.getAffineDimExpr(1);
+    AffineExpr kExpr = rewriter.getAffineDimExpr(2);
+
+    // The outer dims are all in row-major fasion after relayout.
+    auto lhsMap = AffineMap::get(3, 0, {mExpr, kExpr}, ctx);
+    auto rhsMap = AffineMap::get(3, 0, {nExpr, kExpr}, ctx);
+    auto accMap = AffineMap::get(3, 0, {mExpr, nExpr}, ctx);
+
+    SmallVector<utils::IteratorType> iteratorTypes =
+        linalgOp.getIteratorTypesArray();
+
+    // TODO(hanchung): Support batch gemms.
+    Location loc = op.getLoc();
+    auto mmaOp = rewriter.create<IREE::GPU::MultiMmaOp>(
+        loc, operands[0], operands[1], operands[2],
+        ArrayRef<AffineMap>{lhsMap, rhsMap, accMap}, iteratorTypes,
+        mma.value());
+    rewriter.replaceOp(op, mmaOp);
+    return success();
+  }
+
+protected:
+  const MaterializeEncodingValueFn materializeEncodingValueFn;
+};
+
+} // namespace
+
+void GPUMaterializeDeviceEncodingPass::runOnOperation() {
+  MLIRContext *ctx = &getContext();
+  FunctionOpInterface funcOp = getOperation();
+  auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(funcOp);
+  {
+    RewritePatternSet patterns(ctx);
+    MaterializeEncodingTypeConverter typeConverter(materializeEncodingForTarget,
+                                                   targetAttr);
+    MaterializeEncodingConversionTarget target(*funcOp.getContext());
+    MaterializeEncodingValueFn materializeEncodingValueFn =
+        [](RankedTensorType, OpBuilder,
+           Location) -> FailureOr<MaterializeEncodingValueInfo> { return {}; };
+    populateIREEMaterializeEncodingIntoPackUnPackPatterns(
+        patterns, target, typeConverter, materializeEncodingValueFn);
+
+    patterns.insert<GPUSetEncodingOpLoweringConversion,
+                    GPUUnsetEncodingOpLoweringConversion, GPUConvertToMultiMma>(
+        ctx, typeConverter, materializeEncodingValueFn);
+
+    if (failed(applyPartialConversion(funcOp, target, std::move(patterns)))) {
+      funcOp.emitOpError("materialization failed");
+      return signalPassFailure();
+    }
+  }
+
+  // Add patterns to fold pack/unpack ops with pad/extract_slice ops and
+  // resolve dims ops.
+  {
+    RewritePatternSet patterns(ctx);
+    tensor::populateFoldIntoPackAndUnpackPatterns(patterns);
+    memref::populateResolveRankedShapedTypeResultDimsPatterns(patterns);
+    if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(patterns)))) {
+      funcOp.emitOpError("folding patterns failed");
+      return signalPassFailure();
+    }
+  }
+}
+
+} // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td
index a5c5b090a28a..570d3f90104b 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td
@@ -186,6 +186,11 @@ def GPUApplyTilingLevelPass :
   ];
 }
 
+def GPUMaterializeDeviceEncodingPass :
+    InterfacePass<"iree-codegen-gpu-materialize-device-encoding", "mlir::FunctionOpInterface"> {
+  let summary = "Materialize the encoding for tensor as specified by the backend.";
+}
+
 def GPUTensorTileToSerialLoopsPass :
     InterfacePass<"iree-codegen-gpu-tensor-tile-to-serial-loops", "mlir::FunctionOpInterface"> {
   let summary = "Pass to tile reduction dimensions for certain GPU ops";
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel
index ab48f26ec813..bb1e43081185 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel
@@ -28,6 +28,7 @@ iree_lit_test_suite(
             "gpu_infer_memory_space.mlir",
             "gpu_lower_to_ukernels.mlir",
             "gpu_combine_value_barriers.mlir",
+            "gpu_materialize_encoding.mlir",
             "gpu_nested_layout_contract_amdgpu.mlir",
             "gpu_nested_layout_vector_distribution.mlir",
             "gpu_pipeline.mlir",
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt
index ea90e3bb433d..94953712e849 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt
@@ -24,6 +24,7 @@ iree_lit_test_suite(
     "gpu_generalize_named_ops.mlir"
     "gpu_infer_memory_space.mlir"
     "gpu_lower_to_ukernels.mlir"
+    "gpu_materialize_encoding.mlir"
     "gpu_nested_layout_contract_amdgpu.mlir"
     "gpu_nested_layout_vector_distribution.mlir"
     "gpu_pipeline.mlir"
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_materialize_encoding.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_materialize_encoding.mlir
new file mode 100644
index 000000000000..209b29c4d32d
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_materialize_encoding.mlir
@@ -0,0 +1,391 @@
+// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-materialize-device-encoding))" \
+// RUN:   --iree-gpu-test-target=gfx942 \
+// RUN:   --split-input-file %s | FileCheck %s
+
+//-----------------------------------------------------------------------------
+// 1. MFMA_F32_16x16x4_F32
+//-----------------------------------------------------------------------------
+
+#encoding = #iree_encoding.encoding<operand_index = 0, op_type = matmul, element_types = [f32, f32, f32], original_type = tensor<255x513xf32>,
+                                    user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>],
+                                    round_dims_to = array<i64: 16, 16, 16>>
+#pipeline_layout = #hal.pipeline.layout<bindings = [
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>
+]>
+func.func @set_encoding_LHS_unroll8x8x4_MFMA_F32_16x16x4_F32() {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<255x513xf32>>
+  %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<255x513xf32, #encoding>>
+  %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [255, 513], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<255x513xf32>> -> tensor<255x513xf32>
+  %3 = iree_encoding.set_encoding %2 : tensor<255x513xf32> -> tensor<255x513xf32, #encoding>
+  flow.dispatch.tensor.store %3, %1, offsets = [0, 0], sizes = [255, 513], strides = [1, 1] : tensor<255x513xf32, #encoding> -> !flow.dispatch.tensor<writeonly:tensor<255x513xf32,  #encoding>>
+  return
+}
+
+// CHECK-LABEL: func.func @set_encoding_LHS_unroll8x8x4_MFMA_F32_16x16x4_F32
+// CHECK:         %[[PACK:.*]] = tensor.pack %{{.+}} padding_value(%{{.+}} : f32)
+// CHECK-SAME:      outer_dims_perm = [0, 1]
+// CHECK-SAME:      inner_dims_pos = [0, 1]
+// CHECK-SAME:      inner_tiles = [128, 16]
+// CHECK-SAME:      : tensor<255x513xf32> -> tensor<2x33x128x16xf32>
+// CHECK:         %[[EXPAND:.*]] = tensor.expand_shape %[[PACK]]
+// CHECK-SAME       : tensor<2x33x128x16xf32> into tensor<2x33x8x16x4x4xf32>
+// CHECK:         %[[TRANSPOSE:.*]] = linalg.transpose
+// CHECK-SAME:       ins(%[[EXPAND]] : tensor<2x33x8x16x4x4xf32>)
+// CHECK-SAME:       outs({{.*}} : tensor<2x33x8x4x16x4xf32>)
+// CHECK-SAME:       permutation = [0, 1, 2, 5, 3, 4]
+// CHECK:         flow.dispatch.tensor.store %[[TRANSPOSE]]
+
+// -----
+
+#encoding = #iree_encoding.encoding<operand_index = 1, op_type = matmul, element_types = [f32, f32, f32], original_type = tensor<255x513xf32>,
+                                    user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>],
+                                    round_dims_to = array<i64: 16, 16, 16>>
+#pipeline_layout = #hal.pipeline.layout<bindings = [
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>
+]>
+func.func @set_encoding_RHS_unroll8x8x4_MFMA_F32_16x16x4_F32() {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<255x513xf32>>
+  %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<255x513xf32, #encoding>>
+  %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [255, 513], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<255x513xf32>> -> tensor<255x513xf32>
+  %3 = iree_encoding.set_encoding %2 : tensor<255x513xf32> -> tensor<255x513xf32, #encoding>
+  flow.dispatch.tensor.store %3, %1, offsets = [0, 0], sizes = [255, 513], strides = [1, 1] : tensor<255x513xf32, #encoding> -> !flow.dispatch.tensor<writeonly:tensor<255x513xf32,  #encoding>>
+  return
+}
+
+// CHECK-LABEL: func.func @set_encoding_RHS_unroll8x8x4_MFMA_F32_16x16x4_F32
+// CHECK:         %[[PACK:.*]] = tensor.pack %{{.+}} padding_value(%{{.+}} : f32)
+// CHECK-SAME:      outer_dims_perm = [1, 0]
+// CHECK-SAME:      inner_dims_pos = [1, 0]
+// CHECK-SAME:      inner_tiles = [128, 16]
+// CHECK-SAME:      : tensor<255x513xf32> -> tensor<5x16x128x16xf32>
+// CHECK:         %[[EXPAND:.*]] = tensor.expand_shape %[[PACK]]
+// CHECK-SAME       : tensor<5x16x128x16xf32> into tensor<5x16x8x16x4x4xf32>
+// CHECK:         %[[TRANSPOSE:.*]] = linalg.transpose
+// CHECK-SAME:       ins(%[[EXPAND]] : tensor<5x16x8x16x4x4xf32>)
+// CHECK-SAME:       outs({{.*}} : tensor<5x16x8x4x16x4xf32>)
+// CHECK-SAME:       permutation = [0, 1, 2, 5, 3, 4]
+// CHECK:         flow.dispatch.tensor.store %[[TRANSPOSE]]
+
+// -----
+
+#encoding = #iree_encoding.encoding<operand_index = 2, op_type = matmul, element_types = [f32, f32, f32], original_type = tensor<255x513xf32>,
+                                    user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>],
+                                    round_dims_to = array<i64: 16, 16, 16>>
+#pipeline_layout = #hal.pipeline.layout<bindings = [
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>
+]>
+func.func @set_encoding_ACC_unroll8x8x4_MFMA_F32_16x16x4_F32() {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<255x513xf32>>
+  %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<255x513xf32, #encoding>>
+  %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [255, 513], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<255x513xf32>> -> tensor<255x513xf32>
+  %3 = iree_encoding.set_encoding %2 : tensor<255x513xf32> -> tensor<255x513xf32, #encoding>
+  flow.dispatch.tensor.store %3, %1, offsets = [0, 0], sizes = [255, 513], strides = [1, 1] : tensor<255x513xf32, #encoding> -> !flow.dispatch.tensor<writeonly:tensor<255x513xf32,  #encoding>>
+  return
+}
+
+// CHECK-LABEL: func.func @set_encoding_ACC_unroll8x8x4_MFMA_F32_16x16x4_F32
+// CHECK:         %[[PACK:.*]] = tensor.pack %{{.+}} padding_value(%{{.+}} : f32)
+// CHECK-SAME:      outer_dims_perm = [0, 1]
+// CHECK-SAME:      inner_dims_pos = [0, 1]
+// CHECK-SAME:      inner_tiles = [128, 128]
+// CHECK-SAME:      : tensor<255x513xf32> -> tensor<2x5x128x128xf32>
+// CHECK:         %[[EXPAND:.*]] = tensor.expand_shape %[[PACK]]
+// CHECK-SAME       : tensor<2x5x128x128xf32> into tensor<2x5x8x4x4x8x16xf32>
+// CHECK:         %[[TRANSPOSE:.*]] = linalg.transpose
+// CHECK-SAME:       ins(%[[EXPAND]] : tensor<2x5x8x4x4x8x16xf32>)
+// CHECK-SAME:       outs({{.*}} : tensor<2x5x8x8x4x16x4xf32>)
+// CHECK-SAME:       permutation = [0, 1, 2, 5, 3, 6, 4]
+// CHECK:         flow.dispatch.tensor.store %[[TRANSPOSE]]
+
+// -----
+
+#encoding = #iree_encoding.encoding<operand_index = 2, op_type = matmul, element_types = [f32, f32, f32], original_type = tensor<255x513xf32>,
+                                    user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>],
+                                    round_dims_to = array<i64: 16, 16, 16>>
+#pipeline_layout = #hal.pipeline.layout<bindings = [
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>
+]>
+func.func @unset_encoding_ACC_unroll8x8x4_MFMA_F32_16x16x4_F32() {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<255x513xf32, #encoding>>
+  %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<255x513xf32>>
+  %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [255, 513], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<255x513xf32, #encoding>> -> tensor<255x513xf32, #encoding>
+  %3 = iree_encoding.unset_encoding %2 : tensor<255x513xf32, #encoding> -> tensor<255x513xf32>
+  flow.dispatch.tensor.store %3, %1, offsets = [0, 0], sizes = [255, 513], strides = [1, 1] : tensor<255x513xf32> -> !flow.dispatch.tensor<writeonly:tensor<255x513xf32>>
+  return
+}
+
+// CHECK-LABEL: func.func @unset_encoding_ACC_unroll8x8x4_MFMA_F32_16x16x4_F32() {
+// CHECK:         %[[TRANSPOSE:.*]] = linalg.transpose
+// CHECK-SAME:       ins(%{{.+}} : tensor<2x5x8x8x4x16x4xf32>)
+// CHECK-SAME:       outs({{.*}} : tensor<2x5x8x4x4x8x16xf32>)
+// CHECK-SAME:       permutation = [0, 1, 2, 4, 6, 3, 5]
+// CHECK:         %[[COLLAPSE:.*]] = tensor.collapse_shape %[[TRANSPOSE]]
+// CHECK-SAME:      : tensor<2x5x8x4x4x8x16xf32> into tensor<2x5x128x128xf32>
+// CHECK:         %[[UNPACK:.*]] = tensor.unpack %[[COLLAPSE]]
+// CHECK-SAME:      outer_dims_perm = [0, 1]
+// CHECK-SAME:      inner_dims_pos = [0, 1]
+// CHECK-SAME:      inner_tiles = [128, 128]
+// CHECK-SAME:      : tensor<2x5x128x128xf32> -> tensor<255x513xf32>
+// CHECK:         flow.dispatch.tensor.store %[[UNPACK]]
+
+// -----
+
+#map = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+#encoding_lhs = #iree_encoding.encoding<operand_index = 0, op_type = matmul, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 16, 16, 16>>
+#encoding_rhs = #iree_encoding.encoding<operand_index = 1, op_type = matmul, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 16, 16, 16>>
+#encoding_result = #iree_encoding.encoding<operand_index = 2, op_type = matmul, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 16, 16, 16>>
+#pipeline_layout_3 = #hal.pipeline.layout<constants = 3, bindings = [
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>
+]>
+func.func @matmul_lowering_unroll8x8x4_MFMA_F32_16x16x4_F32() {
+  %c0 = arith.constant 0 : index
+  %M = hal.interface.constant.load layout(#pipeline_layout_3) ordinal(0) : index
+  %N = hal.interface.constant.load layout(#pipeline_layout_3) ordinal(1) : index
+  %K = hal.interface.constant.load layout(#pipeline_layout_3) ordinal(2) : index
+  %0 = hal.interface.binding.subspan layout(#pipeline_layout_3) binding(0) alignment(64) offset(%c0)
+      : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #encoding_lhs>>{%M, %K}
+  %1 = hal.interface.binding.subspan layout(#pipeline_layout_3) binding(1) alignment(64) offset(%c0)
+      : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #encoding_rhs>>{%K, %N}
+  %2 = hal.interface.binding.subspan layout(#pipeline_layout_3) binding(2) alignment(64) offset(%c0)
+      : !flow.dispatch.tensor<readwrite:tensor<?x?xf32, #encoding_result>>{%M, %N}
+  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1]
+      : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #encoding_lhs>>{%M, %K}
+      -> tensor<?x?xf32, #encoding_lhs>
+  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1]
+      : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #encoding_rhs>>{%K, %N}
+      -> tensor<?x?xf32, #encoding_rhs>
+  %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1]
+      : !flow.dispatch.tensor<readwrite:tensor<?x?xf32, #encoding_result>>{%M, %N}
+      -> tensor<?x?xf32, #encoding_result>
+  %6 = linalg.matmul
+      ins(%3, %4 : tensor<?x?xf32, #encoding_lhs>,
+                   tensor<?x?xf32, #encoding_rhs>)
+      outs(%5 : tensor<?x?xf32, #encoding_result>)
+      -> tensor<?x?xf32, #encoding_result>
+  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1]
+      : tensor<?x?xf32, #encoding_result>
+      -> !flow.dispatch.tensor<readwrite:tensor<?x?xf32, #encoding_result>>{%M, %N}
+  return
+}
+// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
+// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)>
+// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+// CHECK:     func.func @matmul_lowering_unroll8x8x4_MFMA_F32_16x16x4_F32
+// CHECK-DAG:   %[[LHS_BINDING:.+]] = hal.interface.binding.subspan {{.+}} binding(0)
+// CHECK-DAG:   %[[RHS_BINDING:.+]] = hal.interface.binding.subspan {{.+}} binding(1)
+// CHECK-DAG:   %[[ACC_BINDING:.+]] = hal.interface.binding.subspan {{.+}} binding(2)
+// CHECK-DAG:   %[[LHS:.+]] = flow.dispatch.tensor.load %[[LHS_BINDING]]{{.+}} -> tensor<?x?x8x4x16x4xf32>
+// CHECK-DAG:   %[[RHS:.+]] = flow.dispatch.tensor.load %[[RHS_BINDING]]{{.+}} -> tensor<?x?x8x4x16x4xf32>
+// CHECK-DAG:   %[[ACC:.+]] = flow.dispatch.tensor.load %[[ACC_BINDING]]{{.+}} -> tensor<?x?x8x8x4x16x4xf32>
+// CHECK:       %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[ACC]]
+// CHECK-SAME:    indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]],
+// CHECK-SAME:    iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>]
+// CHECK-SAME:    kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x4_F32, unroll_m = 8, unroll_n = 8, unroll_k = 4>
+// CHECK:       flow.dispatch.tensor.store %[[MMA]], %[[ACC_BINDING]]
+
+
+//-----------------------------------------------------------------------------
+// 2. MFMA_I32_16x16x32_I8
+//-----------------------------------------------------------------------------
+
+#encoding = #iree_encoding.encoding<operand_index = 0, op_type = matmul, element_types = [i8, i8, i32], original_type = tensor<255x513xi8>,
+                                    user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>],
+                                    round_dims_to = array<i64: 16, 16, 32>>
+#pipeline_layout = #hal.pipeline.layout<bindings = [
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>
+]>
+func.func @set_encoding_LHS_unroll8x8x2_MFMA_I32_16x16x32_I8() {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<255x513xi8>>
+  %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<255x513xi8, #encoding>>
+  %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [255, 513], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<255x513xi8>> -> tensor<255x513xi8>
+  %3 = iree_encoding.set_encoding %2 : tensor<255x513xi8> -> tensor<255x513xi8, #encoding>
+  flow.dispatch.tensor.store %3, %1, offsets = [0, 0], sizes = [255, 513], strides = [1, 1] : tensor<255x513xi8, #encoding> -> !flow.dispatch.tensor<writeonly:tensor<255x513xi8,  #encoding>>
+  return
+}
+
+// CHECK-LABEL: func.func @set_encoding_LHS_unroll8x8x2_MFMA_I32_16x16x32_I8
+// CHECK:         %[[PACK:.*]] = tensor.pack %{{.+}} padding_value(%{{.+}} : i8)
+// CHECK-SAME:      outer_dims_perm = [0, 1]
+// CHECK-SAME:      inner_dims_pos = [0, 1]
+// CHECK-SAME:      inner_tiles = [128, 64]
+// CHECK-SAME:      : tensor<255x513xi8> -> tensor<2x9x128x64xi8>
+// CHECK:         %[[EXPAND:.*]] = tensor.expand_shape %[[PACK]]
+// CHECK-SAME       : tensor<2x9x128x64xi8> into tensor<2x9x8x16x2x4x8xi8>
+// CHECK:         %[[TRANSPOSE:.*]] = linalg.transpose
+// CHECK-SAME:       ins(%[[EXPAND]] : tensor<2x9x8x16x2x4x8xi8>)
+// CHECK-SAME:       outs({{.*}} : tensor<2x9x8x4x16x2x8xi8>)
+// CHECK-SAME:       permutation = [0, 1, 2, 5, 3, 4, 6]
+// CHECK:         flow.dispatch.tensor.store %[[TRANSPOSE]]
+
+// -----
+
+#encoding = #iree_encoding.encoding<operand_index = 1, op_type = matmul, element_types = [i8, i8, i32], original_type = tensor<255x513xi8>,
+                                    user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>],
+                                    round_dims_to = array<i64: 16, 16, 32>>
+#pipeline_layout = #hal.pipeline.layout<bindings = [
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>
+]>
+func.func @set_encoding_RHS_unroll8x8x2_MFMA_I32_16x16x32_I8() {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<255x513xi8>>
+  %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<255x513xi8, #encoding>>
+  %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [255, 513], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<255x513xi8>> -> tensor<255x513xi8>
+  %3 = iree_encoding.set_encoding %2 : tensor<255x513xi8> -> tensor<255x513xi8, #encoding>
+  flow.dispatch.tensor.store %3, %1, offsets = [0, 0], sizes = [255, 513], strides = [1, 1] : tensor<255x513xi8, #encoding> -> !flow.dispatch.tensor<writeonly:tensor<255x513xi8,  #encoding>>
+  return
+}
+
+// CHECK-LABEL: func.func @set_encoding_RHS_unroll8x8x2_MFMA_I32_16x16x32_I8
+// CHECK:         %[[PACK:.*]] = tensor.pack %{{.+}} padding_value(%{{.+}} : i8)
+// CHECK-SAME:      outer_dims_perm = [1, 0]
+// CHECK-SAME:      inner_dims_pos = [1, 0]
+// CHECK-SAME:      inner_tiles = [128, 64]
+// CHECK-SAME:      : tensor<255x513xi8> -> tensor<5x4x128x64xi8>
+// CHECK:         %[[EXPAND:.*]] = tensor.expand_shape %[[PACK]]
+// CHECK-SAME       : tensor<5x4x128x64xi8> into tensor<5x4x8x16x2x4x8xi8>
+// CHECK:         %[[TRANSPOSE:.*]] = linalg.transpose
+// CHECK-SAME:       ins(%[[EXPAND]] : tensor<5x4x8x16x2x4x8xi8>)
+// CHECK-SAME:       outs({{.*}} : tensor<5x4x8x4x16x2x8xi8>)
+// CHECK-SAME:       permutation = [0, 1, 2, 5, 3, 4, 6]
+// CHECK:         flow.dispatch.tensor.store %[[TRANSPOSE]]
+
+// -----
+
+#encoding = #iree_encoding.encoding<operand_index = 2, op_type = matmul, element_types = [i8, i8, i32], original_type = tensor<255x513xi32>,
+                                    user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>],
+                                    round_dims_to = array<i64: 16, 16, 32>>
+#pipeline_layout = #hal.pipeline.layout<bindings = [
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>
+]>
+func.func @set_encoding_ACC_unroll8x8x2_MFMA_I32_16x16x32_I8() {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<255x513xi32>>
+  %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<255x513xi32, #encoding>>
+  %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [255, 513], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<255x513xi32>> -> tensor<255x513xi32>
+  %3 = iree_encoding.set_encoding %2 : tensor<255x513xi32> -> tensor<255x513xi32, #encoding>
+  flow.dispatch.tensor.store %3, %1, offsets = [0, 0], sizes = [255, 513], strides = [1, 1] : tensor<255x513xi32, #encoding> -> !flow.dispatch.tensor<writeonly:tensor<255x513xi32,  #encoding>>
+  return
+}
+
+// CHECK-LABEL: func.func @set_encoding_ACC_unroll8x8x2_MFMA_I32_16x16x32_I8
+// CHECK:         %[[PACK:.*]] = tensor.pack %{{.+}} padding_value(%{{.+}} : i32)
+// CHECK-SAME:      outer_dims_perm = [0, 1]
+// CHECK-SAME:      inner_dims_pos = [0, 1]
+// CHECK-SAME:      inner_tiles = [128, 128]
+// CHECK-SAME:      : tensor<255x513xi32> -> tensor<2x5x128x128xi32>
+// CHECK:         %[[EXPAND:.*]] = tensor.expand_shape %[[PACK]]
+// CHECK-SAME       : tensor<2x5x128x128xi32> into tensor<2x5x8x4x4x8x16xi32>
+// CHECK:         %[[TRANSPOSE:.*]] = linalg.transpose
+// CHECK-SAME:       ins(%[[EXPAND]] : tensor<2x5x8x4x4x8x16xi32>)
+// CHECK-SAME:       outs({{.*}} : tensor<2x5x8x8x4x16x4xi32>)
+// CHECK-SAME:       permutation = [0, 1, 2, 5, 3, 6, 4]
+// CHECK:         flow.dispatch.tensor.store %[[TRANSPOSE]]
+
+// -----
+
+#encoding = #iree_encoding.encoding<operand_index = 2, op_type = matmul, element_types = [i8, i8, i32], original_type = tensor<255x513xi32>,
+                                    user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>],
+                                    round_dims_to = array<i64: 16, 16, 32>>
+#pipeline_layout = #hal.pipeline.layout<bindings = [
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>
+]>
+func.func @unset_encoding_ACC_unroll8x8x2_MFMA_I32_16x16x32_I8() {
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<255x513xi32, #encoding>>
+  %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<255x513xi32>>
+  %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [255, 513], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<255x513xi32, #encoding>> -> tensor<255x513xi32, #encoding>
+  %3 = iree_encoding.unset_encoding %2 : tensor<255x513xi32, #encoding> -> tensor<255x513xi32>
+  flow.dispatch.tensor.store %3, %1, offsets = [0, 0], sizes = [255, 513], strides = [1, 1] : tensor<255x513xi32> -> !flow.dispatch.tensor<writeonly:tensor<255x513xi32>>
+  return
+}
+
+// CHECK-LABEL: func.func @unset_encoding_ACC_unroll8x8x2_MFMA_I32_16x16x32_I8() {
+// CHECK:         %[[TRANSPOSE:.*]] = linalg.transpose
+// CHECK-SAME:       ins(%{{.+}} : tensor<2x5x8x8x4x16x4xi32>)
+// CHECK-SAME:       outs({{.*}} : tensor<2x5x8x4x4x8x16xi32>)
+// CHECK-SAME:       permutation = [0, 1, 2, 4, 6, 3, 5]
+// CHECK:         %[[COLLAPSE:.*]] = tensor.collapse_shape %[[TRANSPOSE]]
+// CHECK-SAME:      : tensor<2x5x8x4x4x8x16xi32> into tensor<2x5x128x128xi32>
+// CHECK:         %[[UNPACK:.*]] = tensor.unpack %[[COLLAPSE]]
+// CHECK-SAME:      outer_dims_perm = [0, 1]
+// CHECK-SAME:      inner_dims_pos = [0, 1]
+// CHECK-SAME:      inner_tiles = [128, 128]
+// CHECK-SAME:      : tensor<2x5x128x128xi32> -> tensor<255x513xi32>
+// CHECK:         flow.dispatch.tensor.store %[[UNPACK]]
+
+// -----
+
+#map = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+#encoding_lhs = #iree_encoding.encoding<operand_index = 0, op_type = matmul, element_types = [i8, i8, i32], user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 16, 16, 32>>
+#encoding_rhs = #iree_encoding.encoding<operand_index = 1, op_type = matmul, element_types = [i8, i8, i32], user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 16, 16, 32>>
+#encoding_result = #iree_encoding.encoding<operand_index = 2, op_type = matmul, element_types = [i8, i8, i32], user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 16, 16, 32>>
+#pipeline_layout_3 = #hal.pipeline.layout<constants = 3, bindings = [
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>
+]>
+
+func.func @matmul_lowering_unroll8x8x2_MFMA_I32_16x16x32_I8() {
+  %c0 = arith.constant 0 : index
+  %M = hal.interface.constant.load layout(#pipeline_layout_3) ordinal(0) : index
+  %N = hal.interface.constant.load layout(#pipeline_layout_3) ordinal(1) : index
+  %K = hal.interface.constant.load layout(#pipeline_layout_3) ordinal(2) : index
+  %0 = hal.interface.binding.subspan layout(#pipeline_layout_3) binding(0) alignment(64) offset(%c0)
+      : !flow.dispatch.tensor<readonly:tensor<?x?xi8, #encoding_lhs>>{%M, %K}
+  %1 = hal.interface.binding.subspan layout(#pipeline_layout_3) binding(1) alignment(64) offset(%c0)
+      : !flow.dispatch.tensor<readonly:tensor<?x?xi8, #encoding_rhs>>{%K, %N}
+  %2 = hal.interface.binding.subspan layout(#pipeline_layout_3) binding(2) alignment(64) offset(%c0)
+      : !flow.dispatch.tensor<readwrite:tensor<?x?xi32, #encoding_result>>{%M, %N}
+  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1]
+      : !flow.dispatch.tensor<readonly:tensor<?x?xi8, #encoding_lhs>>{%M, %K}
+      -> tensor<?x?xi8, #encoding_lhs>
+  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1]
+      : !flow.dispatch.tensor<readonly:tensor<?x?xi8, #encoding_rhs>>{%K, %N}
+      -> tensor<?x?xi8, #encoding_rhs>
+  %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1]
+      : !flow.dispatch.tensor<readwrite:tensor<?x?xi32, #encoding_result>>{%M, %N}
+      -> tensor<?x?xi32, #encoding_result>
+  %6 = linalg.matmul
+      ins(%3, %4 : tensor<?x?xi8, #encoding_lhs>,
+                   tensor<?x?xi8, #encoding_rhs>)
+      outs(%5 : tensor<?x?xi32, #encoding_result>)
+      -> tensor<?x?xi32, #encoding_result>
+  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1]
+      : tensor<?x?xi32, #encoding_result>
+      -> !flow.dispatch.tensor<readwrite:tensor<?x?xi32, #encoding_result>>{%M, %N}
+  return
+}
+// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
+// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)>
+// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+// CHECK:     func.func @matmul_lowering_unroll8x8x2_MFMA_I32_16x16x32_I8
+// CHECK-DAG:   %[[LHS_BINDING:.+]] = hal.interface.binding.subspan {{.+}} binding(0)
+// CHECK-DAG:   %[[RHS_BINDING:.+]] = hal.interface.binding.subspan {{.+}} binding(1)
+// CHECK-DAG:   %[[ACC_BINDING:.+]] = hal.interface.binding.subspan {{.+}} binding(2)
+// CHECK-DAG:   %[[LHS:.+]] = flow.dispatch.tensor.load %[[LHS_BINDING]]{{.+}} -> tensor<?x?x8x4x16x2x8xi8>
+// CHECK-DAG:   %[[RHS:.+]] = flow.dispatch.tensor.load %[[RHS_BINDING]]{{.+}} -> tensor<?x?x8x4x16x2x8xi8>
+// CHECK-DAG:   %[[ACC:.+]] = flow.dispatch.tensor.load %[[ACC_BINDING]]{{.+}} -> tensor<?x?x8x8x4x16x4xi32>
+// CHECK:       %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS]], %[[RHS]], %[[ACC]]
+// CHECK-SAME:    indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]],
+// CHECK-SAME:    iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>]
+// CHECK-SAME:    kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 8, unroll_n = 8, unroll_k = 2>
+// CHECK:       flow.dispatch.tensor.store %[[MMA]], %[[ACC_BINDING]]
diff --git a/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingIntoNop.cpp b/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingIntoNop.cpp
index 2802736fe232..8024a7a71edf 100644
--- a/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingIntoNop.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingIntoNop.cpp
@@ -48,6 +48,8 @@ struct MaterializeEncodingIntoNopPass final
         materializeEncodingFn, IREE::HAL::ExecutableTargetAttr());
     MaterializeEncodingConversionTarget target(*context);
     populateMaterializeEncodingIntoPackUnPackPatterns(
+        materializeEncodingPattern, typeConverter, materializeEncodingValueFn);
+    populateIREEMaterializeEncodingIntoPackUnPackPatterns(
         materializeEncodingPattern, target, typeConverter,
         materializeEncodingValueFn);
 
diff --git a/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingIntoPackUnPack.cpp b/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingIntoPackUnPack.cpp
index 2de3dc30029a..852183289304 100644
--- a/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingIntoPackUnPack.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingIntoPackUnPack.cpp
@@ -187,10 +187,12 @@ static void transposeInPlace(MaterializeEncodingInfo &info) {
 // to `pack` and `unpack` operations respectively.
 //===---------------------------------------------------------------------===//
 
-/// Utility method to convert from `set_encoding` op to `pack` operation with
-/// zero padding values. The source is also taken as input so that these could
-/// be used with `OpConversionPatterns`.
-static FailureOr<tensor::PackOp> lowerSetEncodingOpToPackOp(
+/// TODO(hanchung): Move the implementation to EncodingUtils.cpp. It is not
+/// moved because it needs some cleanup for this file. E.g., `getPaddingValue`
+/// is no longer needed. Ideally we should move CPU specific patterns (e.g.,
+/// lowerContractionOpWithEncoding, etc) to the CPUMaterializeEncoding file;
+/// move general patterns to EncodingUtils, and retire this file.
+FailureOr<tensor::PackOp> lowerSetEncodingOpToPackOp(
     RewriterBase &rewriter, IREE::Encoding::SetEncodingOp encodingOp,
     Value source, const MaterializeEncodingTypeConverter &typeConverter,
     MaterializeEncodingValueFn materializeEncodingValueFn) {
@@ -231,10 +233,9 @@ static FailureOr<tensor::PackOp> lowerSetEncodingOpToPackOp(
       paddingValue, encodingInfo->outerDimsPerm);
 }
 
-/// Utility method to convert from `set_encoding` op to `pack` operation.
-/// The source is taken as input so that these could be used with
-/// `OpConversionPatterns`.
-static FailureOr<tensor::UnPackOp> lowerUnsetEncodingToUnpackOp(
+/// TODO(hanchung): Move the implementation to EncodingUtils.cpp. See the reason
+/// in the implementation comment of lowerSetEncodingToPackOp method.
+FailureOr<tensor::UnPackOp> lowerUnsetEncodingToUnpackOp(
     RewriterBase &rewriter, IREE::Encoding::UnsetEncodingOp encodingOp,
     Value packedValue, const MaterializeEncodingTypeConverter &typeConverter,
     MaterializeEncodingValueFn materializeEncodingValueFn) {
@@ -506,6 +507,33 @@ lowerOpWithEncoding(RewriterBase &rewriter, linalg::LinalgOp linalgOp,
       .Default([](Operation *op) { return failure(); });
 }
 
+// Utility to apply a tile-swizzling to a packed shape.
+static SmallVector<OpFoldResult>
+getSwizzledShape(ArrayRef<OpFoldResult> packedShape,
+                 MaterializeEncodingInfo encodingInfo) {
+  if (packedShape.empty() || !encodingInfo.swizzle) {
+    return SmallVector<OpFoldResult>(packedShape);
+  }
+
+  int64_t srcRank = packedShape.size() - encodingInfo.innerTileSizes.size();
+  SmallVector<int64_t> perm = llvm::to_vector(llvm::seq<int64_t>(0, srcRank));
+  for (auto i : encodingInfo.swizzle->permutation) {
+    perm.push_back(i + srcRank);
+  }
+
+  SmallVector<OpFoldResult> newShape(packedShape.take_front(srcRank));
+  SmallVector<int64_t> expandedTileShape =
+      getExpandedTileShape(encodingInfo.swizzle->expandShape);
+  MLIRContext *ctx = packedShape[0].getContext();
+  Builder b(ctx);
+  for (int64_t d : expandedTileShape) {
+    newShape.push_back(b.getIndexAttr(d));
+  }
+  applyPermutationToVector(newShape, perm);
+
+  return newShape;
+}
+
 /// For `dispatchTensorType` that bind a `RankedTensorType` with encoding,
 /// returns the materialized shape of the `dispatchTensorType`. The
 /// dynamic dimensions of the `dispatchTensorType` are provided in
@@ -541,7 +569,7 @@ static FailureOr<SmallVector<OpFoldResult>> getPackedDimsForDispatchTensor(
       tensor::PackOp::getResultShape(builder, loc, targetShape, *innerTileSizes,
                                      encodingInfo->innerDimsPos,
                                      encodingInfo->outerDimsPerm);
-  return convertedTargetShape;
+  return getSwizzledShape(convertedTargetShape, *encodingInfo);
 }
 
 /// For `dispatchTensorType` that bind a `RankedTensorType` with encoding,
@@ -882,11 +910,24 @@ class MaterializeContractionOp : public OpInterfaceConversionPattern<
 } // namespace
 
 void populateMaterializeEncodingIntoPackUnPackPatterns(
-    RewritePatternSet &patterns, MaterializeEncodingConversionTarget &target,
+    RewritePatternSet &patterns,
     MaterializeEncodingTypeConverter &typeConverter,
     MaterializeEncodingValueFn materializeEncodingValueFn) {
   MLIRContext *context = patterns.getContext();
+  patterns.insert<MaterializeDPSOperation<linalg::FillOp>,
+                  MaterializeDPSOperation<linalg::GenericOp>,
+                  MaterializeOperation<tensor::EmptyOp>,
+                  MaterializeContractionOp, SetEncodingOpToPackOpConversion,
+                  UnsetEncodingOpToUnPackOpConversion>(
+      context, typeConverter, materializeEncodingValueFn);
+  memref::populateResolveRankedShapedTypeResultDimsPatterns(patterns);
+}
 
+void populateIREEMaterializeEncodingIntoPackUnPackPatterns(
+    RewritePatternSet &patterns, MaterializeEncodingConversionTarget &target,
+    MaterializeEncodingTypeConverter &typeConverter,
+    MaterializeEncodingValueFn materializeEncodingValueFn) {
+  MLIRContext *context = patterns.getContext();
   typeConverter.addConversion(
       [&typeConverter](IREE::Flow::DispatchTensorType dispatchTensorType) {
         Type boundType = dispatchTensorType.getBoundType();
@@ -908,20 +949,10 @@ void populateMaterializeEncodingIntoPackUnPackPatterns(
         return resultType == typeConverter.convertType(resultType);
       });
 
-  // Add all patterns for converting from encoded type to the materialized
-  // type.
-  patterns.insert<MaterializeDPSOperation<linalg::FillOp>,
-                  MaterializeDPSOperation<linalg::GenericOp>,
-                  MaterializeOperation<tensor::EmptyOp>,
-                  MaterializeContractionOp, SetEncodingOpToPackOpConversion,
-                  UnsetEncodingOpToUnPackOpConversion>(
-      patterns.getContext(), typeConverter, materializeEncodingValueFn);
-  memref::populateResolveRankedShapedTypeResultDimsPatterns(patterns);
-
   patterns.insert<MaterializeFlowDispatchTensorLoadOp,
                   MaterializeFlowDispatchTensorStoreOp,
                   MaterializeInterfaceBindingEncoding>(
       context, typeConverter, materializeEncodingValueFn);
-}
+};
 
 } // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp
index bb7612e72399..11e8a3d39826 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp
@@ -524,8 +524,8 @@ int64_t MMAAttr::getBlockSize() const {
   return 0;
 }
 
-int64_t MMAAttr::getSubgroupSize() const {
-  switch (getIntrinsic().getValue()) {
+static int64_t getIntrinsicSubgroupSize(MMAIntrinsic intrinsic) {
+  switch (intrinsic) {
   case MMAIntrinsic::MFMA_F32_16x16x4_F32:
   case MMAIntrinsic::MFMA_F32_16x16x16_F16:
   case MMAIntrinsic::MFMA_I32_16x16x16_I8:
@@ -546,6 +546,10 @@ int64_t MMAAttr::getSubgroupSize() const {
   return 0;
 }
 
+int64_t MMAAttr::getSubgroupSize() const {
+  return getIntrinsicSubgroupSize(getIntrinsic().getValue());
+}
+
 MMAAttr::SingleSubgroupLayout MMAAttr::getASingleSubgroupLayout() const {
   switch (getIntrinsic().getValue()) {
   case MMAIntrinsic::MFMA_F32_16x16x4_F32: {
@@ -868,6 +872,33 @@ LogicalResult MMAAttr::materializeOperandConcreteShape(
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// DataTiledMMA Attributes
+//===----------------------------------------------------------------------===//
+
+std::tuple<Type, Type, Type> DataTiledMMAAttr::getABCElementTypes() const {
+  MLIRContext *ctx = getContext();
+  auto opaqueLayout = getOpaqueMFMALayout(ctx, getIntrinsic().getValue());
+  return {opaqueLayout.aType, opaqueLayout.bType, opaqueLayout.cType};
+}
+
+std::tuple<int64_t, int64_t, int64_t> DataTiledMMAAttr::getMNKShape() const {
+  MLIRContext *ctx = getContext();
+  auto opaqueLayout = getOpaqueMFMALayout(ctx, getIntrinsic().getValue());
+  return {opaqueLayout.mSize * getUnrollM(), opaqueLayout.nSize * getUnrollN(),
+          opaqueLayout.kSize * getUnrollK()};
+}
+
+std::tuple<VectorType, VectorType, VectorType>
+DataTiledMMAAttr::getABCVectorTypes() const {
+  return MMAAttr::get(getContext(), getIntrinsic().getValue())
+      .getABCVectorTypes();
+}
+
+int64_t DataTiledMMAAttr::getSubgroupSize() const {
+  return getIntrinsicSubgroupSize(getIntrinsic().getValue());
+}
+
 //===----------------------------------------------------------------------===//
 // MMA Schedule Attributes
 //===----------------------------------------------------------------------===//
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td
index 968fa736ec4d..56ae368b14c6 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td
@@ -220,6 +220,53 @@ def IREEGPU_MMAAttr : IREEGPU_MmaVectorLayoutAttr<"MMA", "MMAIntrinsicAttr"> {
   }];
 }
 
+def IREEGPU_DataTiledMMAAttr :
+    AttrDef<IREEGPU_Dialect, "DataTiledMMA", [
+  DeclareAttrInterfaceMethods<IREEGPU_MmaInterfaceAttr, [
+    "getABCElementTypes",
+    // TODO: Implement the interface method. The current implementation just
+    // returns {VectorType(), VectorType(), VectorType()} now because the dummy
+    // implementation is required by the MmaInterfaceAttr.
+    "getABCVectorTypes",
+    "getMNKShape",
+    "getSubgroupSize",
+    // TODO: Implement the interface method.
+    // "populateOperandOffsetsSizesStrides",
+  ]>
+]> {
+  let mnemonic = "data_tiled_mma_layout";
+  let cppNamespace = "::mlir::iree_compiler::IREE::GPU";
+
+  let description = [{
+    This mma variant represents MMA ops with data-tiling details. The
+    |intrinsic| field specifies which particular MMA intrinsic is targeted by
+    the data-tiling.
+
+    The tile swizzling already happens, so the attribute does not need to
+    implement materializeOperandConcreteShape interface method. E.g., if the
+    target intrinsic is MFMA_F32_16x16x4_F32:
+      - The inner tile shape of LHS is 4x16.
+      - The inner tile shape of RHS is 4x16.
+      - The inner tile shape of ACC is 4x16x4.
+
+    Furthermore, the unrolling and interleaving can be represented with the
+    attribute. In the concept of data-tiling, we always unroll the parallel
+    dimensions (i.e., M, N dimensions) to be outermost, and interleave the
+    unrolled K dimension. I.e., the unrolled K dimension becomes the innermost
+    dimension. The constraint can be relaxed based on data-tiling needs. The
+    additional information can be added to `parameters`.
+  }];
+
+  let assemblyFormat = "`<` struct(params) `>`";
+
+  let parameters = (ins
+    "::mlir::iree_compiler::IREE::GPU::MMAIntrinsicAttr":$intrinsic,
+    "int64_t":$unroll_m,
+    "int64_t":$unroll_n,
+    "int64_t":$unroll_k
+  );
+}
+
 def IREEGPU_MMAOpsArrayAttr : ArrayOfAttr<
   IREEGPU_Dialect, "MMAOpsArray", "mma_ops", "MMAAttr"> {
   let cppNamespace = "::mlir::iree_compiler::IREE::GPU";
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUOps.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUOps.cpp
index 4fe1eab9f986..780fc0920186 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUOps.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUOps.cpp
@@ -245,8 +245,13 @@ LogicalResult MultiMmaOp::verify() {
   int64_t accInnerElementCount = multiplyAcc(getAccInnerShape());
 
   auto [m, n, k] = getKind().getMNKShape();
-  if (m * k != lhsInnerElementCount || n * k != rhsInnerElementCount ||
-      m * n != accInnerElementCount) {
+  int64_t expectedNumLhsElem = m * k;
+  int64_t expectedNumRhsElem = n * k;
+  int64_t expectedNumAccElem = m * n;
+
+  if (expectedNumLhsElem != lhsInnerElementCount ||
+      expectedNumRhsElem != rhsInnerElementCount ||
+      expectedNumAccElem != accInnerElementCount) {
     auto [lhsThreadType, rhsThreadType, accThreadType] =
         getKind().getABCVectorTypes();
     int64_t lhsThreadElementCount = multiplyAcc(lhsThreadType.getShape());
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/iree_gpu_attrs.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/iree_gpu_attrs.mlir
index d1003069e94a..046a3c88abbc 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/iree_gpu_attrs.mlir
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/iree_gpu_attrs.mlir
@@ -27,6 +27,33 @@ module {
 // CHECK-LABEL: func @test_wmma_f16_16x16x16_f32
 //  CHECK-SAME:   mma_types = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>
 
+module {
+  func.func @test_data_tiled_mfma_f32_16x16x4_f32() attributes {
+      mma_types = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x4_F32, unroll_m = 1, unroll_n = 1, unroll_k = 1>} {
+    return
+  }
+}
+// CHECK-LABEL: func @test_data_tiled_mfma_f32_16x16x4_f32
+//  CHECK-SAME:   mma_types = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x4_F32, unroll_m = 1, unroll_n = 1, unroll_k = 1>
+
+module {
+  func.func @test_data_tiled_mfma_f32_16x16x16_f16() attributes {
+      mma_types = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x16_F16, unroll_m = 1, unroll_n = 1, unroll_k = 1>} {
+    return
+  }
+}
+// CHECK-LABEL: func @test_data_tiled_mfma_f32_16x16x16_f16
+//  CHECK-SAME:   mma_types = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x16_F16, unroll_m = 1, unroll_n = 1, unroll_k = 1>
+
+module {
+  func.func @test_data_tiled_mfma_i32_16x16x32_i8() attributes {
+      mma_types = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 1, unroll_n = 1, unroll_k = 1>} {
+    return
+  }
+}
+// CHECK-LABEL: func @test_data_tiled_mfma_i32_16x16x32_i8
+//  CHECK-SAME:   mma_types = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_I32_16x16x32_I8, unroll_m = 1, unroll_n = 1, unroll_k = 1>
+
 module {
   func.func @test_any_lowering_config() attributes {
       lowering_config = #iree_gpu.lowering_config<{workgroup = [16, 16], thread = [0, 4]}>} {
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/iree_gpu_ops.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/iree_gpu_ops.mlir
index ca1aed161afb..e70664547753 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/iree_gpu_ops.mlir
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/iree_gpu_ops.mlir
@@ -204,6 +204,60 @@ func.func @tensor_subgroup_matmul_transpose_b_32x32x8_multi_mma(
 
 // -----
 
+#contraction_accesses = [
+ affine_map<(i, j, k) -> (i, k)>,
+ affine_map<(i, j, k) -> (k, j)>,
+ affine_map<(i, j, k) -> (i, j)>
+]
+func.func @data_tiled_1x1x1_tensor_multi_mma(%lhs: tensor<?x?x4x16x1x1xf32>, %rhs: tensor<?x?x4x16x1x1xf32>, %acc: tensor<?x?x4x16x4x1xf32>) -> tensor<?x?x4x16x4x1xf32> {
+  %0 = iree_gpu.multi_mma %lhs, %rhs, %acc {
+    indexing_maps = #contraction_accesses,
+    iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>],
+    kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x4_F32, unroll_m = 1, unroll_n = 1, unroll_k = 1>
+  } : tensor<?x?x4x16x1x1xf32>, tensor<?x?x4x16x1x1xf32> into tensor<?x?x4x16x4x1xf32>
+  return %0 : tensor<?x?x4x16x4x1xf32>
+}
+
+// CHECK: #[[$MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
+// CHECK: #[[$MAP1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
+// CHECK: #[[$MAP2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+
+// CHECK-LABEL: func @data_tiled_1x1x1_tensor_multi_mma
+//       CHECK:   iree_gpu.multi_mma %arg0, %arg1, %arg2
+//  CHECK-SAME:       indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]]
+//  CHECK-SAME:       iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>]
+//  CHECK-SAME:       kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x4_F32, unroll_m = 1, unroll_n = 1, unroll_k = 1>
+//  CHECK-SAME:     : tensor<?x?x4x16x1x1xf32>, tensor<?x?x4x16x1x1xf32> into tensor<?x?x4x16x4x1xf32>
+
+// -----
+
+#contraction_accesses = [
+ affine_map<(i, j, k) -> (i, k)>,
+ affine_map<(i, j, k) -> (k, j)>,
+ affine_map<(i, j, k) -> (i, j)>
+]
+func.func @data_tiled_2x2x4_tensor_multi_mma(%lhs: tensor<?x?x2x4x16x1x4xf32>, %rhs: tensor<?x?x2x4x16x1x4xf32>, %acc: tensor<?x?x2x2x4x16x4x1xf32>) -> tensor<?x?x2x2x4x16x4x1xf32> {
+  %0 = iree_gpu.multi_mma %lhs, %rhs, %acc {
+    indexing_maps = #contraction_accesses,
+    iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>],
+    kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x4_F32, unroll_m = 2, unroll_n = 2, unroll_k = 4>
+  } : tensor<?x?x2x4x16x1x4xf32>, tensor<?x?x2x4x16x1x4xf32> into tensor<?x?x2x2x4x16x4x1xf32>
+  return %0 : tensor<?x?x2x2x4x16x4x1xf32>
+}
+
+// CHECK: #[[$MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
+// CHECK: #[[$MAP1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
+// CHECK: #[[$MAP2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+
+// CHECK-LABEL: func @data_tiled_2x2x4_tensor_multi_mma
+//       CHECK:   iree_gpu.multi_mma %arg0, %arg1, %arg2
+//  CHECK-SAME:       indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]]
+//  CHECK-SAME:       iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>]
+//  CHECK-SAME:       kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x4_F32, unroll_m = 2, unroll_n = 2, unroll_k = 4>
+//  CHECK-SAME:     : tensor<?x?x2x4x16x1x4xf32>, tensor<?x?x2x4x16x1x4xf32> into tensor<?x?x2x2x4x16x4x1xf32>
+
+// -----
+
 func.func @tensor_barrier(%input: tensor<?xf16>) -> tensor<?xf16> {
   %out = iree_gpu.value_barrier %input : tensor<?xf16>
   return %out : tensor<?xf16>
diff --git a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp
index 3e52cef7c16d..d085849d86a0 100644
--- a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp
@@ -959,7 +959,7 @@ bool hasUkernelSupportedGpuArch(IREE::HAL::ExecutableTargetAttr targetAttr) {
 // GPU Target Information
 //===----------------------------------------------------------------------===//
 
-static IREE::GPU::TargetAttr getCLGPUTarget(MLIRContext *context) {
+IREE::GPU::TargetAttr getCLGPUTarget(MLIRContext *context) {
   if (clTestTarget.empty())
     return nullptr;
 
diff --git a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h
index 7cbddf4e79bc..3da1ec775f94 100644
--- a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h
+++ b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.h
@@ -174,6 +174,10 @@ FailureOr<ArrayAttr> getSupportedMmaTypes(DictionaryAttr config);
 
 FailureOr<ArrayAttr> getSupportedMmaTypes(mlir::FunctionOpInterface entryPoint);
 
+/// Returns the GPU target attribute from `iree-gpu-test-target` if provided.
+/// Returns null TargetAttr othersise.
+IREE::GPU::TargetAttr getCLGPUTarget(MLIRContext *context);
+
 /// Returns the GPU target attribute from executable |target| if found.
 /// Returns null TargetAttr othersise.
 IREE::GPU::TargetAttr getGPUTargetAttr(IREE::HAL::ExecutableTargetAttr target);