diff --git a/build_tools/pkgci/external_test_suite/attention_and_matmul_spec.mlir b/build_tools/pkgci/external_test_suite/attention_and_matmul_spec.mlir
index 656b5206595b..4a9949dadb9d 100644
--- a/build_tools/pkgci/external_test_suite/attention_and_matmul_spec.mlir
+++ b/build_tools/pkgci/external_test_suite/attention_and_matmul_spec.mlir
@@ -541,7 +541,8 @@ module attributes { transform.with_named_sequence } {
         {mma_schedule = #iree_gpu.mma_schedule<
            intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>,
            subgroup_m_count = 2, subgroup_n_count = 1>
-         , llvm_func_attrs = {"amdgpu-waves-per-eu" = "1"}}>
+         , workgroup_reorder = #iree_gpu.workgroup_reorder<reorder_option = "transpose">,
+         llvm_func_attrs = {"amdgpu-waves-per-eu" = "1"}}>
       > -> !transform.any_param
     transform.yield %matmul, %config : !transform.any_op, !transform.any_param
   }
@@ -559,7 +560,8 @@ module attributes { transform.with_named_sequence } {
         {mma_schedule = #iree_gpu.mma_schedule<
            intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>,
            subgroup_m_count = 2, subgroup_n_count = 1>
-         , llvm_func_attrs = {"amdgpu-waves-per-eu" = "1"}}>
+         , workgroup_reorder = #iree_gpu.workgroup_reorder<reorder_option = "transpose">,
+         llvm_func_attrs = {"amdgpu-waves-per-eu" = "1"}}>
       > -> !transform.any_param
     transform.yield %matmul, %config : !transform.any_op, !transform.any_param
   }
@@ -637,7 +639,7 @@ module attributes { transform.with_named_sequence } {
           {mma_schedule = #iree_gpu.mma_schedule<
               intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>,
               subgroup_m_count = 1, subgroup_n_count = 5>
-           , reorder_workgroups = "transpose"}>
+           ,  workgroup_reorder = #iree_gpu.workgroup_reorder<reorder_option = "transpose">}>
       > -> !transform.any_param
     transform.yield %conv, %config : !transform.any_op, !transform.any_param
   }
@@ -657,7 +659,7 @@ module attributes { transform.with_named_sequence } {
           {mma_schedule = #iree_gpu.mma_schedule<
               intrinsic = #iree_gpu.mma_layout<MFMA_F16_32x32x8_F32>,
               subgroup_m_count = 1, subgroup_n_count = 4>
-           , reorder_workgroups = "transpose", llvm_func_attrs = {"amdgpu-waves-per-eu" = "1"}}>
+           ,  workgroup_reorder = #iree_gpu.workgroup_reorder<reorder_option = "transpose">}>
       > -> !transform.any_param
     transform.yield %conv, %config : !transform.any_op, !transform.any_param
   }
@@ -677,7 +679,7 @@ module attributes { transform.with_named_sequence } {
           {mma_schedule = #iree_gpu.mma_schedule<
               intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>,
               subgroup_m_count = 1, subgroup_n_count = 5>
-           , reorder_workgroups = "transpose"}>
+           ,  workgroup_reorder = #iree_gpu.workgroup_reorder<reorder_option = "transpose">}>
       > -> !transform.any_param
     transform.yield %conv, %config : !transform.any_op, !transform.any_param
   }
@@ -697,7 +699,7 @@ module attributes { transform.with_named_sequence } {
           {mma_schedule = #iree_gpu.mma_schedule<
               intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>,
               subgroup_m_count = 1, subgroup_n_count = 5>
-           , reorder_workgroups = "transpose"}>
+           ,  workgroup_reorder = #iree_gpu.workgroup_reorder<reorder_option = "transpose">}>
       > -> !transform.any_param
     transform.yield %conv, %config : !transform.any_op, !transform.any_param
   }
@@ -717,7 +719,7 @@ module attributes { transform.with_named_sequence } {
           {mma_schedule = #iree_gpu.mma_schedule<
               intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>,
               subgroup_m_count = 4, subgroup_n_count = 2>
-           , reorder_workgroups = "transpose", llvm_func_attrs = {"amdgpu-waves-per-eu" = "1"}}>
+           ,  workgroup_reorder = #iree_gpu.workgroup_reorder<reorder_option = "transpose">}>
       > -> !transform.any_param
     transform.yield %conv, %config : !transform.any_op, !transform.any_param
   }
@@ -737,7 +739,7 @@ module attributes { transform.with_named_sequence } {
           {mma_schedule = #iree_gpu.mma_schedule<
               intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>,
               subgroup_m_count = 1, subgroup_n_count = 5>
-           , reorder_workgroups = "transpose"}>
+           ,  workgroup_reorder = #iree_gpu.workgroup_reorder<reorder_option = "transpose">}>
       > -> !transform.any_param
     transform.yield %conv, %config : !transform.any_op, !transform.any_param
   }
diff --git a/compiler/plugins/target/ROCM/test/target_device_features.mlir b/compiler/plugins/target/ROCM/test/target_device_features.mlir
index 0f427c52fd00..f9b16a5728d8 100644
--- a/compiler/plugins/target/ROCM/test/target_device_features.mlir
+++ b/compiler/plugins/target/ROCM/test/target_device_features.mlir
@@ -9,7 +9,7 @@
 // GFX942-SAME:         mma = [<MFMA_F16_16x16x16_F32>, <MFMA_F16_32x32x8_F32>],
 // GFX942-SAME:         subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024],
 // GFX942-SAME:         max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536>,
-// GFX942-SAME: chip = <wgp_count = 304>>
+// GFX942-SAME: chip = <wgp_count = 304, chiplet_count = 8>>
 
 // GFX940: target = #iree_gpu.target<arch = "gfx940",
 // GFX940-SAME:         mma = [<MFMA_F16_16x16x16_F32>, <MFMA_F16_32x32x8_F32>]
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.h b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.h
index f985c1234892..1c343151c2e6 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.h
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.h
@@ -109,7 +109,7 @@ createConvertVectorReductionToGPUPass(
     bool expandSubgroupReduction = true,
     std::function<int(mlir::FunctionOpInterface)> getWarpSize = nullptr);
 
-enum class ReorderWorkgroupsStrategy { None, Swizzle, Transpose };
+enum class ReorderWorkgroupsStrategy { None, ChipletGroup, Swizzle, Transpose };
 
 /// Reorders workgroup IDs.
 std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td
index 5bb44cd631f1..7ecfae72a118 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td
@@ -199,10 +199,15 @@ def ReorderWorkgroupsPass :
   let dependentDialects = ["::mlir::affine::AffineDialect"];
   let options = [
     Option<"strategy", "strategy", "std::string", /*default=*/"",
-           "Workgroup reordering strategy, one of: '' (none),  'transpose', 'swizzle'">,
-    Option<"logTile", "logTile", "unsigned",
+           "Workgroup reordering strategy, one of: '' (none),  'transpose', 'swizzle', 'chipletgroup'">,
+    Option<"logSwizzleTile", "logSwizzleTile", "unsigned",
             /*default=*/"0",
-           "The log2 of the tile size used for swizzling. (0: disabled, non-0: swizzling enabled)">,
+           "The log2 of the tile size used for swizzling. "
+           "(0: swizzling disabled, non-0: swizzling enabled)">,
+    Option<"logChipletgroupTile", "logChipletgroupTile", "unsigned",
+            /*default=*/"0",
+           "The log2 of the tile size used for chipletgroup. "
+           "(0: chipletgroup disabled, non-0: chipletgroup enabled)">,
   ];
 }
 
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/WorkgroupReordering.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/WorkgroupReordering.cpp
index fd03448c33db..d9a5f961566b 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/WorkgroupReordering.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/WorkgroupReordering.cpp
@@ -7,6 +7,7 @@
 #include <cassert>
 
 #include "iree/compiler/Codegen/Common/GPU/Passes.h"
+#include "iree/compiler/Codegen/Utils/GPUUtils.h"
 #include "iree/compiler/Codegen/Utils/Utils.h"
 #include "iree/compiler/Dialect/HAL/IR/HALOps.h"
 #include "llvm/ADT/STLExtras.h"
@@ -68,6 +69,132 @@ makeSwizzledIds(Location loc, OpBuilder b, Value workgroupIdX,
   return {swizzledIdX, swizzledIdY};
 }
 
+// Reordering to make workgroup ids move slowly between chiplet groups.
+
+// Example:
+// Currently, the GPU launches workgroups in a round-robin fashion across
+// each XCD partition on the GPU.
+// Assume we have 16 workgroups and XCDPartitionsOnGPU is 4.
+// The default GPU schedule will launch workgroups {0, 1, 2, 3, ..., 15} in
+// the following order:
+// Partition 0: {0, 4, 8, 12}
+// Partition 1: {1, 5, 9, 13}
+// Partition 2: {2, 6, 10, 14}
+// Partition 3: {3, 7, 11, 15}
+
+// After reordering, the workgroup IDs are {0, 4, 8, 12, 1, ..., 15},
+// resulting in the launch order:
+// Partition 0: {0, 1, 2, 3}
+// Partition 1: {4, 5, 6, 7}
+// Partition 2: {8, 9, 10, 11}
+// Partition 3: {12, 13, 14, 15}
+
+// Returns permuted workgroup id (linearized ID).
+// In the above example:
+// linearizedId 0's permuted Id is still 0.
+// linearizedId 1's permuted Id is 4.
+static Value chipletAwareWorkgroupReordering(Location loc, OpBuilder b,
+                                             Value linearizedId,
+                                             Value workgroupCountX,
+                                             Value workgroupCountY,
+                                             int64_t XCDParitionsOnGPU) {
+  // Given:
+  // Id = linearizedId
+  // x_dim = workgroupCountX
+  // y_dim = workgroupCountY
+  // xcd_count = XCDParitionsOnGPU
+
+  // The new workgroup ID is computed as follows:
+  // wgp_count = x_dim * y_dim
+  // reordered_id = (Id / xcd_count) + (Id % xcd_count) * (wgp_count /xcd_count)
+  // final_id = (Id >= (wgp_count - 1 - (wgp_count % xcd_count))) ? Id :
+  // reordered_id
+
+  Value numChipletsVal =
+      b.createOrFold<arith::ConstantIndexOp>(loc, XCDParitionsOnGPU);
+  Value workgroupCount =
+      b.create<arith::MulIOp>(loc, workgroupCountX, workgroupCountY);
+  Value workgroupCountPerChiplet =
+      b.create<arith::DivUIOp>(loc, workgroupCount, numChipletsVal);
+  Value chipletId = b.create<arith::RemUIOp>(loc, linearizedId, numChipletsVal);
+  Value wgIdWithinChiplet =
+      b.create<arith::DivUIOp>(loc, linearizedId, numChipletsVal);
+  Value reorderedId = b.create<arith::AddIOp>(
+      loc, wgIdWithinChiplet,
+      b.create<arith::MulIOp>(loc, chipletId, workgroupCountPerChiplet));
+
+  // Handle the remainder part.
+  Value constOne = b.createOrFold<arith::ConstantIndexOp>(loc, 1);
+  Value lastWorkgroupId =
+      b.create<arith::SubIOp>(loc, workgroupCount, constOne);
+  Value modulatedLastWorkgroupId = b.create<arith::SubIOp>(
+      loc, lastWorkgroupId,
+      b.create<arith::RemUIOp>(loc, workgroupCount, numChipletsVal));
+  Value isGreaterThanFinalWorkgroupId = b.create<arith::CmpIOp>(
+      loc, arith::CmpIPredicate::ugt, linearizedId, modulatedLastWorkgroupId);
+  Value finalId = b.create<arith::SelectOp>(loc, isGreaterThanFinalWorkgroupId,
+                                            linearizedId, reorderedId);
+
+  return finalId;
+}
+
+// Chiplet-aware workgroup reordering strategy: reordering + super-grouping.
+// Step 1: Reorder the workgroup grid to move slowly between
+// chiplet groups (Function: chipletAwareWorkgroupReordering).
+// Step 2: Implement 'super-grouping' of workgroups before switching to the next
+// column.
+// Returns the permuted workgroup IDs (along X and Y dimension).
+static std::pair<Value, Value>
+makeChipletGroupedIds(Location loc, OpBuilder b, Value workgroupIdX,
+                      Value workgroupIdY, Value workgroupCountX,
+                      Value workgroupCountY, unsigned chipletGroupTile,
+                      unsigned numXCDs) {
+  // Create one dimension ID for workgroup.
+  Value linearized =
+      b.create<arith::MulIOp>(loc, workgroupIdY, workgroupCountX);
+  linearized = b.create<arith::AddIOp>(loc, linearized, workgroupIdX);
+
+  assert(numXCDs > 1 && "expected more than one XCD for chiplet reordering");
+  // Map chiplets to perform a spatially local tile operation.
+  // Reorder the linearized ID such that every consecutive group of chiplets
+  // is the slowest-changing dimension in the grid.
+  // Empirically found that two chiplets as a group has better locality
+  // throughout.
+  linearized = chipletAwareWorkgroupReordering(
+      loc, b, linearized, workgroupCountX, workgroupCountY, numXCDs / 2);
+
+  // Detailed explanation about the idea behind the below implementation:
+  // the L2 Cache Optimizations subsection in
+  // https://triton-lang.org/main/getting-started/tutorials/03-matrix-multiplication.html#
+  unsigned rowGroupSize = chipletGroupTile;
+  Value rowGroupSizeVal =
+      b.createOrFold<arith::ConstantIndexOp>(loc, rowGroupSize);
+
+  // Empirically, found rowGroupSize=16 for MI300X achieves good performance
+  // group every 16 workgroups along Y dimension.
+
+  // Number of workgroups in the group.
+  Value numWorkGroupsPerRowBlock =
+      b.create<arith::MulIOp>(loc, rowGroupSizeVal, workgroupCountX);
+
+  Value groupId =
+      b.create<arith::DivUIOp>(loc, linearized, numWorkGroupsPerRowBlock);
+  Value firstRowID = b.create<arith::MulIOp>(loc, groupId, rowGroupSizeVal);
+
+  Value currentRowGroupSize = b.create<arith::MinUIOp>(
+      loc, b.create<arith::SubIOp>(loc, workgroupCountY, firstRowID),
+      rowGroupSizeVal);
+
+  Value newY = b.create<arith::AddIOp>(
+      loc, firstRowID,
+      b.create<arith::RemUIOp>(loc, linearized, currentRowGroupSize));
+
+  Value newX = b.create<arith::DivUIOp>(
+      loc, b.create<arith::RemUIOp>(loc, linearized, numWorkGroupsPerRowBlock),
+      currentRowGroupSize);
+  return {newX, newY};
+}
+
 /// Transpose IDs, i.e., changes the traversal order from left -> right then
 /// top -> bottom to top -> bottom then left -> right.
 static std::pair<Value, Value> makeTransposedIds(Location loc, OpBuilder b,
@@ -112,11 +239,12 @@ getWorkgroupCountsXY(OpBuilder &builder, FunctionOpInterface funcOp) {
 
 static LogicalResult reorderWorkgroupsInFunc(FunctionOpInterface funcOp,
                                              ReorderWorkgroupsStrategy strategy,
-                                             unsigned swizzleLogTile) {
+                                             unsigned logTile,
+                                             unsigned numXCDs = 2) {
   assert(strategy != ReorderWorkgroupsStrategy::None &&
          "Expected a concrete strategy");
 
-  unsigned swizzleTile = 1u << swizzleLogTile;
+  unsigned reorderWgTileSize = 1u << logTile;
   IREE::HAL::InterfaceWorkgroupIDOp oldXId;
   IREE::HAL::InterfaceWorkgroupIDOp oldYId;
   unsigned numXIdOps = 0;
@@ -153,7 +281,13 @@ static LogicalResult reorderWorkgroupsInFunc(FunctionOpInterface funcOp,
   if (strategy == ReorderWorkgroupsStrategy::Swizzle) {
     std::tie(newWorkgroupIdX, newWorkgroupIdY) =
         makeSwizzledIds(funcOp.getLoc(), builder, workgroupIdX, workgroupIdY,
-                        workgroupCntX, workgroupCntY, swizzleTile);
+                        workgroupCntX, workgroupCntY, reorderWgTileSize);
+  } else if (strategy == ReorderWorkgroupsStrategy::ChipletGroup) {
+    if (numXCDs <= 1)
+      return failure();
+    std::tie(newWorkgroupIdX, newWorkgroupIdY) = makeChipletGroupedIds(
+        funcOp.getLoc(), builder, workgroupIdX, workgroupIdY, workgroupCntX,
+        workgroupCntY, reorderWgTileSize, numXCDs);
   } else {
     assert(strategy == ReorderWorkgroupsStrategy::Transpose &&
            "Unhandled strategy");
@@ -186,9 +320,9 @@ namespace {
 struct ReorderWorkgroupsPass final
     : impl::ReorderWorkgroupsPassBase<ReorderWorkgroupsPass> {
   ReorderWorkgroupsPass(
-      ReorderWorkgroupsStrategy strategy, unsigned logSwizzleTile,
+      ReorderWorkgroupsStrategy strategy, unsigned logTile,
       std::function<LogicalResult(mlir::FunctionOpInterface)> filterFn)
-      : reorderingStrategy(strategy), logSwizzleTile(logSwizzleTile),
+      : reorderingStrategy(strategy), reorderWgLogTileSize(logTile),
         filterFn(std::move(filterFn)) {}
 
   LogicalResult initializeOptions(
@@ -197,10 +331,11 @@ struct ReorderWorkgroupsPass final
     if (failed(Pass::initializeOptions(options, errorHandler))) {
       return failure();
     }
-    logSwizzleTile = logTile;
+
     auto selectedStrategy =
         llvm::StringSwitch<FailureOr<ReorderWorkgroupsStrategy>>(strategy)
             .Case("", ReorderWorkgroupsStrategy::None)
+            .Case("chipletgroup", ReorderWorkgroupsStrategy::ChipletGroup)
             .Case("swizzle", ReorderWorkgroupsStrategy::Swizzle)
             .Case("transpose", ReorderWorkgroupsStrategy::Transpose)
             .Default(failure());
@@ -208,6 +343,13 @@ struct ReorderWorkgroupsPass final
       return failure();
 
     reorderingStrategy = *selectedStrategy;
+    if (reorderingStrategy == ReorderWorkgroupsStrategy::Swizzle &&
+        reorderWgLogTileSize == 0)
+      reorderWgLogTileSize = logSwizzleTile;
+    else if (reorderingStrategy == ReorderWorkgroupsStrategy::ChipletGroup &&
+             reorderWgLogTileSize == 0)
+      reorderWgLogTileSize = logChipletgroupTile;
+
     return success();
   }
 
@@ -216,7 +358,11 @@ struct ReorderWorkgroupsPass final
       return;
 
     if (reorderingStrategy == ReorderWorkgroupsStrategy::Swizzle &&
-        logSwizzleTile == 0)
+        reorderWgLogTileSize == 0)
+      return;
+
+    if (reorderingStrategy == ReorderWorkgroupsStrategy::ChipletGroup &&
+        reorderWgLogTileSize == 0)
       return;
 
     FunctionOpInterface funcOp = getOperation();
@@ -229,7 +375,20 @@ struct ReorderWorkgroupsPass final
       llvm::dbgs() << "\n\n";
     });
 
-    if (failed(reorderWorkgroupsInFunc(funcOp, reorderingStrategy, logTile))) {
+    uint32_t numXCDs = 1;
+    if (IREE::GPU::TargetAttr attr = getGPUTargetAttr(funcOp)) {
+      if (IREE::GPU::TargetChipAttr chipAttr = attr.getChip()) {
+        numXCDs = chipAttr.getChipletCount();
+      }
+    }
+
+    LLVM_DEBUG(llvm::dbgs() << "Number of XCDs = " << numXCDs << "\n");
+    if (numXCDs == 1 &&
+        reorderingStrategy == ReorderWorkgroupsStrategy::ChipletGroup)
+      return;
+
+    if (failed(reorderWorkgroupsInFunc(funcOp, reorderingStrategy,
+                                       reorderWgLogTileSize, numXCDs))) {
       LLVM_DEBUG(llvm::dbgs() << "Failed to reorder workgroups\n");
       return;
     }
@@ -244,16 +403,16 @@ struct ReorderWorkgroupsPass final
 private:
   ReorderWorkgroupsStrategy reorderingStrategy =
       ReorderWorkgroupsStrategy::None;
-  unsigned logSwizzleTile = 0;
+  unsigned reorderWgLogTileSize = 0;
   std::function<LogicalResult(mlir::FunctionOpInterface)> filterFn;
 };
 } // namespace
 
 std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
 createReorderWorkgroups(
-    ReorderWorkgroupsStrategy strategy, unsigned swizzleLogTile,
+    ReorderWorkgroupsStrategy strategy, unsigned reorderWgLogTile,
     std::function<LogicalResult(mlir::FunctionOpInterface)> filterFn) {
-  return std::make_unique<ReorderWorkgroupsPass>(strategy, swizzleLogTile,
+  return std::make_unique<ReorderWorkgroupsPass>(strategy, reorderWgLogTile,
                                                  filterFn);
 }
 
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_reorder_workgroups.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_reorder_workgroups.mlir
index 906ed23f162b..ab4b4c49a2d8 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_reorder_workgroups.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_reorder_workgroups.mlir
@@ -1,10 +1,17 @@
-// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-reorder-workgroups{strategy=swizzle logTile=3}))" \
+// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-reorder-workgroups{strategy=swizzle logSwizzleTile=3}))" \
 // RUN:   --split-input-file %s | FileCheck --check-prefix=SWIZZLE %s
 
 // RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-reorder-workgroups{strategy=transpose}))" \
 // RUN:   --split-input-file %s | FileCheck --check-prefix=TRANSPOSE %s
 
-func.func @matmul() {
+// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-reorder-workgroups{strategy=chipletgroup logChipletgroupTile=3}))" \
+// RUN:   --split-input-file %s | FileCheck --check-prefix=CHIPLETGROUP %s
+#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb",
+{iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8,
+storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F16_16x16x16_F32>, <MFMA_F16_32x32x8_F32>],
+subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536>,
+chip = <wgp_count = 304, chiplet_count = 8>>, ukernels = "none"}>
+func.func @matmul() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
   %c96 = arith.constant 96 : index
@@ -55,6 +62,41 @@ func.func @matmul() {
 // SWIZZLE:         %[[S13:.*]] = arith.select %[[S12]], %[[WG_X]], %[[S6]] : index
 // SWIZZLE:         %[[S14:.*]] = arith.select %[[S12]], %[[WG_Y]], %[[S7]] : index
 
+// CHIPLETGROUP-LABEL: func.func @matmul
+// CHIPLETGROUP:         %[[WG_X:.*]] = hal.interface.workgroup.id[0] : index
+// CHIPLETGROUP:         %[[WG_Y:.*]] = hal.interface.workgroup.id[1] : index
+// CHIPLETGROUP:         %[[WG_CNT_X:.*]] = hal.interface.workgroup.count[0] : index
+// CHIPLETGROUP:         %[[WG_CNT_Y:.*]] = hal.interface.workgroup.count[1] : index
+// CHIPLETGROUP:         %[[S0:.*]] = arith.muli %[[WG_Y]], %[[WG_CNT_X]] : index
+// CHIPLETGROUP:         %[[S1:.*]] = arith.addi %[[S0]], %[[WG_X]] : index
+// CHIPLETGROUP:         %[[CST4:.*]] = arith.constant 4 : index
+// CHIPLETGROUP:         %[[WG_CNT:.*]] = arith.muli %[[WG_CNT_X]], %[[WG_CNT_Y]] : index
+// CHIPLETGROUP:         %[[S3:.*]] = arith.divui %[[WG_CNT]], %[[CST4]] : index
+// CHIPLETGROUP:         %[[S4:.*]] = arith.remui %[[S1]], %[[CST4]] : index
+// CHIPLETGROUP:         %[[S5:.*]] = arith.divui %[[S1]], %[[CST4]] : index
+// CHIPLETGROUP:         %[[S6:.*]] = arith.muli %[[S4]], %[[S3]] : index
+// CHIPLETGROUP:         %[[S7:.*]] = arith.addi %[[S5]], %[[S6]] : index
+// CHIPLETGROUP:         %[[CST1:.*]] = arith.constant 1 : index
+// CHIPLETGROUP:         %[[S8:.*]] = arith.subi %[[WG_CNT]], %[[CST1]] : index
+// CHIPLETGROUP:         %[[S9:.*]] = arith.remui %[[WG_CNT]], %[[CST4]] : index
+// CHIPLETGROUP:         %[[S10:.*]] = arith.subi %[[S8]], %[[S9]] : index
+// CHIPLETGROUP:         %[[S11:.*]] = arith.cmpi ugt, %[[S1]], %[[S10]] : index
+// CHIPLETGROUP:         %[[S12:.*]] = arith.select %[[S11]], %[[S1]], %[[S7]] : index
+// CHIPLETGROUP:         %[[CST8:.*]] = arith.constant 8 : index
+// CHIPLETGROUP:         %[[S13:.*]] = arith.muli %[[CST8]], %[[WG_CNT_X]] : index
+// CHIPLETGROUP:         %[[S14:.*]] = arith.divui %[[S12]], %[[S13]] : index
+// CHIPLETGROUP:         %[[S15:.*]] = arith.muli %[[S14]], %[[CST8]] : index
+// CHIPLETGROUP:         %[[S16:.*]] = arith.subi %[[WG_CNT_Y]], %[[S15]] : index
+// CHIPLETGROUP:         %[[S17:.*]] = arith.minui %[[S16]], %[[CST8]] : index
+// CHIPLETGROUP:         %[[S18:.*]] = arith.remui %[[S12]], %[[S17]] : index
+// CHIPLETGROUP:         %[[S19:.*]] = arith.addi %[[S15]], %[[S18]] : index
+// CHIPLETGROUP:         %[[S20:.*]] = arith.remui %[[S12]], %[[S13]] : index
+// CHIPLETGROUP:         %[[S21:.*]] = arith.divui %[[S20]], %[[S17]] : index
+// CHIPLETGROUP:         %26 = affine.apply #map()[%[[S19]]]
+// CHIPLETGROUP:         %27 = affine.apply #map()[%workgroup_count_y_1]
+// CHIPLETGROUP:         %28 = affine.apply #map()[%[[S21]]]
+// CHIPLETGROUP:         %29 = affine.apply #map()[%workgroup_count_x_0]
+
 // TRANSPOSE-LABEL: func.func @matmul
 // TRANSPOSE:         %[[WG_X:.*]] = hal.interface.workgroup.id[0] : index
 // TRANSPOSE:         %[[WG_Y:.*]] = hal.interface.workgroup.id[1] : index
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_reorder_workgroups_static.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_reorder_workgroups_static.mlir
index 640158da79e2..9563a4aa2a6f 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_reorder_workgroups_static.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_reorder_workgroups_static.mlir
@@ -1,9 +1,12 @@
-// RUN: iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-codegen-reorder-workgroups{strategy=swizzle logTile=3})))))" \
+// RUN: iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-codegen-reorder-workgroups{strategy=swizzle logSwizzleTile=3})))))" \
 // RUN:   %s | FileCheck --check-prefix=SWIZZLE %s
 
 // RUN: iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-codegen-reorder-workgroups{strategy=transpose})))))" \
 // RUN:   %s | FileCheck --check-prefix=TRANSPOSE %s
 
+// RUN: iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-codegen-reorder-workgroups{strategy=chipletgroup logChipletgroupTile=3})))))" \
+// RUN:   %s | FileCheck --check-prefix=CHIPLETGROUP %s
+
 // Make sure we use static workgroup counts instead of introducting
 // `hal.interface.workgroup.count` ops. These are currently not supported on ROCm.
 
@@ -18,6 +21,25 @@
 // SWIZZLE-DAG:               affine.apply #{{.+}}()[%[[SEL_Y]]]
 // SWIZZLE:                   return
 
+// CHIPLETGROUP-LABEL: hal.executable private @main_dispatch_0 {
+// CHIPLETGROUP-LABEL: func.func @main_dispatch_0_matmul_transpose_b_32000x32000x4096_f16
+// CHIPLETGROUP-DAG:               %[[WG_X:.+]] = hal.interface.workgroup.id[0] : index
+// CHIPLETGROUP-DAG:               %[[WG_Y:.+]] = hal.interface.workgroup.id[1] : index
+// CHIPLETGROUP-NOT:               hal.interface.workgroup.count
+// CHIPLETGROUP-DAG:               %[[C250:.+]] = arith.constant 250 : index
+// CHIPLETGROUP-DAG:               %[[C500:.+]] = arith.constant 500 : index
+// CHIPLETGROUP:                   %[[MUL:.+]] = arith.muli %[[WG_Y]], %[[C250]] : index
+// CHIPLETGROUP:                   %[[ADD:.+]] = arith.addi %[[MUL]], %[[WG_X]] : index
+// CHIPLETGROUP:                   %[[CMP:.+]] = arith.cmpi ugt, %[[ADD]], %{{.+}} : index
+// CHIPLETGROUP:                   %[[SELECT:.+]] = arith.select %[[CMP]], %[[ADD]], %{{.+}} : index
+// CHIPLETGROUP:                   %[[REM:.+]] = arith.remui %[[SELECT]], %{{.+}} : index
+// CHIPLETGROUP:                   %[[ADDI:.+]] = arith.addi %{{.+}}, %[[REM]] : index
+// CHIPLETGROUP:                   %[[REMI:.+]] = arith.remui %[[SELECT]], %{{.+}} : index
+// CHIPLETGROUP:                   %[[DIV:.+]] = arith.divui %[[REMI]], %{{.+}} : index
+// CHIPLETGROUP-DAG:               affine.apply #{{.+}}()[%[[ADDI]]]
+// CHIPLETGROUP-DAG:               affine.apply #{{.+}}()[%[[DIV]]]
+// CHIPLETGROUP:                   return
+
 // TRANSPOSE-LABEL: hal.executable private @main_dispatch_0 {
 // TRANSPOSE-LABEL: func.func @main_dispatch_0_matmul_transpose_b_32000x32000x4096_f16
 // TRANSPOSE-DAG:               %[[WG_X:.+]] = hal.interface.workgroup.id[0] : index
@@ -33,8 +55,13 @@
 // TRANSPOSE-DAG:               affine.apply #{{.+}}()[%[[REM]]]
 // TRANSPOSE:                   return
 
+#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb",
+{iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8,
+storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F16_16x16x16_F32>, <MFMA_F16_32x32x8_F32>],
+subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536>,
+chip = <wgp_count = 304, chiplet_count = 8>>, ukernels = "none"}>
 hal.executable private @main_dispatch_0 {
-hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
+hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
   hal.executable.export public @main_dispatch_0_matmul_transpose_b_32000x32000x4096_f16 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>], subgroup_size = 64 : index, translation_info = #iree_codegen.translation_info<LLVMGPUMatmulSimt, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>, workgroup_size = [64 : index, 16 : index, 1 : index]} {
   ^bb0(%arg0: !hal.device):
     %c250 = arith.constant 250 : index
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td
index 5a974d5100c9..4633f173b630 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td
@@ -269,6 +269,26 @@ def IREEGPU_MmaScheduleAttr : AttrDef<IREEGPU_Dialect, "MMASchedule"> {
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// Workgroup Reordering Attr
+
+def IREEGPU_WorkgroupReorderAttr: AttrDef<IREEGPU_Dialect, "WorkgroupReorderOptions">{
+  let mnemonic = "workgroup_reorder";
+  let cppNamespace = "::mlir::iree_compiler::IREE::GPU";
+
+  string description = [{
+    Options for workgroup reordering strategies to improve L2 cache hit rate.
+  }];
+
+  let parameters = (ins
+    OptionalParameter<"::mlir::iree_compiler::IREE::GPU::ReorderWorkgroupEnum">:$reorder_option,
+    OptionalParameter<"std::optional<int64_t>", "the tile size to use in log2">:$log_tile_size
+  );
+
+  let assemblyFormat = "`<` struct(params) `>`";
+}
+
+
 //===----------------------------------------------------------------------===//
 // Workgroup processor level description
 
@@ -335,7 +355,7 @@ def IREEGPU_TargetChipAttr : AttrDef<IREEGPU_Dialect, "TargetChip"> {
 
   let parameters = (ins
     "uint32_t":$wgp_count,
-
+    "uint32_t":$chiplet_count,
     // An optional extra dict
     // This field allows to inject more features/limits not supported in the
     // above list for better flexibility.
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td
index a7abbb65ceb6..cd1da79cfe2f 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td
@@ -148,4 +148,20 @@ def IREEGPU_TilingLevel : IREEGPU_I32MmaEnumAttr<"TilingLevel",
       Lane
     ]>;
 
+//===----------------------------------------------------------------------===//
+// Workgroup reordering strategies
+
+def None : I32EnumAttrCase<"none", 0>;
+def Transpose :I32EnumAttrCase<"transpose", 1>;
+def Swizzle : I32EnumAttrCase<"swizzle", 2>;
+def Chipletgroup : I32EnumAttrCase<"chipletgroup", 3>;
+
+def IREEGPU_ReorderWorkgroupEnum : IREEGPU_I32MmaEnumAttr<"ReorderWorkgroupEnum",
+    "Descriptor for strategies of reordering workgroups on GPUs", [
+      None,
+      Transpose,
+      Swizzle,
+      Chipletgroup
+    ]>;
+
 #endif // IREE_COMPILER_CODEGEN_DIALECT_GPU_IREEGPUENUMS
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/target_attrs.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/target_attrs.mlir
index baa47b2be12e..25bf9f0b59cf 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/target_attrs.mlir
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/target_attrs.mlir
@@ -44,9 +44,10 @@ func.func @test_target_wgp_none() attributes {
 // CHECK-LABEL: func.func @test_target_chip()
 func.func @test_target_chip() attributes {
   // CHECK: #iree_gpu.target_chip<
-  // CHECK-SAME: wgp_count = 304>
+  // CHECK-SAME: wgp_count = 304, chiplet_count = 8>
   chip = #iree_gpu.target_chip<
-    wgp_count = 304
+    wgp_count = 304,
+    chiplet_count = 8
   >
 } { return }
 
@@ -68,6 +69,6 @@ func.func @test_target() attributes {
       max_workgroup_sizes = [1024, 1024, 1024],
       max_thread_count_per_workgroup = 1024,
       max_workgroup_memory_bytes = 65536>,
-    chip = <wgp_count = 304>
+    chip = <wgp_count = 304, chiplet_count = 8>
   >
 } { return }
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp
index b6f0aebbd395..7a036fb1b35a 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp
@@ -50,6 +50,7 @@ struct WgpDetails {
 // Chip level feature/limit details
 struct ChipDetails {
   uint32_t wgpCount;
+  uint32_t chipletCount;
 };
 
 // Full target details
@@ -111,7 +112,8 @@ TargetAttr createTargetAttr(const TargetDetails &details, StringRef arch,
   TargetChipAttr targetChip;
   if (details.chip)
     targetChip =
-        TargetChipAttr::get(context, details.chip->wgpCount, DictionaryAttr{});
+        TargetChipAttr::get(context, details.chip->wgpCount,
+                            details.chip->chipletCount, DictionaryAttr{});
 
   return TargetAttr::get(context, arch, features, targetWgp, targetChip);
 }
@@ -198,32 +200,31 @@ std::optional<TargetDetails> getAMDGPUTargetDetails(StringRef target) {
 
   // "AMD Instinct MI300 Series Product Offerings" in Page 23 of
   // https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/white-papers/amd-cdna-3-white-paper.pdf
-  static const ChipDetails mi300xChip = {304};
-  static const ChipDetails mi300aChip = {228};
+  static const ChipDetails mi300xChip = {304, 8};
+  static const ChipDetails mi300aChip = {228, 6};
 
   // "AMD Instinct MI200 Series Accelerator Product Offerings" in Page 14 of
   // https://www.amd.com/content/dam/amd/en/documents/instinct-business-docs/white-papers/amd-cdna2-white-paper.pdf
-  static const ChipDetails mi250xChip = {220};
-  static const ChipDetails mi250Chip = {208};
-  static const ChipDetails mi210Chip = {104};
+  static const ChipDetails mi250xChip = {220, 2};
+  static const ChipDetails mi250Chip = {208, 2};
+  static const ChipDetails mi210Chip = {104, 1};
 
   // "AMD CDNA Architecture Compute Units" in Page 5 of
   // https://www.amd.com/content/dam/amd/en/documents/instinct-business-docs/white-papers/amd-cdna-white-paper.pdf
-  static const ChipDetails mi100Chip = {120};
+  static const ChipDetails mi100Chip = {120, 1};
 
-  static const ChipDetails rx7900xtxChip = {96};
-  static const ChipDetails rx7900xtChip = {84};
-  static const ChipDetails rx7800xtChip = {60};
-  static const ChipDetails rx7700xtChip = {54};
+  static const ChipDetails rx7900xtxChip = {96, 1};
+  static const ChipDetails rx7900xtChip = {84, 1};
+  static const ChipDetails rx7800xtChip = {60, 1};
+  static const ChipDetails rx7700xtChip = {54, 1};
 
   // See https://llvm.org/docs/AMDGPUUsage.html#processors for gfxN to
   // cdnaN/rdnaN mapping.
 
   return llvm::StringSwitch<std::optional<TargetDetails>>(target.lower())
-      .Case("mi300x", TargetDetails{cdna3Wgp, &mi300xChip})
+      .Cases("mi300x", "gfx942", TargetDetails{cdna3Wgp, &mi300xChip})
       .Case("mi300a", TargetDetails{cdna3Wgp, &mi300aChip})
-      .Cases("cdna3", "gfx940", "gfx941", "gfx942",
-             TargetDetails{cdna3Wgp, nullptr})
+      .Cases("cdna3", "gfx940", "gfx941", TargetDetails{cdna3Wgp, nullptr})
       .Case("mi250x", TargetDetails{cdna2Wgp, &mi250xChip})
       .Case("mi250", TargetDetails{cdna2Wgp, &mi250Chip})
       .Case("mi210", TargetDetails{cdna2Wgp, &mi210Chip})
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp
index 2fb329579986..0038da555f07 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp
@@ -92,21 +92,30 @@ getPipelineOptions(FunctionOpInterface funcOp,
       // Get the workgroups reorder config and enable the workgroup reordering.
       Attribute reorderWorkgroupOption =
           config.get(LLVMGPUAttrNames::kReorderWorkgroups);
-      if (!isa<StringAttr>(reorderWorkgroupOption))
-        funcOp.emitOpError() << "'" << LLVMGPUAttrNames::kReorderWorkgroups
-                             << "' is expected to be a string attribute";
-      StringRef reorderStr = llvm::cast<StringAttr>(reorderWorkgroupOption);
-      if (reorderStr == "transpose") {
-        pipelineOptions.reorderStrategy = ReorderWorkgroupsStrategy::Transpose;
-      } else if (reorderStr == "swizzle") {
-        pipelineOptions.reorderStrategy = ReorderWorkgroupsStrategy::Swizzle;
-      } else {
-        if (reorderStr != "none")
-          funcOp.emitOpError()
-              << "Unknown " << LLVMGPUAttrNames::kReorderWorkgroups
-              << "value: " << reorderWorkgroupOption;
-        else
+      if (isa<IREE::GPU::WorkgroupReorderOptionsAttr>(reorderWorkgroupOption)) {
+        IREE::GPU::WorkgroupReorderOptionsAttr ReorderOption =
+            llvm::dyn_cast<IREE::GPU::WorkgroupReorderOptionsAttr>(
+                reorderWorkgroupOption);
+        pipelineOptions.reorderWgLogTileSize = ReorderOption.getLogTileSize();
+        switch (ReorderOption.getReorderOption()) {
+        case IREE::GPU::ReorderWorkgroupEnum::none:
           pipelineOptions.reorderStrategy = ReorderWorkgroupsStrategy::None;
+          break;
+        case IREE::GPU::ReorderWorkgroupEnum::transpose:
+          pipelineOptions.reorderStrategy =
+              ReorderWorkgroupsStrategy::Transpose;
+          break;
+        case IREE::GPU::ReorderWorkgroupEnum::swizzle:
+          pipelineOptions.reorderStrategy = ReorderWorkgroupsStrategy::Swizzle;
+          break;
+        case IREE::GPU::ReorderWorkgroupEnum::chipletgroup:
+          pipelineOptions.reorderStrategy =
+              ReorderWorkgroupsStrategy::ChipletGroup;
+          break;
+        default:
+          funcOp.emitOpError(
+              "unsupported workgroup reordering option on GPU target.");
+        }
       }
     }
   }
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
index a417e310a909..30f44d2065ea 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
@@ -7,6 +7,7 @@
 #include "iree/compiler/Dialect/LinalgExt/Transforms/Passes.h"
 
 #include <cstdint>
+#include <string>
 
 #include "iree-dialects/Dialect/LinalgTransform/Passes.h"
 #include "iree/compiler/Codegen/Common/GPU/Passes.h"
@@ -57,12 +58,14 @@ static llvm::cl::opt<ReorderWorkgroupsStrategy> clReorderWorkgroupsStrategy(
                                 "No workgroup reordering"),
                      clEnumValN(ReorderWorkgroupsStrategy::Swizzle, "swizzle",
                                 "Swizzle"),
+                     clEnumValN(ReorderWorkgroupsStrategy::ChipletGroup,
+                                "chipletgroup", "ChipletGroup"),
                      clEnumValN(ReorderWorkgroupsStrategy::Transpose,
                                 "transpose", "Transpose")),
     llvm::cl::init(ReorderWorkgroupsStrategy::None));
 
-static llvm::cl::opt<unsigned> clReorderWorkgroupsLogSwizzleTile(
-    "iree-codegen-reorder-workgroups-log-swizzle-tile",
+static llvm::cl::opt<unsigned> clReorderWorkgroupsLogTile(
+    "iree-codegen-reorder-workgroups-log-tile",
     llvm::cl::desc("Reorder workgroups: log tile size to use"),
     llvm::cl::init(3));
 
@@ -85,6 +88,9 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
       reorderStr = "transpose";
     } else if (options.reorderStrategy == ReorderWorkgroupsStrategy::Swizzle) {
       reorderStr = "swizzle";
+    } else if (options.reorderStrategy ==
+               ReorderWorkgroupsStrategy::ChipletGroup) {
+      reorderStr = "chilpletgroup";
     } else {
       assert(options.reorderStrategy == ReorderWorkgroupsStrategy::None &&
              "Unhandled reorder option");
@@ -95,6 +101,7 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
   return os << "{" << "enableReduceSharedMemoryBankConflicts = "
             << options.enableReduceSharedMemoryBankConflicts
             << ", reorderWorkgroupsStrategy = " << reorderStr
+            << ", reorderWorkgroupsTileSize = " << options.reorderWgLogTileSize
             << ", enableUkernels = " << options.enableUkernels << "}";
 }
 
@@ -206,6 +213,13 @@ static ReorderWorkgroupsStrategy getReorderWorkgroupsStrategy(
   return option.value_or(clReorderWorkgroupsStrategy);
 }
 
+// Reconciles log2 of the workgroup reordering tile size based on the pipeline
+// `option` and the CLI flag.
+static unsigned getReorderWorkgroupsLogTileSize(std::optional<int64_t> option) {
+  int64_t logTile = option.value_or(clReorderWorkgroupsLogTile);
+  assert(logTile >= 0);
+  return static_cast<unsigned>(logTile);
+}
 //===----------------------------------------------------------------------===//
 // Common Pass Recipes
 //===----------------------------------------------------------------------===//
@@ -449,9 +463,10 @@ void addGPUMatmulSimtPassPipeline(OpPassManager &funcPassManager,
 
   ReorderWorkgroupsStrategy reorderStrategy =
       getReorderWorkgroupsStrategy(options.reorderStrategy);
+  unsigned reorderWgLogTileSize =
+      getReorderWorkgroupsLogTileSize(options.reorderWgLogTileSize);
   funcPassManager.addPass(createReorderWorkgroups(
-      reorderStrategy, clReorderWorkgroupsLogSwizzleTile,
-      canReorderWorkgroups));
+      reorderStrategy, reorderWgLogTileSize, canReorderWorkgroups));
 
   funcPassManager.addPass(createCanonicalizerPass());
   funcPassManager.addPass(createCSEPass());
@@ -498,9 +513,10 @@ void addGPUMatmulTensorCorePassPipeline(OpPassManager &funcPassManager,
 
   ReorderWorkgroupsStrategy reorderStrategy =
       getReorderWorkgroupsStrategy(options.reorderStrategy);
+  unsigned reorderWgLogTileSize =
+      getReorderWorkgroupsLogTileSize(options.reorderWgLogTileSize);
   funcPassManager.addPass(createReorderWorkgroups(
-      reorderStrategy, clReorderWorkgroupsLogSwizzleTile,
-      canReorderWorkgroups));
+      reorderStrategy, reorderWgLogTileSize, canReorderWorkgroups));
 
   funcPassManager.addPass(createCanonicalizerPass());
   funcPassManager.addPass(createCSEPass());
@@ -566,9 +582,10 @@ void addGPUMatmulTensorCoreMmaSyncPassPipeline(
 
   ReorderWorkgroupsStrategy reorderStrategy =
       getReorderWorkgroupsStrategy(options.reorderStrategy);
+  unsigned reorderWgLogTileSize =
+      getReorderWorkgroupsLogTileSize(options.reorderWgLogTileSize);
   funcPassManager.addPass(createReorderWorkgroups(
-      reorderStrategy, clReorderWorkgroupsLogSwizzleTile,
-      canReorderWorkgroups));
+      reorderStrategy, reorderWgLogTileSize, canReorderWorkgroups));
 
   funcPassManager.addPass(createCanonicalizerPass());
   funcPassManager.addPass(createCSEPass());
@@ -725,9 +742,10 @@ void addGPUVectorDistributePassPipeline(OpPassManager &funcPassManager,
 
   ReorderWorkgroupsStrategy reorderStrategy =
       getReorderWorkgroupsStrategy(options.reorderStrategy);
+  unsigned reorderWgLogTileSize =
+      getReorderWorkgroupsLogTileSize(options.reorderWgLogTileSize);
   funcPassManager.addPass(createReorderWorkgroups(
-      reorderStrategy, clReorderWorkgroupsLogSwizzleTile,
-      canReorderWorkgroups));
+      reorderStrategy, reorderWgLogTileSize, canReorderWorkgroups));
 
   funcPassManager.addPass(createCanonicalizerPass());
   funcPassManager.addPass(createCSEPass());
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
index 0492d49f6dee..17802197fd0b 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
@@ -26,7 +26,7 @@ namespace mlir::iree_compiler {
 /// attribute. These are used to override default pass heuristics at the
 /// function granularity.
 namespace LLVMGPUAttrNames {
-inline constexpr StringLiteral kReorderWorkgroups = "reorder_workgroups";
+inline constexpr StringLiteral kReorderWorkgroups = "workgroup_reorder";
 inline constexpr StringLiteral kNoReduceSharedMemoryBankConflicts =
     "no_reduce_shared_memory_bank_conflicts";
 } //  namespace LLVMGPUAttrNames
@@ -35,6 +35,7 @@ struct LLVMGPUPipelineOptions {
   bool enableReduceSharedMemoryBankConflicts = true;
   bool enableUkernels = false;
   std::optional<ReorderWorkgroupsStrategy> reorderStrategy;
+  std::optional<int64_t> reorderWgLogTileSize;
 };
 
 llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_user_vector_distribute.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_user_vector_distribute.mlir
index ad80fdf2498b..5ba79f62b106 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_user_vector_distribute.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_user_vector_distribute.mlir
@@ -2,7 +2,7 @@
 // RUN:   --iree-codegen-reorder-workgroups-strategy=transpose \
 // RUN:   --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target)))))" %s | FileCheck %s --check-prefix=OPT-OUT
 
-// Check that applying `reorder_workgroups` enables or disables workgroup reordering.
+// Check that applying `workgroup_reorder` enables or disables workgroup reordering.
 
 // RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx942 --iree-codegen-llvmgpu-use-vector-distribution \
 // RUN:   --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target)))))" %s | FileCheck %s --check-prefix=OPT-IN
@@ -16,6 +16,11 @@
 // OPT-IN:       #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
 // OPT-IN-SAME:    mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>,
 // OPT-IN-SAME:    no_reduce_shared_memory_bank_conflicts
+#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb",
+{iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8,
+storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F16_16x16x16_F32>, <MFMA_F16_32x32x8_F32>],
+subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536>,
+chip = <wgp_count = 304, chiplet_count = 8>>, ukernels = "none"}>
 #pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
   #hal.descriptor_set.layout<0, bindings = [
     #hal.descriptor_set.binding<0, storage_buffer>,
@@ -23,7 +28,7 @@
   ]>
 ]>
 hal.executable public @main_0_dispatch_0 {
-  hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
+  hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
     hal.executable.export public @main_0_dispatch_0_matmul_transpose_b_2048x10240x1280_f16xf16xf32 ordinal(0) layout(#pipeline_layout)
     attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} {
     ^bb0(%arg0: !hal.device):
@@ -83,15 +88,21 @@ hal.executable public @main_0_dispatch_0 {
 
 // -----
 
-// Check that applying the `reorder_workgroups = transpose` unit attribute enables workgroup reordering.
+// Check that applying the `workgroup_reorder = transpose` unit attribute enables workgroup reordering.
 
 // OPT-OUT:       #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
 // OPT-OUT-SAME:    mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>,
-// OPT-OUT-SAME:    reorder_workgroups = "transpose"
+// OPT-OUT-SAME:    workgroup_reorder = #iree_gpu.workgroup_reorder<reorder_option = transpose>
 
 // OPT-IN:       #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
 // OPT-IN-SAME:    mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>,
-// OPT-IN-SAME:    reorder_workgroups = "transpose"
+// OPT-IN-SAME:    workgroup_reorder = #iree_gpu.workgroup_reorder<reorder_option = transpose>
+
+#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb",
+{iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8,
+storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F16_16x16x16_F32>, <MFMA_F16_32x32x8_F32>],
+subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536>,
+chip = <wgp_count = 304, chiplet_count = 8>>, ukernels = "none"}>
 #pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
   #hal.descriptor_set.layout<0, bindings = [
     #hal.descriptor_set.binding<0, storage_buffer>,
@@ -99,7 +110,7 @@ hal.executable public @main_0_dispatch_0 {
   ]>
 ]>
 hal.executable public @main_0_dispatch_0 {
-  hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
+  hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
     hal.executable.export public @main_0_dispatch_0_matmul_transpose_b_2048x10240x1280_f16xf16xf32 ordinal(0) layout(#pipeline_layout)
     attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} {
     ^bb0(%arg0: !hal.device):
@@ -127,7 +138,7 @@ hal.executable public @main_0_dispatch_0 {
       func.func @main_0_dispatch_0_matmul_transpose_b_2048x10240x1280_f16xf16xf32()
         attributes {translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64, {
           mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>, subgroup_m_count = 2, subgroup_n_count = 2>,
-          reorder_workgroups = "transpose"  // enable the 'reorderWorkgroups' pass.
+          workgroup_reorder = #iree_gpu.workgroup_reorder<reorder_option = "transpose">  // enable the 'reorderWorkgroups' pass.
         }>} {
         %cst = arith.constant 0.000000e+00 : f16
         %c0 = arith.constant 0 : index
@@ -159,11 +170,16 @@ hal.executable public @main_0_dispatch_0 {
 }
 
 // -----
-// Check that applying the `reorder_workgroups = none` unit attribute disables workgroup reordering.
+// Check that applying the `workgroup_reorder = none` unit attribute disables workgroup reordering.
 
 // OPT-OUT:       #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
 // OPT-OUT-SAME:    mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>,
-// OPT-OUT-SAME:    reorder_workgroups = "none"
+// OPT-OUT-SAME:    workgroup_reorder = #iree_gpu.workgroup_reorder<>
+#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb",
+{iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8,
+storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F16_16x16x16_F32>, <MFMA_F16_32x32x8_F32>],
+subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536>,
+chip = <wgp_count = 304, chiplet_count = 8>>, ukernels = "none"}>
 #pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
   #hal.descriptor_set.layout<0, bindings = [
     #hal.descriptor_set.binding<0, storage_buffer>,
@@ -171,7 +187,7 @@ hal.executable public @main_0_dispatch_0 {
   ]>
 ]>
 hal.executable public @main_0_dispatch_0 {
-  hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
+  hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
     hal.executable.export public @main_0_dispatch_0_matmul_transpose_b_2048x10240x1280_f16xf16xf32 ordinal(0) layout(#pipeline_layout)
     attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} {
     ^bb0(%arg0: !hal.device):
@@ -188,7 +204,7 @@ hal.executable public @main_0_dispatch_0 {
       func.func @main_0_dispatch_0_matmul_transpose_b_2048x10240x1280_f16xf16xf32()
         attributes {translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64, {
           mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F16_16x16x16_F32>, subgroup_m_count = 2, subgroup_n_count = 2>,
-          reorder_workgroups = "none"  // Disable the 'reorderWorkgroups' pass.
+          workgroup_reorder = #iree_gpu.workgroup_reorder<reorder_option="none">  // Disable the 'reorderWorkgroups' pass.
         }>} {
         %cst = arith.constant 0.000000e+00 : f16
         %c0 = arith.constant 0 : index