iree-org · bangtianliu · Jun 26, 2024 · Jul 4, 2024 · Jul 9, 2024 · Jul 9, 2024
@@ -109,7 +109,7 @@ createConvertVectorReductionToGPUPass(
     bool expandSubgroupReduction = true,
     std::function<int(mlir::FunctionOpInterface)> getWarpSize = nullptr);
 
-enum class ReorderWorkgroupsStrategy { None, Swizzle, Transpose };
+enum class ReorderWorkgroupsStrategy { None, ChipletGroup, Swizzle, Transpose };
 
 /// Reorders workgroup IDs.
 std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>

@@ -199,10 +199,11 @@ def ReorderWorkgroupsPass :
   let dependentDialects = ["::mlir::affine::AffineDialect"];
   let options = [
     Option<"strategy", "strategy", "std::string", /*default=*/"",
-           "Workgroup reordering strategy, one of: '' (none),  'transpose', 'swizzle'">,
+           "Workgroup reordering strategy, one of: '' (none),  'transpose', 'swizzle', 'chipletgroup'">,
     Option<"logTile", "logTile", "unsigned",
             /*default=*/"0",
-           "The log2 of the tile size used for swizzling. (0: disabled, non-0: swizzling enabled)">,
+           "The log2 of the tile size used for swizzling and chipletgroup. "
+           "(0: disabled, non-0: swizzling/chipletgroup enabled)">,
   ];
 }
 

@@ -68,6 +68,95 @@ makeSwizzledIds(Location loc, OpBuilder b, Value workgroupIdX,
   return {swizzledIdX, swizzledIdY};
 }
 
+// reoredering to make workgroup ids move slowly between chiplet groups
+static Value chipletAwareWorkgroupReordering(Location loc, OpBuilder b,
+                                             Value linearizedId,
+                                             Value workgroupCountX,
+                                             Value workgroupCountY,
+                                             int64_t numChipletsPerGroup) {
+  Value numChipletsVal =
+      b.createOrFold<arith::ConstantIndexOp>(loc, numChipletsPerGroup);
+  Value workgroupCount =
+      b.create<arith::MulIOp>(loc, workgroupCountX, workgroupCountY);
+  Value workgroupCountPerChiplet =
+      b.create<arith::DivUIOp>(loc, workgroupCount, numChipletsVal);
+  Value chipletId = b.create<arith::RemUIOp>(loc, linearizedId, numChipletsVal);
+  Value wgIdWithinChiplet =
+      b.create<arith::DivUIOp>(loc, linearizedId, numChipletsVal);
+  Value reorderedId = b.create<arith::AddIOp>(
+      loc, wgIdWithinChiplet,
+      b.create<arith::MulIOp>(loc, chipletId, workgroupCountPerChiplet));
+
+  // The following code is used to handle the remainder part
+
+  Value constOne = b.createOrFold<arith::ConstantIndexOp>(loc, 1);
+  Value lastWorkgroupId =
+      b.create<arith::SubIOp>(loc, workgroupCount, constOne);
+  Value modulatedLastWorkgroupId = b.create<arith::SubIOp>(
+      loc, lastWorkgroupId,
+      b.create<arith::RemUIOp>(loc, workgroupCount, numChipletsVal));
+  Value isGreaterThanFinalWorkgroupId = b.create<arith::CmpIOp>(
+      loc, arith::CmpIPredicate::ugt, linearizedId, modulatedLastWorkgroupId);
+  linearizedId = b.create<arith::SelectOp>(loc, isGreaterThanFinalWorkgroupId,
+                                           linearizedId, reorderedId);
+
+  return linearizedId;
+}
+
+// Chiplet-aware workgroup reordering strategy: reordering + super-grouping.
+// Step 1: Reorder the workgroup grid to move slowly between
+// chiplet groups (Function: chipletAwareWorkgroupReordering).
+// Step 2: Implement 'super-grouping' of workgroups before switching to the next
+// column.
+static std::pair<Value, Value>
+makeChipletGroupedIds(Location loc, OpBuilder b, Value workgroupIdX,
+                      Value workgroupIdY, Value workgroupCountX,
+                      Value workgroupCountY, unsigned chipletGroupTile) {
+  // Create one dimension ID for workgroup
-  // Create one dimension ID for workgroup
+  // Create one dimension ID for workgroup.
-  // Create one dimension ID for workgroup
+  // Create one dimension ID for workgroup.
+  Value linearized =
+      b.create<arith::MulIOp>(loc, workgroupIdY, workgroupCountX);
+  linearized = b.create<arith::AddIOp>(loc, linearized, workgroupIdX);
+
+  // This value is hardcoded for cdna3(mi300x)
+  int64_t numXCDs = 8;
+  // Map chiplets to perform a spatially local tile operation.
+  // Reorder the linearized ID such that every consecutive group of chiplets
+  // is the slowest-changing dimension in the grid.
+  // Emphircally found that two chiplets as a group has better locality
+  // throughout.
+  linearized = chipletAwareWorkgroupReordering(
+      loc, b, linearized, workgroupCountX, workgroupCountY, numXCDs / 2);
+
+  // Detailed explaination about the idea behind the below implementation:
+  // the L2 Cache Optimizations subsection in
+  // https://triton-lang.org/main/getting-started/tutorials/03-matrix-multiplication.html#
+  // Emphircally, found rowGroupSize=16 for mi300x achieves good performance
+  unsigned rowGroupSize = chipletGroupTile;
+  Value rowGroupSizeVal =
+      b.createOrFold<arith::ConstantIndexOp>(loc, rowGroupSize);
+  // group every 16 workgroups along Y dimension
+  // Number of workgroups in the group
+  Value numWorkGroupsPerRowBlock =
+      b.create<arith::MulIOp>(loc, rowGroupSizeVal, workgroupCountX);
+
+  Value groupId =
+      b.create<arith::DivUIOp>(loc, linearized, numWorkGroupsPerRowBlock);
+  Value firstRowID = b.create<arith::MulIOp>(loc, groupId, rowGroupSizeVal);
+
+  Value currentRowGroupSize = b.create<arith::MinUIOp>(
+      loc, b.create<arith::SubIOp>(loc, workgroupCountY, firstRowID),
+      rowGroupSizeVal);
+
+  Value newY = b.create<arith::AddIOp>(
+      loc, firstRowID,
+      b.create<arith::RemUIOp>(loc, linearized, currentRowGroupSize));
+
+  Value newX = b.create<arith::DivUIOp>(
+      loc, b.create<arith::RemUIOp>(loc, linearized, numWorkGroupsPerRowBlock),
+      currentRowGroupSize);
+  return {newX, newY};
+}
+
 /// Transpose IDs, i.e., changes the traversal order from left -> right then
 /// top -> bottom to top -> bottom then left -> right.
 static std::pair<Value, Value> makeTransposedIds(Location loc, OpBuilder b,
@@ -112,11 +201,11 @@ getWorkgroupCountsXY(OpBuilder &builder, FunctionOpInterface funcOp) {
 
 static LogicalResult reorderWorkgroupsInFunc(FunctionOpInterface funcOp,
                                              ReorderWorkgroupsStrategy strategy,
-                                             unsigned swizzleLogTile) {
+                                             unsigned logTile) {
   assert(strategy != ReorderWorkgroupsStrategy::None &&
          "Expected a concrete strategy");
 
-  unsigned swizzleTile = 1u << swizzleLogTile;
+  unsigned reorderWgTileSize = 1u << logTile;
   IREE::HAL::InterfaceWorkgroupIDOp oldXId;
   IREE::HAL::InterfaceWorkgroupIDOp oldYId;
   unsigned numXIdOps = 0;
@@ -153,7 +242,11 @@ static LogicalResult reorderWorkgroupsInFunc(FunctionOpInterface funcOp,
   if (strategy == ReorderWorkgroupsStrategy::Swizzle) {
     std::tie(newWorkgroupIdX, newWorkgroupIdY) =
         makeSwizzledIds(funcOp.getLoc(), builder, workgroupIdX, workgroupIdY,
-                        workgroupCntX, workgroupCntY, swizzleTile);
+                        workgroupCntX, workgroupCntY, reorderWgTileSize);
+  } else if (strategy == ReorderWorkgroupsStrategy::ChipletGroup) {
+    std::tie(newWorkgroupIdX, newWorkgroupIdY) = makeChipletGroupedIds(
+        funcOp.getLoc(), builder, workgroupIdX, workgroupIdY, workgroupCntX,
+        workgroupCntY, reorderWgTileSize);
   } else {
     assert(strategy == ReorderWorkgroupsStrategy::Transpose &&
            "Unhandled strategy");
@@ -186,9 +279,9 @@ namespace {
 struct ReorderWorkgroupsPass final
     : impl::ReorderWorkgroupsPassBase<ReorderWorkgroupsPass> {
   ReorderWorkgroupsPass(
-      ReorderWorkgroupsStrategy strategy, unsigned logSwizzleTile,
+      ReorderWorkgroupsStrategy strategy, unsigned logTile,
       std::function<LogicalResult(mlir::FunctionOpInterface)> filterFn)
-      : reorderingStrategy(strategy), logSwizzleTile(logSwizzleTile),
+      : reorderingStrategy(strategy), reorderWgLogTileSize(logTile),
         filterFn(std::move(filterFn)) {}
 
   LogicalResult initializeOptions(
@@ -197,10 +290,11 @@ struct ReorderWorkgroupsPass final
     if (failed(Pass::initializeOptions(options, errorHandler))) {
       return failure();
     }
-    logSwizzleTile = logTile;
+    reorderWgLogTileSize = logTile;
     auto selectedStrategy =
         llvm::StringSwitch<FailureOr<ReorderWorkgroupsStrategy>>(strategy)
             .Case("", ReorderWorkgroupsStrategy::None)
+            .Case("chipletgroup", ReorderWorkgroupsStrategy::ChipletGroup)
             .Case("swizzle", ReorderWorkgroupsStrategy::Swizzle)
             .Case("transpose", ReorderWorkgroupsStrategy::Transpose)
             .Default(failure());
@@ -216,7 +310,11 @@ struct ReorderWorkgroupsPass final
       return;
 
     if (reorderingStrategy == ReorderWorkgroupsStrategy::Swizzle &&
-        logSwizzleTile == 0)
+        reorderWgLogTileSize == 0)
+      return;
+
+    if (reorderingStrategy == ReorderWorkgroupsStrategy::ChipletGroup &&
+        reorderWgLogTileSize == 0)
       return;
 
     FunctionOpInterface funcOp = getOperation();
@@ -229,7 +327,8 @@ struct ReorderWorkgroupsPass final
       llvm::dbgs() << "\n\n";
     });
 
-    if (failed(reorderWorkgroupsInFunc(funcOp, reorderingStrategy, logTile))) {
+    if (failed(reorderWorkgroupsInFunc(funcOp, reorderingStrategy,
+                                       reorderWgLogTileSize))) {
       LLVM_DEBUG(llvm::dbgs() << "Failed to reorder workgroups\n");
       return;
     }
@@ -244,7 +343,7 @@ struct ReorderWorkgroupsPass final
 private:
   ReorderWorkgroupsStrategy reorderingStrategy =
       ReorderWorkgroupsStrategy::None;
-  unsigned logSwizzleTile = 0;
+  unsigned reorderWgLogTileSize = 0;
   std::function<LogicalResult(mlir::FunctionOpInterface)> filterFn;
 };
 } // namespace

@@ -4,6 +4,9 @@
 // RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-reorder-workgroups{strategy=transpose}))" \
 // RUN:   --split-input-file %s | FileCheck --check-prefix=TRANSPOSE %s
 
+// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-reorder-workgroups{strategy=chipletgroup logTile=3}))" \
+// RUN:   --split-input-file %s | FileCheck --check-prefix=CHIPLETGROUP %s
+
 func.func @matmul() {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
@@ -55,6 +58,41 @@ func.func @matmul() {
 // SWIZZLE:         %[[S13:.*]] = arith.select %[[S12]], %[[WG_X]], %[[S6]] : index
 // SWIZZLE:         %[[S14:.*]] = arith.select %[[S12]], %[[WG_Y]], %[[S7]] : index
 
+// CHIPLETGROUP-LABEL: func.func @matmul
+// CHIPLETGROUP:         %[[WG_X:.*]] = hal.interface.workgroup.id[0] : index
+// CHIPLETGROUP:         %[[WG_Y:.*]] = hal.interface.workgroup.id[1] : index
+// CHIPLETGROUP:         %[[WG_CNT_X:.*]] = hal.interface.workgroup.count[0] : index
+// CHIPLETGROUP:         %[[WG_CNT_Y:.*]] = hal.interface.workgroup.count[1] : index
+// CHIPLETGROUP:         %[[S0:.*]] = arith.muli %[[WG_Y]], %[[WG_CNT_X]] : index
+// CHIPLETGROUP:         %[[S1:.*]] = arith.addi %[[S0]], %[[WG_X]] : index
+// CHIPLETGROUP:         %[[CST4:.*]] = arith.constant 4 : index
+// CHIPLETGROUP:         %[[WG_CNT:.*]] = arith.muli %[[WG_CNT_X]], %[[WG_CNT_Y]] : index
+// CHIPLETGROUP:         %[[S3:.*]] = arith.divui %[[WG_CNT]], %[[CST4]] : index
+// CHIPLETGROUP:         %[[S4:.*]] = arith.remui %[[S1]], %[[CST4]] : index
+// CHIPLETGROUP:         %[[S5:.*]] = arith.divui %[[S1]], %[[CST4]] : index
+// CHIPLETGROUP:         %[[S6:.*]] = arith.muli %[[S4]], %[[S3]] : index
+// CHIPLETGROUP:         %[[S7:.*]] = arith.addi %[[S5]], %[[S6]] : index
+// CHIPLETGROUP:         %[[CST1:.*]] = arith.constant 1 : index
+// CHIPLETGROUP:         %[[S8:.*]] = arith.subi %[[WG_CNT]], %[[CST1]] : index
+// CHIPLETGROUP:         %[[S9:.*]] = arith.remui %[[WG_CNT]], %[[CST4]] : index
+// CHIPLETGROUP:         %[[S10:.*]] = arith.subi %[[S8]], %[[S9]] : index
+// CHIPLETGROUP:         %[[S11:.*]] = arith.cmpi ugt, %[[S1]], %[[S10]] : index
+// CHIPLETGROUP:         %[[S12:.*]] = arith.select %[[S11]], %[[S1]], %[[S7]] : index
+// CHIPLETGROUP:         %[[CST8:.*]] = arith.constant 8 : index
+// CHIPLETGROUP:         %[[S13:.*]] = arith.muli %[[CST8]], %[[WG_CNT_X]] : index
+// CHIPLETGROUP:         %[[S14:.*]] = arith.divui %[[S12]], %[[S13]] : index
+// CHIPLETGROUP:         %[[S15:.*]] = arith.muli %[[S14]], %[[CST8]] : index
+// CHIPLETGROUP:         %[[S16:.*]] = arith.subi %[[WG_CNT_Y]], %[[S15]] : index
+// CHIPLETGROUP:         %[[S17:.*]] = arith.minui %[[S16]], %[[CST8]] : index
+// CHIPLETGROUP:         %[[S18:.*]] = arith.remui %[[S12]], %[[S17]] : index
+// CHIPLETGROUP:         %[[S19:.*]] = arith.addi %[[S15]], %[[S18]] : index
+// CHIPLETGROUP:         %[[S20:.*]] = arith.remui %[[S12]], %[[S13]] : index
+// CHIPLETGROUP:         %[[S21:.*]] = arith.divui %[[S20]], %[[S17]] : index
+// CHIPLETGROUP:         %26 = affine.apply #map()[%[[S19]]]
+// CHIPLETGROUP:         %27 = affine.apply #map()[%workgroup_count_y_1]
+// CHIPLETGROUP:         %28 = affine.apply #map()[%[[S21]]]
+// CHIPLETGROUP:         %29 = affine.apply #map()[%workgroup_count_x_0]
+
 // TRANSPOSE-LABEL: func.func @matmul
 // TRANSPOSE:         %[[WG_X:.*]] = hal.interface.workgroup.id[0] : index
 // TRANSPOSE:         %[[WG_Y:.*]] = hal.interface.workgroup.id[1] : index

@@ -4,6 +4,9 @@
 // RUN: iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-codegen-reorder-workgroups{strategy=transpose})))))" \
 // RUN:   %s | FileCheck --check-prefix=TRANSPOSE %s
 
+// RUN: iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-codegen-reorder-workgroups{strategy=chipletgroup logTile=3})))))" \
+// RUN:   %s | FileCheck --check-prefix=CHIPLETGROUP %s
+
 // Make sure we use static workgroup counts instead of introducting
 // `hal.interface.workgroup.count` ops. These are currently not supported on ROCm.
 
@@ -18,6 +21,25 @@
 // SWIZZLE-DAG:               affine.apply #{{.+}}()[%[[SEL_Y]]]
 // SWIZZLE:                   return
 
+// CHIPLETGROUP-LABEL: hal.executable private @main_dispatch_0 {
+// CHIPLETGROUP-LABEL: func.func @main_dispatch_0_matmul_transpose_b_32000x32000x4096_f16
+// CHIPLETGROUP-DAG:               %[[WG_X:.+]] = hal.interface.workgroup.id[0] : index
+// CHIPLETGROUP-DAG:               %[[WG_Y:.+]] = hal.interface.workgroup.id[1] : index
+// CHIPLETGROUP-NOT:               hal.interface.workgroup.count
+// CHIPLETGROUP-DAG:               %[[C250:.+]] = arith.constant 250 : index
+// CHIPLETGROUP-DAG:               %[[C500:.+]] = arith.constant 500 : index
+// CHIPLETGROUP:                   %[[MUL:.+]] = arith.muli %[[WG_Y]], %[[C250]] : index
+// CHIPLETGROUP:                   %[[ADD:.+]] = arith.addi %[[MUL]], %[[WG_X]] : index
+// CHIPLETGROUP:                   %[[CMP:.+]] = arith.cmpi ugt, %[[ADD]], %{{.+}} : index
+// CHIPLETGROUP:                   %[[SELECT:.+]] = arith.select %[[CMP]], %[[ADD]], %{{.+}} : index
+// CHIPLETGROUP:                   %[[REM:.+]] = arith.remui %[[SELECT]], %{{.+}} : index
+// CHIPLETGROUP:                   %[[ADDI:.+]] = arith.addi %{{.+}}, %[[REM]] : index
+// CHIPLETGROUP:                   %[[REMI:.+]] = arith.remui %[[SELECT]], %{{.+}} : index
+// CHIPLETGROUP:                   %[[DIV:.+]] = arith.divui %[[REMI]], %{{.+}} : index
+// CHIPLETGROUP-DAG:               affine.apply #{{.+}}()[%[[ADDI]]]
+// CHIPLETGROUP-DAG:               affine.apply #{{.+}}()[%[[DIV]]]
+// CHIPLETGROUP:                   return
+
 // TRANSPOSE-LABEL: hal.executable private @main_dispatch_0 {
 // TRANSPOSE-LABEL: func.func @main_dispatch_0_matmul_transpose_b_32000x32000x4096_f16
 // TRANSPOSE-DAG:               %[[WG_X:.+]] = hal.interface.workgroup.id[0] : index

@@ -269,6 +269,27 @@ def IREEGPU_MmaScheduleAttr : AttrDef<IREEGPU_Dialect, "MMASchedule"> {
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// Workgroup Reordering Attr
+
+def IREEGPU_WorkGroupReorderAttr: AttrDef<IREEGPU_Dialect, "WorkgroupReorderOptions">{
+  let mnemonic = "reorder_workgroups";
+  let cppNamespace = "::mlir::iree_compiler::IREE::GPU";
+
+  string description = [{
+    options for workgroup reordering strategies to improve L2 cache hit rate
+    and thus provide performance improvement.
+  }];
+
+  let parameters = (ins
+    "::mlir::iree_compiler::IREE::GPU::ReorderWorkgroupEnum":$reorder_option,
+    OptionalParameter<"std::optional<int64_t>", "the tile size to use">:$tileSize
+  );
+
+  let assemblyFormat = "`<` struct(params) `>`";
+}
+
+
 //===----------------------------------------------------------------------===//
 // Workgroup processor level description
 

@@ -148,4 +148,20 @@ def IREEGPU_TilingLevel : IREEGPU_I32MmaEnumAttr<"TilingLevel",
       Lane
     ]>;
 
+//===----------------------------------------------------------------------===//
+// Workgroup reordering strategies
+
+def None : I32EnumAttrCase<"none", 0>;
+def Transpose :I32EnumAttrCase<"transpose", 1>;
+def Swizzle : I32EnumAttrCase<"swizzle", 2>;
+def Chipletgroup : I32EnumAttrCase<"chipletgroup", 3>;
+
+def IREEGPU_ReorderWorkgroupEnum : IREEGPU_I32MmaEnumAttr<"ReorderWorkgroupEnum",
+    "Descriptor for strategies of reordering workgroups on GPUs", [
+      None,
+      Transpose,
+      Swizzle,
+      Chipletgroup
+    ]>;
+
 #endif // IREE_COMPILER_CODEGEN_DIALECT_GPU_IREEGPUENUMS
@@ -92,21 +92,31 @@ getPipelineOptions(FunctionOpInterface funcOp,
       // Get the workgroups reorder config and enable the workgroup reordering.
       Attribute reorderWorkgroupOption =
           config.get(LLVMGPUAttrNames::kReorderWorkgroups);
-      if (!isa<StringAttr>(reorderWorkgroupOption))
-        funcOp.emitOpError() << "'" << LLVMGPUAttrNames::kReorderWorkgroups
-                             << "' is expected to be a string attribute";
-      StringRef reorderStr = llvm::cast<StringAttr>(reorderWorkgroupOption);
-      if (reorderStr == "transpose") {
-        pipelineOptions.reorderStrategy = ReorderWorkgroupsStrategy::Transpose;
-      } else if (reorderStr == "swizzle") {
-        pipelineOptions.reorderStrategy = ReorderWorkgroupsStrategy::Swizzle;
-      } else {
-        if (reorderStr != "none")
-          funcOp.emitOpError()
-              << "Unknown " << LLVMGPUAttrNames::kReorderWorkgroups
-              << "value: " << reorderWorkgroupOption;
-        else
+      if (llvm::isa<IREE::GPU::WorkgroupReorderOptionsAttr>(
+              reorderWorkgroupOption)) {
+        IREE::GPU::WorkgroupReorderOptionsAttr ReorderOption =
+            llvm::dyn_cast<IREE::GPU::WorkgroupReorderOptionsAttr>(
+                reorderWorkgroupOption);
+        pipelineOptions.reorderWgLogTileSize = ReorderOption.getTileSize();
+        switch (ReorderOption.getReorderOption()) {
+        case IREE::GPU::ReorderWorkgroupEnum::none:
           pipelineOptions.reorderStrategy = ReorderWorkgroupsStrategy::None;
+          break;
+        case IREE::GPU::ReorderWorkgroupEnum::transpose:
+          pipelineOptions.reorderStrategy =
+              ReorderWorkgroupsStrategy::Transpose;
+          break;
+        case IREE::GPU::ReorderWorkgroupEnum::swizzle:
+          pipelineOptions.reorderStrategy = ReorderWorkgroupsStrategy::Swizzle;
+          break;
+        case IREE::GPU::ReorderWorkgroupEnum::chipletgroup:
+          pipelineOptions.reorderStrategy =
+              ReorderWorkgroupsStrategy::ChipletGroup;
+          break;
+        default:
+          funcOp.emitOpError(
+              "unsupported workgroup reordering option on GPU target.");
+        }
       }
     }
   }