diff --git a/build_tools/pkgci/external_test_suite/attention_and_matmul_spec.mlir b/build_tools/pkgci/external_test_suite/attention_and_matmul_spec.mlir index 656b5206595b..4a9949dadb9d 100644 --- a/build_tools/pkgci/external_test_suite/attention_and_matmul_spec.mlir +++ b/build_tools/pkgci/external_test_suite/attention_and_matmul_spec.mlir @@ -541,7 +541,8 @@ module attributes { transform.with_named_sequence } { {mma_schedule = #iree_gpu.mma_schedule< intrinsic = #iree_gpu.mma_layout, subgroup_m_count = 2, subgroup_n_count = 1> - , llvm_func_attrs = {"amdgpu-waves-per-eu" = "1"}}> + , workgroup_reorder = #iree_gpu.workgroup_reorder, + llvm_func_attrs = {"amdgpu-waves-per-eu" = "1"}}> > -> !transform.any_param transform.yield %matmul, %config : !transform.any_op, !transform.any_param } @@ -559,7 +560,8 @@ module attributes { transform.with_named_sequence } { {mma_schedule = #iree_gpu.mma_schedule< intrinsic = #iree_gpu.mma_layout, subgroup_m_count = 2, subgroup_n_count = 1> - , llvm_func_attrs = {"amdgpu-waves-per-eu" = "1"}}> + , workgroup_reorder = #iree_gpu.workgroup_reorder, + llvm_func_attrs = {"amdgpu-waves-per-eu" = "1"}}> > -> !transform.any_param transform.yield %matmul, %config : !transform.any_op, !transform.any_param } @@ -637,7 +639,7 @@ module attributes { transform.with_named_sequence } { {mma_schedule = #iree_gpu.mma_schedule< intrinsic = #iree_gpu.mma_layout, subgroup_m_count = 1, subgroup_n_count = 5> - , reorder_workgroups = "transpose"}> + , workgroup_reorder = #iree_gpu.workgroup_reorder}> > -> !transform.any_param transform.yield %conv, %config : !transform.any_op, !transform.any_param } @@ -657,7 +659,7 @@ module attributes { transform.with_named_sequence } { {mma_schedule = #iree_gpu.mma_schedule< intrinsic = #iree_gpu.mma_layout, subgroup_m_count = 1, subgroup_n_count = 4> - , reorder_workgroups = "transpose", llvm_func_attrs = {"amdgpu-waves-per-eu" = "1"}}> + , workgroup_reorder = #iree_gpu.workgroup_reorder}> > -> !transform.any_param transform.yield %conv, %config : !transform.any_op, !transform.any_param } @@ -677,7 +679,7 @@ module attributes { transform.with_named_sequence } { {mma_schedule = #iree_gpu.mma_schedule< intrinsic = #iree_gpu.mma_layout, subgroup_m_count = 1, subgroup_n_count = 5> - , reorder_workgroups = "transpose"}> + , workgroup_reorder = #iree_gpu.workgroup_reorder}> > -> !transform.any_param transform.yield %conv, %config : !transform.any_op, !transform.any_param } @@ -697,7 +699,7 @@ module attributes { transform.with_named_sequence } { {mma_schedule = #iree_gpu.mma_schedule< intrinsic = #iree_gpu.mma_layout, subgroup_m_count = 1, subgroup_n_count = 5> - , reorder_workgroups = "transpose"}> + , workgroup_reorder = #iree_gpu.workgroup_reorder}> > -> !transform.any_param transform.yield %conv, %config : !transform.any_op, !transform.any_param } @@ -717,7 +719,7 @@ module attributes { transform.with_named_sequence } { {mma_schedule = #iree_gpu.mma_schedule< intrinsic = #iree_gpu.mma_layout, subgroup_m_count = 4, subgroup_n_count = 2> - , reorder_workgroups = "transpose", llvm_func_attrs = {"amdgpu-waves-per-eu" = "1"}}> + , workgroup_reorder = #iree_gpu.workgroup_reorder}> > -> !transform.any_param transform.yield %conv, %config : !transform.any_op, !transform.any_param } @@ -737,7 +739,7 @@ module attributes { transform.with_named_sequence } { {mma_schedule = #iree_gpu.mma_schedule< intrinsic = #iree_gpu.mma_layout, subgroup_m_count = 1, subgroup_n_count = 5> - , reorder_workgroups = "transpose"}> + , workgroup_reorder = #iree_gpu.workgroup_reorder}> > -> !transform.any_param transform.yield %conv, %config : !transform.any_op, !transform.any_param } diff --git a/compiler/plugins/target/ROCM/test/target_device_features.mlir b/compiler/plugins/target/ROCM/test/target_device_features.mlir index 0f427c52fd00..f9b16a5728d8 100644 --- a/compiler/plugins/target/ROCM/test/target_device_features.mlir +++ b/compiler/plugins/target/ROCM/test/target_device_features.mlir @@ -9,7 +9,7 @@ // GFX942-SAME: mma = [, ], // GFX942-SAME: subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], // GFX942-SAME: max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536>, -// GFX942-SAME: chip = > +// GFX942-SAME: chip = > // GFX940: target = #iree_gpu.target, ] diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.h b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.h index f985c1234892..1c343151c2e6 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.h +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.h @@ -109,7 +109,7 @@ createConvertVectorReductionToGPUPass( bool expandSubgroupReduction = true, std::function getWarpSize = nullptr); -enum class ReorderWorkgroupsStrategy { None, Swizzle, Transpose }; +enum class ReorderWorkgroupsStrategy { None, ChipletGroup, Swizzle, Transpose }; /// Reorders workgroup IDs. std::unique_ptr> diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td index 5bb44cd631f1..7ecfae72a118 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td @@ -199,10 +199,15 @@ def ReorderWorkgroupsPass : let dependentDialects = ["::mlir::affine::AffineDialect"]; let options = [ Option<"strategy", "strategy", "std::string", /*default=*/"", - "Workgroup reordering strategy, one of: '' (none), 'transpose', 'swizzle'">, - Option<"logTile", "logTile", "unsigned", + "Workgroup reordering strategy, one of: '' (none), 'transpose', 'swizzle', 'chipletgroup'">, + Option<"logSwizzleTile", "logSwizzleTile", "unsigned", /*default=*/"0", - "The log2 of the tile size used for swizzling. (0: disabled, non-0: swizzling enabled)">, + "The log2 of the tile size used for swizzling. " + "(0: swizzling disabled, non-0: swizzling enabled)">, + Option<"logChipletgroupTile", "logChipletgroupTile", "unsigned", + /*default=*/"0", + "The log2 of the tile size used for chipletgroup. " + "(0: chipletgroup disabled, non-0: chipletgroup enabled)">, ]; } diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/WorkgroupReordering.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/WorkgroupReordering.cpp index fd03448c33db..d9a5f961566b 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/WorkgroupReordering.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/WorkgroupReordering.cpp @@ -7,6 +7,7 @@ #include #include "iree/compiler/Codegen/Common/GPU/Passes.h" +#include "iree/compiler/Codegen/Utils/GPUUtils.h" #include "iree/compiler/Codegen/Utils/Utils.h" #include "iree/compiler/Dialect/HAL/IR/HALOps.h" #include "llvm/ADT/STLExtras.h" @@ -68,6 +69,132 @@ makeSwizzledIds(Location loc, OpBuilder b, Value workgroupIdX, return {swizzledIdX, swizzledIdY}; } +// Reordering to make workgroup ids move slowly between chiplet groups. + +// Example: +// Currently, the GPU launches workgroups in a round-robin fashion across +// each XCD partition on the GPU. +// Assume we have 16 workgroups and XCDPartitionsOnGPU is 4. +// The default GPU schedule will launch workgroups {0, 1, 2, 3, ..., 15} in +// the following order: +// Partition 0: {0, 4, 8, 12} +// Partition 1: {1, 5, 9, 13} +// Partition 2: {2, 6, 10, 14} +// Partition 3: {3, 7, 11, 15} + +// After reordering, the workgroup IDs are {0, 4, 8, 12, 1, ..., 15}, +// resulting in the launch order: +// Partition 0: {0, 1, 2, 3} +// Partition 1: {4, 5, 6, 7} +// Partition 2: {8, 9, 10, 11} +// Partition 3: {12, 13, 14, 15} + +// Returns permuted workgroup id (linearized ID). +// In the above example: +// linearizedId 0's permuted Id is still 0. +// linearizedId 1's permuted Id is 4. +static Value chipletAwareWorkgroupReordering(Location loc, OpBuilder b, + Value linearizedId, + Value workgroupCountX, + Value workgroupCountY, + int64_t XCDParitionsOnGPU) { + // Given: + // Id = linearizedId + // x_dim = workgroupCountX + // y_dim = workgroupCountY + // xcd_count = XCDParitionsOnGPU + + // The new workgroup ID is computed as follows: + // wgp_count = x_dim * y_dim + // reordered_id = (Id / xcd_count) + (Id % xcd_count) * (wgp_count /xcd_count) + // final_id = (Id >= (wgp_count - 1 - (wgp_count % xcd_count))) ? Id : + // reordered_id + + Value numChipletsVal = + b.createOrFold(loc, XCDParitionsOnGPU); + Value workgroupCount = + b.create(loc, workgroupCountX, workgroupCountY); + Value workgroupCountPerChiplet = + b.create(loc, workgroupCount, numChipletsVal); + Value chipletId = b.create(loc, linearizedId, numChipletsVal); + Value wgIdWithinChiplet = + b.create(loc, linearizedId, numChipletsVal); + Value reorderedId = b.create( + loc, wgIdWithinChiplet, + b.create(loc, chipletId, workgroupCountPerChiplet)); + + // Handle the remainder part. + Value constOne = b.createOrFold(loc, 1); + Value lastWorkgroupId = + b.create(loc, workgroupCount, constOne); + Value modulatedLastWorkgroupId = b.create( + loc, lastWorkgroupId, + b.create(loc, workgroupCount, numChipletsVal)); + Value isGreaterThanFinalWorkgroupId = b.create( + loc, arith::CmpIPredicate::ugt, linearizedId, modulatedLastWorkgroupId); + Value finalId = b.create(loc, isGreaterThanFinalWorkgroupId, + linearizedId, reorderedId); + + return finalId; +} + +// Chiplet-aware workgroup reordering strategy: reordering + super-grouping. +// Step 1: Reorder the workgroup grid to move slowly between +// chiplet groups (Function: chipletAwareWorkgroupReordering). +// Step 2: Implement 'super-grouping' of workgroups before switching to the next +// column. +// Returns the permuted workgroup IDs (along X and Y dimension). +static std::pair +makeChipletGroupedIds(Location loc, OpBuilder b, Value workgroupIdX, + Value workgroupIdY, Value workgroupCountX, + Value workgroupCountY, unsigned chipletGroupTile, + unsigned numXCDs) { + // Create one dimension ID for workgroup. + Value linearized = + b.create(loc, workgroupIdY, workgroupCountX); + linearized = b.create(loc, linearized, workgroupIdX); + + assert(numXCDs > 1 && "expected more than one XCD for chiplet reordering"); + // Map chiplets to perform a spatially local tile operation. + // Reorder the linearized ID such that every consecutive group of chiplets + // is the slowest-changing dimension in the grid. + // Empirically found that two chiplets as a group has better locality + // throughout. + linearized = chipletAwareWorkgroupReordering( + loc, b, linearized, workgroupCountX, workgroupCountY, numXCDs / 2); + + // Detailed explanation about the idea behind the below implementation: + // the L2 Cache Optimizations subsection in + // https://triton-lang.org/main/getting-started/tutorials/03-matrix-multiplication.html# + unsigned rowGroupSize = chipletGroupTile; + Value rowGroupSizeVal = + b.createOrFold(loc, rowGroupSize); + + // Empirically, found rowGroupSize=16 for MI300X achieves good performance + // group every 16 workgroups along Y dimension. + + // Number of workgroups in the group. + Value numWorkGroupsPerRowBlock = + b.create(loc, rowGroupSizeVal, workgroupCountX); + + Value groupId = + b.create(loc, linearized, numWorkGroupsPerRowBlock); + Value firstRowID = b.create(loc, groupId, rowGroupSizeVal); + + Value currentRowGroupSize = b.create( + loc, b.create(loc, workgroupCountY, firstRowID), + rowGroupSizeVal); + + Value newY = b.create( + loc, firstRowID, + b.create(loc, linearized, currentRowGroupSize)); + + Value newX = b.create( + loc, b.create(loc, linearized, numWorkGroupsPerRowBlock), + currentRowGroupSize); + return {newX, newY}; +} + /// Transpose IDs, i.e., changes the traversal order from left -> right then /// top -> bottom to top -> bottom then left -> right. static std::pair makeTransposedIds(Location loc, OpBuilder b, @@ -112,11 +239,12 @@ getWorkgroupCountsXY(OpBuilder &builder, FunctionOpInterface funcOp) { static LogicalResult reorderWorkgroupsInFunc(FunctionOpInterface funcOp, ReorderWorkgroupsStrategy strategy, - unsigned swizzleLogTile) { + unsigned logTile, + unsigned numXCDs = 2) { assert(strategy != ReorderWorkgroupsStrategy::None && "Expected a concrete strategy"); - unsigned swizzleTile = 1u << swizzleLogTile; + unsigned reorderWgTileSize = 1u << logTile; IREE::HAL::InterfaceWorkgroupIDOp oldXId; IREE::HAL::InterfaceWorkgroupIDOp oldYId; unsigned numXIdOps = 0; @@ -153,7 +281,13 @@ static LogicalResult reorderWorkgroupsInFunc(FunctionOpInterface funcOp, if (strategy == ReorderWorkgroupsStrategy::Swizzle) { std::tie(newWorkgroupIdX, newWorkgroupIdY) = makeSwizzledIds(funcOp.getLoc(), builder, workgroupIdX, workgroupIdY, - workgroupCntX, workgroupCntY, swizzleTile); + workgroupCntX, workgroupCntY, reorderWgTileSize); + } else if (strategy == ReorderWorkgroupsStrategy::ChipletGroup) { + if (numXCDs <= 1) + return failure(); + std::tie(newWorkgroupIdX, newWorkgroupIdY) = makeChipletGroupedIds( + funcOp.getLoc(), builder, workgroupIdX, workgroupIdY, workgroupCntX, + workgroupCntY, reorderWgTileSize, numXCDs); } else { assert(strategy == ReorderWorkgroupsStrategy::Transpose && "Unhandled strategy"); @@ -186,9 +320,9 @@ namespace { struct ReorderWorkgroupsPass final : impl::ReorderWorkgroupsPassBase { ReorderWorkgroupsPass( - ReorderWorkgroupsStrategy strategy, unsigned logSwizzleTile, + ReorderWorkgroupsStrategy strategy, unsigned logTile, std::function filterFn) - : reorderingStrategy(strategy), logSwizzleTile(logSwizzleTile), + : reorderingStrategy(strategy), reorderWgLogTileSize(logTile), filterFn(std::move(filterFn)) {} LogicalResult initializeOptions( @@ -197,10 +331,11 @@ struct ReorderWorkgroupsPass final if (failed(Pass::initializeOptions(options, errorHandler))) { return failure(); } - logSwizzleTile = logTile; + auto selectedStrategy = llvm::StringSwitch>(strategy) .Case("", ReorderWorkgroupsStrategy::None) + .Case("chipletgroup", ReorderWorkgroupsStrategy::ChipletGroup) .Case("swizzle", ReorderWorkgroupsStrategy::Swizzle) .Case("transpose", ReorderWorkgroupsStrategy::Transpose) .Default(failure()); @@ -208,6 +343,13 @@ struct ReorderWorkgroupsPass final return failure(); reorderingStrategy = *selectedStrategy; + if (reorderingStrategy == ReorderWorkgroupsStrategy::Swizzle && + reorderWgLogTileSize == 0) + reorderWgLogTileSize = logSwizzleTile; + else if (reorderingStrategy == ReorderWorkgroupsStrategy::ChipletGroup && + reorderWgLogTileSize == 0) + reorderWgLogTileSize = logChipletgroupTile; + return success(); } @@ -216,7 +358,11 @@ struct ReorderWorkgroupsPass final return; if (reorderingStrategy == ReorderWorkgroupsStrategy::Swizzle && - logSwizzleTile == 0) + reorderWgLogTileSize == 0) + return; + + if (reorderingStrategy == ReorderWorkgroupsStrategy::ChipletGroup && + reorderWgLogTileSize == 0) return; FunctionOpInterface funcOp = getOperation(); @@ -229,7 +375,20 @@ struct ReorderWorkgroupsPass final llvm::dbgs() << "\n\n"; }); - if (failed(reorderWorkgroupsInFunc(funcOp, reorderingStrategy, logTile))) { + uint32_t numXCDs = 1; + if (IREE::GPU::TargetAttr attr = getGPUTargetAttr(funcOp)) { + if (IREE::GPU::TargetChipAttr chipAttr = attr.getChip()) { + numXCDs = chipAttr.getChipletCount(); + } + } + + LLVM_DEBUG(llvm::dbgs() << "Number of XCDs = " << numXCDs << "\n"); + if (numXCDs == 1 && + reorderingStrategy == ReorderWorkgroupsStrategy::ChipletGroup) + return; + + if (failed(reorderWorkgroupsInFunc(funcOp, reorderingStrategy, + reorderWgLogTileSize, numXCDs))) { LLVM_DEBUG(llvm::dbgs() << "Failed to reorder workgroups\n"); return; } @@ -244,16 +403,16 @@ struct ReorderWorkgroupsPass final private: ReorderWorkgroupsStrategy reorderingStrategy = ReorderWorkgroupsStrategy::None; - unsigned logSwizzleTile = 0; + unsigned reorderWgLogTileSize = 0; std::function filterFn; }; } // namespace std::unique_ptr> createReorderWorkgroups( - ReorderWorkgroupsStrategy strategy, unsigned swizzleLogTile, + ReorderWorkgroupsStrategy strategy, unsigned reorderWgLogTile, std::function filterFn) { - return std::make_unique(strategy, swizzleLogTile, + return std::make_unique(strategy, reorderWgLogTile, filterFn); } diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_reorder_workgroups.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_reorder_workgroups.mlir index 906ed23f162b..ab4b4c49a2d8 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_reorder_workgroups.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_reorder_workgroups.mlir @@ -1,10 +1,17 @@ -// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-reorder-workgroups{strategy=swizzle logTile=3}))" \ +// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-reorder-workgroups{strategy=swizzle logSwizzleTile=3}))" \ // RUN: --split-input-file %s | FileCheck --check-prefix=SWIZZLE %s // RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-reorder-workgroups{strategy=transpose}))" \ // RUN: --split-input-file %s | FileCheck --check-prefix=TRANSPOSE %s -func.func @matmul() { +// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-reorder-workgroups{strategy=chipletgroup logChipletgroupTile=3}))" \ +// RUN: --split-input-file %s | FileCheck --check-prefix=CHIPLETGROUP %s +#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", +{iree.gpu.target = #iree_gpu.target, ], +subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536>, +chip = >, ukernels = "none"}> +func.func @matmul() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} { %c0 = arith.constant 0 : index %c128 = arith.constant 128 : index %c96 = arith.constant 96 : index @@ -55,6 +62,41 @@ func.func @matmul() { // SWIZZLE: %[[S13:.*]] = arith.select %[[S12]], %[[WG_X]], %[[S6]] : index // SWIZZLE: %[[S14:.*]] = arith.select %[[S12]], %[[WG_Y]], %[[S7]] : index +// CHIPLETGROUP-LABEL: func.func @matmul +// CHIPLETGROUP: %[[WG_X:.*]] = hal.interface.workgroup.id[0] : index +// CHIPLETGROUP: %[[WG_Y:.*]] = hal.interface.workgroup.id[1] : index +// CHIPLETGROUP: %[[WG_CNT_X:.*]] = hal.interface.workgroup.count[0] : index +// CHIPLETGROUP: %[[WG_CNT_Y:.*]] = hal.interface.workgroup.count[1] : index +// CHIPLETGROUP: %[[S0:.*]] = arith.muli %[[WG_Y]], %[[WG_CNT_X]] : index +// CHIPLETGROUP: %[[S1:.*]] = arith.addi %[[S0]], %[[WG_X]] : index +// CHIPLETGROUP: %[[CST4:.*]] = arith.constant 4 : index +// CHIPLETGROUP: %[[WG_CNT:.*]] = arith.muli %[[WG_CNT_X]], %[[WG_CNT_Y]] : index +// CHIPLETGROUP: %[[S3:.*]] = arith.divui %[[WG_CNT]], %[[CST4]] : index +// CHIPLETGROUP: %[[S4:.*]] = arith.remui %[[S1]], %[[CST4]] : index +// CHIPLETGROUP: %[[S5:.*]] = arith.divui %[[S1]], %[[CST4]] : index +// CHIPLETGROUP: %[[S6:.*]] = arith.muli %[[S4]], %[[S3]] : index +// CHIPLETGROUP: %[[S7:.*]] = arith.addi %[[S5]], %[[S6]] : index +// CHIPLETGROUP: %[[CST1:.*]] = arith.constant 1 : index +// CHIPLETGROUP: %[[S8:.*]] = arith.subi %[[WG_CNT]], %[[CST1]] : index +// CHIPLETGROUP: %[[S9:.*]] = arith.remui %[[WG_CNT]], %[[CST4]] : index +// CHIPLETGROUP: %[[S10:.*]] = arith.subi %[[S8]], %[[S9]] : index +// CHIPLETGROUP: %[[S11:.*]] = arith.cmpi ugt, %[[S1]], %[[S10]] : index +// CHIPLETGROUP: %[[S12:.*]] = arith.select %[[S11]], %[[S1]], %[[S7]] : index +// CHIPLETGROUP: %[[CST8:.*]] = arith.constant 8 : index +// CHIPLETGROUP: %[[S13:.*]] = arith.muli %[[CST8]], %[[WG_CNT_X]] : index +// CHIPLETGROUP: %[[S14:.*]] = arith.divui %[[S12]], %[[S13]] : index +// CHIPLETGROUP: %[[S15:.*]] = arith.muli %[[S14]], %[[CST8]] : index +// CHIPLETGROUP: %[[S16:.*]] = arith.subi %[[WG_CNT_Y]], %[[S15]] : index +// CHIPLETGROUP: %[[S17:.*]] = arith.minui %[[S16]], %[[CST8]] : index +// CHIPLETGROUP: %[[S18:.*]] = arith.remui %[[S12]], %[[S17]] : index +// CHIPLETGROUP: %[[S19:.*]] = arith.addi %[[S15]], %[[S18]] : index +// CHIPLETGROUP: %[[S20:.*]] = arith.remui %[[S12]], %[[S13]] : index +// CHIPLETGROUP: %[[S21:.*]] = arith.divui %[[S20]], %[[S17]] : index +// CHIPLETGROUP: %26 = affine.apply #map()[%[[S19]]] +// CHIPLETGROUP: %27 = affine.apply #map()[%workgroup_count_y_1] +// CHIPLETGROUP: %28 = affine.apply #map()[%[[S21]]] +// CHIPLETGROUP: %29 = affine.apply #map()[%workgroup_count_x_0] + // TRANSPOSE-LABEL: func.func @matmul // TRANSPOSE: %[[WG_X:.*]] = hal.interface.workgroup.id[0] : index // TRANSPOSE: %[[WG_Y:.*]] = hal.interface.workgroup.id[1] : index diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_reorder_workgroups_static.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_reorder_workgroups_static.mlir index 640158da79e2..9563a4aa2a6f 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_reorder_workgroups_static.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_reorder_workgroups_static.mlir @@ -1,9 +1,12 @@ -// RUN: iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-codegen-reorder-workgroups{strategy=swizzle logTile=3})))))" \ +// RUN: iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-codegen-reorder-workgroups{strategy=swizzle logSwizzleTile=3})))))" \ // RUN: %s | FileCheck --check-prefix=SWIZZLE %s // RUN: iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-codegen-reorder-workgroups{strategy=transpose})))))" \ // RUN: %s | FileCheck --check-prefix=TRANSPOSE %s +// RUN: iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-codegen-reorder-workgroups{strategy=chipletgroup logChipletgroupTile=3})))))" \ +// RUN: %s | FileCheck --check-prefix=CHIPLETGROUP %s + // Make sure we use static workgroup counts instead of introducting // `hal.interface.workgroup.count` ops. These are currently not supported on ROCm. @@ -18,6 +21,25 @@ // SWIZZLE-DAG: affine.apply #{{.+}}()[%[[SEL_Y]]] // SWIZZLE: return +// CHIPLETGROUP-LABEL: hal.executable private @main_dispatch_0 { +// CHIPLETGROUP-LABEL: func.func @main_dispatch_0_matmul_transpose_b_32000x32000x4096_f16 +// CHIPLETGROUP-DAG: %[[WG_X:.+]] = hal.interface.workgroup.id[0] : index +// CHIPLETGROUP-DAG: %[[WG_Y:.+]] = hal.interface.workgroup.id[1] : index +// CHIPLETGROUP-NOT: hal.interface.workgroup.count +// CHIPLETGROUP-DAG: %[[C250:.+]] = arith.constant 250 : index +// CHIPLETGROUP-DAG: %[[C500:.+]] = arith.constant 500 : index +// CHIPLETGROUP: %[[MUL:.+]] = arith.muli %[[WG_Y]], %[[C250]] : index +// CHIPLETGROUP: %[[ADD:.+]] = arith.addi %[[MUL]], %[[WG_X]] : index +// CHIPLETGROUP: %[[CMP:.+]] = arith.cmpi ugt, %[[ADD]], %{{.+}} : index +// CHIPLETGROUP: %[[SELECT:.+]] = arith.select %[[CMP]], %[[ADD]], %{{.+}} : index +// CHIPLETGROUP: %[[REM:.+]] = arith.remui %[[SELECT]], %{{.+}} : index +// CHIPLETGROUP: %[[ADDI:.+]] = arith.addi %{{.+}}, %[[REM]] : index +// CHIPLETGROUP: %[[REMI:.+]] = arith.remui %[[SELECT]], %{{.+}} : index +// CHIPLETGROUP: %[[DIV:.+]] = arith.divui %[[REMI]], %{{.+}} : index +// CHIPLETGROUP-DAG: affine.apply #{{.+}}()[%[[ADDI]]] +// CHIPLETGROUP-DAG: affine.apply #{{.+}}()[%[[DIV]]] +// CHIPLETGROUP: return + // TRANSPOSE-LABEL: hal.executable private @main_dispatch_0 { // TRANSPOSE-LABEL: func.func @main_dispatch_0_matmul_transpose_b_32000x32000x4096_f16 // TRANSPOSE-DAG: %[[WG_X:.+]] = hal.interface.workgroup.id[0] : index @@ -33,8 +55,13 @@ // TRANSPOSE-DAG: affine.apply #{{.+}}()[%[[REM]]] // TRANSPOSE: return +#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", +{iree.gpu.target = #iree_gpu.target, ], +subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536>, +chip = >, ukernels = "none"}> hal.executable private @main_dispatch_0 { -hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { +hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { hal.executable.export public @main_dispatch_0_matmul_transpose_b_32000x32000x4096_f16 ordinal(0) layout(#hal.pipeline.layout, <1, storage_buffer>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>], subgroup_size = 64 : index, translation_info = #iree_codegen.translation_info, workgroup_size = [64 : index, 16 : index, 1 : index]} { ^bb0(%arg0: !hal.device): %c250 = arith.constant 250 : index diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td index 5a974d5100c9..4633f173b630 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td @@ -269,6 +269,26 @@ def IREEGPU_MmaScheduleAttr : AttrDef { }]; } +//===----------------------------------------------------------------------===// +// Workgroup Reordering Attr + +def IREEGPU_WorkgroupReorderAttr: AttrDef{ + let mnemonic = "workgroup_reorder"; + let cppNamespace = "::mlir::iree_compiler::IREE::GPU"; + + string description = [{ + Options for workgroup reordering strategies to improve L2 cache hit rate. + }]; + + let parameters = (ins + OptionalParameter<"::mlir::iree_compiler::IREE::GPU::ReorderWorkgroupEnum">:$reorder_option, + OptionalParameter<"std::optional", "the tile size to use in log2">:$log_tile_size + ); + + let assemblyFormat = "`<` struct(params) `>`"; +} + + //===----------------------------------------------------------------------===// // Workgroup processor level description @@ -335,7 +355,7 @@ def IREEGPU_TargetChipAttr : AttrDef { let parameters = (ins "uint32_t":$wgp_count, - + "uint32_t":$chiplet_count, // An optional extra dict // This field allows to inject more features/limits not supported in the // above list for better flexibility. diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td index a7abbb65ceb6..cd1da79cfe2f 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td @@ -148,4 +148,20 @@ def IREEGPU_TilingLevel : IREEGPU_I32MmaEnumAttr<"TilingLevel", Lane ]>; +//===----------------------------------------------------------------------===// +// Workgroup reordering strategies + +def None : I32EnumAttrCase<"none", 0>; +def Transpose :I32EnumAttrCase<"transpose", 1>; +def Swizzle : I32EnumAttrCase<"swizzle", 2>; +def Chipletgroup : I32EnumAttrCase<"chipletgroup", 3>; + +def IREEGPU_ReorderWorkgroupEnum : IREEGPU_I32MmaEnumAttr<"ReorderWorkgroupEnum", + "Descriptor for strategies of reordering workgroups on GPUs", [ + None, + Transpose, + Swizzle, + Chipletgroup + ]>; + #endif // IREE_COMPILER_CODEGEN_DIALECT_GPU_IREEGPUENUMS diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/target_attrs.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/target_attrs.mlir index baa47b2be12e..25bf9f0b59cf 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/target_attrs.mlir +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/target_attrs.mlir @@ -44,9 +44,10 @@ func.func @test_target_wgp_none() attributes { // CHECK-LABEL: func.func @test_target_chip() func.func @test_target_chip() attributes { // CHECK: #iree_gpu.target_chip< - // CHECK-SAME: wgp_count = 304> + // CHECK-SAME: wgp_count = 304, chiplet_count = 8> chip = #iree_gpu.target_chip< - wgp_count = 304 + wgp_count = 304, + chiplet_count = 8 > } { return } @@ -68,6 +69,6 @@ func.func @test_target() attributes { max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536>, - chip = + chip = > } { return } diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp index b6f0aebbd395..7a036fb1b35a 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp @@ -50,6 +50,7 @@ struct WgpDetails { // Chip level feature/limit details struct ChipDetails { uint32_t wgpCount; + uint32_t chipletCount; }; // Full target details @@ -111,7 +112,8 @@ TargetAttr createTargetAttr(const TargetDetails &details, StringRef arch, TargetChipAttr targetChip; if (details.chip) targetChip = - TargetChipAttr::get(context, details.chip->wgpCount, DictionaryAttr{}); + TargetChipAttr::get(context, details.chip->wgpCount, + details.chip->chipletCount, DictionaryAttr{}); return TargetAttr::get(context, arch, features, targetWgp, targetChip); } @@ -198,32 +200,31 @@ std::optional getAMDGPUTargetDetails(StringRef target) { // "AMD Instinct MI300 Series Product Offerings" in Page 23 of // https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/white-papers/amd-cdna-3-white-paper.pdf - static const ChipDetails mi300xChip = {304}; - static const ChipDetails mi300aChip = {228}; + static const ChipDetails mi300xChip = {304, 8}; + static const ChipDetails mi300aChip = {228, 6}; // "AMD Instinct MI200 Series Accelerator Product Offerings" in Page 14 of // https://www.amd.com/content/dam/amd/en/documents/instinct-business-docs/white-papers/amd-cdna2-white-paper.pdf - static const ChipDetails mi250xChip = {220}; - static const ChipDetails mi250Chip = {208}; - static const ChipDetails mi210Chip = {104}; + static const ChipDetails mi250xChip = {220, 2}; + static const ChipDetails mi250Chip = {208, 2}; + static const ChipDetails mi210Chip = {104, 1}; // "AMD CDNA Architecture Compute Units" in Page 5 of // https://www.amd.com/content/dam/amd/en/documents/instinct-business-docs/white-papers/amd-cdna-white-paper.pdf - static const ChipDetails mi100Chip = {120}; + static const ChipDetails mi100Chip = {120, 1}; - static const ChipDetails rx7900xtxChip = {96}; - static const ChipDetails rx7900xtChip = {84}; - static const ChipDetails rx7800xtChip = {60}; - static const ChipDetails rx7700xtChip = {54}; + static const ChipDetails rx7900xtxChip = {96, 1}; + static const ChipDetails rx7900xtChip = {84, 1}; + static const ChipDetails rx7800xtChip = {60, 1}; + static const ChipDetails rx7700xtChip = {54, 1}; // See https://llvm.org/docs/AMDGPUUsage.html#processors for gfxN to // cdnaN/rdnaN mapping. return llvm::StringSwitch>(target.lower()) - .Case("mi300x", TargetDetails{cdna3Wgp, &mi300xChip}) + .Cases("mi300x", "gfx942", TargetDetails{cdna3Wgp, &mi300xChip}) .Case("mi300a", TargetDetails{cdna3Wgp, &mi300aChip}) - .Cases("cdna3", "gfx940", "gfx941", "gfx942", - TargetDetails{cdna3Wgp, nullptr}) + .Cases("cdna3", "gfx940", "gfx941", TargetDetails{cdna3Wgp, nullptr}) .Case("mi250x", TargetDetails{cdna2Wgp, &mi250xChip}) .Case("mi250", TargetDetails{cdna2Wgp, &mi250Chip}) .Case("mi210", TargetDetails{cdna2Wgp, &mi210Chip}) diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp index 2fb329579986..0038da555f07 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp @@ -92,21 +92,30 @@ getPipelineOptions(FunctionOpInterface funcOp, // Get the workgroups reorder config and enable the workgroup reordering. Attribute reorderWorkgroupOption = config.get(LLVMGPUAttrNames::kReorderWorkgroups); - if (!isa(reorderWorkgroupOption)) - funcOp.emitOpError() << "'" << LLVMGPUAttrNames::kReorderWorkgroups - << "' is expected to be a string attribute"; - StringRef reorderStr = llvm::cast(reorderWorkgroupOption); - if (reorderStr == "transpose") { - pipelineOptions.reorderStrategy = ReorderWorkgroupsStrategy::Transpose; - } else if (reorderStr == "swizzle") { - pipelineOptions.reorderStrategy = ReorderWorkgroupsStrategy::Swizzle; - } else { - if (reorderStr != "none") - funcOp.emitOpError() - << "Unknown " << LLVMGPUAttrNames::kReorderWorkgroups - << "value: " << reorderWorkgroupOption; - else + if (isa(reorderWorkgroupOption)) { + IREE::GPU::WorkgroupReorderOptionsAttr ReorderOption = + llvm::dyn_cast( + reorderWorkgroupOption); + pipelineOptions.reorderWgLogTileSize = ReorderOption.getLogTileSize(); + switch (ReorderOption.getReorderOption()) { + case IREE::GPU::ReorderWorkgroupEnum::none: pipelineOptions.reorderStrategy = ReorderWorkgroupsStrategy::None; + break; + case IREE::GPU::ReorderWorkgroupEnum::transpose: + pipelineOptions.reorderStrategy = + ReorderWorkgroupsStrategy::Transpose; + break; + case IREE::GPU::ReorderWorkgroupEnum::swizzle: + pipelineOptions.reorderStrategy = ReorderWorkgroupsStrategy::Swizzle; + break; + case IREE::GPU::ReorderWorkgroupEnum::chipletgroup: + pipelineOptions.reorderStrategy = + ReorderWorkgroupsStrategy::ChipletGroup; + break; + default: + funcOp.emitOpError( + "unsupported workgroup reordering option on GPU target."); + } } } } diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp index a417e310a909..30f44d2065ea 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp @@ -7,6 +7,7 @@ #include "iree/compiler/Dialect/LinalgExt/Transforms/Passes.h" #include +#include #include "iree-dialects/Dialect/LinalgTransform/Passes.h" #include "iree/compiler/Codegen/Common/GPU/Passes.h" @@ -57,12 +58,14 @@ static llvm::cl::opt clReorderWorkgroupsStrategy( "No workgroup reordering"), clEnumValN(ReorderWorkgroupsStrategy::Swizzle, "swizzle", "Swizzle"), + clEnumValN(ReorderWorkgroupsStrategy::ChipletGroup, + "chipletgroup", "ChipletGroup"), clEnumValN(ReorderWorkgroupsStrategy::Transpose, "transpose", "Transpose")), llvm::cl::init(ReorderWorkgroupsStrategy::None)); -static llvm::cl::opt clReorderWorkgroupsLogSwizzleTile( - "iree-codegen-reorder-workgroups-log-swizzle-tile", +static llvm::cl::opt clReorderWorkgroupsLogTile( + "iree-codegen-reorder-workgroups-log-tile", llvm::cl::desc("Reorder workgroups: log tile size to use"), llvm::cl::init(3)); @@ -85,6 +88,9 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &os, reorderStr = "transpose"; } else if (options.reorderStrategy == ReorderWorkgroupsStrategy::Swizzle) { reorderStr = "swizzle"; + } else if (options.reorderStrategy == + ReorderWorkgroupsStrategy::ChipletGroup) { + reorderStr = "chilpletgroup"; } else { assert(options.reorderStrategy == ReorderWorkgroupsStrategy::None && "Unhandled reorder option"); @@ -95,6 +101,7 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &os, return os << "{" << "enableReduceSharedMemoryBankConflicts = " << options.enableReduceSharedMemoryBankConflicts << ", reorderWorkgroupsStrategy = " << reorderStr + << ", reorderWorkgroupsTileSize = " << options.reorderWgLogTileSize << ", enableUkernels = " << options.enableUkernels << "}"; } @@ -206,6 +213,13 @@ static ReorderWorkgroupsStrategy getReorderWorkgroupsStrategy( return option.value_or(clReorderWorkgroupsStrategy); } +// Reconciles log2 of the workgroup reordering tile size based on the pipeline +// `option` and the CLI flag. +static unsigned getReorderWorkgroupsLogTileSize(std::optional option) { + int64_t logTile = option.value_or(clReorderWorkgroupsLogTile); + assert(logTile >= 0); + return static_cast(logTile); +} //===----------------------------------------------------------------------===// // Common Pass Recipes //===----------------------------------------------------------------------===// @@ -449,9 +463,10 @@ void addGPUMatmulSimtPassPipeline(OpPassManager &funcPassManager, ReorderWorkgroupsStrategy reorderStrategy = getReorderWorkgroupsStrategy(options.reorderStrategy); + unsigned reorderWgLogTileSize = + getReorderWorkgroupsLogTileSize(options.reorderWgLogTileSize); funcPassManager.addPass(createReorderWorkgroups( - reorderStrategy, clReorderWorkgroupsLogSwizzleTile, - canReorderWorkgroups)); + reorderStrategy, reorderWgLogTileSize, canReorderWorkgroups)); funcPassManager.addPass(createCanonicalizerPass()); funcPassManager.addPass(createCSEPass()); @@ -498,9 +513,10 @@ void addGPUMatmulTensorCorePassPipeline(OpPassManager &funcPassManager, ReorderWorkgroupsStrategy reorderStrategy = getReorderWorkgroupsStrategy(options.reorderStrategy); + unsigned reorderWgLogTileSize = + getReorderWorkgroupsLogTileSize(options.reorderWgLogTileSize); funcPassManager.addPass(createReorderWorkgroups( - reorderStrategy, clReorderWorkgroupsLogSwizzleTile, - canReorderWorkgroups)); + reorderStrategy, reorderWgLogTileSize, canReorderWorkgroups)); funcPassManager.addPass(createCanonicalizerPass()); funcPassManager.addPass(createCSEPass()); @@ -566,9 +582,10 @@ void addGPUMatmulTensorCoreMmaSyncPassPipeline( ReorderWorkgroupsStrategy reorderStrategy = getReorderWorkgroupsStrategy(options.reorderStrategy); + unsigned reorderWgLogTileSize = + getReorderWorkgroupsLogTileSize(options.reorderWgLogTileSize); funcPassManager.addPass(createReorderWorkgroups( - reorderStrategy, clReorderWorkgroupsLogSwizzleTile, - canReorderWorkgroups)); + reorderStrategy, reorderWgLogTileSize, canReorderWorkgroups)); funcPassManager.addPass(createCanonicalizerPass()); funcPassManager.addPass(createCSEPass()); @@ -725,9 +742,10 @@ void addGPUVectorDistributePassPipeline(OpPassManager &funcPassManager, ReorderWorkgroupsStrategy reorderStrategy = getReorderWorkgroupsStrategy(options.reorderStrategy); + unsigned reorderWgLogTileSize = + getReorderWorkgroupsLogTileSize(options.reorderWgLogTileSize); funcPassManager.addPass(createReorderWorkgroups( - reorderStrategy, clReorderWorkgroupsLogSwizzleTile, - canReorderWorkgroups)); + reorderStrategy, reorderWgLogTileSize, canReorderWorkgroups)); funcPassManager.addPass(createCanonicalizerPass()); funcPassManager.addPass(createCSEPass()); diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h index 0492d49f6dee..17802197fd0b 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h @@ -26,7 +26,7 @@ namespace mlir::iree_compiler { /// attribute. These are used to override default pass heuristics at the /// function granularity. namespace LLVMGPUAttrNames { -inline constexpr StringLiteral kReorderWorkgroups = "reorder_workgroups"; +inline constexpr StringLiteral kReorderWorkgroups = "workgroup_reorder"; inline constexpr StringLiteral kNoReduceSharedMemoryBankConflicts = "no_reduce_shared_memory_bank_conflicts"; } // namespace LLVMGPUAttrNames @@ -35,6 +35,7 @@ struct LLVMGPUPipelineOptions { bool enableReduceSharedMemoryBankConflicts = true; bool enableUkernels = false; std::optional reorderStrategy; + std::optional reorderWgLogTileSize; }; llvm::raw_ostream &operator<<(llvm::raw_ostream &os, diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_user_vector_distribute.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_user_vector_distribute.mlir index ad80fdf2498b..5ba79f62b106 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_user_vector_distribute.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_user_vector_distribute.mlir @@ -2,7 +2,7 @@ // RUN: --iree-codegen-reorder-workgroups-strategy=transpose \ // RUN: --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target)))))" %s | FileCheck %s --check-prefix=OPT-OUT -// Check that applying `reorder_workgroups` enables or disables workgroup reordering. +// Check that applying `workgroup_reorder` enables or disables workgroup reordering. // RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx942 --iree-codegen-llvmgpu-use-vector-distribution \ // RUN: --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target)))))" %s | FileCheck %s --check-prefix=OPT-IN @@ -16,6 +16,11 @@ // OPT-IN: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info, // OPT-IN-SAME: no_reduce_shared_memory_bank_conflicts +#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", +{iree.gpu.target = #iree_gpu.target, ], +subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536>, +chip = >, ukernels = "none"}> #pipeline_layout = #hal.pipeline.layout, @@ -23,7 +28,7 @@ ]> ]> hal.executable public @main_0_dispatch_0 { - hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { + hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { hal.executable.export public @main_0_dispatch_0_matmul_transpose_b_2048x10240x1280_f16xf16xf32 ordinal(0) layout(#pipeline_layout) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} { ^bb0(%arg0: !hal.device): @@ -83,15 +88,21 @@ hal.executable public @main_0_dispatch_0 { // ----- -// Check that applying the `reorder_workgroups = transpose` unit attribute enables workgroup reordering. +// Check that applying the `workgroup_reorder = transpose` unit attribute enables workgroup reordering. // OPT-OUT: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info, -// OPT-OUT-SAME: reorder_workgroups = "transpose" +// OPT-OUT-SAME: workgroup_reorder = #iree_gpu.workgroup_reorder // OPT-IN: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info, -// OPT-IN-SAME: reorder_workgroups = "transpose" +// OPT-IN-SAME: workgroup_reorder = #iree_gpu.workgroup_reorder + +#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", +{iree.gpu.target = #iree_gpu.target, ], +subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536>, +chip = >, ukernels = "none"}> #pipeline_layout = #hal.pipeline.layout, @@ -99,7 +110,7 @@ hal.executable public @main_0_dispatch_0 { ]> ]> hal.executable public @main_0_dispatch_0 { - hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { + hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { hal.executable.export public @main_0_dispatch_0_matmul_transpose_b_2048x10240x1280_f16xf16xf32 ordinal(0) layout(#pipeline_layout) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} { ^bb0(%arg0: !hal.device): @@ -127,7 +138,7 @@ hal.executable public @main_0_dispatch_0 { func.func @main_0_dispatch_0_matmul_transpose_b_2048x10240x1280_f16xf16xf32() attributes {translation_info = #iree_codegen.translation_info, subgroup_m_count = 2, subgroup_n_count = 2>, - reorder_workgroups = "transpose" // enable the 'reorderWorkgroups' pass. + workgroup_reorder = #iree_gpu.workgroup_reorder // enable the 'reorderWorkgroups' pass. }>} { %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index @@ -159,11 +170,16 @@ hal.executable public @main_0_dispatch_0 { } // ----- -// Check that applying the `reorder_workgroups = none` unit attribute disables workgroup reordering. +// Check that applying the `workgroup_reorder = none` unit attribute disables workgroup reordering. // OPT-OUT: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info, -// OPT-OUT-SAME: reorder_workgroups = "none" +// OPT-OUT-SAME: workgroup_reorder = #iree_gpu.workgroup_reorder<> +#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", +{iree.gpu.target = #iree_gpu.target, ], +subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536>, +chip = >, ukernels = "none"}> #pipeline_layout = #hal.pipeline.layout, @@ -171,7 +187,7 @@ hal.executable public @main_0_dispatch_0 { ]> ]> hal.executable public @main_0_dispatch_0 { - hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { + hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) { hal.executable.export public @main_0_dispatch_0_matmul_transpose_b_2048x10240x1280_f16xf16xf32 ordinal(0) layout(#pipeline_layout) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} { ^bb0(%arg0: !hal.device): @@ -188,7 +204,7 @@ hal.executable public @main_0_dispatch_0 { func.func @main_0_dispatch_0_matmul_transpose_b_2048x10240x1280_f16xf16xf32() attributes {translation_info = #iree_codegen.translation_info, subgroup_m_count = 2, subgroup_n_count = 2>, - reorder_workgroups = "none" // Disable the 'reorderWorkgroups' pass. + workgroup_reorder = #iree_gpu.workgroup_reorder // Disable the 'reorderWorkgroups' pass. }>} { %cst = arith.constant 0.000000e+00 : f16 %c0 = arith.constant 0 : index