[LLVMGPU] Fix linear dim selection in GPUApplyTilingLevel (#17611)

`GPUApplyTilingLevel` skips over dimensions with no tiling (tile size of zero), and the linear dim mapping in the forall loop also skips the indices of the dimensions. This PR fixes this so the linear dim mappings always increment by 1 starting from 0. Signed-off-by: Max Dawkins <max.dawkins@gmail.com>
iree-org · Jun 7, 2024 · f062b19 · f062b19
1 parent 2baf6c3
commit f062b19
Show file tree

Hide file tree

Showing 2 changed files with 39 additions and 2 deletions.
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUApplyTilingLevel.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUApplyTilingLevel.cpp
@@ -107,10 +107,11 @@ applyTileAndFuseToEachRoot(RewriterBase &rewriter,
       // TODO: Add some helpers to construct this based on the enum type rather
       // than doing it here.
       SmallVector<DeviceMappingAttrInterface> mapping;
-      for (auto [idx, size] : llvm::enumerate(tileSizes)) {
+      int idx = 0;
+      for (auto size : tileSizes) {
         if (!isConstantIntValue(size, 0)) {
           unsigned mappingId =
-              static_cast<unsigned>(gpu::MappingId::LinearDim0) + idx;
+              static_cast<unsigned>(gpu::MappingId::LinearDim0) + idx++;
           mapping.push_back(gpu::GPUThreadMappingAttr::get(
               context, static_cast<gpu::MappingId>(mappingId)));
         }

diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_apply_tiling_level.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_apply_tiling_level.mlir
@@ -37,6 +37,42 @@ module {
 
 // -----
 
+#config = #iree_gpu.lowering_config<{thread = [0, 16]}>
+#map = affine_map<(d0, d1) -> (d0, d1)>
+module {
+  func.func @sequential_forall_mappings() {
+    %c0 = arith.constant 0 : index
+    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<4x256xf32>>
+    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<4x256xf32>>
+    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<4x256xf32>>
+    %3 = flow.dispatch.tensor.load %0, offsets = [%c0, %c0], sizes = [4, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4x256xf32>> -> tensor<4x256xf32>
+    %4 = flow.dispatch.tensor.load %1, offsets = [%c0, %c0], sizes = [4, 256], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4x256xf32>> -> tensor<4x256xf32>
+    %5 = flow.dispatch.tensor.load %2, offsets = [%c0, %c0], sizes = [4, 256], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<4x256xf32>> -> tensor<4x256xf32>
+    %6 = linalg.generic {
+      indexing_maps = [#map, #map, #map],
+      iterator_types = ["parallel", "parallel"]
+      } ins(%3, %4 : tensor<4x256xf32>, tensor<4x256xf32>) outs(%5 : tensor<4x256xf32>) attrs =  {lowering_config = #config} {
+    ^bb0(%in: f32, %in_0: f32, %out: f32):
+      %7 = arith.addf %in, %in_0 : f32
+      linalg.yield %7 : f32
+    } -> tensor<4x256xf32>
+    flow.dispatch.tensor.store %6, %2, offsets = [%c0, %c0], sizes = [4, 256], strides = [1, 1] : tensor<4x256xf32> -> !flow.dispatch.tensor<writeonly:tensor<4x256xf32>>
+    return
+  }
+}
+
+// Verify that no loops are generated without a reduction configuration.
+// CHECK-LABEL: func.func @sequential_forall_mappings
+//   CHECK-NOT:   scf.for
+
+// THREAD-LABEL: func.func @sequential_forall_mappings
+//       THREAD:   scf.forall ({{.*}}) = (0) to (256) step (16)
+//       THREAD:     linalg.generic {{.*}} ins(%{{.*}}: tensor<4x16xf32>, tensor<4x16xf32>)
+//       THREAD:     scf.forall.in_parallel
+//       THREAD:   mapping = [#gpu.thread<linear_dim_0>]
+
+// -----
+
 module {
   func.func @matmul_transpose_b() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUVectorize workgroup_size = [128, 2, 1] subgroup_size = 64>} {
     %c4 = arith.constant 4 : index