Error when vectorising linalg.generic #11779

banach-space · 2023-01-10T17:32:38Z

Hi 👋🏻 !

Here's the assertion that I am hitting:

/iree/third_party/llvm-project/mlir/include/mlir/IR/BlockAndValueMapping.h:82: T mlir::BlockAndValueMapping::lookupImpl(T) const [T = mlir::Value]: Assertion `result && "expected 'from' to be contained within the map"' failed.

It originates from reduceIfNeeded.

INPUT MLIR:

module {
  func.func @pipeline_dispatch_1_generic_1080x1920() {
    %c1 = arith.constant 1 : index
    %c4 = arith.constant 4 : index
    %c120 = arith.constant 120 : index
    %c64 = arith.constant 64 : index
    %c1080 = arith.constant 1080 : index
    %c1920 = arith.constant 1920 : index
    %c0 = arith.constant 0 : index
    %cst = arith.constant 5.400000e+02 : f32
    %cst_0 = arith.constant 9.600000e+02 : f32
    %cst_1 = arith.constant -2.700000e+02 : f32
    %cst_2 = arith.constant -4.800000e+02 : f32
    %cst_3 = arith.constant 1.080000e+03 : f32
    %cst_4 = arith.constant 1.920000e+03 : f32
    %c1_i32 = arith.constant 1 : i32
    %c539_i32 = arith.constant 539 : i32
    %c959_i32 = arith.constant 959 : i32
    %cst_5 = arith.constant 1.000000e+00 : f32
    %cst_6 = arith.constant 4.000000e+00 : f32
    %c0_i32 = arith.constant 0 : i32
    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<readonly:tensor<1x540x960x1xf32>>
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:tensor<1080x1920xf32>>
    %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 540, 960, 1], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x540x960x1xf32>> -> tensor<1x540x960x1xf32>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_count_x = hal.interface.workgroup.count[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_count_y = hal.interface.workgroup.count[1] : index
    %c120_7 = arith.constant 120 : index
    %3 = arith.muli %workgroup_id_y, %c120_7 : index
    %c120_8 = arith.constant 120 : index
    %4 = arith.muli %workgroup_count_y, %c120_8 : index
    %c64_9 = arith.constant 64 : index
    %5 = arith.muli %workgroup_id_x, %c64_9 : index
    %c64_10 = arith.constant 64 : index
    %6 = arith.muli %workgroup_count_x, %c64_10 : index
    scf.for %arg0 = %3 to %c1080 step %4 {
      scf.for %arg1 = %5 to %c1920 step %6 {
        %7 = flow.dispatch.tensor.load %1, offsets = [%arg0, %arg1], sizes = [120, 64], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1080x1920xf32>> -> tensor<120x64xf32>
        %8 = scf.for %arg2 = %c0 to %c120 step %c1 iter_args(%arg3 = %7) -> (tensor<120x64xf32>) {
          %9 = scf.for %arg4 = %c0 to %c64 step %c4 iter_args(%arg5 = %arg3) -> (tensor<120x64xf32>) {
            %extracted_slice = tensor.extract_slice %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<120x64xf32> to tensor<1x4xf32>
            %10 = linalg.fill {__internal_linalg_transform__ = "1"} ins(%cst_6 : f32) outs(%extracted_slice : tensor<1x4xf32>) -> tensor<1x4xf32>
            %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%10 : tensor<1x4xf32>) attrs =  {__internal_linalg_transform__ = "1", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[120, 64], [1, 4], [0, 0]]>} {
            ^bb0(%out: f32):
              %12 = linalg.index 0 : index
              %13 = arith.addi %arg0, %12 : index
              %14 = arith.addi %13, %arg2 : index
              %15 = linalg.index 1 : index
              %16 = arith.addi %arg1, %15 : index
              %17 = arith.addi %16, %arg4 : index
              %18 = arith.index_cast %14 : index to i32
              %19 = arith.index_cast %17 : index to i32
              %20 = arith.uitofp %18 : i32 to f32
              %21 = arith.uitofp %19 : i32 to f32
              %22 = arith.mulf %20, %cst : f32
              %23 = arith.mulf %21, %cst_0 : f32
              %24 = arith.addf %22, %cst_1 : f32
              %25 = arith.addf %23, %cst_2 : f32
              %26 = arith.divf %24, %cst_3 : f32
              %27 = arith.divf %25, %cst_4 : f32
              %28 = math.floor %26 : f32
              %29 = math.floor %27 : f32
              %30 = arith.subf %26, %28 : f32
              %31 = arith.subf %27, %29 : f32
              %32 = arith.fptosi %28 : f32 to i32
              %33 = arith.fptosi %29 : f32 to i32
              %34 = arith.addi %32, %c1_i32 : i32
              %35 = arith.addi %33, %c1_i32 : i32
              %36 = arith.cmpi slt, %32, %c0_i32 : i32
              %37 = arith.select %36, %c0_i32, %32 : i32
              %38 = arith.cmpi sgt, %32, %c539_i32 : i32
              %39 = arith.select %38, %c539_i32, %37 : i32
              %40 = arith.cmpi slt, %34, %c0_i32 : i32
              %41 = arith.select %40, %c0_i32, %34 : i32
              %42 = arith.cmpi sgt, %34, %c539_i32 : i32
              %43 = arith.select %42, %c539_i32, %41 : i32
              %44 = arith.cmpi slt, %33, %c0_i32 : i32
              %45 = arith.select %44, %c0_i32, %33 : i32
              %46 = arith.cmpi sgt, %33, %c959_i32 : i32
              %47 = arith.select %46, %c959_i32, %45 : i32
              %48 = arith.cmpi slt, %35, %c0_i32 : i32
              %49 = arith.select %48, %c0_i32, %35 : i32
              %50 = arith.cmpi sgt, %35, %c959_i32 : i32
              %51 = arith.select %50, %c959_i32, %49 : i32
              %52 = arith.index_cast %39 : i32 to index
              %53 = arith.index_cast %43 : i32 to index
              %54 = arith.index_cast %47 : i32 to index
              %55 = arith.index_cast %51 : i32 to index
              %56 = arith.subf %cst_5, %31 : f32
              %57 = arith.mulf %28, %56 : f32
              %58 = arith.mulf %56, %31 : f32
              %59 = arith.addf %57, %58 : f32
              %60 = arith.mulf %56, %56 : f32
              %61 = arith.mulf %56, %31 : f32
              %62 = arith.addf %60, %61 : f32
              %63 = arith.subf %cst_5, %30 : f32
              %64 = arith.mulf %59, %63 : f32
              %65 = arith.mulf %62, %30 : f32
              %66 = arith.addf %64, %65 : f32
              %67 = arith.mulf %out, %66 : f32
              linalg.yield %67 : f32
            } -> tensor<1x4xf32>
            %inserted_slice = tensor.insert_slice %11 into %arg5[%arg2, %arg4] [1, 4] [1, 1] : tensor<1x4xf32> into tensor<120x64xf32>
            scf.yield %inserted_slice : tensor<120x64xf32>
          }
          scf.yield %9 : tensor<120x64xf32>
        }
        flow.dispatch.tensor.store %8, %1, offsets = [%arg0, %arg1], sizes = [120, 64], strides = [1, 1] : tensor<120x64xf32> -> !flow.dispatch.tensor<writeonly:tensor<1080x1920xf32>>
      }
    }
    return
  }

  transform.sequence failures(propagate) {
  ^bb0(%arg0: !pdl.operation):
    %0 = transform.structured.match ops{["linalg.generic"]} in %arg0
    %1 = get_closest_isolated_parent %0 : (!pdl.operation) -> !pdl.operation
    %2 = transform.structured.vectorize %1
  }
}

TO REPRODUCE

$ tools/iree-opt  --iree-transform-dialect-interpreter repro.mlir

CONTEXT
The MLIR above was generated from the following TOSA snippet. It's a "trimmed" dump just before the LinalgStrategyVectorizePass. I only really removed tensor.extract ops, which are currently not vectorised (and for this crash to happen, I had to make sure that my linalg.generic operator is being vectorised).

  func.func @pipeline(%arg0: tensor<540x960x1xf32>) -> tensor<1080x1920x1xf32> attributes {llvm.emit_c_interface} {
    %cst = arith.constant 4.000000e+00 : f32
    %0 = tensor.empty() : tensor<1080x1920x1xf32>
    %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<1080x1920x1xf32>) -> tensor<1080x1920x1xf32>
    %2 = "tosa.reshape"(%arg0) {new_shape = [1, 540, 960, 1]} : (tensor<540x960x1xf32>) -> tensor<1x540x960x1xf32>
    %3 = "tosa.resize"(%2) {border = [0, 0], mode = "BILINEAR", offset = [-270, -480], scale = [1080, 540, 1920, 960]} : (tensor<1x540x960x1xf32>) -> tensor<1x1080x1920x1xf32>
    %4 = "tosa.reshape"(%3) {new_shape = [1080, 1920, 1]} : (tensor<1x1080x1920x1xf32>) -> tensor<1080x1920x1xf32>
    %5 = "tosa.mul"(%1, %4) {shift = 0 : i32} : (tensor<1080x1920x1xf32>, tensor<1080x1920x1xf32>) -> tensor<1080x1920x1xf32>
    return %5 : tensor<1080x1920x1xf32>
  }

OBSERVATIONS
I can see that the Linalg vectorizer looks for reductions, identifies one and then things go wrong. It's not clear to me why it thinks that there's a reduction there?

Without tosa.mul in my original TOSA example, the vectorizer no longer thinks that there are reductions and everything goes fine.

Apologies for the lengthy reproducer. It took me a while to get here, and I wanted to share this before doing more investigation.

Happy to reduce more. Your pointers are much appreciated :)

-Andrzej

The text was updated successfully, but these errors were encountered:

dcaballe · 2023-01-10T19:07:47Z

There is a bug that hopefully I fixed here: https://reviews.llvm.org/D141413

Basically %13 = arith.addi %arg0, %12 : index is identified as a reduction since %arg0 is a block argument that makes it through the reduction detection logic. We just have to make sure that %arg0 is a block argument in block of the linalg generic op (see the fix).

Could you please provide a small test that we can use for the patch? I tried to simplify your example but I couldn't make it fail.

banach-space · 2023-01-10T19:47:33Z

Could you please provide a small test that we can use for the patch? I tried to simplify your example but I couldn't make it fail.

Much reduced linalg.generic that exhibits similar behavior:

module {
  func.func @pipeline_dispatch_1_generic_1080x1920() {
    %c1 = arith.constant 1 : index
    %c4 = arith.constant 4 : index
    %c120 = arith.constant 120 : index
    %c64 = arith.constant 64 : index
    %c1080 = arith.constant 1080 : index
    %c1920 = arith.constant 1920 : index
    %c0 = arith.constant 0 : index
    %cst_6 = arith.constant 4.000000e+00 : f32
    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c0) alignment(64) : !flow.dispatch.tensor<writeonly:tensor<1080x1920xf32>>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_count_x = hal.interface.workgroup.count[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_count_y = hal.interface.workgroup.count[1] : index
    %c120_7 = arith.constant 120 : index
    %3 = arith.muli %workgroup_id_y, %c120_7 : index
    %c120_8 = arith.constant 120 : index
    %4 = arith.muli %workgroup_count_y, %c120_8 : index
    %c64_9 = arith.constant 64 : index
    %5 = arith.muli %workgroup_id_x, %c64_9 : index
    %c64_10 = arith.constant 64 : index
    %6 = arith.muli %workgroup_count_x, %c64_10 : index
    scf.for %arg0 = %3 to %c1080 step %4 {
      scf.for %arg1 = %5 to %c1920 step %6 {
        %7 = flow.dispatch.tensor.load %1, offsets = [%arg0, %arg1], sizes = [120, 64], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1080x1920xf32>> -> tensor<120x64xf32>
        %8 = scf.for %arg2 = %c0 to %c120 step %c1 iter_args(%arg3 = %7) -> (tensor<120x64xf32>) {
          %9 = scf.for %arg4 = %c0 to %c64 step %c4 iter_args(%arg5 = %arg3) -> (tensor<120x64xf32>) {
            %extracted_slice = tensor.extract_slice %arg5[%c0, %arg4] [1, 4] [1, 1] : tensor<120x64xf32> to tensor<1x4xf32>
            %10 = linalg.fill {__internal_linalg_transform__ = "1"} ins(%cst_6 : f32) outs(%extracted_slice : tensor<1x4xf32>) -> tensor<1x4xf32>
            %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} outs(%10 : tensor<1x4xf32>) {
            ^bb0(%out: f32):
              %12 = linalg.index 0 : index
              %13 = arith.addi %arg4, %12 : index
              %18 = arith.index_cast %13 : index to i32
              %20 = arith.uitofp %18 : i32 to f32
              %67 = arith.mulf %out, %20 : f32
              linalg.yield %67 : f32
            } -> tensor<1x4xf32>
            %inserted_slice = tensor.insert_slice %11 into %arg5[%c0, %arg4] [1, 4] [1, 1] : tensor<1x4xf32> into tensor<120x64xf32>
            scf.yield %inserted_slice : tensor<120x64xf32>
          }
          scf.yield %9 : tensor<120x64xf32>
        }
        flow.dispatch.tensor.store %8, %1, offsets = [%arg0, %arg1], sizes = [120, 64], strides = [1, 1] : tensor<120x64xf32> -> !flow.dispatch.tensor<writeonly:tensor<1080x1920xf32>>
      }
    }
    return
  }

  transform.sequence failures(propagate) {
  ^bb0(%arg0: !pdl.operation):
    %0 = transform.structured.match ops{["linalg.generic"]} in %arg0
    %1 = get_closest_isolated_parent %0 : (!pdl.operation) -> !pdl.operation
    %2 = transform.structured.vectorize %1
  }
}

When detecting reductions, make sure the block argument is from the linalg generic op. This fixes iree-org/iree#11779. Co-authored-by: Andrzej Warzynski <andrzej.warzynski@arm.com> Reviewed By: nicolasvasilache Differential Revision: https://reviews.llvm.org/D141413

banach-space added the codegen Shared code generation infrastructure and dialects label Jan 10, 2023

dcaballe self-assigned this Jan 10, 2023

dcaballe closed this as completed in llvm/llvm-project@57e455e Jan 12, 2023

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Error when vectorising linalg.generic #11779

Error when vectorising linalg.generic #11779

banach-space commented Jan 10, 2023

dcaballe commented Jan 10, 2023

banach-space commented Jan 10, 2023 •

edited

Error when vectorising linalg.generic #11779

Error when vectorising linalg.generic #11779

Comments

banach-space commented Jan 10, 2023

dcaballe commented Jan 10, 2023

banach-space commented Jan 10, 2023 • edited

banach-space commented Jan 10, 2023 •

edited