diff --git a/tests/transform_dialect/cpu/BUILD.bazel b/tests/transform_dialect/cpu/BUILD.bazel index 669933c9f2af..fc81794397de 100644 --- a/tests/transform_dialect/cpu/BUILD.bazel +++ b/tests/transform_dialect/cpu/BUILD.bazel @@ -14,20 +14,12 @@ package( iree_lit_test_suite( name = "lit", srcs = [ - "contraction-packing.mlir", - "contraction-packing-and-dispatch.mlir", - # DISABLED: incorrectly assuming default flag values. - # "eltwise_reduction_eltwise.mlir", - "fold_tensor_slice_into_transfer.mlir", - # DISABLED: incorrectly assuming default flag values. - # "matmul.mlir", "matmul_library_call.mlir", ], cfg = "//tests:lit.cfg.py", # transform dialect spec files are MLIR files that specify a transformation, # they need to be included as data. data = [ - "matmul_codegen_default_spec.mlir", "transform_library.mlir", ], tags = [ diff --git a/tests/transform_dialect/cpu/CMakeLists.txt b/tests/transform_dialect/cpu/CMakeLists.txt index 9f24b4534c50..f1b9544294d7 100644 --- a/tests/transform_dialect/cpu/CMakeLists.txt +++ b/tests/transform_dialect/cpu/CMakeLists.txt @@ -14,9 +14,6 @@ iree_lit_test_suite( NAME lit SRCS - "contraction-packing-and-dispatch.mlir" - "contraction-packing.mlir" - "fold_tensor_slice_into_transfer.mlir" "matmul_library_call.mlir" TOOLS ${IREE_LLD_TARGET} @@ -26,7 +23,6 @@ iree_lit_test_suite( iree-opt iree-run-module DATA - matmul_codegen_default_spec.mlir transform_library.mlir LABELS "hostonly" diff --git a/tests/transform_dialect/cpu/contraction-packing-and-dispatch.mlir b/tests/transform_dialect/cpu/contraction-packing-and-dispatch.mlir deleted file mode 100644 index bb0c56a24213..000000000000 --- a/tests/transform_dialect/cpu/contraction-packing-and-dispatch.mlir +++ /dev/null @@ -1,59 +0,0 @@ - -// Preprocessing with generalized packing. -// -// RUN: iree-opt %s --iree-transform-dialect-interpreter --transform-dialect-drop-schedule | \ -// RUN: iree-compile - --iree-hal-target-backends=llvm-cpu \ -// RUN: --iree-opt-data-tiling=false \ -// RUN: --compile-to=executable-configurations | \ -// RUN: FileCheck %s - -!a_tensor_t = tensor<1234x567xf32> -!b_tensor_t = tensor<567x890xf32> -!c_tensor_t = tensor<1234x890xf32> - -// Note: the normalization in these maps is gone due to InterchangeGenericOps. -// When using generalized packing, it would be better to drop that pass. - -// CHECK-LABEL: func.func @matmul_dispatch_0 -// CHECK: tensor.empty() : tensor<155x18x8x32xf32> -// CHECK: tensor.pack - -// CHECK-LABEL: func.func @matmul_dispatch_1 -// CHECK: tensor.empty() : tensor<18x56x16x32xf32> -// CHECK: tensor.pack - -// CHECK-LABEL: func.func @matmul_dispatch_2 -// CHECK: tensor.empty() : tensor<155x56x8x16xf32> -// CHECK: tensor.pack - -// CHECK-LABEL: func.func @matmul_dispatch_3 -func.func public @matmul(%arg0: !a_tensor_t, %arg2: !c_tensor_t) -> !c_tensor_t { - %rhs = arith.constant dense<0.1> : !b_tensor_t - %c0 = util.optimization_barrier %rhs : !b_tensor_t - // CHECK-NOT: pack - // CHECK: linalg.generic - // CHECK-SAME: indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d4, d2, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d4, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>] - // CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction"]} - // CHECK-SAME: ins(%{{.*}} : tensor<155x18x8x32xf32>, tensor<18x56x16x32xf32>) - // CHECK-SAME: outs(%{{.*}} : tensor<155x56x8x16xf32>) - - %0 = linalg.matmul - ins(%arg0, %c0: !a_tensor_t, !b_tensor_t) - outs(%arg2: !c_tensor_t) -> !c_tensor_t - return %0 : !c_tensor_t -} - -// CHECK-LABEL: func.func @matmul_dispatch_4 -// CHECK: tensor.unpack -module attributes { transform.with_named_sequence } { - transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) { - %matmul = transform.structured.match interface{LinalgOp} in %module_op - : (!transform.any_op) -> (!transform.any_op) - - transform.structured.pack_greedily %matmul - matmul_packed_sizes = [8, 16, 32] - matmul_inner_dims_order = [0, 1, 2] - : (!transform.any_op) -> !transform.op<"linalg.generic"> - transform.yield - } -} // module diff --git a/tests/transform_dialect/cpu/contraction-packing.mlir b/tests/transform_dialect/cpu/contraction-packing.mlir deleted file mode 100644 index 8c67a38295e9..000000000000 --- a/tests/transform_dialect/cpu/contraction-packing.mlir +++ /dev/null @@ -1,153 +0,0 @@ - -// Preprocessing with generalized packing. -// -// RUN: iree-opt %s --iree-transform-dialect-interpreter --transform-dialect-drop-schedule | \ -// RUN: FileCheck %s - -!a_tensor_t = tensor<1234x567xf32> -!at_tensor_t = tensor<567x1234xf32> -!b_tensor_t = tensor<567x890xf32> -!bt_tensor_t = tensor<890x567xf32> -!c_tensor_t = tensor<1234x890xf32> -!ct_tensor_t = tensor<890x1234xf32> - -// CHECK-DAG: #[[$map_lhs:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)> -// CHECK-DAG: #[[$map_rhs:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d4, d5)> -// CHECK-DAG: #[[$map_res:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)> -// CHECK-DAG: #[[$map_tlhs:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d3, d5)> -// CHECK-DAG: #[[$map_trhs:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)> -// CHECK-DAG: #[[$map_tres:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d3, d4)> - -// CHECK-LABEL: func.func @matmul_nnn -func.func @matmul_nnn(%arg0: !a_tensor_t, %arg2: !c_tensor_t) -> !c_tensor_t { - %c0 = arith.constant dense<0.1> : !b_tensor_t - - // CHECK: tensor.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [8, 32] - // CHECK: tensor.pack %{{.*}} inner_dims_pos = [1, 0] inner_tiles = [16, 32] - // CHECK: tensor.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [8, 16] - // CHECK: linalg.generic - // CHECK-SAME: indexing_maps = [#[[$map_lhs]], #[[$map_rhs]], #[[$map_res]]] - // CHECK-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} - // CHECK-SAME: ins(%{{.*}} : tensor<155x18x8x32xf32>, tensor<18x56x16x32xf32>) - // CHECK-SAME: outs(%{{.*}} : tensor<155x56x8x16xf32>) - // CHECK: tensor.unpack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [8, 16] - %0 = linalg.matmul - ins(%arg0, %c0: !a_tensor_t, !b_tensor_t) - outs(%arg2: !c_tensor_t) -> !c_tensor_t - return %0 : !c_tensor_t -} - -#matmul_tnn_trait = { - indexing_maps = [ - affine_map<(m, n, k) -> (k, m)>, - affine_map<(m, n, k) -> (k, n)>, - affine_map<(m, n, k) -> (m, n)> - ], - iterator_types = ["parallel", "parallel", "reduction"] -} - -// CHECK-LABEL: func.func @matmul_tnn -func.func @matmul_tnn(%arg0: !at_tensor_t, %arg2: !c_tensor_t) -> !c_tensor_t { - %c0 = arith.constant dense<0.1> : !b_tensor_t - - // CHECK: tensor.pack %{{.*}} inner_dims_pos = [1, 0] inner_tiles = [8, 32] - // CHECK: tensor.pack %{{.*}} inner_dims_pos = [1, 0] inner_tiles = [16, 32] - // CHECK: tensor.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [8, 16] - // CHECK: linalg.generic - // CHECK-SAME: indexing_maps = [#[[$map_tlhs]], #[[$map_rhs]], #[[$map_res]]] - // CHECK-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} - // CHECK-SAME: ins(%{{.*}} : tensor<18x155x8x32xf32>, tensor<18x56x16x32xf32>) - // CHECK-SAME: outs(%{{.*}} : tensor<155x56x8x16xf32>) - // CHECK: tensor.unpack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [8, 16] - %0 = linalg.generic #matmul_tnn_trait - ins(%arg0, %c0: !at_tensor_t, !b_tensor_t) - outs(%arg2: !c_tensor_t) { - ^bb(%a: f32, %b: f32, %c: f32) : - %d = arith.mulf %a, %b: f32 - %e = arith.addf %c, %d: f32 - linalg.yield %e : f32 - } -> !c_tensor_t - return %0 : !c_tensor_t -} - -#matmul_ntn_trait = { - indexing_maps = [ - affine_map<(m, n, k) -> (m, k)>, - affine_map<(m, n, k) -> (n, k)>, - affine_map<(m, n, k) -> (m, n)> - ], - iterator_types = ["parallel", "parallel", "reduction"] -} - -// CHECK-LABEL: func.func @matmul_ntn -func.func @matmul_ntn(%arg0: !a_tensor_t, %arg2: !c_tensor_t) -> !c_tensor_t { - %c0 = arith.constant dense<0.1> : !bt_tensor_t - - // CHECK: tensor.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [8, 32] - // CHECK: tensor.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [16, 32] - // CHECK: tensor.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [8, 16] - // CHECK: linalg.generic - // CHECK-SAME: indexing_maps = [#[[$map_lhs]], #[[$map_trhs]], #[[$map_res]]] - // CHECK-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} - // CHECK-SAME: ins(%{{.*}} : tensor<155x18x8x32xf32>, tensor<56x18x16x32xf32>) - // CHECK-SAME: outs(%{{.*}} : tensor<155x56x8x16xf32>) - // CHECK: tensor.unpack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [8, 16] - %0 = linalg.generic #matmul_ntn_trait - ins(%arg0, %c0: !a_tensor_t, !bt_tensor_t) - outs(%arg2: !c_tensor_t) { - ^bb(%a: f32, %b: f32, %c: f32) : - %d = arith.mulf %a, %b: f32 - %e = arith.addf %c, %d: f32 - linalg.yield %e : f32 - } -> !c_tensor_t - return %0 : !c_tensor_t -} - -#matmul_nnt_trait = { - indexing_maps = [ - affine_map<(m, n, k) -> (m, k)>, - affine_map<(m, n, k) -> (k, n)>, - affine_map<(m, n, k) -> (n, m)> - ], - iterator_types = ["parallel", "parallel", "reduction"] -} - -// CHECK-LABEL: func.func @matmul_nnt -func.func @matmul_nnt(%arg0: !a_tensor_t, %arg2: !ct_tensor_t) -> !ct_tensor_t { - %c0 = arith.constant dense<0.1> : !b_tensor_t - - // CHECK: tensor.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [8, 32] - // CHECK: tensor.pack %{{.*}} inner_dims_pos = [1, 0] inner_tiles = [16, 32] - // CHECK: tensor.pack %{{.*}} inner_dims_pos = [1, 0] inner_tiles = [8, 16] - // CHECK: linalg.generic - // CHECK-SAME: indexing_maps = [#[[$map_lhs]], #[[$map_rhs]], #[[$map_tres]]] - // CHECK-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} - // CHECK-SAME: ins(%{{.*}} : tensor<155x18x8x32xf32>, tensor<18x56x16x32xf32>) - // CHECK-SAME: outs(%{{.*}} : tensor<56x155x8x16xf32>) - // CHECK: tensor.unpack %{{.*}} inner_dims_pos = [1, 0] inner_tiles = [8, 16] - %0 = linalg.generic #matmul_nnt_trait - ins(%arg0, %c0: !a_tensor_t, !b_tensor_t) - outs(%arg2: !ct_tensor_t) { - ^bb(%a: f32, %b: f32, %c: f32) : - %d = arith.mulf %a, %b: f32 - %e = arith.addf %c, %d: f32 - linalg.yield %e : f32 - } -> !ct_tensor_t - return %0 : !ct_tensor_t -} - -module attributes { transform.with_named_sequence } { - transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) { - %matmul = transform.structured.match interface{LinalgOp} in %module_op - : (!transform.any_op) -> (!transform.any_op) - - // Generalized packing rewrite extracts a gemm from any linalg op that contains - // one. This acts as a powerful normalization step: after this point, we have a - // gemm (i.e. 3-D contraction with (m,n,k)=(8,16,32) ) on the 3 most minor - // dimensions. - transform.structured.pack_greedily %matmul - matmul_packed_sizes = [8, 16, 32] matmul_inner_dims_order = [0, 1, 2] - : (!transform.any_op) -> !transform.op<"linalg.generic"> - transform.yield - } -} // module diff --git a/tests/transform_dialect/cpu/eltwise_reduction_eltwise.mlir b/tests/transform_dialect/cpu/eltwise_reduction_eltwise.mlir deleted file mode 100644 index 0e61ca71cf31..000000000000 --- a/tests/transform_dialect/cpu/eltwise_reduction_eltwise.mlir +++ /dev/null @@ -1,71 +0,0 @@ -!in_tensor_t = tensor<32x256xf32> -!out_tensor_t = tensor<32xf32> - -func.func @reduce(%arg : !in_tensor_t) -> (!out_tensor_t) { - %cst = arith.constant -0.000000e+00 : f32 - - %0 = tensor.empty() : !out_tensor_t - %1 = linalg.fill ins(%cst : f32) outs(%0 : !out_tensor_t) -> !out_tensor_t - %2 = tensor.empty() : !in_tensor_t - %3 = linalg.generic { - indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, - affine_map<(d0, d1) -> (d0, d1)>], - iterator_types = ["parallel", "parallel"]} - ins(%arg : !in_tensor_t) outs(%2 : !in_tensor_t) { - ^bb0(%arg3: f32, %arg4: f32): - %4 = arith.addf %arg3, %arg3 : f32 - %5 = arith.addf %4, %4 : f32 - linalg.yield %5 : f32 - } -> !in_tensor_t - - %6 = linalg.generic { - indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, - affine_map<(d0, d1) -> (d0)>], - iterator_types = ["parallel", "reduction"]} - ins(%3 : !in_tensor_t) outs(%1 : !out_tensor_t) { - ^bb0(%arg3: f32, %arg4: f32): - %4 = arith.addf %arg3, %arg4 : f32 - linalg.yield %4 : f32 - } -> !out_tensor_t - - %7 = tensor.empty() : !out_tensor_t - %8 = linalg.generic { - indexing_maps = [affine_map<(d0) -> (d0)>, - affine_map<(d0) -> (d0)>], - iterator_types = ["parallel"]} - ins(%6 : !out_tensor_t) outs(%7 : !out_tensor_t) { - ^bb0(%arg3: f32, %arg4: f32): - %4 = math.sqrt %arg3 : f32 - linalg.yield %4 : f32 - } -> !out_tensor_t - - - return %8 : !out_tensor_t -} - -// RUN: iree-compile %s --iree-hal-target-backends=llvm-cpu \ -// RUN: --iree-opt-data-tiling=false \ -// RUN: --compile-to=executable-configurations | \ -// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs,iree-llvmcpu-select-lowering-strategy,iree-llvmcpu-lower-executable-target)))' \ -// RUN: --iree-llvmcpu-enable-transform-dialect-jit | \ -// RUN: FileCheck %s - -// RUN: iree-compile %s --iree-hal-target-backends=llvm-cpu \ -// RUN: --iree-opt-data-tiling=false \ -// RUN: --iree-llvmcpu-enable-transform-dialect-jit | \ -// RUN: iree-run-module --module=- --function=reduce --device=local-task --input="32x256xf32=1" | \ -// RUN: FileCheck %s --check-prefix=EXEC - -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[workgroup_id_x:.*]] = hal.interface.workgroup.id[0] : index -// CHECK: scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} -> (vector<8xf32>) { -// CHECK: arith.addf %{{.*}} : vector<8x16xf32> -// CHECK-COUNT-16: vector.extract %{{.*}} : vector<8xf32> from vector<16x8xf32>{{[[:space:]].*}}arith.addf %{{.*}} : vector<8xf32> -// CHECK: scf.yield %{{.*}} : vector<8xf32> -// CHECK: } -// CHECK: math.sqrt %{{.*}} : vector<8xf32> -// CHECK: vector.store %{{.*}} : memref<8xf32, strided<[1], offset: ?>, #hal.descriptor_type>, vector<8xf32> - -// EXEC: result[0]: hal.buffer_view -// EXEC-NEXT: 32xf32=32 32 32 32 32 32 32 32 diff --git a/tests/transform_dialect/cpu/fold_tensor_slice_into_transfer.mlir b/tests/transform_dialect/cpu/fold_tensor_slice_into_transfer.mlir deleted file mode 100644 index 435c25f434f1..000000000000 --- a/tests/transform_dialect/cpu/fold_tensor_slice_into_transfer.mlir +++ /dev/null @@ -1,111 +0,0 @@ -// RUN: iree-opt --iree-transform-dialect-interpreter %s | FileCheck %s - -// This transform was removed from MLIR by https://reviews.llvm.org/D154932 and -// added to IREE in https://github.com/iree-org/iree/pull/14373, as a workaround -// for other patterns being sensitive to these exact transforms. - -// CHECK-LABEL: func @transfer_read_of_extract_slice( -// CHECK-SAME: %[[t:.*]]: tensor, %[[s1:.*]]: index, %[[s2:.*]]: index -// CHECK-DAG: %[[c4:.*]] = arith.constant 4 : index -// CHECK-DAG: %[[c8:.*]] = arith.constant 8 : index -// CHECK: %[[add:.*]] = arith.addi %[[s1]], %[[c4]] -// CHECK: %[[r:.*]] = vector.transfer_read %[[t]][%[[c8]], %[[add]]], %{{.*}} {in_bounds = [true, true]} : tensor, vector<5x6xf32> -// CHECK: return %[[r]] -func.func @transfer_read_of_extract_slice(%t : tensor, %s1 : index, %s2 : index) -> vector<5x6xf32> { - %c3 = arith.constant 3 : index - %c4 = arith.constant 4 : index - %cst = arith.constant 0.0 : f32 - %0 = tensor.extract_slice %t[5, %s1] [10, %s2] [1, 1] : tensor to tensor<10x?xf32> - %1 = vector.transfer_read %0[%c3, %c4], %cst {in_bounds = [true, true]} : tensor<10x?xf32>, vector<5x6xf32> - return %1 : vector<5x6xf32> -} - -// CHECK-LABEL: func @transfer_read_of_extract_slice_1d( -// CHECK-SAME: %[[t:.*]]: tensor, %[[s1:.*]]: index, %[[s2:.*]]: index -// CHECK-DAG: %[[c4:.*]] = arith.constant 4 : index -// CHECK-DAG: %[[c8:.*]] = arith.constant 8 : index -// CHECK: %[[add:.*]] = arith.addi %[[s1]], %[[c4]] -// CHECK: %[[r:.*]] = vector.transfer_read %[[t]][%[[c8]], %[[add]]], %{{.*}} {in_bounds = [true]} : tensor, vector<6xf32> -// CHECK: return %[[r]] -func.func @transfer_read_of_extract_slice_1d(%t : tensor, %s1 : index, %s2 : index) -> vector<6xf32> { - %c3 = arith.constant 3 : index - %c4 = arith.constant 4 : index - %cst = arith.constant 0.0 : f32 - %0 = tensor.extract_slice %t[5, %s1] [10, %s2] [1, 1] : tensor to tensor<10x?xf32> - %1 = vector.transfer_read %0[%c3, %c4], %cst {in_bounds = [true]} : tensor<10x?xf32>, vector<6xf32> - return %1 : vector<6xf32> -} - -// CHECK-LABEL: func @transfer_read_of_extract_slice_rank_reducing( -// CHECK-SAME: %[[t:.*]]: tensor, %[[s1:.*]]: index, %[[s2:.*]]: index -// CHECK-DAG: %[[c3:.*]] = arith.constant 3 : index -// CHECK-DAG: %[[c5:.*]] = arith.constant 5 : index -// CHECK-DAG: %[[c10:.*]] = arith.constant 10 : index -// CHECK: %[[add:.*]] = arith.addi %[[s1]], %[[c3]] -// CHECK: %[[r:.*]] = vector.transfer_read %[[t]][%[[c5]], %[[add]], %[[c10]]], %{{.*}} {in_bounds = [true, true]} : tensor, vector<5x6xf32> -// CHECK: return %[[r]] -func.func @transfer_read_of_extract_slice_rank_reducing(%t : tensor, %s1 : index, %s2 : index) -> vector<5x6xf32> { - %c3 = arith.constant 3 : index - %c4 = arith.constant 4 : index - %cst = arith.constant 0.0 : f32 - %0 = tensor.extract_slice %t[5, %s1, 6] [1, %s2, 12] [1, 1, 1] : tensor to tensor - %1 = vector.transfer_read %0[%c3, %c4], %cst {in_bounds = [true, true]} : tensor, vector<5x6xf32> - return %1 : vector<5x6xf32> -} - -// CHECK-LABEL: func @transfer_read_of_extract_slice_illegal_rank_reducing( -// CHECK: extract_slice -// CHECK: vector.transfer_read -func.func @transfer_read_of_extract_slice_illegal_rank_reducing(%t : tensor, %s1 : index, %s2 : index) -> vector<5x6xf32> { - %c3 = arith.constant 3 : index - %c4 = arith.constant 4 : index - %cst = arith.constant 0.0 : f32 - %0 = tensor.extract_slice %t[5, %s1, 6] [%s2, 1, 12] [1, 1, 1] : tensor to tensor - %1 = vector.transfer_read %0[%c3, %c4], %cst {in_bounds = [true, true]} : tensor, vector<5x6xf32> - return %1 : vector<5x6xf32> -} - -// CHECK-LABEL: func @insert_slice_of_transfer_write( -// CHECK-SAME: %[[t1:.*]]: tensor, %[[v:.*]]: vector<5x6xf32>, %[[s:.*]]: index -// CHECK: %[[c3:.*]] = arith.constant 3 : index -// CHECK: %[[r:.*]] = vector.transfer_write %[[v]], %[[t1]][%[[c3]], %[[s]]] {in_bounds = [true, true]} : vector<5x6xf32>, tensor -// CHECK: return %[[r]] -func.func @insert_slice_of_transfer_write(%t1 : tensor, %v : vector<5x6xf32>, %s : index, %t2 : tensor<5x6xf32>) -> tensor { - %c0 = arith.constant 0 : index - %0 = vector.transfer_write %v, %t2[%c0, %c0] {in_bounds = [true, true]} : vector<5x6xf32>, tensor<5x6xf32> - %1 = tensor.insert_slice %0 into %t1[3, %s] [5, 6] [1, 1] : tensor<5x6xf32> into tensor - return %1 : tensor -} - -// CHECK-LABEL: func @insert_slice_of_transfer_write_illegal_rank_extending( -// CHECK: vector.transfer_write -// CHECK: insert_slice -func.func @insert_slice_of_transfer_write_illegal_rank_extending(%t1 : tensor, %v : vector<5x6xf32>, %s : index, %t2 : tensor<5x6xf32>) -> tensor { - %c0 = arith.constant 0 : index - %0 = vector.transfer_write %v, %t2[%c0, %c0] {in_bounds = [true, true]} : vector<5x6xf32>, tensor<5x6xf32> - %1 = tensor.insert_slice %0 into %t1[4, 3, %s] [5, 1, 6] [1, 1, 1] : tensor<5x6xf32> into tensor - return %1 : tensor -} - -// CHECK-LABEL: func @insert_slice_of_transfer_write_rank_extending( -// CHECK-SAME: %[[t1:.*]]: tensor, %[[v:.*]]: vector<5x6xf32>, %[[s:.*]]: index -// CHECK-DAG: %[[c3:.*]] = arith.constant 3 : index -// CHECK-DAG: %[[c4:.*]] = arith.constant 4 : index -// CHECK: %[[r:.*]] = vector.transfer_write %[[v]], %[[t1]][%[[c4]], %[[c3]], %[[s]]] {in_bounds = [true, true]} : vector<5x6xf32>, tensor -// CHECK: return %[[r]] -func.func @insert_slice_of_transfer_write_rank_extending(%t1 : tensor, %v : vector<5x6xf32>, %s : index, %t2 : tensor<5x6xf32>) -> tensor { - %c0 = arith.constant 0 : index - %0 = vector.transfer_write %v, %t2[%c0, %c0] {in_bounds = [true, true]} : vector<5x6xf32>, tensor<5x6xf32> - %1 = tensor.insert_slice %0 into %t1[4, 3, %s] [1, 5, 6] [1, 1, 1] : tensor<5x6xf32> into tensor - return %1 : tensor -} - -module attributes { transform.with_named_sequence } { - transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) { - %top_level_func = transform.structured.match ops{["func.func"]} in %module_op : (!transform.any_op) -> !transform.any_op - transform.apply_patterns to %top_level_func { - transform.apply_patterns.iree.fold_tensor_slice_into_transfer - } : !transform.any_op - transform.yield - } -} diff --git a/tests/transform_dialect/cpu/matmul.mlir b/tests/transform_dialect/cpu/matmul.mlir deleted file mode 100644 index 39a2074f6043..000000000000 --- a/tests/transform_dialect/cpu/matmul.mlir +++ /dev/null @@ -1,21 +0,0 @@ -!A_size = tensor<3x5xf32> -!B_size = tensor<5x3xf32> -!C_size = tensor<3x3xf32> - -func.func @matmul_static( - %A : !A_size, %B : !B_size, %C : !C_size) -> !C_size { - %0 = linalg.matmul ins(%A, %B : !A_size, !B_size) - outs(%C : !C_size) -> !C_size - return %0 : !C_size -} - -// RUN: iree-compile %s --iree-hal-target-backends=llvm-cpu \ -// RUN: --iree-opt-data-tiling=false \ -// RUN: --iree-codegen-transform-dialect-library=%p/matmul_codegen_default_spec.mlir@codegen | \ -// RUN: iree-run-module --module=- --function=matmul_static \ -// RUN: --input="3x5xf32=1" \ -// RUN: --input="5x3xf32=2" \ -// RUN: --input="3x3xf32=42" | \ -// RUN: FileCheck %s --check-prefixes=EXEC - -// EXEC: 3x3xf32=[52 52 52][52 52 52][52 52 52] diff --git a/tests/transform_dialect/cpu/matmul_codegen_default_spec.mlir b/tests/transform_dialect/cpu/matmul_codegen_default_spec.mlir deleted file mode 100644 index 1fa31050e506..000000000000 --- a/tests/transform_dialect/cpu/matmul_codegen_default_spec.mlir +++ /dev/null @@ -1,32 +0,0 @@ -// RUN: iree-opt %s - -module attributes { transform.with_named_sequence } { - transform.named_sequence @codegen( - %variant_op: !transform.any_op {transform.consumed}) { - %matmul = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> !transform.any_op - - // Step 1. Tile to forall with tile_sizes [2]. - // =================================================== - %tiled_generic, %forall = - transform.structured.tile_using_forall %matmul tile_sizes [2] - ( mapping = [#gpu.block] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall - : (!transform.any_op) -> () - - // Step 2. Bufferize and drop HAL decriptor from memref ops. - // ========================================================= - transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> () - %variant_op_3 = transform.iree.bufferize %variant_op : (!transform.any_op) -> !transform.any_op - - // Step 3. Post-bufferization mapping workgroup. - // ========================================================= - %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op - transform.iree.forall_to_workgroup %memref_func : (!transform.any_op) -> () - - // Annotate the exported function as already translated. - %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op - %none = transform.param.constant #iree_codegen.translation_info -> !transform.any_param - transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param - transform.yield - } -} // module diff --git a/tests/transform_dialect/cuda/BUILD.bazel b/tests/transform_dialect/cuda/BUILD.bazel deleted file mode 100644 index a35e0dbe609f..000000000000 --- a/tests/transform_dialect/cuda/BUILD.bazel +++ /dev/null @@ -1,113 +0,0 @@ -# Copyright 2022 The IREE Authors -# -# Licensed under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -load("//build_tools/bazel:build_defs.oss.bzl", "iree_cmake_extra_content") -load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite") - -package( - features = ["layering_check"], - licenses = ["notice"], # Apache 2.0 -) - -iree_cmake_extra_content( - content = """ -if(NOT IREE_HAL_DRIVER_CUDA) - return() -endif() - -if(NOT IREE_HOST_BIN_DIR AND NOT IREE_TARGET_BACKEND_CUDA) - return() -endif() -""", - inline = True, -) - -iree_lit_test_suite( - name = "lit", - srcs = [ - # TODO(#15892): reductions have flakes and need to be triaged. - # "reduction.mlir", - # "reduction_eltwise.mlir", - # "reduction_v2.mlir", - # "reduction_v2_uneven.mlir", - # "softmax.mlir", - # "softmax_v2.mlir", - # First few ops of softmax only, acts as a proxy example. - # "softmax_partial.mlir", - ], - cfg = "//tests:lit.cfg.py", - # transform dialect spec files are MLIR files that specify a transformation, - # they need to be included as data. - data = [ - "mma_reduction_layout_analysis_codegen_spec.mlir", - "mma_reduction_layout_analysis_dispatch_spec.mlir", - "mma_using_layout_analysis_codegen_spec.mlir", - "reduction_codegen_spec.mlir", - "reduction_eltwise_codegen_spec.mlir", - "reduction_v2_codegen_spec.mlir", - "softmax_codegen_spec.mlir", - "softmax_v2_codegen_spec.mlir", - # - # FIXME: This must be used with the custom dispatch region formation - # because IREE's does not fuse the 6 ops softmax version even with - # --iree-flow-fuse-multi-use. - # - "softmax_dispatch_spec.mlir", - # First few ops of softmax only, acts as a proxy example. - "softmax_partial_codegen_spec.mlir", - ], - tags = [ - # CUDA cuInit fails with sanitizer on. - "noasan", - "nomsan", - "notsan", - "noubsan", - "requires-gpu-nvidia", - "driver=cuda", - ], - tools = [ - "//tools:iree-compile", - "//tools:iree-opt", - "//tools:iree-run-module", - "@llvm-project//llvm:FileCheck", - ], -) - -iree_lit_test_suite( - name = "sm80_lit", - srcs = [ - "double_mma_layout_analysis.mlir", - "mma_elemwise_layout_analysis.mlir", - "mma_reduction_layout_analysis.mlir", - "mma_using_layout_analysis.mlir", - ], - cfg = "//tests:lit.cfg.py", - # transform dialect spec files are MLIR files that specify a transformation, - # they need to be included as data. - data = [ - "double_mma_layout_analysis_codegen_spec.mlir", - "double_mma_layout_analysis_dispatch_spec.mlir", - "mma_elemwise_layout_analysis_codegen_spec.mlir", - "mma_reduction_layout_analysis_codegen_spec.mlir", - "mma_reduction_layout_analysis_dispatch_spec.mlir", - "mma_using_layout_analysis_codegen_spec.mlir", - ], - tags = [ - # CUDA cuInit fails with sanitizer on. - "noasan", - "nomsan", - "notsan", - "noubsan", - "requires-gpu-sm80", - "driver=cuda", - ], - tools = [ - "//tools:iree-compile", - "//tools:iree-opt", - "//tools:iree-run-module", - "@llvm-project//llvm:FileCheck", - ], -) diff --git a/tests/transform_dialect/cuda/CMakeLists.txt b/tests/transform_dialect/cuda/CMakeLists.txt deleted file mode 100644 index 7534bc8bfd46..000000000000 --- a/tests/transform_dialect/cuda/CMakeLists.txt +++ /dev/null @@ -1,78 +0,0 @@ -################################################################################ -# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from # -# tests/transform_dialect/cuda/BUILD.bazel # -# # -# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary # -# CMake-only content. # -# # -# To disable autogeneration for this file entirely, delete this header. # -################################################################################ - -iree_add_all_subdirs() - -if(NOT IREE_HAL_DRIVER_CUDA) - return() -endif() - -if(NOT IREE_HOST_BIN_DIR AND NOT IREE_TARGET_BACKEND_CUDA) - return() -endif() - -iree_lit_test_suite( - NAME - lit - TOOLS - FileCheck - iree-compile - iree-opt - iree-run-module - DATA - mma_reduction_layout_analysis_codegen_spec.mlir - mma_reduction_layout_analysis_dispatch_spec.mlir - mma_using_layout_analysis_codegen_spec.mlir - reduction_codegen_spec.mlir - reduction_eltwise_codegen_spec.mlir - reduction_v2_codegen_spec.mlir - softmax_codegen_spec.mlir - softmax_dispatch_spec.mlir - softmax_partial_codegen_spec.mlir - softmax_v2_codegen_spec.mlir - LABELS - "noasan" - "nomsan" - "notsan" - "noubsan" - "requires-gpu-nvidia" - "driver=cuda" -) - -iree_lit_test_suite( - NAME - sm80_lit - SRCS - "double_mma_layout_analysis.mlir" - "mma_elemwise_layout_analysis.mlir" - "mma_reduction_layout_analysis.mlir" - "mma_using_layout_analysis.mlir" - TOOLS - FileCheck - iree-compile - iree-opt - iree-run-module - DATA - double_mma_layout_analysis_codegen_spec.mlir - double_mma_layout_analysis_dispatch_spec.mlir - mma_elemwise_layout_analysis_codegen_spec.mlir - mma_reduction_layout_analysis_codegen_spec.mlir - mma_reduction_layout_analysis_dispatch_spec.mlir - mma_using_layout_analysis_codegen_spec.mlir - LABELS - "noasan" - "nomsan" - "notsan" - "noubsan" - "requires-gpu-sm80" - "driver=cuda" -) - -### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ### diff --git a/tests/transform_dialect/cuda/double_mma_layout_analysis.mlir b/tests/transform_dialect/cuda/double_mma_layout_analysis.mlir deleted file mode 100644 index 9293fd4ff0ab..000000000000 --- a/tests/transform_dialect/cuda/double_mma_layout_analysis.mlir +++ /dev/null @@ -1,26 +0,0 @@ -func.func @double_matmul(%lhs : tensor<16x16xf16>, %rhs : tensor<16x16xf16>, %second : tensor<16x8xf16>) -> tensor<16x8xf16> { - %c0 = arith.constant 0.0 : f16 - %0 = tensor.empty() : tensor<16x16xf16> - %1 = linalg.fill ins(%c0 : f16) outs(%0 : tensor<16x16xf16>) -> tensor<16x16xf16> - %2 = linalg.matmul ins(%lhs, %rhs : tensor<16x16xf16>, tensor<16x16xf16>) - outs(%1 : tensor<16x16xf16>) -> tensor<16x16xf16> - %3 = tensor.empty() : tensor<16x8xf16> - %4 = linalg.fill ins(%c0 : f16) outs(%3 : tensor<16x8xf16>) -> tensor<16x8xf16> - %5 = linalg.matmul ins(%2, %second : tensor<16x16xf16>, tensor<16x8xf16>) - outs(%4 : tensor<16x8xf16>) -> tensor<16x8xf16> - return %5 : tensor<16x8xf16> -} - -// RUN: iree-compile %s --iree-hal-target-backends=cuda \ -// RUN: --iree-hal-cuda-llvm-target-arch=sm_80 \ -// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \ -// RUN: --iree-flow-dispatch-use-transform-dialect=%p/double_mma_layout_analysis_dispatch_spec.mlir \ -// RUN: --iree-codegen-transform-dialect-library=%p/double_mma_layout_analysis_codegen_spec.mlir@codegen | \ -// RUN: iree-run-module --module=- --function=double_matmul --device=cuda \ -// RUN: --input="16x16xf16=[[0.0999755859375,0.2249755859375,0.07501220703125,0.0,0.07501220703125,0.2249755859375,0.175048828125,0.07501220703125,0.175048828125,0.07501220703125,0.024993896484375,0.1500244140625,0.1500244140625,0.2249755859375,0.199951171875,0.1500244140625],[0.1500244140625,0.199951171875,0.0999755859375,0.07501220703125,0.1500244140625,0.2249755859375,0.024993896484375,0.0999755859375,0.0999755859375,0.024993896484375,0.2249755859375,0.2249755859375,0.2249755859375,0.0,0.024993896484375,0.04998779296875],[0.07501220703125,0.0,0.125,0.125,0.04998779296875,0.2249755859375,0.024993896484375,0.199951171875,0.199951171875,0.07501220703125,0.1500244140625,0.2249755859375,0.024993896484375,0.175048828125,0.07501220703125,0.125],[0.04998779296875,0.024993896484375,0.0,0.2249755859375,0.07501220703125,0.024993896484375,0.024993896484375,0.0,0.07501220703125,0.1500244140625,0.1500244140625,0.175048828125,0.2249755859375,0.1500244140625,0.07501220703125,0.0999755859375],[0.125,0.0,0.199951171875,0.04998779296875,0.199951171875,0.04998779296875,0.175048828125,0.125,0.0,0.0,0.199951171875,0.024993896484375,0.2249755859375,0.1500244140625,0.024993896484375,0.0],[0.04998779296875,0.2249755859375,0.0999755859375,0.07501220703125,0.2249755859375,0.07501220703125,0.2249755859375,0.07501220703125,0.2249755859375,0.199951171875,0.125,0.07501220703125,0.04998779296875,0.199951171875,0.125,0.1500244140625],[0.1500244140625,0.125,0.175048828125,0.04998779296875,0.125,0.1500244140625,0.1500244140625,0.125,0.0999755859375,0.0,0.199951171875,0.024993896484375,0.175048828125,0.199951171875,0.125,0.0999755859375],[0.0999755859375,0.199951171875,0.0999755859375,0.0999755859375,0.2249755859375,0.0,0.175048828125,0.0999755859375,0.125,0.07501220703125,0.07501220703125,0.175048828125,0.07501220703125,0.0,0.2249755859375,0.2249755859375],[0.07501220703125,0.024993896484375,0.199951171875,0.024993896484375,0.175048828125,0.199951171875,0.0999755859375,0.024993896484375,0.0,0.0999755859375,0.0,0.0999755859375,0.2249755859375,0.175048828125,0.0,0.0],[0.024993896484375,0.0999755859375,0.2249755859375,0.2249755859375,0.125,0.2249755859375,0.04998779296875,0.04998779296875,0.04998779296875,0.024993896484375,0.0999755859375,0.2249755859375,0.024993896484375,0.024993896484375,0.0,0.07501220703125],[0.0,0.1500244140625,0.175048828125,0.1500244140625,0.2249755859375,0.024993896484375,0.1500244140625,0.0999755859375,0.024993896484375,0.0,0.125,0.04998779296875,0.125,0.199951171875,0.024993896484375,0.199951171875],[0.024993896484375,0.04998779296875,0.199951171875,0.0,0.07501220703125,0.199951171875,0.2249755859375,0.04998779296875,0.175048828125,0.0,0.199951171875,0.199951171875,0.1500244140625,0.199951171875,0.125,0.199951171875],[0.1500244140625,0.125,0.04998779296875,0.0999755859375,0.04998779296875,0.175048828125,0.04998779296875,0.0999755859375,0.2249755859375,0.199951171875,0.125,0.1500244140625,0.0999755859375,0.07501220703125,0.07501220703125,0.0999755859375],[0.0,0.04998779296875,0.125,0.024993896484375,0.04998779296875,0.199951171875,0.04998779296875,0.0999755859375,0.199951171875,0.07501220703125,0.1500244140625,0.125,0.199951171875,0.199951171875,0.0,0.125],[0.024993896484375,0.07501220703125,0.0,0.199951171875,0.024993896484375,0.024993896484375,0.024993896484375,0.175048828125,0.04998779296875,0.04998779296875,0.04998779296875,0.07501220703125,0.07501220703125,0.1500244140625,0.175048828125,0.199951171875],[0.0,0.125,0.0,0.07501220703125,0.125,0.125,0.07501220703125,0.1500244140625,0.04998779296875,0.04998779296875,0.125,0.125,0.2249755859375,0.0999755859375,0.07501220703125,0.07501220703125]]" \ -// RUN: --input="16x16xf16=[[0.175048828125,0.07501220703125,0.199951171875,0.0,0.175048828125,0.125,0.199951171875,0.04998779296875,0.0999755859375,0.175048828125,0.07501220703125,0.04998779296875,0.125,0.125,0.07501220703125,0.2249755859375],[0.024993896484375,0.199951171875,0.0,0.1500244140625,0.175048828125,0.0999755859375,0.175048828125,0.1500244140625,0.2249755859375,0.07501220703125,0.199951171875,0.0999755859375,0.0999755859375,0.2249755859375,0.0999755859375,0.0999755859375],[0.2249755859375,0.2249755859375,0.125,0.175048828125,0.0,0.07501220703125,0.04998779296875,0.0,0.199951171875,0.1500244140625,0.024993896484375,0.2249755859375,0.024993896484375,0.1500244140625,0.2249755859375,0.199951171875],[0.1500244140625,0.125,0.024993896484375,0.07501220703125,0.125,0.125,0.07501220703125,0.1500244140625,0.04998779296875,0.175048828125,0.125,0.175048828125,0.175048828125,0.07501220703125,0.024993896484375,0.125],[0.2249755859375,0.125,0.2249755859375,0.1500244140625,0.0,0.0,0.1500244140625,0.125,0.024993896484375,0.125,0.0,0.024993896484375,0.175048828125,0.175048828125,0.024993896484375,0.125],[0.2249755859375,0.024993896484375,0.04998779296875,0.0,0.0,0.1500244140625,0.07501220703125,0.2249755859375,0.1500244140625,0.024993896484375,0.0,0.0999755859375,0.125,0.1500244140625,0.2249755859375,0.0],[0.125,0.0999755859375,0.0,0.0999755859375,0.199951171875,0.125,0.175048828125,0.175048828125,0.1500244140625,0.2249755859375,0.04998779296875,0.125,0.1500244140625,0.0,0.0,0.0999755859375],[0.125,0.07501220703125,0.175048828125,0.1500244140625,0.175048828125,0.0,0.04998779296875,0.125,0.125,0.024993896484375,0.0999755859375,0.175048828125,0.024993896484375,0.0,0.024993896484375,0.0],[0.2249755859375,0.024993896484375,0.0999755859375,0.04998779296875,0.125,0.07501220703125,0.0999755859375,0.024993896484375,0.125,0.125,0.125,0.024993896484375,0.125,0.04998779296875,0.0999755859375,0.07501220703125],[0.0999755859375,0.175048828125,0.199951171875,0.0999755859375,0.175048828125,0.07501220703125,0.024993896484375,0.125,0.07501220703125,0.0,0.125,0.07501220703125,0.07501220703125,0.0,0.199951171875,0.175048828125],[0.07501220703125,0.0999755859375,0.175048828125,0.07501220703125,0.125,0.1500244140625,0.0,0.0999755859375,0.2249755859375,0.199951171875,0.04998779296875,0.0,0.0,0.1500244140625,0.199951171875,0.2249755859375],[0.024993896484375,0.2249755859375,0.04998779296875,0.1500244140625,0.2249755859375,0.2249755859375,0.175048828125,0.0999755859375,0.024993896484375,0.199951171875,0.125,0.199951171875,0.175048828125,0.2249755859375,0.175048828125,0.0999755859375],[0.125,0.0999755859375,0.04998779296875,0.125,0.199951171875,0.07501220703125,0.199951171875,0.0,0.024993896484375,0.04998779296875,0.0,0.04998779296875,0.04998779296875,0.199951171875,0.1500244140625,0.0999755859375],[0.199951171875,0.0,0.125,0.04998779296875,0.07501220703125,0.175048828125,0.0999755859375,0.175048828125,0.024993896484375,0.07501220703125,0.0,0.1500244140625,0.07501220703125,0.024993896484375,0.07501220703125,0.175048828125],[0.1500244140625,0.125,0.0999755859375,0.175048828125,0.04998779296875,0.0,0.04998779296875,0.1500244140625,0.024993896484375,0.125,0.125,0.175048828125,0.125,0.0999755859375,0.175048828125,0.1500244140625],[0.07501220703125,0.199951171875,0.024993896484375,0.0999755859375,0.175048828125,0.07501220703125,0.1500244140625,0.04998779296875,0.0,0.024993896484375,0.07501220703125,0.07501220703125,0.1500244140625,0.04998779296875,0.2249755859375,0.1500244140625]]" \ -// RUN: --input="16x8xf16=[[0.1500244140625,0.07501220703125,0.1500244140625,0.0,0.199951171875,0.125,0.0,0.175048828125],[0.04998779296875,0.07501220703125,0.04998779296875,0.125,0.2249755859375,0.04998779296875,0.04998779296875,0.2249755859375],[0.0,0.07501220703125,0.04998779296875,0.175048828125,0.0999755859375,0.1500244140625,0.04998779296875,0.199951171875],[0.125,0.175048828125,0.04998779296875,0.07501220703125,0.199951171875,0.07501220703125,0.024993896484375,0.1500244140625],[0.175048828125,0.0,0.0,0.0999755859375,0.0999755859375,0.1500244140625,0.07501220703125,0.024993896484375],[0.1500244140625,0.199951171875,0.0999755859375,0.0999755859375,0.125,0.175048828125,0.199951171875,0.0],[0.175048828125,0.0999755859375,0.024993896484375,0.175048828125,0.125,0.07501220703125,0.175048828125,0.175048828125],[0.175048828125,0.175048828125,0.2249755859375,0.125,0.175048828125,0.0,0.04998779296875,0.175048828125],[0.175048828125,0.024993896484375,0.125,0.1500244140625,0.1500244140625,0.07501220703125,0.0,0.04998779296875],[0.125,0.0999755859375,0.024993896484375,0.199951171875,0.175048828125,0.0999755859375,0.04998779296875,0.125],[0.199951171875,0.04998779296875,0.1500244140625,0.0999755859375,0.04998779296875,0.07501220703125,0.199951171875,0.125],[0.1500244140625,0.0,0.125,0.175048828125,0.024993896484375,0.07501220703125,0.199951171875,0.0999755859375],[0.175048828125,0.04998779296875,0.07501220703125,0.125,0.024993896484375,0.2249755859375,0.0,0.0],[0.024993896484375,0.0999755859375,0.1500244140625,0.07501220703125,0.125,0.2249755859375,0.0,0.0],[0.04998779296875,0.125,0.175048828125,0.04998779296875,0.125,0.0999755859375,0.0999755859375,0.04998779296875],[0.125,0.175048828125,0.0,0.2249755859375,0.199951171875,0.175048828125,0.1500244140625,0.1500244140625]]" |\ -// RUN: FileCheck %s --check-prefix=EXEC - -// EXEC: result[0]: hal.buffer_view -// EXEC-NEXT: 16x8xf16=[0.465332 0.345703 0.341064 0.440674 0.495605 0.424805 0.296875 0.393555][0.420166 0.324707 0.314941 0.414551 0.470215 0.408447 0.268555 0.361084][0.404785 0.305176 0.304688 0.390381 0.438232 0.378174 0.262207 0.349609][0.33252 0.25708 0.235596 0.327393 0.364258 0.320312 0.222168 0.287109][0.326172 0.256592 0.235107 0.332031 0.377686 0.316895 0.204224 0.300537][0.484131 0.361328 0.346436 0.46875 0.525391 0.442871 0.303467 0.425293][0.422852 0.324219 0.311035 0.41626 0.472412 0.398926 0.26709 0.374023][0.447754 0.33252 0.314697 0.439209 0.487305 0.414551 0.285156 0.395508][0.30127 0.23938 0.229736 0.297363 0.343994 0.293701 0.192749 0.272705][0.355225 0.271729 0.270752 0.34668 0.391846 0.334717 0.227539 0.305664][0.375 0.286133 0.267822 0.369141 0.416992 0.348877 0.237549 0.334473][0.454834 0.349365 0.334717 0.44165 0.502441 0.426758 0.292725 0.390381][0.404541 0.302002 0.296631 0.390137 0.437012 0.379639 0.260742 0.34668][0.348877 0.269043 0.263428 0.335205 0.386719 0.33252 0.22522 0.300293][0.297363 0.216431 0.211182 0.283203 0.311523 0.266113 0.194946 0.25708][0.320068 0.242188 0.235107 0.30835 0.349365 0.299072 0.203125 0.275879] diff --git a/tests/transform_dialect/cuda/double_mma_layout_analysis_codegen_spec.mlir b/tests/transform_dialect/cuda/double_mma_layout_analysis_codegen_spec.mlir deleted file mode 100644 index 49f07d2c963a..000000000000 --- a/tests/transform_dialect/cuda/double_mma_layout_analysis_codegen_spec.mlir +++ /dev/null @@ -1,69 +0,0 @@ -// RUN: iree-opt %s - -module attributes { transform.with_named_sequence } { - transform.named_sequence @codegen( - %variant_op: !transform.any_op) { - - // Step 1. Find the fill and matmul ops - // =========================================================================== - %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op - %matmul = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> !transform.any_op - %fill0, %fill1 = transform.split_handle %fill : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - %matmul0, %matmul1 = transform.split_handle %matmul : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - - // Step 2. Tile the matmul and fuse the fill - // =========================================================================== - %grid_reduction, %forall_grid = - transform.structured.tile_using_forall %matmul1 tile_sizes [16] ( mapping = [#gpu.block] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> () - - transform.structured.fuse_into_containing_op %fill1 into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.structured.fuse_into_containing_op %matmul0 into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.structured.fuse_into_containing_op %fill0 into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - - // Step 3. Vectorize - // =========================================================================== - %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op - transform.apply_patterns to %func { - transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface - transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices - transform.apply_patterns.vector.cast_away_vector_leading_one_dim - } : !transform.any_op - %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op - - // Step 4. Bufferize - // =========================================================================== - transform.apply_patterns to %func_3 { - transform.apply_patterns.iree.fold_fill_into_pad - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.scf.for_loop_canonicalization - } : !transform.any_op - transform.apply_patterns to %func_3 { - transform.apply_patterns.tensor.reassociative_reshape_folding - transform.apply_patterns.canonicalization - } : !transform.any_op - transform.apply_cse to %func_3 : !transform.any_op - transform.iree.eliminate_empty_tensors %func_3 : (!transform.any_op) -> () - transform.apply_patterns to %func_3 { - transform.apply_patterns.linalg.erase_unnecessary_inputs - } : !transform.any_op - %memref_func = transform.iree.bufferize { target_gpu } %func_3 : (!transform.any_op) -> (!transform.any_op) - - // Step 5. Pre-process the contract and transfer ops to put it in the right form. - // =========================================================================== - transform.apply_patterns to %memref_func { - transform.apply_patterns.iree.prepare_vector_to_mma - } : !transform.any_op - - // Step 6. Post-bufferization vector distribution - // =========================================================================== - transform.iree.forall_to_workgroup %memref_func : (!transform.any_op) -> () - transform.iree.map_nested_forall_to_gpu_threads %memref_func workgroup_dims = [4, 8, 1] : (!transform.any_op) -> () - - // Step 7. Do layout analysis and lower to mma - // =========================================================================== - %func_11 = transform.iree.layout_analysis_and_distribution %memref_func : (!transform.any_op) -> (!transform.any_op) - - transform.yield - } -} // module diff --git a/tests/transform_dialect/cuda/double_mma_layout_analysis_dispatch_spec.mlir b/tests/transform_dialect/cuda/double_mma_layout_analysis_dispatch_spec.mlir deleted file mode 100644 index 8fad32f799b6..000000000000 --- a/tests/transform_dialect/cuda/double_mma_layout_analysis_dispatch_spec.mlir +++ /dev/null @@ -1,23 +0,0 @@ -// RUN: iree-opt %s - -module attributes { transform.with_named_sequence } { - transform.named_sequence @__transform_main(%func: !transform.any_op) { - %ops = transform.structured.match ops{["linalg.fill", "linalg.matmul"]} - in %func : (!transform.any_op) -> !transform.any_op - - %fill0, %matmul0, %fill1, %matmul1 = - transform.split_handle %ops - : (!transform.any_op) -> (!transform.any_op, !transform.any_op, - !transform.any_op, !transform.any_op) - - %region_op = transform.iree.wrap_in_dispatch_region %matmul1 { generateWorkload = false } : (!transform.any_op) -> !transform.any_op - - %non_matmul1 = transform.merge_handles %fill0, %matmul0, %fill1 : !transform.any_op - %region_op_2 = transform.iree.move_preceding_op_into_dispatch_region %non_matmul1 into %region_op : (!transform.any_op, !transform.any_op) -> !transform.any_op - - %empty = transform.structured.match ops{["tensor.empty"]} in %func : (!transform.any_op) -> !transform.any_op - %region_op_3 = transform.iree.move_preceding_op_into_dispatch_region %empty into %region_op_2 : (!transform.any_op, !transform.any_op) -> !transform.any_op - transform.iree.region_to_workgroups %region_op_3 : (!transform.any_op) -> !transform.any_op - transform.yield - } -} diff --git a/tests/transform_dialect/cuda/eltwise_reduction.mlir b/tests/transform_dialect/cuda/eltwise_reduction.mlir deleted file mode 100644 index eabf1a0cf739..000000000000 --- a/tests/transform_dialect/cuda/eltwise_reduction.mlir +++ /dev/null @@ -1,39 +0,0 @@ -!in_tensor_t = tensor<8x64xf32> -!out_tensor_t = tensor<8xf32> - -func.func @reduce(%arg : !in_tensor_t) -> (!out_tensor_t) { - %cst = arith.constant -0.000000e+00 : f32 - - %0 = tensor.empty() : !out_tensor_t - %1 = linalg.fill ins(%cst : f32) outs(%0 : !out_tensor_t) -> !out_tensor_t - %2 = tensor.empty() : !in_tensor_t - %3 = linalg.generic { - indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, - affine_map<(d0, d1) -> (d0, d1)>], - iterator_types = ["parallel", "parallel"]} - ins(%arg : !in_tensor_t) outs(%2 : !in_tensor_t) { - ^bb0(%arg3: f32, %arg4: f32): - %4 = arith.addf %arg3, %arg3 : f32 - %5 = arith.addf %4, %4 : f32 - linalg.yield %5 : f32 - } -> !in_tensor_t - - %6 = linalg.generic { - indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, - affine_map<(d0, d1) -> (d0)>], - iterator_types = ["parallel", "reduction"]} - ins(%3 : !in_tensor_t) outs(%1 : !out_tensor_t) { - ^bb0(%arg3: f32, %arg4: f32): - %4 = arith.addf %arg3, %arg4 : f32 - linalg.yield %4 : f32 - } -> !out_tensor_t - - return %6 : !out_tensor_t -} - -// RUN: iree-compile %s --iree-hal-target-backends=cuda | \ -// RUN: iree-run-module --module=- --function=reduce --device=cuda --input="8x64xf32=1" |\ -// RUN: FileCheck %s --check-prefix=EXEC - -// EXEC: result[0]: hal.buffer_view -// EXEC-NEXT: 8xf32=256 256 256 256 256 256 256 256 diff --git a/tests/transform_dialect/cuda/eltwise_reduction_eltwise.mlir b/tests/transform_dialect/cuda/eltwise_reduction_eltwise.mlir deleted file mode 100644 index 70aa3322dfd3..000000000000 --- a/tests/transform_dialect/cuda/eltwise_reduction_eltwise.mlir +++ /dev/null @@ -1,51 +0,0 @@ -!in_tensor_t = tensor<8x64xf32> -!out_tensor_t = tensor<8xf32> - -func.func @reduce(%arg : !in_tensor_t) -> (!out_tensor_t) { - %cst = arith.constant -0.000000e+00 : f32 - - %0 = tensor.empty() : !out_tensor_t - %1 = linalg.fill ins(%cst : f32) outs(%0 : !out_tensor_t) -> !out_tensor_t - %2 = tensor.empty() : !in_tensor_t - %3 = linalg.generic { - indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, - affine_map<(d0, d1) -> (d0, d1)>], - iterator_types = ["parallel", "parallel"]} - ins(%arg : !in_tensor_t) outs(%2 : !in_tensor_t) { - ^bb0(%arg3: f32, %arg4: f32): - %4 = arith.addf %arg3, %arg3 : f32 - %5 = arith.addf %4, %4 : f32 - linalg.yield %5 : f32 - } -> !in_tensor_t - - %6 = linalg.generic { - indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, - affine_map<(d0, d1) -> (d0)>], - iterator_types = ["parallel", "reduction"]} - ins(%3 : !in_tensor_t) outs(%1 : !out_tensor_t) { - ^bb0(%arg3: f32, %arg4: f32): - %4 = arith.addf %arg3, %arg4 : f32 - linalg.yield %4 : f32 - } -> !out_tensor_t - - %7 = tensor.empty() : !out_tensor_t - %8 = linalg.generic { - indexing_maps = [affine_map<(d0) -> (d0)>, - affine_map<(d0) -> (d0)>], - iterator_types = ["parallel"]} - ins(%6 : !out_tensor_t) outs(%7 : !out_tensor_t) { - ^bb0(%arg3: f32, %arg4: f32): - %4 = math.sqrt %arg3 : f32 - linalg.yield %4 : f32 - } -> !out_tensor_t - - - return %8 : !out_tensor_t -} - -// RUN: iree-compile %s --iree-hal-target-backends=cuda | \ -// RUN: iree-run-module --module=- --function=reduce --device=cuda --input="8x64xf32=1" |\ -// RUN: FileCheck %s --check-prefix=EXEC - -// EXEC: result[0]: hal.buffer_view -// EXEC-NEXT: 8xf32=16 16 16 16 16 16 16 16 diff --git a/tests/transform_dialect/cuda/mma_elemwise_layout_analysis.mlir b/tests/transform_dialect/cuda/mma_elemwise_layout_analysis.mlir deleted file mode 100644 index c1b1c2904c44..000000000000 --- a/tests/transform_dialect/cuda/mma_elemwise_layout_analysis.mlir +++ /dev/null @@ -1,29 +0,0 @@ -#map = affine_map<(d0, d1) -> (d0, d1)> -func.func @matmul(%lhs : tensor<16x16xf16>, %rhs : tensor<8x16xf16>, %bias : tensor<16x8xf16>) -> tensor<16x8xf16> { - %c0 = arith.constant 0.0 : f16 - %0 = tensor.empty() : tensor<16x8xf16> - %1 = linalg.fill ins(%c0 : f16) outs(%0 : tensor<16x8xf16>) -> tensor<16x8xf16> - %2 = linalg.matmul_transpose_b ins(%lhs, %rhs : tensor<16x16xf16>, tensor<8x16xf16>) - outs(%1 : tensor<16x8xf16>) -> tensor<16x8xf16> - %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types=["parallel", "parallel"]} - ins(%2, %bias : tensor<16x8xf16>, tensor<16x8xf16>) outs(%0 : tensor<16x8xf16>) { - ^bb0(%arg0: f16, %arg1: f16, %arg2: f16): - %10 = arith.subf %arg0, %arg1 : f16 - %11 = math.exp %10 : f16 - linalg.yield %11 : f16 - } -> tensor<16x8xf16> - return %3 : tensor<16x8xf16> -} - -// RUN: iree-compile %s --iree-hal-target-backends=cuda \ -// RUN: --iree-hal-cuda-llvm-target-arch=sm_80 \ -// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \ -// RUN: --iree-codegen-transform-dialect-library=%p/mma_elemwise_layout_analysis_codegen_spec.mlir@__transform_main | \ -// RUN: iree-run-module --module=- --function=matmul --device=cuda \ -// RUN: --input="16x16xf16=[[0.0999755859375,0.2249755859375,0.07501220703125,0.0,0.07501220703125,0.2249755859375,0.175048828125,0.07501220703125,0.175048828125,0.07501220703125,0.024993896484375,0.1500244140625,0.1500244140625,0.2249755859375,0.199951171875,0.1500244140625],[0.1500244140625,0.199951171875,0.0999755859375,0.07501220703125,0.1500244140625,0.2249755859375,0.024993896484375,0.0999755859375,0.0999755859375,0.024993896484375,0.2249755859375,0.2249755859375,0.2249755859375,0.0,0.024993896484375,0.04998779296875],[0.07501220703125,0.0,0.125,0.125,0.04998779296875,0.2249755859375,0.024993896484375,0.199951171875,0.199951171875,0.07501220703125,0.1500244140625,0.2249755859375,0.024993896484375,0.175048828125,0.07501220703125,0.125],[0.04998779296875,0.024993896484375,0.0,0.2249755859375,0.07501220703125,0.024993896484375,0.024993896484375,0.0,0.07501220703125,0.1500244140625,0.1500244140625,0.175048828125,0.2249755859375,0.1500244140625,0.07501220703125,0.0999755859375],[0.125,0.0,0.199951171875,0.04998779296875,0.199951171875,0.04998779296875,0.175048828125,0.125,0.0,0.0,0.199951171875,0.024993896484375,0.2249755859375,0.1500244140625,0.024993896484375,0.0],[0.04998779296875,0.2249755859375,0.0999755859375,0.07501220703125,0.2249755859375,0.07501220703125,0.2249755859375,0.07501220703125,0.2249755859375,0.199951171875,0.125,0.07501220703125,0.04998779296875,0.199951171875,0.125,0.1500244140625],[0.1500244140625,0.125,0.175048828125,0.04998779296875,0.125,0.1500244140625,0.1500244140625,0.125,0.0999755859375,0.0,0.199951171875,0.024993896484375,0.175048828125,0.199951171875,0.125,0.0999755859375],[0.0999755859375,0.199951171875,0.0999755859375,0.0999755859375,0.2249755859375,0.0,0.175048828125,0.0999755859375,0.125,0.07501220703125,0.07501220703125,0.175048828125,0.07501220703125,0.0,0.2249755859375,0.2249755859375],[0.07501220703125,0.024993896484375,0.199951171875,0.024993896484375,0.175048828125,0.199951171875,0.0999755859375,0.024993896484375,0.0,0.0999755859375,0.0,0.0999755859375,0.2249755859375,0.175048828125,0.0,0.0],[0.024993896484375,0.0999755859375,0.2249755859375,0.2249755859375,0.125,0.2249755859375,0.04998779296875,0.04998779296875,0.04998779296875,0.024993896484375,0.0999755859375,0.2249755859375,0.024993896484375,0.024993896484375,0.0,0.07501220703125],[0.0,0.1500244140625,0.175048828125,0.1500244140625,0.2249755859375,0.024993896484375,0.1500244140625,0.0999755859375,0.024993896484375,0.0,0.125,0.04998779296875,0.125,0.199951171875,0.024993896484375,0.199951171875],[0.024993896484375,0.04998779296875,0.199951171875,0.0,0.07501220703125,0.199951171875,0.2249755859375,0.04998779296875,0.175048828125,0.0,0.199951171875,0.199951171875,0.1500244140625,0.199951171875,0.125,0.199951171875],[0.1500244140625,0.125,0.04998779296875,0.0999755859375,0.04998779296875,0.175048828125,0.04998779296875,0.0999755859375,0.2249755859375,0.199951171875,0.125,0.1500244140625,0.0999755859375,0.07501220703125,0.07501220703125,0.0999755859375],[0.0,0.04998779296875,0.125,0.024993896484375,0.04998779296875,0.199951171875,0.04998779296875,0.0999755859375,0.199951171875,0.07501220703125,0.1500244140625,0.125,0.199951171875,0.199951171875,0.0,0.125],[0.024993896484375,0.07501220703125,0.0,0.199951171875,0.024993896484375,0.024993896484375,0.024993896484375,0.175048828125,0.04998779296875,0.04998779296875,0.04998779296875,0.07501220703125,0.07501220703125,0.1500244140625,0.175048828125,0.199951171875],[0.0,0.125,0.0,0.07501220703125,0.125,0.125,0.07501220703125,0.1500244140625,0.04998779296875,0.04998779296875,0.125,0.125,0.2249755859375,0.0999755859375,0.07501220703125,0.07501220703125]]" \ -// RUN: --input="8x16xf16=[[0.175049 0.0999756 0.0249939 0.224976 0.224976 0.199951 0.150024 0.0499878 0.224976 0.0249939 0.224976 0.150024 0.125 0.150024 0.125 0.125][0.0750122 0.175049 0.199951 0.0750122 0.224976 0.150024 0.125 0.175049 0.125 0.125 0.0249939 0.0249939 0.0999756 0.224976 0.0750122 0.0249939][0.199951 0.0750122 0 0.199951 0.125 0.0249939 0.0249939 0.125 0.224976 0 0.0499878 0 0 0.0499878 0.175049 0.0999756][0 0.0499878 0.150024 0.0999756 0.175049 0.224976 0.0750122 0.175049 0.150024 0.0249939 0 0.0999756 0.0999756 0.125 0.150024 0.175049][0.175049 0.125 0.175049 0.0999756 0 0.0249939 0.125 0.175049 0 0.175049 0 0.125 0.199951 0.150024 0.175049 0.0249939][0.125 0.125 0.0999756 0.224976 0.0750122 0.150024 0.125 0.0750122 0 0.175049 0.150024 0.150024 0.125 0 0 0][0.199951 0.0750122 0.175049 0.0999756 0.0499878 0.224976 0.0750122 0.0249939 0.150024 0.0249939 0.0750122 0.224976 0.175049 0 0.0499878 0.0249939][0.0499878 0.224976 0.150024 0.0999756 0 0.199951 0.150024 0.125 0.125 0.125 0.224976 0 0.175049 0.0999756 0.125 0]]" \ -// RUN: --input="16x8xf16=[[0.0,-0.03173828125,-0.1318359375,-0.044189453125,-0.0655517578125,-0.126220703125,-0.076171875,-0.041259765625],[0.0,-0.0855712890625,-0.157470703125,-0.09619140625,-0.1124267578125,-0.0718994140625,-0.04052734375,-0.0531005859375],[0.0,-0.065673828125,-0.118896484375,-0.0438232421875,-0.1031494140625,-0.1051025390625,-0.06884765625,-0.0750732421875],[0.0,-0.0911865234375,-0.11810302734375,-0.09375,-0.0711669921875,-0.06494140625,-0.083740234375,-0.0755615234375],[0.0,-0.0150146484375,-0.125,-0.064453125,-0.0462646484375,-0.065673828125,-0.064453125,-0.0325927734375],[0.0,-0.026123046875,-0.1287841796875,-0.078125,-0.1043701171875,-0.125,-0.1368408203125,-0.06005859375],[0.0,-0.036865234375,-0.1300048828125,-0.0699462890625,-0.078125,-0.11376953125,-0.088134765625,-0.03369140625],[0.0,-0.06201171875,-0.0894775390625,-0.0594482421875,-0.078857421875,-0.10693359375,-0.0982666015625,-0.0863037109375],[-0.021240234375,0.0,-0.15185546875,-0.036865234375,-0.0380859375,-0.0587158203125,-0.0343017578125,-0.045654296875],[0.0,-0.052490234375,-0.1268310546875,-0.04248046875,-0.0955810546875,-0.0399169921875,-0.029296875,-0.060546875],[0.0,-0.0162353515625,-0.1187744140625,-0.043701171875,-0.079345703125,-0.0924072265625,-0.1112060546875,-0.0599365234375],[0.0,-0.0706787109375,-0.175048828125,-0.054931640625,-0.1119384765625,-0.1356201171875,-0.076904296875,-0.06005859375],[0.0,-0.0662841796875,-0.105712890625,-0.0782470703125,-0.0838623046875,-0.078857421875,-0.061279296875,-0.0494384765625],[0.0,-0.0374755859375,-0.138671875,-0.0374755859375,-0.086181640625,-0.09619140625,-0.05615234375,-0.0318603515625],[0.0,-0.0455322265625,-0.0450439453125,-0.023681640625,-0.0343017578125,-0.07745361328125,-0.09375,-0.05126953125],[0.0,-0.041259765625,-0.11566162109375,-0.0462646484375,-0.061279296875,-0.06689453125,-0.0706787109375,-0.0325927734375]]" |\ -// RUN: FileCheck %s --check-prefix=EXEC - -// EXEC: result[0]: hal.buffer_view -// EXEC-NEXT: 16x8xf16=[1.34863 1.34863 1.34863 1.34863 1.34863 1.34863 1.34863 1.34863][1.34766 1.34766 1.34766 1.34766 1.34668 1.34766 1.34668 1.34766][1.32715 1.32715 1.32715 1.32715 1.32715 1.32715 1.32715 1.32715][1.27246 1.27246 1.27246 1.27246 1.27246 1.27246 1.27246 1.27246][1.25586 1.25586 1.25586 1.25684 1.25586 1.25586 1.25586 1.25684][1.37598 1.37598 1.37598 1.37598 1.37598 1.37598 1.37598 1.37598][1.33984 1.33984 1.33984 1.33984 1.33984 1.33984 1.33984 1.33984][1.32715 1.32715 1.32715 1.32715 1.32715 1.32715 1.32715 1.32715][1.24023 1.24023 1.24023 1.24023 1.24023 1.24023 1.23926 1.24023][1.26855 1.26855 1.26855 1.26855 1.26855 1.26855 1.26855 1.26855][1.28516 1.28516 1.28516 1.28516 1.28516 1.28516 1.28516 1.28516][1.36523 1.36523 1.36523 1.36523 1.36523 1.36523 1.36523 1.36523][1.31445 1.31445 1.31445 1.31445 1.31445 1.31445 1.31445 1.31445][1.28027 1.28027 1.28027 1.28027 1.28027 1.28027 1.28027 1.28027][1.21387 1.21387 1.21387 1.21387 1.21387 1.21387 1.21387 1.21387][1.24902 1.24902 1.24902 1.24902 1.24902 1.24902 1.24902 1.24902] diff --git a/tests/transform_dialect/cuda/mma_elemwise_layout_analysis_codegen_spec.mlir b/tests/transform_dialect/cuda/mma_elemwise_layout_analysis_codegen_spec.mlir deleted file mode 100644 index 138fc5899ae8..000000000000 --- a/tests/transform_dialect/cuda/mma_elemwise_layout_analysis_codegen_spec.mlir +++ /dev/null @@ -1,65 +0,0 @@ -// RUN: iree-opt %s - -module attributes { transform.with_named_sequence } { - transform.named_sequence @__transform_main( - %variant_op: !transform.any_op) { - // Step 1. Find the fill, matmul and generic ops - // =========================================================================== - %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op - %matmul = transform.structured.match ops{["linalg.generic"]} - attributes{iterator_types = [#linalg.iterator_type, #linalg.iterator_type, #linalg.iterator_type]} - in %variant_op : (!transform.any_op) -> !transform.any_op - %generic = transform.structured.match ops{["linalg.generic"]} - attributes{iterator_types = [#linalg.iterator_type, #linalg.iterator_type]} - in %variant_op : (!transform.any_op) -> !transform.any_op - - // Step 2. Tile the generic and fuse the fill and matmul - // =========================================================================== - %grid_reduction, %forall_grid = - transform.structured.tile_using_forall %generic tile_sizes [16] ( mapping = [#gpu.block] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> () - - transform.structured.fuse_into_containing_op %matmul into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.structured.fuse_into_containing_op %fill into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - - // Step 3. Vectorize - // =========================================================================== - %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op - transform.apply_patterns to %func { - transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface - transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices - transform.apply_patterns.vector.cast_away_vector_leading_one_dim - } : !transform.any_op - %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op - - // Step 4. Bufferize - // =========================================================================== - transform.apply_patterns to %func_3 { - transform.apply_patterns.iree.fold_fill_into_pad - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.scf.for_loop_canonicalization - } : !transform.any_op - transform.apply_patterns to %func_3 { - transform.apply_patterns.tensor.reassociative_reshape_folding - transform.apply_patterns.canonicalization - } : !transform.any_op - transform.apply_cse to %func_3 : !transform.any_op - transform.iree.eliminate_empty_tensors %func_3 : (!transform.any_op) -> () - transform.apply_patterns to %func_3 { - transform.apply_patterns.linalg.erase_unnecessary_inputs - } : !transform.any_op - %memref_func = transform.iree.bufferize { target_gpu } %func_3 : (!transform.any_op) -> (!transform.any_op) - - // Step 6. Post-bufferization vector distribution - // =========================================================================== - transform.iree.forall_to_workgroup %memref_func : (!transform.any_op) -> () - transform.iree.map_nested_forall_to_gpu_threads %memref_func - workgroup_dims = [4, 8, 1] : (!transform.any_op) -> () - - // Step 7. Do layout analysis and lower to mma - // =========================================================================== - %func_11 = transform.iree.layout_analysis_and_distribution %memref_func : (!transform.any_op) -> (!transform.any_op) - - transform.yield - } -} // module diff --git a/tests/transform_dialect/cuda/mma_reduction_layout_analysis.mlir b/tests/transform_dialect/cuda/mma_reduction_layout_analysis.mlir deleted file mode 100644 index 198b513aeeda..000000000000 --- a/tests/transform_dialect/cuda/mma_reduction_layout_analysis.mlir +++ /dev/null @@ -1,38 +0,0 @@ -#map = affine_map<(d0, d1) -> (d0, d1)> -#map1 = affine_map<(d0, d1) -> (d0)> -func.func @matmul_reduction(%lhs : tensor<16x16xf16>, %rhs : tensor<16x16xf16>) -> tensor<16x16xf16> { - %c0 = arith.constant 0.0 : f16 - %c1 = arith.constant -1.0e+04 : f16 - %acc = tensor.empty() : tensor<16xf16> - %init = linalg.fill ins(%c1 : f16) outs(%acc : tensor<16xf16>) -> tensor<16xf16> - %0 = tensor.empty() : tensor<16x16xf16> - %1 = linalg.fill ins(%c0 : f16) outs(%0 : tensor<16x16xf16>) -> tensor<16x16xf16> - %2 = linalg.matmul_transpose_b ins(%lhs, %rhs : tensor<16x16xf16>, tensor<16x16xf16>) - outs(%1 : tensor<16x16xf16>) -> tensor<16x16xf16> - %6 = linalg.generic {indexing_maps - = [#map, #map1], iterator_types = ["parallel", "reduction"]} - ins(%2 : tensor<16x16xf16>) outs(%init : tensor<16xf16>) { - ^bb0(%in: f16, %out: f16): - %20 = arith.maximumf %in, %out : f16 - linalg.yield %20 : f16 - } -> tensor<16xf16> - %8 = linalg.generic {indexing_maps = [#map1, #map], iterator_types=["parallel", "parallel"]} - ins(%6 : tensor<16xf16>) outs(%0 : tensor<16x16xf16>) { - ^bb0(%in: f16, %out: f16): - linalg.yield %in : f16 - } -> tensor<16x16xf16> - return %8 : tensor<16x16xf16> -} - -// RUN: iree-compile %s --iree-hal-target-backends=cuda \ -// RUN: --iree-hal-cuda-llvm-target-arch=sm_80 \ -// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \ -// RUN: --iree-flow-dispatch-use-transform-dialect=%p/mma_reduction_layout_analysis_dispatch_spec.mlir \ -// RUN: --iree-codegen-transform-dialect-library=%p/mma_reduction_layout_analysis_codegen_spec.mlir@codegen | \ -// RUN: iree-run-module --module=- --function=matmul_reduction --device=cuda \ -// RUN: --input="16x16xf16=[[3.0,2.0,2.5,4.5,1.5,4.0,2.0,2.5,4.0,4.0,1.5,0.5,2.0,3.0,0.5,2.0],[2.5,2.5,0.5,3.5,0.0,2.5,3.5,1.0,0.5,0.0,3.0,4.5,0.5,0.5,0.0,3.5],[4.5,3.0,4.0,2.5,1.0,0.5,0.0,4.5,0.0,2.5,3.5,0.0,2.0,4.5,1.5,4.5],[0.0,2.0,1.5,0.0,2.0,1.5,3.0,2.0,2.0,4.0,4.0,2.5,0.0,3.0,2.0,0.5],[0.5,3.5,3.0,2.5,0.0,2.5,3.0,3.0,4.5,2.0,2.0,1.0,2.0,1.0,3.5,2.0],[0.0,4.5,2.0,4.0,2.5,2.5,1.5,1.5,1.5,3.0,3.0,0.0,2.5,0.5,2.0,2.0],[3.5,4.0,3.5,1.5,2.0,0.5,1.0,2.5,4.0,3.5,0.0,3.0,0.0,1.5,4.5,0.0],[4.5,3.5,1.0,4.5,0.5,0.0,1.5,4.5,1.5,3.5,3.0,2.5,0.0,0.5,0.0,4.0],[2.0,3.0,0.5,2.0,1.5,0.5,2.0,2.5,2.5,4.0,2.0,4.5,4.0,0.0,2.0,3.0],[2.5,4.0,4.0,3.0,2.0,2.0,4.5,0.5,4.5,1.0,2.0,0.0,4.5,1.0,3.0,0.5],[4.0,1.5,3.5,3.0,2.5,4.5,1.0,3.5,3.0,2.5,2.5,2.0,2.0,4.5,1.5,2.5],[3.0,3.0,0.0,2.5,1.0,3.0,0.0,1.5,1.5,2.5,0.5,1.0,3.0,3.5,1.5,1.5],[0.0,4.5,0.5,1.5,0.5,4.0,3.5,4.0,4.0,0.0,0.5,1.0,4.5,1.5,0.0,3.5],[2.5,2.0,2.5,1.5,3.0,0.0,2.0,1.0,2.5,4.0,0.0,4.0,4.0,1.5,3.0,2.5],[3.0,0.0,4.0,4.0,2.0,0.5,1.0,3.5,4.0,2.5,4.0,4.5,0.0,3.0,1.5,2.5],[0.5,0.5,2.5,4.0,1.0,2.5,0.5,4.5,2.0,3.0,1.5,4.5,1.5,4.5,0.5,1.5]]" \ -// RUN: --input="16x16xf16=[[3.5,3.0,4.5,3.0,3.0,0.0,2.0,2.5,2.0,0.0,4.5,2.5,0.5,0.0,4.0,3.5],[0.0,0.5,2.0,4.5,0.0,4.0,1.5,3.5,0.5,2.5,3.5,1.5,3.5,4.5,4.0,3.0],[3.0,3.5,2.5,1.5,1.5,1.5,0.5,4.5,0.0,3.5,4.0,0.0,0.0,2.0,0.5,1.0],[1.5,4.0,3.5,3.5,0.0,0.0,0.0,2.0,3.0,1.5,0.0,3.0,0.0,2.5,2.0,3.0],[3.5,4.0,2.5,1.5,3.0,2.0,3.0,4.5,1.5,3.0,2.0,3.5,2.5,4.5,0.5,3.5],[0.0,0.0,0.0,0.5,1.0,2.5,1.5,1.0,2.5,1.5,0.0,1.5,1.5,2.0,4.5,2.5],[4.0,1.5,3.0,2.5,2.5,3.5,2.0,4.0,1.5,2.5,0.5,4.0,1.0,4.5,3.5,0.0],[1.0,2.0,4.0,4.5,4.5,3.5,0.0,1.0,4.5,3.5,2.0,3.0,0.5,4.0,3.5,1.5],[1.0,0.0,2.5,4.5,0.0,2.0,0.0,2.5,3.0,4.0,2.5,0.5,3.5,0.0,3.5,1.0],[0.0,3.5,4.0,0.0,0.0,4.5,1.0,3.5,1.5,3.0,2.0,1.0,0.5,0.5,2.0,0.0],[1.5,0.0,4.5,2.0,4.5,4.5,3.5,3.0,2.5,4.5,0.5,0.5,0.0,4.5,0.0,4.0],[4.5,3.5,4.0,4.0,1.5,4.0,1.0,4.0,2.5,0.5,4.5,3.5,3.5,0.5,4.5,3.0],[0.0,3.0,2.5,1.0,1.5,2.0,1.0,1.5,4.0,2.5,3.5,1.0,3.5,2.5,3.5,4.5],[1.5,4.5,2.0,2.0,2.0,0.5,4.0,2.0,4.0,3.5,4.0,1.0,1.5,2.5,1.0,0.0],[0.0,0.0,1.0,2.5,3.5,2.5,4.0,0.0,2.0,2.0,4.5,0.5,1.0,3.5,3.0,2.5],[2.0,2.0,0.5,2.0,4.5,2.5,3.0,1.5,4.5,2.0,3.5,3.0,1.0,2.0,1.5,2.0]]" |\ -// RUN: FileCheck %s --check-prefix=EXEC - -// EXEC: result[0]: hal.buffer_view -// EXEC-NEXT: 16x16xf16=[116 116 116 116 116 116 116 116 116 116 116 116 116 116 116 116][96.5 96.5 96.5 96.5 96.5 96.5 96.5 96.5 96.5 96.5 96.5 96.5 96.5 96.5 96.5 96.5][124.75 124.75 124.75 124.75 124.75 124.75 124.75 124.75 124.75 124.75 124.75 124.75 124.75 124.75 124.75 124.75][86.75 86.75 86.75 86.75 86.75 86.75 86.75 86.75 86.75 86.75 86.75 86.75 86.75 86.75 86.75 86.75][115.5 115.5 115.5 115.5 115.5 115.5 115.5 115.5 115.5 115.5 115.5 115.5 115.5 115.5 115.5 115.5][103.75 103.75 103.75 103.75 103.75 103.75 103.75 103.75 103.75 103.75 103.75 103.75 103.75 103.75 103.75 103.75][109 109 109 109 109 109 109 109 109 109 109 109 109 109 109 109][114.75 114.75 114.75 114.75 114.75 114.75 114.75 114.75 114.75 114.75 114.75 114.75 114.75 114.75 114.75 114.75][110.75 110.75 110.75 110.75 110.75 110.75 110.75 110.75 110.75 110.75 110.75 110.75 110.75 110.75 110.75 110.75][122.75 122.75 122.75 122.75 122.75 122.75 122.75 122.75 122.75 122.75 122.75 122.75 122.75 122.75 122.75 122.75][136.5 136.5 136.5 136.5 136.5 136.5 136.5 136.5 136.5 136.5 136.5 136.5 136.5 136.5 136.5 136.5][87.75 87.75 87.75 87.75 87.75 87.75 87.75 87.75 87.75 87.75 87.75 87.75 87.75 87.75 87.75 87.75][102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75][102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75][126.25 126.25 126.25 126.25 126.25 126.25 126.25 126.25 126.25 126.25 126.25 126.25 126.25 126.25 126.25 126.25][106 106 106 106 106 106 106 106 106 106 106 106 106 106 106 106] diff --git a/tests/transform_dialect/cuda/mma_reduction_layout_analysis_codegen_spec.mlir b/tests/transform_dialect/cuda/mma_reduction_layout_analysis_codegen_spec.mlir deleted file mode 100644 index 746625f798f8..000000000000 --- a/tests/transform_dialect/cuda/mma_reduction_layout_analysis_codegen_spec.mlir +++ /dev/null @@ -1,68 +0,0 @@ -// RUN: iree-opt %s - -module attributes { transform.with_named_sequence } { - transform.named_sequence @codegen( - %variant_op: !transform.any_op) { - - // Step 1. Find the fill, matmul and generic ops - // =========================================================================== - %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op - %matmul = transform.structured.match ops{["linalg.generic"]} - attributes{iterator_types = [#linalg.iterator_type, #linalg.iterator_type, #linalg.iterator_type]} - in %variant_op : (!transform.any_op) -> !transform.any_op - %reduce = transform.structured.match ops{["linalg.generic"]} - attributes{iterator_types = [#linalg.iterator_type, #linalg.iterator_type]} - in %variant_op : (!transform.any_op) -> !transform.any_op - %broadcast = transform.structured.match ops{["linalg.generic"]} - attributes{iterator_types = [#linalg.iterator_type, #linalg.iterator_type]} - in %variant_op : (!transform.any_op) -> !transform.any_op - - // Step 2. Tile the matmul and fuse the fill - // =========================================================================== - %grid_reduction, %forall_grid = - transform.structured.tile_using_forall %broadcast tile_sizes [16] ( mapping = [#gpu.block] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> () - transform.structured.fuse_into_containing_op %reduce into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.structured.fuse_into_containing_op %matmul into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.structured.fuse_into_containing_op %fill into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - - // Step 3. Vectorize - // =========================================================================== - %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op - transform.apply_patterns to %func { - transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface - transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices - transform.apply_patterns.vector.cast_away_vector_leading_one_dim - } : !transform.any_op - %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op - - // Step 4. Bufferize - // =========================================================================== - transform.apply_patterns to %func_3 { - transform.apply_patterns.iree.fold_fill_into_pad - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.scf.for_loop_canonicalization - } : !transform.any_op - transform.apply_patterns to %func_3 { - transform.apply_patterns.tensor.reassociative_reshape_folding - transform.apply_patterns.canonicalization - } : !transform.any_op - transform.apply_cse to %func_3 : !transform.any_op - transform.iree.eliminate_empty_tensors %func_3 : (!transform.any_op) -> () - transform.apply_patterns to %func_3 { - transform.apply_patterns.linalg.erase_unnecessary_inputs - } : !transform.any_op - %memref_func = transform.iree.bufferize { target_gpu } %func_3 : (!transform.any_op) -> (!transform.any_op) - - // Step 6. Post-bufferization vector distribution - // =========================================================================== - transform.iree.forall_to_workgroup %memref_func : (!transform.any_op) -> () - transform.iree.map_nested_forall_to_gpu_threads %memref_func workgroup_dims = [4, 8, 1] : (!transform.any_op) -> () - - // Step 7. Do layout analysis and lower to mma - // =========================================================================== - %func_11 = transform.iree.layout_analysis_and_distribution %memref_func : (!transform.any_op) -> (!transform.any_op) - - transform.yield - } -} // module diff --git a/tests/transform_dialect/cuda/mma_reduction_layout_analysis_dispatch_spec.mlir b/tests/transform_dialect/cuda/mma_reduction_layout_analysis_dispatch_spec.mlir deleted file mode 100644 index 998146768b09..000000000000 --- a/tests/transform_dialect/cuda/mma_reduction_layout_analysis_dispatch_spec.mlir +++ /dev/null @@ -1,23 +0,0 @@ -// RUN: iree-opt %s - -module attributes { transform.with_named_sequence } { - transform.named_sequence @__transform_main(%func: !transform.any_op) { - %ops = transform.structured.match ops{["linalg.fill", "linalg.matmul_transpose_b", "linalg.generic"]} - in %func : (!transform.any_op) -> !transform.any_op - - %fill0, %fill1, %matmul, %reduce, %broadcast = - transform.split_handle %ops - : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, - !transform.any_op, !transform.any_op) - - %region_op = transform.iree.wrap_in_dispatch_region %broadcast { generateWorkload = false } : (!transform.any_op) -> !transform.any_op - - %non_broadcast = transform.merge_handles %fill0, %fill1, %matmul, %reduce : !transform.any_op - %region_op_2 = transform.iree.move_preceding_op_into_dispatch_region %non_broadcast into %region_op : (!transform.any_op, !transform.any_op) -> !transform.any_op - - %empty = transform.structured.match ops{["tensor.empty"]} in %func : (!transform.any_op) -> !transform.any_op - %region_op_3 = transform.iree.move_preceding_op_into_dispatch_region %empty into %region_op_2 : (!transform.any_op, !transform.any_op) -> !transform.any_op - transform.iree.region_to_workgroups %region_op_3 : (!transform.any_op) -> !transform.any_op - transform.yield - } -} diff --git a/tests/transform_dialect/cuda/mma_using_layout_analysis.mlir b/tests/transform_dialect/cuda/mma_using_layout_analysis.mlir deleted file mode 100644 index 5c0e240db337..000000000000 --- a/tests/transform_dialect/cuda/mma_using_layout_analysis.mlir +++ /dev/null @@ -1,20 +0,0 @@ -func.func @matmul(%lhs : tensor<16x16xf16>, %rhs : tensor<16x8xf16>) -> tensor<16x8xf16> { - %c0 = arith.constant 0.0 : f16 - %0 = tensor.empty() : tensor<16x8xf16> - %1 = linalg.fill ins(%c0 : f16) outs(%0 : tensor<16x8xf16>) -> tensor<16x8xf16> - %2 = linalg.matmul ins(%lhs, %rhs : tensor<16x16xf16>, tensor<16x8xf16>) - outs(%1 : tensor<16x8xf16>) -> tensor<16x8xf16> - return %2 : tensor<16x8xf16> -} - -// RUN: iree-compile %s --iree-hal-target-backends=cuda \ -// RUN: --iree-hal-cuda-llvm-target-arch=sm_80 \ -// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \ -// RUN: --iree-codegen-transform-dialect-library=%p/mma_using_layout_analysis_codegen_spec.mlir@__transform_main | \ -// RUN: iree-run-module --module=- --function=matmul --device=cuda \ -// RUN: --input="16x16xf16=[[1.0,1.125,1.25,1.375,1.5,1.625,1.75,1.875,2.0,2.125,2.25,2.375,2.5,2.625,2.75,2.875],[3.0,3.125,3.25,3.375,3.5,3.625,3.75,3.875,4.0,4.125,4.25,4.375,4.5,4.625,4.75,4.875],[5.0,5.125,5.25,5.375,5.5,5.625,5.75,5.875,6.0,6.125,6.25,6.375,6.5,6.625,6.75,6.875],[7.0,7.125,7.25,7.375,7.5,7.625,7.75,7.875,8.0,8.125,8.25,8.375,8.5,8.625,8.75,8.875],[9.0,9.125,9.25,9.375,9.5,9.625,9.75,9.875,10.0,10.125,10.25,10.375,10.5,10.625,10.75,10.875],[11.0,11.125,11.25,11.375,11.5,11.625,11.75,11.875,12.0,12.125,12.25,12.375,12.5,12.625,12.75,12.875],[13.0,13.125,13.25,13.375,13.5,13.625,13.75,13.875,14.0,14.125,14.25,14.375,14.5,14.625,14.75,14.875],[15.0,15.125,15.25,15.375,15.5,15.625,15.75,15.875,16.0,16.125,16.25,16.375,16.5,16.625,16.75,16.875],[17.0,17.125,17.25,17.375,17.5,17.625,17.75,17.875,18.0,18.125,18.25,18.375,18.5,18.625,18.75,18.875],[19.0,19.125,19.25,19.375,19.5,19.625,19.75,19.875,20.0,20.125,20.25,20.375,20.5,20.625,20.75,20.875],[21.0,21.125,21.25,21.375,21.5,21.625,21.75,21.875,22.0,22.125,22.25,22.375,22.5,22.625,22.75,22.875],[23.0,23.125,23.25,23.375,23.5,23.625,23.75,23.875,24.0,24.125,24.25,24.375,24.5,24.625,24.75,24.875],[25.0,25.125,25.25,25.375,25.5,25.625,25.75,25.875,26.0,26.125,26.25,26.375,26.5,26.625,26.75,26.875],[27.0,27.125,27.25,27.375,27.5,27.625,27.75,27.875,28.0,28.125,28.25,28.375,28.5,28.625,28.75,28.875],[29.0,29.125,29.25,29.375,29.5,29.625,29.75,29.875,30.0,30.125,30.25,30.375,30.5,30.625,30.75,30.875],[31.0,31.125,31.25,31.375,31.5,31.625,31.75,31.875,32.0,32.125,32.25,32.375,32.5,32.625,32.75,32.875]]" \ -// RUN: --input="16x8xf16=[[1.0,1.125,1.25,1.375,1.5,1.625,1.75,1.875],[2.0,2.125,2.25,2.375,2.5,2.625,2.75,2.875],[3.0,3.125,3.25,3.375,3.5,3.625,3.75,3.875],[4.0,4.125,4.25,4.375,4.5,4.625,4.75,4.875],[5.0,5.125,5.25,5.375,5.5,5.625,5.75,5.875],[6.0,6.125,6.25,6.375,6.5,6.625,6.75,6.875],[7.0,7.125,7.25,7.375,7.5,7.625,7.75,7.875],[8.0,8.125,8.25,8.375,8.5,8.625,8.75,8.875],[9.0,9.125,9.25,9.375,9.5,9.625,9.75,9.875],[10.0,10.125,10.25,10.375,10.5,10.625,10.75,10.875],[11.0,11.125,11.25,11.375,11.5,11.625,11.75,11.875],[12.0,12.125,12.25,12.375,12.5,12.625,12.75,12.875],[13.0,13.125,13.25,13.375,13.5,13.625,13.75,13.875],[14.0,14.125,14.25,14.375,14.5,14.625,14.75,14.875],[15.0,15.125,15.25,15.375,15.5,15.625,15.75,15.875],[16.0,16.125,16.25,16.375,16.5,16.625,16.75,16.875]]" |\ -// RUN: FileCheck %s --check-prefix=EXEC - -// EXEC: result[0]: hal.buffer_view -// EXEC-NEXT: 16x8xf16=[306 310 313.75 317.5 321.5 325.5 329.25 333][578 586 594 601.5 609.5 617.5 625 633][850 862 874 885.5 897.5 909.5 921 933][1122 1138 1154 1170 1186 1201 1217 1233][1394 1414 1434 1454 1474 1493 1513 1533][1666 1690 1714 1738 1762 1785 1809 1833][1938 1966 1994 2022 2050 2078 2106 2134][2210 2242 2274 2306 2338 2370 2402 2434][2482 2518 2554 2590 2626 2662 2698 2734][2754 2794 2834 2874 2914 2954 2994 3034][3026 3070 3114 3158 3202 3246 3290 3334][3298 3346 3394 3442 3490 3538 3586 3634][3570 3622 3674 3726 3778 3830 3882 3934][3842 3898 3954 4010 4066 4120 4176 4232][4112 4172 4232 4292 4352 4412 4472 4532][4384 4448 4512 4576 4640 4704 4768 4832] diff --git a/tests/transform_dialect/cuda/mma_using_layout_analysis_codegen_spec.mlir b/tests/transform_dialect/cuda/mma_using_layout_analysis_codegen_spec.mlir deleted file mode 100644 index ee3f7336258d..000000000000 --- a/tests/transform_dialect/cuda/mma_using_layout_analysis_codegen_spec.mlir +++ /dev/null @@ -1,72 +0,0 @@ -// RUN: iree-opt %s - -module attributes { transform.with_named_sequence } { - transform.named_sequence @__transform_main( - %variant_op: !transform.any_op {transform.consumed}) { - // Step 1. Find the fill and matmul ops - // =========================================================================== - %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op - %matmul = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> !transform.any_op - - // Step 2. Tile the matmul and fuse the fill - // =========================================================================== - %grid_reduction, %forall_grid = - transform.structured.tile_using_forall %matmul tile_sizes [16] ( mapping = [#gpu.block] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> () - - transform.structured.fuse_into_containing_op %fill into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - - // Promote operands in order to test loading from shared memory. - %matmul_2 = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> !transform.any_op - %promoted_matmul, %alloc_0, %alloc_1 = - transform.iree.promote_operands %matmul_2 [0, 1] - : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) - - - // Step 3. Vectorize - // =========================================================================== - %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op - transform.apply_patterns to %func { - transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface - transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices - transform.apply_patterns.vector.cast_away_vector_leading_one_dim - } : !transform.any_op - %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op - - // Step 4. Bufferize - // =========================================================================== - transform.apply_patterns to %func_3 { - transform.apply_patterns.iree.fold_fill_into_pad - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.scf.for_loop_canonicalization - } : !transform.any_op - transform.apply_patterns to %func_3 { - transform.apply_patterns.tensor.reassociative_reshape_folding - transform.apply_patterns.canonicalization - } : !transform.any_op - transform.apply_cse to %func_3 : !transform.any_op - transform.iree.eliminate_empty_tensors %func_3 : (!transform.any_op) -> () - transform.apply_patterns to %func_3 { - transform.apply_patterns.linalg.erase_unnecessary_inputs - } : !transform.any_op - %memref_func = transform.iree.bufferize { target_gpu } %func_3: (!transform.any_op) -> (!transform.any_op) - - // Step 5. Pre-process the contract and transfer ops to put it in the right form. - // =========================================================================== - transform.apply_patterns to %memref_func { - transform.apply_patterns.iree.prepare_vector_to_mma - } : !transform.any_op - - // Step 6. Post-bufferization vector distribution - // =========================================================================== - transform.iree.forall_to_workgroup %memref_func : (!transform.any_op) -> () - transform.iree.map_nested_forall_to_gpu_threads %memref_func - workgroup_dims = [4, 8, 1] : (!transform.any_op) -> () - - // Step 7. Do layout analysis and lower to mma - // =========================================================================== - %func_11 = transform.iree.layout_analysis_and_distribution %memref_func : (!transform.any_op) -> (!transform.any_op) - - transform.yield - } -} // module diff --git a/tests/transform_dialect/cuda/reduction.mlir b/tests/transform_dialect/cuda/reduction.mlir deleted file mode 100644 index d506a1740cf1..000000000000 --- a/tests/transform_dialect/cuda/reduction.mlir +++ /dev/null @@ -1,33 +0,0 @@ -!in_tensor_t = tensor<8x64xf32> -!out_tensor_t = tensor<8xf32> - -func.func @reduce(%arg : !in_tensor_t) -> (!out_tensor_t) { - %cst = arith.constant -0.000000e+00 : f32 - - %0 = tensor.empty() : !out_tensor_t - %1 = linalg.fill ins(%cst : f32) outs(%0 : !out_tensor_t) -> !out_tensor_t - %2 = linalg.generic { - indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, - affine_map<(d0, d1) -> (d0)>], - iterator_types = ["parallel", "reduction"]} - ins(%arg : !in_tensor_t) outs(%1 : !out_tensor_t) { - ^bb0(%arg3: f32, %arg4: f32): - %3 = arith.addf %arg3, %arg4 : f32 - linalg.yield %3 : f32 - } -> !out_tensor_t - return %2 : !out_tensor_t -} - -// RUN: iree-compile %s --iree-hal-target-backends=cuda \ -// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \ -// RUN: --iree-codegen-transform-dialect-library=%p/reduction_codegen_spec.mlir@codegen | \ -// RUN: iree-run-module --module=- --function=reduce --device=cuda --input="8x64xf32=1" |\ -// RUN: FileCheck %s --check-prefix=EXEC - -/// Note: the current --iree-codegen-llvmgpu-enable-transform-dialect-jit only works for exactly this reduction atm. -// RUN: iree-compile %s --iree-hal-target-backends=cuda | \ -// RUN: iree-run-module --module=- --function=reduce --device=cuda --input="8x64xf32=1" |\ -// RUN: FileCheck %s --check-prefix=EXEC - -// EXEC: result[0]: hal.buffer_view -// EXEC-NEXT: 8xf32=64 64 64 64 64 64 64 64 diff --git a/tests/transform_dialect/cuda/reduction_codegen_spec.mlir b/tests/transform_dialect/cuda/reduction_codegen_spec.mlir deleted file mode 100644 index c8b46b3756a6..000000000000 --- a/tests/transform_dialect/cuda/reduction_codegen_spec.mlir +++ /dev/null @@ -1,125 +0,0 @@ -// RUN: iree-opt %s - -module attributes { transform.with_named_sequence } { - transform.named_sequence @codegen( - %variant_op: !transform.any_op {transform.consumed}) { - - %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op - - // Step 1. Split the reduction to get meatier (size(red) / 2)-way parallelism. - // =========================================================================== - %0 = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!transform.any_op) -> !transform.any_op - %init_or_alloc_op, %more_parallel_fill_op, %more_parallel_op, %combiner_op = - transform.structured.split_reduction %0 - { split_factor = 2, insert_split_dimension = 1 } - : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) - - // Step 2. First level of tiling + fusion parallelizes to blocks. - // =========================================================================== - %grid_combiner_op, %forall_grid = - transform.structured.tile_using_forall %combiner_op tile_sizes [1] - ( mapping = [#gpu.block] ) - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> () - %not_combiner = transform.merge_handles %fill, %more_parallel_fill_op, %more_parallel_op : !transform.any_op - transform.structured.fuse_into_containing_op %not_combiner into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - - // Step 3. Second level of tiling + fusion parallelizes to threads. - // =========================================================================== - %fill_1d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1xf32> in %variant_op - : (!transform.any_op) -> !transform.any_op - %block_combiner_op, %forall_block_combiner_op = - transform.structured.tile_using_forall %grid_combiner_op tile_sizes [1] - ( mapping = [#gpu.thread] ) - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.structured.fuse_into_containing_op %fill_1d into %forall_block_combiner_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - - // Canonicalizations. - %func_op = transform.structured.match ops{["func.func"]} in %variant_op - : (!transform.any_op) -> !transform.any_op - transform.apply_patterns to %func_op { - transform.apply_patterns.iree.fold_fill_into_pad - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.scf.for_loop_canonicalization - } : !transform.any_op - transform.iree.apply_licm %func_op : !transform.any_op - transform.apply_cse to %func_op : !transform.any_op - - %fill_2d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1x2xf32> in %variant_op - : (!transform.any_op) -> !transform.any_op - %grid_more_parallel_op = transform.structured.match ops{["linalg.generic"]} - attributes{iterator_types = [#linalg.iterator_type, #linalg.iterator_type, #linalg.iterator_type]} in %variant_op - : (!transform.any_op) -> !transform.any_op - %block_more_parallel_op, %forall_block_more_parallel_op = - transform.structured.tile_using_forall %grid_more_parallel_op tile_sizes [1, 1] - ( mapping = [#gpu.thread, #gpu.thread] ) - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.structured.fuse_into_containing_op %fill_2d into %forall_block_more_parallel_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - - // Step 4. Rank-reduce and vectorize. - // =========================================================================== - %func = transform.structured.match ops{["func.func"]} in %variant_op - : (!transform.any_op) -> !transform.any_op - transform.apply_patterns to %func { - transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface - transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices - transform.apply_patterns.vector.cast_away_vector_leading_one_dim - } : !transform.any_op - %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op - - // Step 5. Bufferize and drop HAL decriptor from memref ops. - // =========================================================================== - transform.apply_patterns to %func_3 { - transform.apply_patterns.tensor.reassociative_reshape_folding - } : !transform.any_op - transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> () - %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> !transform.any_op - %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 - : (!transform.any_op) -> !transform.any_op - - // Step 6. Post-bufferization mapping to blocks and threads. - // =========================================================================== - %func_5 = transform.structured.match ops{["func.func"]} in %variant_op_3 - : (!transform.any_op) -> !transform.any_op - transform.iree.forall_to_workgroup %func_5 : (!transform.any_op) -> () - transform.iree.map_nested_forall_to_gpu_threads %func_5 - workgroup_dims = [32, 2, 1] : (!transform.any_op) -> () - - // Step 7. Post-bufferization vector distribution with rank-reduction. - // =========================================================================== - transform.apply_patterns to %func_5 { - transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface - transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices - transform.apply_patterns.memref.fold_memref_alias_ops - transform.apply_patterns.vector.cast_away_vector_leading_one_dim - } : !transform.any_op - %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3 - : (!transform.any_op) -> !transform.any_op - // Don't complain about unsupported if (threadIdx.x == 0 && threadIdx.y == 0) - // at this point. - transform.sequence %variant_op_3 : !transform.any_op failures(suppress) { - ^bb0(%arg0: !transform.any_op): - transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 } : (!transform.any_op) -> !transform.any_op - } - transform.iree.vector.warp_distribute %func_5 : (!transform.any_op) -> () - - - // Late Canonicalizations. - %func_op_3 = transform.structured.match ops{["func.func"]} in %variant_op_3 - : (!transform.any_op) -> !transform.any_op - transform.apply_patterns to %func_op_3 { - transform.apply_patterns.iree.fold_fill_into_pad - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.scf.for_loop_canonicalization - } : !transform.any_op - transform.iree.apply_licm %func_op_3 : !transform.any_op - transform.apply_cse to %func_op_3 : !transform.any_op - - // Annotate the exported function as already translated. - %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op - %none = transform.param.constant #iree_codegen.translation_info -> !transform.any_param - transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param - - transform.yield - } -} // module diff --git a/tests/transform_dialect/cuda/reduction_eltwise.mlir b/tests/transform_dialect/cuda/reduction_eltwise.mlir deleted file mode 100644 index 0bd49b5d4298..000000000000 --- a/tests/transform_dialect/cuda/reduction_eltwise.mlir +++ /dev/null @@ -1,45 +0,0 @@ -!in_tensor_t = tensor<8x64xf32> -!out_tensor_t = tensor<8xf32> - -func.func @reduce(%arg : !in_tensor_t) -> (!out_tensor_t) { - %cst = arith.constant -0.000000e+00 : f32 - - %0 = tensor.empty() : !out_tensor_t - %1 = linalg.fill ins(%cst : f32) outs(%0 : !out_tensor_t) -> !out_tensor_t - %5 = linalg.generic { - indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, - affine_map<(d0, d1) -> (d0)>], - iterator_types = ["parallel", "reduction"]} - ins(%arg : !in_tensor_t) outs(%1 : !out_tensor_t) { - ^bb0(%arg3: f32, %arg4: f32): - %4 = arith.addf %arg3, %arg4 : f32 - linalg.yield %4 : f32 - } -> !out_tensor_t - - %6 = tensor.empty() : !out_tensor_t - %7 = linalg.generic { - indexing_maps = [affine_map<(d0) -> (d0)>, - affine_map<(d0) -> (d0)>], - iterator_types = ["parallel"]} - ins(%5 : !out_tensor_t) outs(%6 : !out_tensor_t) { - ^bb0(%arg3: f32, %arg4: f32): - %4 = math.sqrt %arg3 : f32 - linalg.yield %4 : f32 - } -> !out_tensor_t - return %7 : !out_tensor_t -} - -// RUN: iree-compile %s --iree-hal-target-backends=cuda \ -// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \ -// RUN: --iree-codegen-transform-dialect-library=%p/reduction_eltwise_codegen_spec.mlir@codegen | \ -// RUN: iree-run-module --module=- --function=reduce --device=cuda --input="8x64xf32=1" |\ -// RUN: FileCheck %s --check-prefix=EXEC - -/// Note: the current --iree-codegen-llvmgpu-enable-transform-dialect-jit -/// only works for exactly this reduction atm. -// RUN: iree-compile %s --iree-hal-target-backends=cuda | \ -// RUN: iree-run-module --module=- --function=reduce --device=cuda --input="8x64xf32=1" |\ -// RUN: FileCheck %s --check-prefix=EXEC - -// EXEC: result[0]: hal.buffer_view -// EXEC-NEXT: 8xf32=8 8 8 8 8 8 8 8 diff --git a/tests/transform_dialect/cuda/reduction_eltwise_codegen_spec.mlir b/tests/transform_dialect/cuda/reduction_eltwise_codegen_spec.mlir deleted file mode 100644 index 6422201305af..000000000000 --- a/tests/transform_dialect/cuda/reduction_eltwise_codegen_spec.mlir +++ /dev/null @@ -1,164 +0,0 @@ -// RUN: iree-opt %s - -module attributes { transform.with_named_sequence } { - transform.named_sequence @codegen( - %variant_op: !transform.any_op {transform.consumed}) { - %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op - : (!transform.any_op) -> !transform.any_op - - // Step 1. Split the reduction to get meatier (size(red) / 2)-way parallelism. - // =========================================================================== - %0 = transform.structured.match ops{["linalg.generic"]} in %variant_op - : (!transform.any_op) -> !transform.any_op - %reduction, %eltwise = transform.split_handle %0 - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - %init_or_alloc_op, %more_parallel_fill_op, %more_parallel_op, %combiner_op = - transform.structured.split_reduction %reduction - { split_factor = 2, insert_split_dimension = 1 } - : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) - - // Canonicalizations. - %func_op = transform.structured.match ops{["func.func"]} in %variant_op - : (!transform.any_op) -> !transform.any_op - transform.apply_patterns to %func_op { - transform.apply_patterns.iree.fold_fill_into_pad - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.scf.for_loop_canonicalization - transform.apply_patterns.canonicalization - } : !transform.any_op - transform.iree.apply_licm %func_op : !transform.any_op - transform.apply_cse to %func_op : !transform.any_op - - // Step 2. First level of tiling + fusion parallelizes to blocks. Tile the - // trailing elementwise the same way we want to tile the reduction. - // =========================================================================== - %eltwise_grid_op, %grid_loop = transform.structured.tile_using_forall %eltwise - tile_sizes [1] (mapping = [#gpu.block]) - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.iree.populate_workgroup_count_region_using_num_threads_slice %grid_loop : (!transform.any_op) -> () - %not_eltwise = transform.merge_handles %fill, %more_parallel_fill_op, %more_parallel_op, %combiner_op - : !transform.any_op - transform.structured.fuse_into_containing_op %not_eltwise into %grid_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - - // Canonicalizations. - transform.apply_patterns to %func_op { - transform.apply_patterns.iree.fold_fill_into_pad - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.scf.for_loop_canonicalization - transform.apply_patterns.canonicalization - } : !transform.any_op - transform.iree.apply_licm %func_op : !transform.any_op - transform.apply_cse to %func_op : !transform.any_op - - // Step 3. Second level of tiling + fusion parallelizes to threads. - // =========================================================================== - %fill_1d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1xf32> in %variant_op - : (!transform.any_op) -> !transform.any_op - %eltwise_block_op, %eltwise_block_loop = - transform.structured.tile_using_forall %eltwise_grid_op tile_sizes [1] - ( mapping = [#gpu.thread] ) - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - %block_combiner_op = transform.structured.match ops{["linalg.generic"]} - attributes {iterator_types = [#linalg.iterator_type, #linalg.iterator_type]} in %variant_op - : (!transform.any_op) -> !transform.any_op - %combined_and_fill = transform.merge_handles %fill_1d, %block_combiner_op : !transform.any_op - transform.structured.fuse_into_containing_op %combined_and_fill into %eltwise_block_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - - // Canonicalizations. - transform.apply_patterns to %func_op { - transform.apply_patterns.iree.fold_fill_into_pad - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.scf.for_loop_canonicalization - transform.apply_patterns.canonicalization - } : !transform.any_op - transform.iree.apply_licm %func_op : !transform.any_op - transform.apply_cse to %func_op : !transform.any_op - - %fill_2d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1x2xf32> in %variant_op - : (!transform.any_op) -> !transform.any_op - %grid_more_parallel_op = transform.structured.match ops{["linalg.generic"]} - attributes{iterator_types = [#linalg.iterator_type, #linalg.iterator_type, #linalg.iterator_type]} in %variant_op - : (!transform.any_op) -> !transform.any_op - %block_more_parallel_op, %forall_block_more_parallel_op = - transform.structured.tile_using_forall %grid_more_parallel_op tile_sizes [1, 1] - ( mapping = [#gpu.thread, #gpu.thread] ) - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.structured.fuse_into_containing_op %fill_2d into %forall_block_more_parallel_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - - // Canonicalizations. - transform.apply_patterns to %func_op { - transform.apply_patterns.iree.fold_fill_into_pad - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.scf.for_loop_canonicalization - transform.apply_patterns.canonicalization - } : !transform.any_op - transform.iree.apply_licm %func_op : !transform.any_op - transform.apply_cse to %func_op : !transform.any_op - - // Step 4. Rank-reduce and vectorize. - // =========================================================================== - %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op - transform.apply_patterns to %func { - transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface - transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices - transform.apply_patterns.vector.cast_away_vector_leading_one_dim - } : !transform.any_op - %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op - - // Step 5. Bufferize and drop HAL decriptor from memref ops. - // =========================================================================== - transform.apply_patterns to %func_3 { - transform.apply_patterns.tensor.reassociative_reshape_folding - } : !transform.any_op - transform.iree.eliminate_empty_tensors %variant_op: (!transform.any_op) -> () - %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op) - %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 - : (!transform.any_op) -> !transform.any_op - - // Step 6. Post-bufferization mapping to blocks and threads. - // =========================================================================== - %func_5 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op - transform.iree.forall_to_workgroup %func_5 : (!transform.any_op) -> () - transform.iree.map_nested_forall_to_gpu_threads %func_5 - workgroup_dims = [32, 2, 1] : (!transform.any_op) -> () - - // Step 7. Post-bufferization vector distribution with rank-reduction. - // =========================================================================== - transform.apply_patterns to %func_5 { - transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface - transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices - transform.apply_patterns.memref.fold_memref_alias_ops - transform.apply_patterns.vector.cast_away_vector_leading_one_dim - } : !transform.any_op - %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3 - : (!transform.any_op) -> !transform.any_op - // Don't complain about unsupported if (threadIdx.x == 0 && threadIdx.y == 0) - // at this point. - transform.sequence %variant_op_3 : !transform.any_op failures(suppress) { - ^bb0(%arg0: !transform.any_op): - transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 } - : (!transform.any_op) -> !transform.any_op - } - transform.iree.vector.warp_distribute %func_5 : (!transform.any_op) -> () - - - // Late canonicalizations. - %func_op_3 = transform.structured.match ops{["func.func"]} in %variant_op_3 - : (!transform.any_op) -> !transform.any_op - transform.apply_patterns to %func_op_3 { - transform.apply_patterns.iree.fold_fill_into_pad - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.scf.for_loop_canonicalization - transform.apply_patterns.canonicalization - } : !transform.any_op - transform.iree.apply_licm %func_op_3 : !transform.any_op - transform.apply_cse to %func_op_3 : !transform.any_op - - // Annotate the exported function as already translated. - %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op - %none = transform.param.constant #iree_codegen.translation_info -> !transform.any_param - transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param - - transform.yield - } -} // module diff --git a/tests/transform_dialect/cuda/reduction_v2.mlir b/tests/transform_dialect/cuda/reduction_v2.mlir deleted file mode 100644 index 6d367d218397..000000000000 --- a/tests/transform_dialect/cuda/reduction_v2.mlir +++ /dev/null @@ -1,33 +0,0 @@ -!in_tensor_t = tensor<33x1024xf32> -!out_tensor_t = tensor<33xf32> - -func.func @reduce(%arg : !in_tensor_t) -> (!out_tensor_t) { - %cst = arith.constant -0.000000e+00 : f32 - - %0 = tensor.empty() : !out_tensor_t - %1 = linalg.fill ins(%cst : f32) outs(%0 : !out_tensor_t) -> !out_tensor_t - %2 = linalg.generic { - indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, - affine_map<(d0, d1) -> (d0)>], - iterator_types = ["parallel", "reduction"]} - ins(%arg : !in_tensor_t) outs(%1 : !out_tensor_t) { - ^bb0(%arg3: f32, %arg4: f32): - %3 = arith.addf %arg3, %arg4 : f32 - linalg.yield %3 : f32 - } -> !out_tensor_t - return %2 : !out_tensor_t -} - -// RUN: iree-compile %s --iree-hal-target-backends=cuda \ -// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \ -// RUN: --iree-codegen-transform-dialect-library=%p/reduction_v2_codegen_spec.mlir@codegen | \ -// RUN: iree-run-module --module=- --function=reduce --device=cuda --input="33x1024xf32=1" |\ -// RUN: FileCheck %s --check-prefix=EXEC - -// RUN: iree-compile %s --iree-hal-target-backends=cuda | \ -// RUN: iree-run-module --module=- --function=reduce --device=cuda --input="33x1024xf32=1" |\ -// RUN: FileCheck %s --check-prefix=EXEC - -// only checking the first 6 of 33 -// EXEC: result[0]: hal.buffer_view -// EXEC-NEXT: 33xf32=1024 1024 1024 1024 1024 1024 diff --git a/tests/transform_dialect/cuda/reduction_v2_codegen_spec.mlir b/tests/transform_dialect/cuda/reduction_v2_codegen_spec.mlir deleted file mode 100644 index bb9ecbfc6429..000000000000 --- a/tests/transform_dialect/cuda/reduction_v2_codegen_spec.mlir +++ /dev/null @@ -1,113 +0,0 @@ -// RUN: iree-opt %s - -module attributes { transform.with_named_sequence } { - transform.named_sequence @codegen( - %variant_op: !transform.any_op {transform.consumed}) { - - %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op - %reduction = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!transform.any_op) -> !transform.any_op - - // Step 1. First level of tiling + fusion parallelizes to blocks. - // =========================================================================== - %grid_reduction, %forall_grid = - transform.structured.tile_using_forall %reduction tile_sizes [1] - ( mapping = [#gpu.block] ) - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> () - transform.structured.fuse_into_containing_op %fill into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - - // Step 2. Split the reduction to get meatier parallelism. - // =========================================================================== - %block_more_parallel_fill_op_2, %block_more_parallel_op_2, %block_combiner_op_2, %forall = - transform.structured.tile_reduction_using_for %grid_reduction by tile_sizes = [0, 128] - : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) - %_1:2 = - transform.structured.tile_using_forall %block_more_parallel_op_2 num_threads [0, 32] - ( mapping = [#gpu.thread] ) - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - - // Step 3. Second level of tiling parallelizes to threads. - // =========================================================================== - // 1st op is [parallel, parallel], map it to threadIdx.x by 4. - %_2:2 = - transform.structured.tile_using_forall %block_more_parallel_fill_op_2 tile_sizes [0, 4] - ( mapping = [#gpu.thread] ) - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - // 2nd op is [parallel, reduction] of 1x128, map the 1-dim to threadIdx.y to - // trigger mapping of the reduction to threadIdx.x via predication via `if (x==0)`. - %_3:2 = - transform.structured.tile_using_forall %block_combiner_op_2 tile_sizes [1] - ( mapping = [#gpu.thread] ) - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - - // Step 4. Rank-reduce and vectorize. - // =========================================================================== - %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op - transform.apply_patterns to %func { - transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface - transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices - transform.apply_patterns.vector.cast_away_vector_leading_one_dim - } : !transform.any_op - %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op - - // Step 5. Bufferize and drop HAL decriptor from memref ops. - // =========================================================================== - // Canonicalization/CSE is needed before bufferization otherwise unnecessary - // allocs will be created. - transform.apply_patterns to %func_3 { - transform.apply_patterns.iree.fold_fill_into_pad - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.scf.for_loop_canonicalization - } : !transform.any_op - transform.apply_patterns to %func_3 { - transform.apply_patterns.tensor.reassociative_reshape_folding - transform.apply_patterns.canonicalization - } : !transform.any_op - transform.apply_cse to %func_3 : !transform.any_op - transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> () - %func_5 = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op - transform.apply_patterns to %func_5 { - transform.apply_patterns.linalg.erase_unnecessary_inputs - } : !transform.any_op - %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op) - %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op - - // Step 6. Post-bufferization mapping to blocks and threads. - // =========================================================================== - %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op - transform.iree.forall_to_workgroup %func_7 : (!transform.any_op) -> () - transform.iree.map_nested_forall_to_gpu_threads %func_7 - workgroup_dims = [32, 1, 1] : (!transform.any_op) -> () - - // Step 7. Post-bufferization vector distribution with rank-reduction. - // =========================================================================== - transform.apply_patterns to %func_7 { - transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface - transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices - transform.apply_patterns.memref.fold_memref_alias_ops - transform.apply_patterns.vector.cast_away_vector_leading_one_dim - } : !transform.any_op - %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3 - : (!transform.any_op) -> !transform.any_op - %warp = transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 } : (!transform.any_op) -> !transform.any_op - transform.iree.vector.warp_distribute %func_7 - : (!transform.any_op) -> () - - // Late canonicalizations to cleanup and pass the checks - transform.apply_patterns to %func_7 { - transform.apply_patterns.iree.fold_fill_into_pad - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.scf.for_loop_canonicalization - transform.apply_patterns.canonicalization - } : !transform.any_op - transform.iree.apply_licm %func_7 : !transform.any_op - transform.apply_cse to %func_7 : !transform.any_op - - // Annotate the exported function as already translated. - %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op - %none = transform.param.constant #iree_codegen.translation_info -> !transform.any_param - transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param - - transform.yield - } -} // module diff --git a/tests/transform_dialect/cuda/reduction_v2_uneven.mlir b/tests/transform_dialect/cuda/reduction_v2_uneven.mlir deleted file mode 100644 index 66b88cdf2232..000000000000 --- a/tests/transform_dialect/cuda/reduction_v2_uneven.mlir +++ /dev/null @@ -1,29 +0,0 @@ -!in_tensor_t = tensor<33x34567xf32> -!out_tensor_t = tensor<33xf32> - -func.func @reduce(%arg : !in_tensor_t) -> (!out_tensor_t) { - %cst = arith.constant -0.000000e+00 : f32 - - %0 = tensor.empty() : !out_tensor_t - %1 = linalg.fill ins(%cst : f32) outs(%0 : !out_tensor_t) -> !out_tensor_t - %2 = linalg.generic { - indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, - affine_map<(d0, d1) -> (d0)>], - iterator_types = ["parallel", "reduction"]} - ins(%arg : !in_tensor_t) outs(%1 : !out_tensor_t) { - ^bb0(%arg3: f32, %arg4: f32): - %3 = arith.addf %arg3, %arg4 : f32 - linalg.yield %3 : f32 - } -> !out_tensor_t - return %2 : !out_tensor_t -} - -// RUN: iree-compile %s --iree-hal-target-backends=cuda \ -// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \ -// RUN: --iree-codegen-transform-dialect-library=%p/reduction_v2_codegen_spec.mlir@codegen | \ -// RUN: iree-run-module --module=- --function=reduce --device=cuda --input="33x34567xf32=1" |\ -// RUN: FileCheck %s --check-prefix=EXEC - -// only checking the first 6 of 33 -// EXEC: result[0]: hal.buffer_view -// EXEC-NEXT: 33xf32=34567 34567 34567 34567 34567 34567 diff --git a/tests/transform_dialect/cuda/softmax.mlir b/tests/transform_dialect/cuda/softmax.mlir deleted file mode 100644 index 91bbc07b7a28..000000000000 --- a/tests/transform_dialect/cuda/softmax.mlir +++ /dev/null @@ -1,84 +0,0 @@ -// RUN: iree-compile %s --iree-hal-target-backends=cuda \ -// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \ -// RUN: --iree-flow-dispatch-use-transform-dialect=%p/softmax_dispatch_spec.mlir \ -// RUN: --iree-codegen-transform-dialect-library=%p/softmax_codegen_spec.mlir@codegen | \ -// RUN: iree-run-module --module=- --function=softmax --device=cuda | \ -// RUN: FileCheck %s - - -!tmp_tensor_t = tensor<16x128xf32> -!in_tensor_t = tensor<16x128x128xf32> -!out_tensor_t = tensor<16x128x128xf32> - -// Execution only checks that @softmax runs. -// CHECK: EXEC @softmax -// CHECK: 16x128x128xf32=[ -// CHECK-SAME: [0.0078125 0.0078125 0.0078125 0.0078125 - -func.func @softmax() -> !out_tensor_t { - %cst_0 = arith.constant 0.0 : f32 - %cst_1 = arith.constant 1.0 : f32 - %cst_min = arith.constant -3.40282347E+38 : f32 - %input = arith.constant dense<5.000000e+00> : !out_tensor_t - util.optimization_barrier %input : !in_tensor_t - - %input_max_empty = tensor.empty() : !tmp_tensor_t - %input_max_filled = linalg.fill ins(%cst_min : f32) - outs(%input_max_empty : !tmp_tensor_t) -> !tmp_tensor_t - %input_max = linalg.generic - {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, - affine_map<(d0, d1, d2) -> (d0, d1)>], - iterator_types = ["parallel", "parallel", "reduction"]} - ins(%input : !in_tensor_t) - outs(%input_max_filled : !tmp_tensor_t) { - ^bb0(%arg0: f32, %arg1: f32): - %max = arith.maximumf %arg0, %arg1 : f32 - linalg.yield %max : f32 - } -> !tmp_tensor_t - - // This has been fused manually to avoid the fusion on tensors pass and reduce noise atm. - %exps_empty = tensor.empty() : !out_tensor_t - %exps_sum_empty = tensor.empty() : !tmp_tensor_t - %exps_sum_filled = linalg.fill ins(%cst_0 : f32) - outs(%exps_sum_empty : !tmp_tensor_t) -> !tmp_tensor_t - %exps = linalg.generic - {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, - affine_map<(d0, d1, d2) -> (d0, d1)>, - affine_map<(d0, d1, d2) -> (d0, d1, d2)>], - iterator_types = ["parallel", "parallel", "parallel"]} - ins(%input, %input_max : !in_tensor_t, !tmp_tensor_t) - outs(%exps_empty : !out_tensor_t) { - ^bb0(%arg0: f32, %arg1: f32, %arg2: f32): - %sub = arith.subf %arg0, %arg1 : f32 - %exp = math.exp %sub : f32 - linalg.yield %exp: f32 - } -> (!out_tensor_t) - - %exps_sum = linalg.generic - {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, - affine_map<(d0, d1, d2) -> (d0, d1)>], - iterator_types = ["parallel", "parallel", "reduction"]} - ins(%exps : !out_tensor_t) - outs(%exps_sum_filled : !tmp_tensor_t) { - ^bb0(%exp: f32, %acc: f32): - %add = arith.addf %exp, %acc : f32 - linalg.yield %add : f32 - } -> (!tmp_tensor_t) - - %res_empty = tensor.empty() : !out_tensor_t - %res = linalg.generic - {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, - affine_map<(d0, d1, d2) -> (d0, d1)>, - affine_map<(d0, d1, d2) -> (d0, d1, d2)>], - iterator_types = ["parallel", "parallel", "parallel"]} - ins(%exps, %exps_sum : !out_tensor_t, !tmp_tensor_t) - outs(%res_empty : !out_tensor_t) { - ^bb0(%arg0: f32, %arg1: f32, %arg2: f32): - // %10 = arith.divf %cst_1, %arg1 : f32 - // %11 = arith.mulf %arg0, %10 : f32 - %div = arith.divf %arg0, %arg1 : f32 - linalg.yield %div : f32 - } -> !out_tensor_t - - return %res: !out_tensor_t -} diff --git a/tests/transform_dialect/cuda/softmax_codegen_spec.mlir b/tests/transform_dialect/cuda/softmax_codegen_spec.mlir deleted file mode 100644 index 86f49091536d..000000000000 --- a/tests/transform_dialect/cuda/softmax_codegen_spec.mlir +++ /dev/null @@ -1,119 +0,0 @@ -// RUN: iree-opt %s - -// Codegen -module attributes { transform.with_named_sequence } { - transform.named_sequence @codegen( - %variant_op: !transform.any_op {transform.consumed}) { - - %ops = transform.structured.match ops{["linalg.fill", "linalg.generic"]} - in %variant_op : (!transform.any_op) -> !transform.any_op - %input_max_fill, - %input_max, - %exps_sum_fill, - %exps, - %exps_sum, - %div = transform.split_handle %ops - : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, - !transform.any_op, !transform.any_op, !transform.any_op) - - // Step 1. First level of tiling + fusion parallelizes to blocks. - // ============================================================== - %_, %forall = - transform.structured.tile_using_forall %div tile_sizes [1, 4] - ( mapping = [#gpu.block, #gpu.block] ) - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall : (!transform.any_op) -> () - - // TODO: Merging and fusing merged handles does not work properly atm. - transform.structured.fuse_into_containing_op %exps_sum into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.structured.fuse_into_containing_op %exps into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.structured.fuse_into_containing_op %exps_sum_fill into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.structured.fuse_into_containing_op %input_max into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.structured.fuse_into_containing_op %input_max_fill into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - // By default, fusion into scf.forall does not promote captured values - // to shared as this involves a cross-thread dependence analysis. - // Instead, we activate it explicitly post-hoc to promote all the extract_slice - // ops that we find and match the prerequisites - %forall_with_type = transform.cast %forall : !transform.any_op to !transform.op<"scf.forall"> - transform.iree.share_forall_operands %forall_with_type - : (!transform.op<"scf.forall">) -> !transform.op<"scf.forall"> - transform.apply_patterns to %variant_op { - transform.apply_patterns.canonicalization - } : !transform.any_op - transform.apply_cse to %variant_op : !transform.any_op - - // Step 2. Second level of tiling + fusion parallelizes to threads. - // ================================================================ - %tiled_ops = transform.structured.match ops{["linalg.fill", "linalg.generic"]} - in %variant_op : (!transform.any_op) -> !transform.any_op - %tiled_input_max_fill, - %tiled_input_max, - %tiled_exps_sum_fill, - %tiled_exp_and_exps_sum, - %tiled_exp_and_exps_sum_2, - %tiled_div = transform.split_handle %tiled_ops - : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, - !transform.any_op, !transform.any_op, !transform.any_op) - // Leaving the reduction untiled on threadIdx.x makes it sequential on - // threadIdx.x. After distribution, predication by `if (threadIdx.x == 0)` is - // introduced and opportunities for distributing vector ops across warps - // appear. - %reduction_linalg_ops = transform.merge_handles %tiled_input_max, - %tiled_exp_and_exps_sum, - %tiled_exp_and_exps_sum_2 - : !transform.any_op - transform.structured.tile_using_forall %reduction_linalg_ops tile_sizes [1, 1] - ( mapping = [#gpu.thread, #gpu.thread] ) - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - // Fully parallel ops are tiled and mapped. - %parallel_linalg_ops = transform.merge_handles %tiled_input_max_fill, - %tiled_exps_sum_fill, - %tiled_div - : !transform.any_op - transform.structured.tile_using_forall %parallel_linalg_ops num_threads [1, 4, 32] - ( mapping = [#gpu.thread, #gpu.thread, #gpu.thread] ) - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - - // Step 3. Rank-reduce and vectorize. - // ================================== - %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op - transform.apply_patterns to %func { - transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface - transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices - transform.apply_patterns.vector.cast_away_vector_leading_one_dim - } : !transform.any_op - transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op - - // Step 4. Bufferize and drop HAL decriptor from memref ops. - // ========================================================= - transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> () - %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> !transform.any_op - %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op - - // Step 5. Post-bufferization mapping to blocks and threads. - // ========================================================= - %func_2 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op - transform.iree.forall_to_workgroup %func_2 : (!transform.any_op) -> () - transform.iree.map_nested_forall_to_gpu_threads %func_2 workgroup_dims = [32, 4, 1] : (!transform.any_op) -> () - - // Step 6. Post-bufferization vector distribution with rank-reduction. - // =================================================================== - %end_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op - transform.apply_patterns to %end_func { - transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface - transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices - transform.apply_patterns.memref.fold_memref_alias_ops - transform.apply_patterns.vector.cast_away_vector_leading_one_dim - } : !transform.any_op - %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op - %warp = transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 } : (!transform.any_op) -> !transform.any_op - transform.iree.vector.warp_distribute %end_func : (!transform.any_op) -> () - - // Annotate the exported function as already translated. - %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op - %none = transform.param.constant #iree_codegen.translation_info -> !transform.any_param - transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param - - transform.yield - } -} // module diff --git a/tests/transform_dialect/cuda/softmax_dispatch_spec.mlir b/tests/transform_dialect/cuda/softmax_dispatch_spec.mlir deleted file mode 100644 index 09c2c07895ee..000000000000 --- a/tests/transform_dialect/cuda/softmax_dispatch_spec.mlir +++ /dev/null @@ -1,28 +0,0 @@ -// RUN: iree-opt %s - -// Dispatch softmax. -module attributes { transform.with_named_sequence } { - transform.named_sequence @__transform_main(%func: !transform.any_op) { - %ops = transform.structured.match ops{["linalg.fill", "linalg.generic"]} - in %func : (!transform.any_op) -> !transform.any_op - - %input_max_fill, %input_max, %exps_sum_fill, %exps, %exps_sum, %div = - transform.split_handle %ops - : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, - !transform.any_op, !transform.any_op, !transform.any_op) - - /// This must be used with the custom dispatch region formation - /// because IREE's does not fuse the 6 ops softmax version even with - /// --iree-flow-fuse-multi-use. - %region_op = transform.iree.wrap_in_dispatch_region %div { generateWorkload = false } : (!transform.any_op) -> !transform.any_op - - %non_div = transform.merge_handles %input_max_fill, %input_max, %exps_sum_fill, %exps, %exps_sum - : !transform.any_op - %region_op_2 = transform.iree.move_preceding_op_into_dispatch_region %non_div into %region_op : (!transform.any_op, !transform.any_op) -> !transform.any_op - - %empty = transform.structured.match ops{["tensor.empty"]} in %func : (!transform.any_op) -> !transform.any_op - %region_op_3 = transform.iree.move_preceding_op_into_dispatch_region %empty into %region_op_2 : (!transform.any_op, !transform.any_op) -> !transform.any_op - transform.iree.region_to_workgroups %region_op_3 : (!transform.any_op) -> !transform.any_op - transform.yield - } -} diff --git a/tests/transform_dialect/cuda/softmax_partial.mlir b/tests/transform_dialect/cuda/softmax_partial.mlir deleted file mode 100644 index 018ad8c42ac8..000000000000 --- a/tests/transform_dialect/cuda/softmax_partial.mlir +++ /dev/null @@ -1,46 +0,0 @@ -// RUN: iree-compile %s --iree-hal-target-backends=cuda \ -// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \ -// RUN: --iree-codegen-transform-dialect-library=%p/softmax_partial_codegen_spec.mlir@codegen | \ -// RUN: iree-run-module --module=- --function=softmax_partial --device=cuda | \ -// RUN: FileCheck %s - -!tmp_tensor_t = tensor<16x128xf32> -!out_tensor_t = tensor<16x128x128xf32> - -// Execution only checks that @softmax_partial runs. -// CHECK: EXEC @softmax_partial -// CHECK: 16x128x128xf32=[ -// CHECK-SAME: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 - -func.func @softmax_partial() -> !out_tensor_t { - %cst = arith.constant -3.40282347E+38 : f32 - %cst_0 = arith.constant dense<1121212.000000e+00> : !out_tensor_t - %cst_1 = arith.constant dense<5.000000e+00> : !out_tensor_t - %0 = util.optimization_barrier %cst_1 : !out_tensor_t - - %1 = tensor.empty() : !tmp_tensor_t - %2 = linalg.fill ins(%cst : f32) outs(%1 : !tmp_tensor_t) -> !tmp_tensor_t - %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, - affine_map<(d0, d1, d2) -> (d0, d1)>], - iterator_types = ["parallel", "parallel", "reduction"]} - ins(%0 : !out_tensor_t) outs(%2 : !tmp_tensor_t) { - ^bb0(%arg0: f32, %arg1: f32): - %8 = arith.maximumf %arg0, %arg1 : f32 - linalg.yield %8 : f32 - } -> !tmp_tensor_t - - // This has been fused manually to avoid the fusion on tensors pass and reduce noise atm. - %4 = tensor.empty() : !out_tensor_t - %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, - affine_map<(d0, d1, d2) -> (d0, d1)>, - affine_map<(d0, d1, d2) -> (d0, d1, d2)>], - iterator_types = ["parallel", "parallel", "parallel"]} - ins(%0, %3 : !out_tensor_t, !tmp_tensor_t) outs(%4 : !out_tensor_t) { - ^bb0(%arg0: f32, %arg1: f32, %arg2: f32): - %6 = arith.subf %arg0, %arg1 : f32 - %7 = math.exp %6 : f32 - linalg.yield %7 : f32 - } -> !out_tensor_t - - return %5: !out_tensor_t -} diff --git a/tests/transform_dialect/cuda/softmax_partial_codegen_spec.mlir b/tests/transform_dialect/cuda/softmax_partial_codegen_spec.mlir deleted file mode 100644 index 65ea847af92c..000000000000 --- a/tests/transform_dialect/cuda/softmax_partial_codegen_spec.mlir +++ /dev/null @@ -1,102 +0,0 @@ -// RUN: iree-opt %s - -// Codegen -module attributes { transform.with_named_sequence } { - transform.named_sequence @codegen( - %variant_op: !transform.any_op {transform.consumed}) { - - // Step 1. First level of tiling + fusion parallelizes to blocks. - // ============================================================== - %root = transform.structured.match interface{LinalgOp} - attributes{iterator_types = [#linalg.iterator_type, #linalg.iterator_type, #linalg.iterator_type]} in %variant_op : (!transform.any_op) -> !transform.any_op - %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op - %red = transform.structured.match interface{LinalgOp} - attributes{iterator_types = [#linalg.iterator_type, #linalg.iterator_type, #linalg.iterator_type]} in %variant_op : (!transform.any_op) -> !transform.any_op - %not_root = transform.merge_handles %fill, %red : !transform.any_op - %tiled_generic, %forall = - transform.structured.tile_using_forall %root tile_sizes [1, 4] - ( mapping = [#gpu.block, #gpu.block] ) - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall : (!transform.any_op) -> () - transform.structured.fuse_into_containing_op %not_root into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - - // Step 2. Second level of tiling + fusion parallelizes to threads. - // ================================================================ - %fill_linalg = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op - %reduction_linalg = transform.structured.match ops{["linalg.generic"]} - attributes{iterator_types = [#linalg.iterator_type, #linalg.iterator_type, #linalg.iterator_type]} in %variant_op : (!transform.any_op) -> !transform.any_op - %parallel_linalg = transform.structured.match ops{["linalg.generic"]} - attributes{iterator_types = [#linalg.iterator_type, #linalg.iterator_type, #linalg.iterator_type]} in %variant_op : (!transform.any_op) -> !transform.any_op - %tiled_reduction_generic, %forall_reduction = - transform.structured.tile_using_forall %reduction_linalg tile_sizes [1, 1] - ( mapping = [#gpu.thread, #gpu.thread] ) - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - // TODO: this fusion currently does not happen properly, this is related to the clone - // behavior when fusing into scf.forall. - // Once fixed we'll be able to fuse. - // Fusion will save us one roundtrip to memory. - // transform.structured.fuse_into_containing_op %fill_linalg into %forall_reduction - transform.structured.tile_using_forall %parallel_linalg num_threads [1, 4, 32] - ( mapping = [#gpu.thread, #gpu.thread, #gpu.thread] ) - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - - - // Inability to tile reductions to scf.forall has 2 implications: - // 1. since no scf.forall is present, no gpu.barrier is added. - // This should be fixed independently: ops that are not nested in an scf.forall - // should have a gpu.barrier. Later needs to be complemented by a barrier - // removal pass. - // 2. Similarly, needs to be predicated under an if threadIx == 0 to avoid - // multiple threads updating the buffer inplace once bufferized. - // - // Instead, we can vectorize and go to vector SSA values that sidestep these - // issues. - // Everyone will race to the write while still computing the same value. - // - // That is still not good enough because we need to predicate this in order - // to enable the parallel reduction on warps. - - // Step 3. Rank-reduce and vectorize. - // ================================== - %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op - transform.apply_patterns to %func { - transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface - transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices - transform.apply_patterns.vector.cast_away_vector_leading_one_dim - } : !transform.any_op - transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op - - // Step 4. Bufferize and drop HAL decriptor from memref ops. - // ========================================================= - transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> () - %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> !transform.any_op - %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op - - // Step 5. Post-bufferization mapping to blocks and threads. - // ========================================================= - %func_2 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op - transform.iree.forall_to_workgroup %func_2 : (!transform.any_op) -> () - transform.iree.map_nested_forall_to_gpu_threads %func_2 workgroup_dims = [32, 4, 1] : (!transform.any_op) -> () - - // Step 6. Post-bufferization vector distribution with rank-reduction. - // =================================================================== - %end_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op - transform.apply_patterns to %end_func { - transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface - transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices - transform.apply_patterns.memref.fold_memref_alias_ops - transform.apply_patterns.vector.cast_away_vector_leading_one_dim - } : !transform.any_op - %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op - %warp = transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 } - : (!transform.any_op) -> !transform.any_op - transform.iree.vector.warp_distribute %end_func : (!transform.any_op) -> () - - // Annotate the exported function as already translated. - %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op - %none = transform.param.constant #iree_codegen.translation_info -> !transform.any_param - transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param - - transform.yield - } -} // module diff --git a/tests/transform_dialect/cuda/softmax_v2.mlir b/tests/transform_dialect/cuda/softmax_v2.mlir deleted file mode 100644 index 5ef3a90347b9..000000000000 --- a/tests/transform_dialect/cuda/softmax_v2.mlir +++ /dev/null @@ -1,76 +0,0 @@ -// RUN: iree-compile %s --iree-hal-target-backends=cuda \ -// RUN: --iree-flow-fuse-multi-use \ -// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \ -// RUN: --iree-codegen-transform-dialect-library=%p/softmax_v2_codegen_spec.mlir@codegen | \ -// RUN: iree-run-module --module=- --function=softmax --device=cuda | \ -// RUN: FileCheck %s - -!tmp_tensor_t = tensor<16x128xf32> -!in_tensor_t = tensor<16x128x128xf32> -!out_tensor_t = tensor<16x128x128xf32> - -// Execution only checks that @softmax runs. -// CHECK: EXEC @softmax -// CHECK: 16x128x128xf32=[ -// CHECK-SAME: [0.0078125 0.0078125 0.0078125 0.0078125 - -func.func @softmax() -> !out_tensor_t { - %cst_0 = arith.constant 0.0 : f32 - %cst_1 = arith.constant 1.0 : f32 - %cst_min = arith.constant -3.40282347E+38 : f32 - %input = arith.constant dense<5.000000e+00> : !out_tensor_t - util.optimization_barrier %input : !in_tensor_t - - %softmax = flow.dispatch.region[] -> (!out_tensor_t) { - %input_max_empty = tensor.empty() : !tmp_tensor_t - %input_max_filled = linalg.fill ins(%cst_min : f32) - outs(%input_max_empty : !tmp_tensor_t) -> !tmp_tensor_t - %input_max = linalg.generic - {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, - affine_map<(d0, d1, d2) -> (d0, d1)>], - iterator_types = ["parallel", "parallel", "reduction"]} - ins(%input : !in_tensor_t) - outs(%input_max_filled : !tmp_tensor_t) { - ^bb0(%arg0: f32, %arg1: f32): - %max = arith.maximumf %arg0, %arg1 : f32 - linalg.yield %max : f32 - } -> !tmp_tensor_t - - // This has been fused manually to avoid the fusion on tensors pass and reduce noise atm. - %exps_empty = tensor.empty() : !out_tensor_t - %exps_sum_empty = tensor.empty() : !tmp_tensor_t - %exps_sum_filled = linalg.fill ins(%cst_0 : f32) - outs(%exps_sum_empty : !tmp_tensor_t) -> !tmp_tensor_t - %exps, %exps_sum = linalg.generic - {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, - affine_map<(d0, d1, d2) -> (d0, d1)>, - affine_map<(d0, d1, d2) -> (d0, d1, d2)>, - affine_map<(d0, d1, d2) -> (d0, d1)>], - iterator_types = ["parallel", "parallel", "reduction"]} - ins(%input, %input_max : !in_tensor_t, !tmp_tensor_t) - outs(%exps_empty, %exps_sum_filled : !out_tensor_t, !tmp_tensor_t) { - ^bb0(%arg0: f32, %arg1: f32, %arg2: f32, %arg3: f32): - %sub = arith.subf %arg0, %arg1 : f32 - %exp = math.exp %sub : f32 - %add = arith.addf %exp, %arg3 : f32 - linalg.yield %exp, %add : f32, f32 - } -> (!out_tensor_t, !tmp_tensor_t) - - %res_empty = tensor.empty() : !out_tensor_t - %res = linalg.generic - {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, - affine_map<(d0, d1, d2) -> (d0, d1)>, - affine_map<(d0, d1, d2) -> (d0, d1, d2)>], - iterator_types = ["parallel", "parallel", "parallel"]} - ins(%exps, %exps_sum : !out_tensor_t, !tmp_tensor_t) - outs(%res_empty : !out_tensor_t) { - ^bb0(%arg0: f32, %arg1: f32, %arg2: f32): - // %10 = arith.divf %cst_1, %arg1 : f32 - // %11 = arith.mulf %arg0, %10 : f32 - %div = arith.divf %arg0, %arg1 : f32 - linalg.yield %div : f32 - } -> !out_tensor_t - flow.return %res : !out_tensor_t - } - return %softmax: !out_tensor_t -} diff --git a/tests/transform_dialect/cuda/softmax_v2_codegen_spec.mlir b/tests/transform_dialect/cuda/softmax_v2_codegen_spec.mlir deleted file mode 100644 index c73cbe9e18d2..000000000000 --- a/tests/transform_dialect/cuda/softmax_v2_codegen_spec.mlir +++ /dev/null @@ -1,148 +0,0 @@ -// RUN: iree-opt %s - -// Codegen -module attributes { transform.with_named_sequence } { - transform.named_sequence @codegen( - %variant_op: !transform.any_op {transform.consumed}) { - - %ops = transform.structured.match ops{["linalg.fill", "linalg.generic"]} - in %variant_op : (!transform.any_op) -> !transform.any_op - %input_max_fill, - %input_max, - %exps_sum_fill, - %exp_and_exps_sum, - %div = transform.split_handle %ops - : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, - !transform.any_op, !transform.any_op) - - // Step 1. First level of tiling + fusion parallelizes to blocks. - // ============================================================== - %_, %forall = - transform.structured.tile_using_forall %div tile_sizes [1, 4] - ( mapping = [#gpu.block, #gpu.block] ) - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall : (!transform.any_op) -> () - - // TODO: Merging and fusing merged handles does not work properly atm. - transform.structured.fuse_into_containing_op %exp_and_exps_sum into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.structured.fuse_into_containing_op %exps_sum_fill into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.structured.fuse_into_containing_op %input_max into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.structured.fuse_into_containing_op %input_max_fill into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - // By default, fusion into scf.forall does not promote captured values - // to shared as this involves a cross-thread dependence analysis. - // Instead, we activate it explicitly post-hoc to promote all the extract_slice - // ops that we find and match the prerequisites - %forall_with_type = transform.cast %forall : !transform.any_op to !transform.op<"scf.forall"> - transform.iree.share_forall_operands %forall_with_type - : (!transform.op<"scf.forall">) -> !transform.op<"scf.forall"> - - // Canonicalizations. - %func_op = transform.structured.match ops{["func.func"]} in %variant_op - : (!transform.any_op) -> !transform.any_op - transform.apply_patterns to %func_op { - transform.apply_patterns.iree.fold_fill_into_pad - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.scf.for_loop_canonicalization - transform.apply_patterns.canonicalization - } : !transform.any_op - transform.iree.apply_licm %func_op : !transform.any_op - transform.apply_cse to %func_op : !transform.any_op - - - // Step 2. Second level of tiling + fusion parallelizes to threads. - // ================================================================ - %tiled_ops = transform.structured.match ops{["linalg.fill", "linalg.generic"]} - in %variant_op : (!transform.any_op) -> !transform.any_op - %tiled_input_max_fill, - %tiled_input_max, - %tiled_exps_sum_fill, - %tiled_exp_and_exps_sum, - %tiled_div = transform.split_handle %tiled_ops - : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, - !transform.any_op, !transform.any_op) - // Leaving the reduction untiled on threadIdx.x makes it sequential on - // threadIdx.x. After distribution, predication by `if (threadIdx.x == 0)` is - // introduced and opportunities for distributing vector ops across warps - // appear. - %reduction_linalg_ops = transform.merge_handles %tiled_input_max, - %tiled_exp_and_exps_sum - : !transform.any_op - transform.structured.tile_using_forall %reduction_linalg_ops tile_sizes [1, 1] - ( mapping = [#gpu.thread, #gpu.thread] ) - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - // Fully parallel ops are tiled and mapped. - %parallel_linalg_ops = transform.merge_handles %tiled_input_max_fill, - %tiled_exps_sum_fill, - %tiled_div - : !transform.any_op - transform.structured.tile_using_forall %parallel_linalg_ops num_threads [1, 4, 32] - ( mapping = [#gpu.thread, #gpu.thread, #gpu.thread] ) - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - - // Canonicalizations. - transform.apply_patterns to %func_op { - transform.apply_patterns.iree.fold_fill_into_pad - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.scf.for_loop_canonicalization - transform.apply_patterns.canonicalization - } : !transform.any_op - transform.iree.apply_licm %func_op : !transform.any_op - transform.apply_cse to %func_op : !transform.any_op - - // Step 3. Rank-reduce and vectorize. - // ================================== - transform.apply_patterns to %func_op { - transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface - transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices - transform.apply_patterns.vector.cast_away_vector_leading_one_dim - } : !transform.any_op - transform.structured.vectorize_children_and_apply_patterns %func_op : (!transform.any_op) -> !transform.any_op - - // Step 4. Bufferize and drop HAL decriptor from memref ops. - // ========================================================= - transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> () - %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> !transform.any_op - %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op - - // Step 5. Post-bufferization mapping to blocks and threads. - // ========================================================= - transform.iree.forall_to_workgroup %memref_func : (!transform.any_op) -> () - transform.iree.map_nested_forall_to_gpu_threads %memref_func - workgroup_dims = [32, 4, 1] : (!transform.any_op) -> () - - // Step 6. Post-bufferization vector distribution with rank-reduction. - // =================================================================== - transform.apply_patterns to %memref_func { - transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface - transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices - transform.apply_patterns.memref.fold_memref_alias_ops - transform.apply_patterns.vector.cast_away_vector_leading_one_dim - } : !transform.any_op - %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3 - : (!transform.any_op) -> !transform.any_op - %warp = transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 } - : (!transform.any_op) -> !transform.any_op - transform.iree.vector.warp_distribute %memref_func - : (!transform.any_op) -> () - - - // Late canonicalizations. - %func_op_3 = transform.structured.match ops{["func.func"]} in %variant_op_3 - : (!transform.any_op) -> !transform.any_op - transform.apply_patterns to %func_op_3 { - transform.apply_patterns.iree.fold_fill_into_pad - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.scf.for_loop_canonicalization - transform.apply_patterns.canonicalization - } : !transform.any_op - transform.iree.apply_licm %func_op_3 : !transform.any_op - transform.apply_cse to %func_op_3 : !transform.any_op - - // Annotate the exported function as already translated. - %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op - %none = transform.param.constant #iree_codegen.translation_info -> !transform.any_param - transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param - - transform.yield - } -} // module