From 694dd3675fb38ef6f85bd7f4a09a0e4cdc402de1 Mon Sep 17 00:00:00 2001 From: Quinn Dawkins Date: Wed, 10 Jul 2024 19:10:01 -0400 Subject: [PATCH] Drop unmaintained transform dialect tests The tests in `tests/transform_dialect` have been unmaintained (and mostly disabled) for a while. Most of the tests were misplaced anyway, mixing lit tests in with e2e tests, and much of the relevant functionality is now tested upstream or has evolved into different lit tests in Codegen. This drops the entire `cuda` directory, and drops all `cpu` tests except one which is testing a transform dialect library call, which is still a relevant e2e test. --- tests/transform_dialect/cpu/BUILD.bazel | 8 - .../cpu/contraction-packing-and-dispatch.mlir | 59 ------- .../cpu/contraction-packing.mlir | 153 ---------------- .../cpu/eltwise_reduction_eltwise.mlir | 71 -------- .../cpu/fold_tensor_slice_into_transfer.mlir | 111 ------------ tests/transform_dialect/cpu/matmul.mlir | 21 --- .../cpu/matmul_codegen_default_spec.mlir | 32 ---- tests/transform_dialect/cuda/BUILD.bazel | 113 ------------ tests/transform_dialect/cuda/CMakeLists.txt | 78 --------- .../cuda/double_mma_layout_analysis.mlir | 26 --- ...uble_mma_layout_analysis_codegen_spec.mlir | 69 -------- ...ble_mma_layout_analysis_dispatch_spec.mlir | 23 --- .../cuda/eltwise_reduction.mlir | 39 ----- .../cuda/eltwise_reduction_eltwise.mlir | 51 ------ .../cuda/mma_elemwise_layout_analysis.mlir | 29 ---- ...elemwise_layout_analysis_codegen_spec.mlir | 65 ------- .../cuda/mma_reduction_layout_analysis.mlir | 38 ---- ...eduction_layout_analysis_codegen_spec.mlir | 68 -------- ...duction_layout_analysis_dispatch_spec.mlir | 23 --- .../cuda/mma_using_layout_analysis.mlir | 20 --- ...ma_using_layout_analysis_codegen_spec.mlir | 72 -------- tests/transform_dialect/cuda/reduction.mlir | 33 ---- .../cuda/reduction_codegen_spec.mlir | 125 ------------- .../cuda/reduction_eltwise.mlir | 45 ----- .../cuda/reduction_eltwise_codegen_spec.mlir | 164 ------------------ .../transform_dialect/cuda/reduction_v2.mlir | 33 ---- .../cuda/reduction_v2_codegen_spec.mlir | 113 ------------ .../cuda/reduction_v2_uneven.mlir | 29 ---- tests/transform_dialect/cuda/softmax.mlir | 84 --------- .../cuda/softmax_codegen_spec.mlir | 119 ------------- .../cuda/softmax_dispatch_spec.mlir | 28 --- .../cuda/softmax_partial.mlir | 46 ----- .../cuda/softmax_partial_codegen_spec.mlir | 102 ----------- tests/transform_dialect/cuda/softmax_v2.mlir | 76 -------- .../cuda/softmax_v2_codegen_spec.mlir | 148 ---------------- 35 files changed, 2314 deletions(-) delete mode 100644 tests/transform_dialect/cpu/contraction-packing-and-dispatch.mlir delete mode 100644 tests/transform_dialect/cpu/contraction-packing.mlir delete mode 100644 tests/transform_dialect/cpu/eltwise_reduction_eltwise.mlir delete mode 100644 tests/transform_dialect/cpu/fold_tensor_slice_into_transfer.mlir delete mode 100644 tests/transform_dialect/cpu/matmul.mlir delete mode 100644 tests/transform_dialect/cpu/matmul_codegen_default_spec.mlir delete mode 100644 tests/transform_dialect/cuda/BUILD.bazel delete mode 100644 tests/transform_dialect/cuda/CMakeLists.txt delete mode 100644 tests/transform_dialect/cuda/double_mma_layout_analysis.mlir delete mode 100644 tests/transform_dialect/cuda/double_mma_layout_analysis_codegen_spec.mlir delete mode 100644 tests/transform_dialect/cuda/double_mma_layout_analysis_dispatch_spec.mlir delete mode 100644 tests/transform_dialect/cuda/eltwise_reduction.mlir delete mode 100644 tests/transform_dialect/cuda/eltwise_reduction_eltwise.mlir delete mode 100644 tests/transform_dialect/cuda/mma_elemwise_layout_analysis.mlir delete mode 100644 tests/transform_dialect/cuda/mma_elemwise_layout_analysis_codegen_spec.mlir delete mode 100644 tests/transform_dialect/cuda/mma_reduction_layout_analysis.mlir delete mode 100644 tests/transform_dialect/cuda/mma_reduction_layout_analysis_codegen_spec.mlir delete mode 100644 tests/transform_dialect/cuda/mma_reduction_layout_analysis_dispatch_spec.mlir delete mode 100644 tests/transform_dialect/cuda/mma_using_layout_analysis.mlir delete mode 100644 tests/transform_dialect/cuda/mma_using_layout_analysis_codegen_spec.mlir delete mode 100644 tests/transform_dialect/cuda/reduction.mlir delete mode 100644 tests/transform_dialect/cuda/reduction_codegen_spec.mlir delete mode 100644 tests/transform_dialect/cuda/reduction_eltwise.mlir delete mode 100644 tests/transform_dialect/cuda/reduction_eltwise_codegen_spec.mlir delete mode 100644 tests/transform_dialect/cuda/reduction_v2.mlir delete mode 100644 tests/transform_dialect/cuda/reduction_v2_codegen_spec.mlir delete mode 100644 tests/transform_dialect/cuda/reduction_v2_uneven.mlir delete mode 100644 tests/transform_dialect/cuda/softmax.mlir delete mode 100644 tests/transform_dialect/cuda/softmax_codegen_spec.mlir delete mode 100644 tests/transform_dialect/cuda/softmax_dispatch_spec.mlir delete mode 100644 tests/transform_dialect/cuda/softmax_partial.mlir delete mode 100644 tests/transform_dialect/cuda/softmax_partial_codegen_spec.mlir delete mode 100644 tests/transform_dialect/cuda/softmax_v2.mlir delete mode 100644 tests/transform_dialect/cuda/softmax_v2_codegen_spec.mlir diff --git a/tests/transform_dialect/cpu/BUILD.bazel b/tests/transform_dialect/cpu/BUILD.bazel index 669933c9f2af..fc81794397de 100644 --- a/tests/transform_dialect/cpu/BUILD.bazel +++ b/tests/transform_dialect/cpu/BUILD.bazel @@ -14,20 +14,12 @@ package( iree_lit_test_suite( name = "lit", srcs = [ - "contraction-packing.mlir", - "contraction-packing-and-dispatch.mlir", - # DISABLED: incorrectly assuming default flag values. - # "eltwise_reduction_eltwise.mlir", - "fold_tensor_slice_into_transfer.mlir", - # DISABLED: incorrectly assuming default flag values. - # "matmul.mlir", "matmul_library_call.mlir", ], cfg = "//tests:lit.cfg.py", # transform dialect spec files are MLIR files that specify a transformation, # they need to be included as data. data = [ - "matmul_codegen_default_spec.mlir", "transform_library.mlir", ], tags = [ diff --git a/tests/transform_dialect/cpu/contraction-packing-and-dispatch.mlir b/tests/transform_dialect/cpu/contraction-packing-and-dispatch.mlir deleted file mode 100644 index bb0c56a24213..000000000000 --- a/tests/transform_dialect/cpu/contraction-packing-and-dispatch.mlir +++ /dev/null @@ -1,59 +0,0 @@ - -// Preprocessing with generalized packing. -// -// RUN: iree-opt %s --iree-transform-dialect-interpreter --transform-dialect-drop-schedule | \ -// RUN: iree-compile - --iree-hal-target-backends=llvm-cpu \ -// RUN: --iree-opt-data-tiling=false \ -// RUN: --compile-to=executable-configurations | \ -// RUN: FileCheck %s - -!a_tensor_t = tensor<1234x567xf32> -!b_tensor_t = tensor<567x890xf32> -!c_tensor_t = tensor<1234x890xf32> - -// Note: the normalization in these maps is gone due to InterchangeGenericOps. -// When using generalized packing, it would be better to drop that pass. - -// CHECK-LABEL: func.func @matmul_dispatch_0 -// CHECK: tensor.empty() : tensor<155x18x8x32xf32> -// CHECK: tensor.pack - -// CHECK-LABEL: func.func @matmul_dispatch_1 -// CHECK: tensor.empty() : tensor<18x56x16x32xf32> -// CHECK: tensor.pack - -// CHECK-LABEL: func.func @matmul_dispatch_2 -// CHECK: tensor.empty() : tensor<155x56x8x16xf32> -// CHECK: tensor.pack - -// CHECK-LABEL: func.func @matmul_dispatch_3 -func.func public @matmul(%arg0: !a_tensor_t, %arg2: !c_tensor_t) -> !c_tensor_t { - %rhs = arith.constant dense<0.1> : !b_tensor_t - %c0 = util.optimization_barrier %rhs : !b_tensor_t - // CHECK-NOT: pack - // CHECK: linalg.generic - // CHECK-SAME: indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d4, d2, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d4, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>] - // CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction"]} - // CHECK-SAME: ins(%{{.*}} : tensor<155x18x8x32xf32>, tensor<18x56x16x32xf32>) - // CHECK-SAME: outs(%{{.*}} : tensor<155x56x8x16xf32>) - - %0 = linalg.matmul - ins(%arg0, %c0: !a_tensor_t, !b_tensor_t) - outs(%arg2: !c_tensor_t) -> !c_tensor_t - return %0 : !c_tensor_t -} - -// CHECK-LABEL: func.func @matmul_dispatch_4 -// CHECK: tensor.unpack -module attributes { transform.with_named_sequence } { - transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) { - %matmul = transform.structured.match interface{LinalgOp} in %module_op - : (!transform.any_op) -> (!transform.any_op) - - transform.structured.pack_greedily %matmul - matmul_packed_sizes = [8, 16, 32] - matmul_inner_dims_order = [0, 1, 2] - : (!transform.any_op) -> !transform.op<"linalg.generic"> - transform.yield - } -} // module diff --git a/tests/transform_dialect/cpu/contraction-packing.mlir b/tests/transform_dialect/cpu/contraction-packing.mlir deleted file mode 100644 index 8c67a38295e9..000000000000 --- a/tests/transform_dialect/cpu/contraction-packing.mlir +++ /dev/null @@ -1,153 +0,0 @@ - -// Preprocessing with generalized packing. -// -// RUN: iree-opt %s --iree-transform-dialect-interpreter --transform-dialect-drop-schedule | \ -// RUN: FileCheck %s - -!a_tensor_t = tensor<1234x567xf32> -!at_tensor_t = tensor<567x1234xf32> -!b_tensor_t = tensor<567x890xf32> -!bt_tensor_t = tensor<890x567xf32> -!c_tensor_t = tensor<1234x890xf32> -!ct_tensor_t = tensor<890x1234xf32> - -// CHECK-DAG: #[[$map_lhs:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)> -// CHECK-DAG: #[[$map_rhs:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d4, d5)> -// CHECK-DAG: #[[$map_res:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)> -// CHECK-DAG: #[[$map_tlhs:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d3, d5)> -// CHECK-DAG: #[[$map_trhs:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)> -// CHECK-DAG: #[[$map_tres:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d3, d4)> - -// CHECK-LABEL: func.func @matmul_nnn -func.func @matmul_nnn(%arg0: !a_tensor_t, %arg2: !c_tensor_t) -> !c_tensor_t { - %c0 = arith.constant dense<0.1> : !b_tensor_t - - // CHECK: tensor.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [8, 32] - // CHECK: tensor.pack %{{.*}} inner_dims_pos = [1, 0] inner_tiles = [16, 32] - // CHECK: tensor.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [8, 16] - // CHECK: linalg.generic - // CHECK-SAME: indexing_maps = [#[[$map_lhs]], #[[$map_rhs]], #[[$map_res]]] - // CHECK-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} - // CHECK-SAME: ins(%{{.*}} : tensor<155x18x8x32xf32>, tensor<18x56x16x32xf32>) - // CHECK-SAME: outs(%{{.*}} : tensor<155x56x8x16xf32>) - // CHECK: tensor.unpack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [8, 16] - %0 = linalg.matmul - ins(%arg0, %c0: !a_tensor_t, !b_tensor_t) - outs(%arg2: !c_tensor_t) -> !c_tensor_t - return %0 : !c_tensor_t -} - -#matmul_tnn_trait = { - indexing_maps = [ - affine_map<(m, n, k) -> (k, m)>, - affine_map<(m, n, k) -> (k, n)>, - affine_map<(m, n, k) -> (m, n)> - ], - iterator_types = ["parallel", "parallel", "reduction"] -} - -// CHECK-LABEL: func.func @matmul_tnn -func.func @matmul_tnn(%arg0: !at_tensor_t, %arg2: !c_tensor_t) -> !c_tensor_t { - %c0 = arith.constant dense<0.1> : !b_tensor_t - - // CHECK: tensor.pack %{{.*}} inner_dims_pos = [1, 0] inner_tiles = [8, 32] - // CHECK: tensor.pack %{{.*}} inner_dims_pos = [1, 0] inner_tiles = [16, 32] - // CHECK: tensor.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [8, 16] - // CHECK: linalg.generic - // CHECK-SAME: indexing_maps = [#[[$map_tlhs]], #[[$map_rhs]], #[[$map_res]]] - // CHECK-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} - // CHECK-SAME: ins(%{{.*}} : tensor<18x155x8x32xf32>, tensor<18x56x16x32xf32>) - // CHECK-SAME: outs(%{{.*}} : tensor<155x56x8x16xf32>) - // CHECK: tensor.unpack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [8, 16] - %0 = linalg.generic #matmul_tnn_trait - ins(%arg0, %c0: !at_tensor_t, !b_tensor_t) - outs(%arg2: !c_tensor_t) { - ^bb(%a: f32, %b: f32, %c: f32) : - %d = arith.mulf %a, %b: f32 - %e = arith.addf %c, %d: f32 - linalg.yield %e : f32 - } -> !c_tensor_t - return %0 : !c_tensor_t -} - -#matmul_ntn_trait = { - indexing_maps = [ - affine_map<(m, n, k) -> (m, k)>, - affine_map<(m, n, k) -> (n, k)>, - affine_map<(m, n, k) -> (m, n)> - ], - iterator_types = ["parallel", "parallel", "reduction"] -} - -// CHECK-LABEL: func.func @matmul_ntn -func.func @matmul_ntn(%arg0: !a_tensor_t, %arg2: !c_tensor_t) -> !c_tensor_t { - %c0 = arith.constant dense<0.1> : !bt_tensor_t - - // CHECK: tensor.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [8, 32] - // CHECK: tensor.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [16, 32] - // CHECK: tensor.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [8, 16] - // CHECK: linalg.generic - // CHECK-SAME: indexing_maps = [#[[$map_lhs]], #[[$map_trhs]], #[[$map_res]]] - // CHECK-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} - // CHECK-SAME: ins(%{{.*}} : tensor<155x18x8x32xf32>, tensor<56x18x16x32xf32>) - // CHECK-SAME: outs(%{{.*}} : tensor<155x56x8x16xf32>) - // CHECK: tensor.unpack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [8, 16] - %0 = linalg.generic #matmul_ntn_trait - ins(%arg0, %c0: !a_tensor_t, !bt_tensor_t) - outs(%arg2: !c_tensor_t) { - ^bb(%a: f32, %b: f32, %c: f32) : - %d = arith.mulf %a, %b: f32 - %e = arith.addf %c, %d: f32 - linalg.yield %e : f32 - } -> !c_tensor_t - return %0 : !c_tensor_t -} - -#matmul_nnt_trait = { - indexing_maps = [ - affine_map<(m, n, k) -> (m, k)>, - affine_map<(m, n, k) -> (k, n)>, - affine_map<(m, n, k) -> (n, m)> - ], - iterator_types = ["parallel", "parallel", "reduction"] -} - -// CHECK-LABEL: func.func @matmul_nnt -func.func @matmul_nnt(%arg0: !a_tensor_t, %arg2: !ct_tensor_t) -> !ct_tensor_t { - %c0 = arith.constant dense<0.1> : !b_tensor_t - - // CHECK: tensor.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [8, 32] - // CHECK: tensor.pack %{{.*}} inner_dims_pos = [1, 0] inner_tiles = [16, 32] - // CHECK: tensor.pack %{{.*}} inner_dims_pos = [1, 0] inner_tiles = [8, 16] - // CHECK: linalg.generic - // CHECK-SAME: indexing_maps = [#[[$map_lhs]], #[[$map_rhs]], #[[$map_tres]]] - // CHECK-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} - // CHECK-SAME: ins(%{{.*}} : tensor<155x18x8x32xf32>, tensor<18x56x16x32xf32>) - // CHECK-SAME: outs(%{{.*}} : tensor<56x155x8x16xf32>) - // CHECK: tensor.unpack %{{.*}} inner_dims_pos = [1, 0] inner_tiles = [8, 16] - %0 = linalg.generic #matmul_nnt_trait - ins(%arg0, %c0: !a_tensor_t, !b_tensor_t) - outs(%arg2: !ct_tensor_t) { - ^bb(%a: f32, %b: f32, %c: f32) : - %d = arith.mulf %a, %b: f32 - %e = arith.addf %c, %d: f32 - linalg.yield %e : f32 - } -> !ct_tensor_t - return %0 : !ct_tensor_t -} - -module attributes { transform.with_named_sequence } { - transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) { - %matmul = transform.structured.match interface{LinalgOp} in %module_op - : (!transform.any_op) -> (!transform.any_op) - - // Generalized packing rewrite extracts a gemm from any linalg op that contains - // one. This acts as a powerful normalization step: after this point, we have a - // gemm (i.e. 3-D contraction with (m,n,k)=(8,16,32) ) on the 3 most minor - // dimensions. - transform.structured.pack_greedily %matmul - matmul_packed_sizes = [8, 16, 32] matmul_inner_dims_order = [0, 1, 2] - : (!transform.any_op) -> !transform.op<"linalg.generic"> - transform.yield - } -} // module diff --git a/tests/transform_dialect/cpu/eltwise_reduction_eltwise.mlir b/tests/transform_dialect/cpu/eltwise_reduction_eltwise.mlir deleted file mode 100644 index 0e61ca71cf31..000000000000 --- a/tests/transform_dialect/cpu/eltwise_reduction_eltwise.mlir +++ /dev/null @@ -1,71 +0,0 @@ -!in_tensor_t = tensor<32x256xf32> -!out_tensor_t = tensor<32xf32> - -func.func @reduce(%arg : !in_tensor_t) -> (!out_tensor_t) { - %cst = arith.constant -0.000000e+00 : f32 - - %0 = tensor.empty() : !out_tensor_t - %1 = linalg.fill ins(%cst : f32) outs(%0 : !out_tensor_t) -> !out_tensor_t - %2 = tensor.empty() : !in_tensor_t - %3 = linalg.generic { - indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, - affine_map<(d0, d1) -> (d0, d1)>], - iterator_types = ["parallel", "parallel"]} - ins(%arg : !in_tensor_t) outs(%2 : !in_tensor_t) { - ^bb0(%arg3: f32, %arg4: f32): - %4 = arith.addf %arg3, %arg3 : f32 - %5 = arith.addf %4, %4 : f32 - linalg.yield %5 : f32 - } -> !in_tensor_t - - %6 = linalg.generic { - indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, - affine_map<(d0, d1) -> (d0)>], - iterator_types = ["parallel", "reduction"]} - ins(%3 : !in_tensor_t) outs(%1 : !out_tensor_t) { - ^bb0(%arg3: f32, %arg4: f32): - %4 = arith.addf %arg3, %arg4 : f32 - linalg.yield %4 : f32 - } -> !out_tensor_t - - %7 = tensor.empty() : !out_tensor_t - %8 = linalg.generic { - indexing_maps = [affine_map<(d0) -> (d0)>, - affine_map<(d0) -> (d0)>], - iterator_types = ["parallel"]} - ins(%6 : !out_tensor_t) outs(%7 : !out_tensor_t) { - ^bb0(%arg3: f32, %arg4: f32): - %4 = math.sqrt %arg3 : f32 - linalg.yield %4 : f32 - } -> !out_tensor_t - - - return %8 : !out_tensor_t -} - -// RUN: iree-compile %s --iree-hal-target-backends=llvm-cpu \ -// RUN: --iree-opt-data-tiling=false \ -// RUN: --compile-to=executable-configurations | \ -// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs,iree-llvmcpu-select-lowering-strategy,iree-llvmcpu-lower-executable-target)))' \ -// RUN: --iree-llvmcpu-enable-transform-dialect-jit | \ -// RUN: FileCheck %s - -// RUN: iree-compile %s --iree-hal-target-backends=llvm-cpu \ -// RUN: --iree-opt-data-tiling=false \ -// RUN: --iree-llvmcpu-enable-transform-dialect-jit | \ -// RUN: iree-run-module --module=- --function=reduce --device=local-task --input="32x256xf32=1" | \ -// RUN: FileCheck %s --check-prefix=EXEC - -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[workgroup_id_x:.*]] = hal.interface.workgroup.id[0] : index -// CHECK: scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} -> (vector<8xf32>) { -// CHECK: arith.addf %{{.*}} : vector<8x16xf32> -// CHECK-COUNT-16: vector.extract %{{.*}} : vector<8xf32> from vector<16x8xf32>{{[[:space:]].*}}arith.addf %{{.*}} : vector<8xf32> -// CHECK: scf.yield %{{.*}} : vector<8xf32> -// CHECK: } -// CHECK: math.sqrt %{{.*}} : vector<8xf32> -// CHECK: vector.store %{{.*}} : memref<8xf32, strided<[1], offset: ?>, #hal.descriptor_type>, vector<8xf32> - -// EXEC: result[0]: hal.buffer_view -// EXEC-NEXT: 32xf32=32 32 32 32 32 32 32 32 diff --git a/tests/transform_dialect/cpu/fold_tensor_slice_into_transfer.mlir b/tests/transform_dialect/cpu/fold_tensor_slice_into_transfer.mlir deleted file mode 100644 index 435c25f434f1..000000000000 --- a/tests/transform_dialect/cpu/fold_tensor_slice_into_transfer.mlir +++ /dev/null @@ -1,111 +0,0 @@ -// RUN: iree-opt --iree-transform-dialect-interpreter %s | FileCheck %s - -// This transform was removed from MLIR by https://reviews.llvm.org/D154932 and -// added to IREE in https://github.com/iree-org/iree/pull/14373, as a workaround -// for other patterns being sensitive to these exact transforms. - -// CHECK-LABEL: func @transfer_read_of_extract_slice( -// CHECK-SAME: %[[t:.*]]: tensor, %[[s1:.*]]: index, %[[s2:.*]]: index -// CHECK-DAG: %[[c4:.*]] = arith.constant 4 : index -// CHECK-DAG: %[[c8:.*]] = arith.constant 8 : index -// CHECK: %[[add:.*]] = arith.addi %[[s1]], %[[c4]] -// CHECK: %[[r:.*]] = vector.transfer_read %[[t]][%[[c8]], %[[add]]], %{{.*}} {in_bounds = [true, true]} : tensor, vector<5x6xf32> -// CHECK: return %[[r]] -func.func @transfer_read_of_extract_slice(%t : tensor, %s1 : index, %s2 : index) -> vector<5x6xf32> { - %c3 = arith.constant 3 : index - %c4 = arith.constant 4 : index - %cst = arith.constant 0.0 : f32 - %0 = tensor.extract_slice %t[5, %s1] [10, %s2] [1, 1] : tensor to tensor<10x?xf32> - %1 = vector.transfer_read %0[%c3, %c4], %cst {in_bounds = [true, true]} : tensor<10x?xf32>, vector<5x6xf32> - return %1 : vector<5x6xf32> -} - -// CHECK-LABEL: func @transfer_read_of_extract_slice_1d( -// CHECK-SAME: %[[t:.*]]: tensor, %[[s1:.*]]: index, %[[s2:.*]]: index -// CHECK-DAG: %[[c4:.*]] = arith.constant 4 : index -// CHECK-DAG: %[[c8:.*]] = arith.constant 8 : index -// CHECK: %[[add:.*]] = arith.addi %[[s1]], %[[c4]] -// CHECK: %[[r:.*]] = vector.transfer_read %[[t]][%[[c8]], %[[add]]], %{{.*}} {in_bounds = [true]} : tensor, vector<6xf32> -// CHECK: return %[[r]] -func.func @transfer_read_of_extract_slice_1d(%t : tensor, %s1 : index, %s2 : index) -> vector<6xf32> { - %c3 = arith.constant 3 : index - %c4 = arith.constant 4 : index - %cst = arith.constant 0.0 : f32 - %0 = tensor.extract_slice %t[5, %s1] [10, %s2] [1, 1] : tensor to tensor<10x?xf32> - %1 = vector.transfer_read %0[%c3, %c4], %cst {in_bounds = [true]} : tensor<10x?xf32>, vector<6xf32> - return %1 : vector<6xf32> -} - -// CHECK-LABEL: func @transfer_read_of_extract_slice_rank_reducing( -// CHECK-SAME: %[[t:.*]]: tensor, %[[s1:.*]]: index, %[[s2:.*]]: index -// CHECK-DAG: %[[c3:.*]] = arith.constant 3 : index -// CHECK-DAG: %[[c5:.*]] = arith.constant 5 : index -// CHECK-DAG: %[[c10:.*]] = arith.constant 10 : index -// CHECK: %[[add:.*]] = arith.addi %[[s1]], %[[c3]] -// CHECK: %[[r:.*]] = vector.transfer_read %[[t]][%[[c5]], %[[add]], %[[c10]]], %{{.*}} {in_bounds = [true, true]} : tensor, vector<5x6xf32> -// CHECK: return %[[r]] -func.func @transfer_read_of_extract_slice_rank_reducing(%t : tensor, %s1 : index, %s2 : index) -> vector<5x6xf32> { - %c3 = arith.constant 3 : index - %c4 = arith.constant 4 : index - %cst = arith.constant 0.0 : f32 - %0 = tensor.extract_slice %t[5, %s1, 6] [1, %s2, 12] [1, 1, 1] : tensor to tensor - %1 = vector.transfer_read %0[%c3, %c4], %cst {in_bounds = [true, true]} : tensor, vector<5x6xf32> - return %1 : vector<5x6xf32> -} - -// CHECK-LABEL: func @transfer_read_of_extract_slice_illegal_rank_reducing( -// CHECK: extract_slice -// CHECK: vector.transfer_read -func.func @transfer_read_of_extract_slice_illegal_rank_reducing(%t : tensor, %s1 : index, %s2 : index) -> vector<5x6xf32> { - %c3 = arith.constant 3 : index - %c4 = arith.constant 4 : index - %cst = arith.constant 0.0 : f32 - %0 = tensor.extract_slice %t[5, %s1, 6] [%s2, 1, 12] [1, 1, 1] : tensor to tensor - %1 = vector.transfer_read %0[%c3, %c4], %cst {in_bounds = [true, true]} : tensor, vector<5x6xf32> - return %1 : vector<5x6xf32> -} - -// CHECK-LABEL: func @insert_slice_of_transfer_write( -// CHECK-SAME: %[[t1:.*]]: tensor, %[[v:.*]]: vector<5x6xf32>, %[[s:.*]]: index -// CHECK: %[[c3:.*]] = arith.constant 3 : index -// CHECK: %[[r:.*]] = vector.transfer_write %[[v]], %[[t1]][%[[c3]], %[[s]]] {in_bounds = [true, true]} : vector<5x6xf32>, tensor -// CHECK: return %[[r]] -func.func @insert_slice_of_transfer_write(%t1 : tensor, %v : vector<5x6xf32>, %s : index, %t2 : tensor<5x6xf32>) -> tensor { - %c0 = arith.constant 0 : index - %0 = vector.transfer_write %v, %t2[%c0, %c0] {in_bounds = [true, true]} : vector<5x6xf32>, tensor<5x6xf32> - %1 = tensor.insert_slice %0 into %t1[3, %s] [5, 6] [1, 1] : tensor<5x6xf32> into tensor - return %1 : tensor -} - -// CHECK-LABEL: func @insert_slice_of_transfer_write_illegal_rank_extending( -// CHECK: vector.transfer_write -// CHECK: insert_slice -func.func @insert_slice_of_transfer_write_illegal_rank_extending(%t1 : tensor, %v : vector<5x6xf32>, %s : index, %t2 : tensor<5x6xf32>) -> tensor { - %c0 = arith.constant 0 : index - %0 = vector.transfer_write %v, %t2[%c0, %c0] {in_bounds = [true, true]} : vector<5x6xf32>, tensor<5x6xf32> - %1 = tensor.insert_slice %0 into %t1[4, 3, %s] [5, 1, 6] [1, 1, 1] : tensor<5x6xf32> into tensor - return %1 : tensor -} - -// CHECK-LABEL: func @insert_slice_of_transfer_write_rank_extending( -// CHECK-SAME: %[[t1:.*]]: tensor, %[[v:.*]]: vector<5x6xf32>, %[[s:.*]]: index -// CHECK-DAG: %[[c3:.*]] = arith.constant 3 : index -// CHECK-DAG: %[[c4:.*]] = arith.constant 4 : index -// CHECK: %[[r:.*]] = vector.transfer_write %[[v]], %[[t1]][%[[c4]], %[[c3]], %[[s]]] {in_bounds = [true, true]} : vector<5x6xf32>, tensor -// CHECK: return %[[r]] -func.func @insert_slice_of_transfer_write_rank_extending(%t1 : tensor, %v : vector<5x6xf32>, %s : index, %t2 : tensor<5x6xf32>) -> tensor { - %c0 = arith.constant 0 : index - %0 = vector.transfer_write %v, %t2[%c0, %c0] {in_bounds = [true, true]} : vector<5x6xf32>, tensor<5x6xf32> - %1 = tensor.insert_slice %0 into %t1[4, 3, %s] [1, 5, 6] [1, 1, 1] : tensor<5x6xf32> into tensor - return %1 : tensor -} - -module attributes { transform.with_named_sequence } { - transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) { - %top_level_func = transform.structured.match ops{["func.func"]} in %module_op : (!transform.any_op) -> !transform.any_op - transform.apply_patterns to %top_level_func { - transform.apply_patterns.iree.fold_tensor_slice_into_transfer - } : !transform.any_op - transform.yield - } -} diff --git a/tests/transform_dialect/cpu/matmul.mlir b/tests/transform_dialect/cpu/matmul.mlir deleted file mode 100644 index 39a2074f6043..000000000000 --- a/tests/transform_dialect/cpu/matmul.mlir +++ /dev/null @@ -1,21 +0,0 @@ -!A_size = tensor<3x5xf32> -!B_size = tensor<5x3xf32> -!C_size = tensor<3x3xf32> - -func.func @matmul_static( - %A : !A_size, %B : !B_size, %C : !C_size) -> !C_size { - %0 = linalg.matmul ins(%A, %B : !A_size, !B_size) - outs(%C : !C_size) -> !C_size - return %0 : !C_size -} - -// RUN: iree-compile %s --iree-hal-target-backends=llvm-cpu \ -// RUN: --iree-opt-data-tiling=false \ -// RUN: --iree-codegen-transform-dialect-library=%p/matmul_codegen_default_spec.mlir@codegen | \ -// RUN: iree-run-module --module=- --function=matmul_static \ -// RUN: --input="3x5xf32=1" \ -// RUN: --input="5x3xf32=2" \ -// RUN: --input="3x3xf32=42" | \ -// RUN: FileCheck %s --check-prefixes=EXEC - -// EXEC: 3x3xf32=[52 52 52][52 52 52][52 52 52] diff --git a/tests/transform_dialect/cpu/matmul_codegen_default_spec.mlir b/tests/transform_dialect/cpu/matmul_codegen_default_spec.mlir deleted file mode 100644 index 1fa31050e506..000000000000 --- a/tests/transform_dialect/cpu/matmul_codegen_default_spec.mlir +++ /dev/null @@ -1,32 +0,0 @@ -// RUN: iree-opt %s - -module attributes { transform.with_named_sequence } { - transform.named_sequence @codegen( - %variant_op: !transform.any_op {transform.consumed}) { - %matmul = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> !transform.any_op - - // Step 1. Tile to forall with tile_sizes [2]. - // =================================================== - %tiled_generic, %forall = - transform.structured.tile_using_forall %matmul tile_sizes [2] - ( mapping = [#gpu.block] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall - : (!transform.any_op) -> () - - // Step 2. Bufferize and drop HAL decriptor from memref ops. - // ========================================================= - transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> () - %variant_op_3 = transform.iree.bufferize %variant_op : (!transform.any_op) -> !transform.any_op - - // Step 3. Post-bufferization mapping workgroup. - // ========================================================= - %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op - transform.iree.forall_to_workgroup %memref_func : (!transform.any_op) -> () - - // Annotate the exported function as already translated. - %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op - %none = transform.param.constant #iree_codegen.translation_info -> !transform.any_param - transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param - transform.yield - } -} // module diff --git a/tests/transform_dialect/cuda/BUILD.bazel b/tests/transform_dialect/cuda/BUILD.bazel deleted file mode 100644 index a35e0dbe609f..000000000000 --- a/tests/transform_dialect/cuda/BUILD.bazel +++ /dev/null @@ -1,113 +0,0 @@ -# Copyright 2022 The IREE Authors -# -# Licensed under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -load("//build_tools/bazel:build_defs.oss.bzl", "iree_cmake_extra_content") -load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite") - -package( - features = ["layering_check"], - licenses = ["notice"], # Apache 2.0 -) - -iree_cmake_extra_content( - content = """ -if(NOT IREE_HAL_DRIVER_CUDA) - return() -endif() - -if(NOT IREE_HOST_BIN_DIR AND NOT IREE_TARGET_BACKEND_CUDA) - return() -endif() -""", - inline = True, -) - -iree_lit_test_suite( - name = "lit", - srcs = [ - # TODO(#15892): reductions have flakes and need to be triaged. - # "reduction.mlir", - # "reduction_eltwise.mlir", - # "reduction_v2.mlir", - # "reduction_v2_uneven.mlir", - # "softmax.mlir", - # "softmax_v2.mlir", - # First few ops of softmax only, acts as a proxy example. - # "softmax_partial.mlir", - ], - cfg = "//tests:lit.cfg.py", - # transform dialect spec files are MLIR files that specify a transformation, - # they need to be included as data. - data = [ - "mma_reduction_layout_analysis_codegen_spec.mlir", - "mma_reduction_layout_analysis_dispatch_spec.mlir", - "mma_using_layout_analysis_codegen_spec.mlir", - "reduction_codegen_spec.mlir", - "reduction_eltwise_codegen_spec.mlir", - "reduction_v2_codegen_spec.mlir", - "softmax_codegen_spec.mlir", - "softmax_v2_codegen_spec.mlir", - # - # FIXME: This must be used with the custom dispatch region formation - # because IREE's does not fuse the 6 ops softmax version even with - # --iree-flow-fuse-multi-use. - # - "softmax_dispatch_spec.mlir", - # First few ops of softmax only, acts as a proxy example. - "softmax_partial_codegen_spec.mlir", - ], - tags = [ - # CUDA cuInit fails with sanitizer on. - "noasan", - "nomsan", - "notsan", - "noubsan", - "requires-gpu-nvidia", - "driver=cuda", - ], - tools = [ - "//tools:iree-compile", - "//tools:iree-opt", - "//tools:iree-run-module", - "@llvm-project//llvm:FileCheck", - ], -) - -iree_lit_test_suite( - name = "sm80_lit", - srcs = [ - "double_mma_layout_analysis.mlir", - "mma_elemwise_layout_analysis.mlir", - "mma_reduction_layout_analysis.mlir", - "mma_using_layout_analysis.mlir", - ], - cfg = "//tests:lit.cfg.py", - # transform dialect spec files are MLIR files that specify a transformation, - # they need to be included as data. - data = [ - "double_mma_layout_analysis_codegen_spec.mlir", - "double_mma_layout_analysis_dispatch_spec.mlir", - "mma_elemwise_layout_analysis_codegen_spec.mlir", - "mma_reduction_layout_analysis_codegen_spec.mlir", - "mma_reduction_layout_analysis_dispatch_spec.mlir", - "mma_using_layout_analysis_codegen_spec.mlir", - ], - tags = [ - # CUDA cuInit fails with sanitizer on. - "noasan", - "nomsan", - "notsan", - "noubsan", - "requires-gpu-sm80", - "driver=cuda", - ], - tools = [ - "//tools:iree-compile", - "//tools:iree-opt", - "//tools:iree-run-module", - "@llvm-project//llvm:FileCheck", - ], -) diff --git a/tests/transform_dialect/cuda/CMakeLists.txt b/tests/transform_dialect/cuda/CMakeLists.txt deleted file mode 100644 index 7534bc8bfd46..000000000000 --- a/tests/transform_dialect/cuda/CMakeLists.txt +++ /dev/null @@ -1,78 +0,0 @@ -################################################################################ -# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from # -# tests/transform_dialect/cuda/BUILD.bazel # -# # -# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary # -# CMake-only content. # -# # -# To disable autogeneration for this file entirely, delete this header. # -################################################################################ - -iree_add_all_subdirs() - -if(NOT IREE_HAL_DRIVER_CUDA) - return() -endif() - -if(NOT IREE_HOST_BIN_DIR AND NOT IREE_TARGET_BACKEND_CUDA) - return() -endif() - -iree_lit_test_suite( - NAME - lit - TOOLS - FileCheck - iree-compile - iree-opt - iree-run-module - DATA - mma_reduction_layout_analysis_codegen_spec.mlir - mma_reduction_layout_analysis_dispatch_spec.mlir - mma_using_layout_analysis_codegen_spec.mlir - reduction_codegen_spec.mlir - reduction_eltwise_codegen_spec.mlir - reduction_v2_codegen_spec.mlir - softmax_codegen_spec.mlir - softmax_dispatch_spec.mlir - softmax_partial_codegen_spec.mlir - softmax_v2_codegen_spec.mlir - LABELS - "noasan" - "nomsan" - "notsan" - "noubsan" - "requires-gpu-nvidia" - "driver=cuda" -) - -iree_lit_test_suite( - NAME - sm80_lit - SRCS - "double_mma_layout_analysis.mlir" - "mma_elemwise_layout_analysis.mlir" - "mma_reduction_layout_analysis.mlir" - "mma_using_layout_analysis.mlir" - TOOLS - FileCheck - iree-compile - iree-opt - iree-run-module - DATA - double_mma_layout_analysis_codegen_spec.mlir - double_mma_layout_analysis_dispatch_spec.mlir - mma_elemwise_layout_analysis_codegen_spec.mlir - mma_reduction_layout_analysis_codegen_spec.mlir - mma_reduction_layout_analysis_dispatch_spec.mlir - mma_using_layout_analysis_codegen_spec.mlir - LABELS - "noasan" - "nomsan" - "notsan" - "noubsan" - "requires-gpu-sm80" - "driver=cuda" -) - -### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ### diff --git a/tests/transform_dialect/cuda/double_mma_layout_analysis.mlir b/tests/transform_dialect/cuda/double_mma_layout_analysis.mlir deleted file mode 100644 index 9293fd4ff0ab..000000000000 --- a/tests/transform_dialect/cuda/double_mma_layout_analysis.mlir +++ /dev/null @@ -1,26 +0,0 @@ -func.func @double_matmul(%lhs : tensor<16x16xf16>, %rhs : tensor<16x16xf16>, %second : tensor<16x8xf16>) -> tensor<16x8xf16> { - %c0 = arith.constant 0.0 : f16 - %0 = tensor.empty() : tensor<16x16xf16> - %1 = linalg.fill ins(%c0 : f16) outs(%0 : tensor<16x16xf16>) -> tensor<16x16xf16> - %2 = linalg.matmul ins(%lhs, %rhs : tensor<16x16xf16>, tensor<16x16xf16>) - outs(%1 : tensor<16x16xf16>) -> tensor<16x16xf16> - %3 = tensor.empty() : tensor<16x8xf16> - %4 = linalg.fill ins(%c0 : f16) outs(%3 : tensor<16x8xf16>) -> tensor<16x8xf16> - %5 = linalg.matmul ins(%2, %second : tensor<16x16xf16>, tensor<16x8xf16>) - outs(%4 : tensor<16x8xf16>) -> tensor<16x8xf16> - return %5 : tensor<16x8xf16> -} - -// RUN: iree-compile %s --iree-hal-target-backends=cuda \ -// RUN: --iree-hal-cuda-llvm-target-arch=sm_80 \ -// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \ -// RUN: --iree-flow-dispatch-use-transform-dialect=%p/double_mma_layout_analysis_dispatch_spec.mlir \ -// RUN: --iree-codegen-transform-dialect-library=%p/double_mma_layout_analysis_codegen_spec.mlir@codegen | \ -// RUN: iree-run-module --module=- --function=double_matmul --device=cuda \ -// RUN: --input="16x16xf16=[[0.0999755859375,0.2249755859375,0.07501220703125,0.0,0.07501220703125,0.2249755859375,0.175048828125,0.07501220703125,0.175048828125,0.07501220703125,0.024993896484375,0.1500244140625,0.1500244140625,0.2249755859375,0.199951171875,0.1500244140625],[0.1500244140625,0.199951171875,0.0999755859375,0.07501220703125,0.1500244140625,0.2249755859375,0.024993896484375,0.0999755859375,0.0999755859375,0.024993896484375,0.2249755859375,0.2249755859375,0.2249755859375,0.0,0.024993896484375,0.04998779296875],[0.07501220703125,0.0,0.125,0.125,0.04998779296875,0.2249755859375,0.024993896484375,0.199951171875,0.199951171875,0.07501220703125,0.1500244140625,0.2249755859375,0.024993896484375,0.175048828125,0.07501220703125,0.125],[0.04998779296875,0.024993896484375,0.0,0.2249755859375,0.07501220703125,0.024993896484375,0.024993896484375,0.0,0.07501220703125,0.1500244140625,0.1500244140625,0.175048828125,0.2249755859375,0.1500244140625,0.07501220703125,0.0999755859375],[0.125,0.0,0.199951171875,0.04998779296875,0.199951171875,0.04998779296875,0.175048828125,0.125,0.0,0.0,0.199951171875,0.024993896484375,0.2249755859375,0.1500244140625,0.024993896484375,0.0],[0.04998779296875,0.2249755859375,0.0999755859375,0.07501220703125,0.2249755859375,0.07501220703125,0.2249755859375,0.07501220703125,0.2249755859375,0.199951171875,0.125,0.07501220703125,0.04998779296875,0.199951171875,0.125,0.1500244140625],[0.1500244140625,0.125,0.175048828125,0.04998779296875,0.125,0.1500244140625,0.1500244140625,0.125,0.0999755859375,0.0,0.199951171875,0.024993896484375,0.175048828125,0.199951171875,0.125,0.0999755859375],[0.0999755859375,0.199951171875,0.0999755859375,0.0999755859375,0.2249755859375,0.0,0.175048828125,0.0999755859375,0.125,0.07501220703125,0.07501220703125,0.175048828125,0.07501220703125,0.0,0.2249755859375,0.2249755859375],[0.07501220703125,0.024993896484375,0.199951171875,0.024993896484375,0.175048828125,0.199951171875,0.0999755859375,0.024993896484375,0.0,0.0999755859375,0.0,0.0999755859375,0.2249755859375,0.175048828125,0.0,0.0],[0.024993896484375,0.0999755859375,0.2249755859375,0.2249755859375,0.125,0.2249755859375,0.04998779296875,0.04998779296875,0.04998779296875,0.024993896484375,0.0999755859375,0.2249755859375,0.024993896484375,0.024993896484375,0.0,0.07501220703125],[0.0,0.1500244140625,0.175048828125,0.1500244140625,0.2249755859375,0.024993896484375,0.1500244140625,0.0999755859375,0.024993896484375,0.0,0.125,0.04998779296875,0.125,0.199951171875,0.024993896484375,0.199951171875],[0.024993896484375,0.04998779296875,0.199951171875,0.0,0.07501220703125,0.199951171875,0.2249755859375,0.04998779296875,0.175048828125,0.0,0.199951171875,0.199951171875,0.1500244140625,0.199951171875,0.125,0.199951171875],[0.1500244140625,0.125,0.04998779296875,0.0999755859375,0.04998779296875,0.175048828125,0.04998779296875,0.0999755859375,0.2249755859375,0.199951171875,0.125,0.1500244140625,0.0999755859375,0.07501220703125,0.07501220703125,0.0999755859375],[0.0,0.04998779296875,0.125,0.024993896484375,0.04998779296875,0.199951171875,0.04998779296875,0.0999755859375,0.199951171875,0.07501220703125,0.1500244140625,0.125,0.199951171875,0.199951171875,0.0,0.125],[0.024993896484375,0.07501220703125,0.0,0.199951171875,0.024993896484375,0.024993896484375,0.024993896484375,0.175048828125,0.04998779296875,0.04998779296875,0.04998779296875,0.07501220703125,0.07501220703125,0.1500244140625,0.175048828125,0.199951171875],[0.0,0.125,0.0,0.07501220703125,0.125,0.125,0.07501220703125,0.1500244140625,0.04998779296875,0.04998779296875,0.125,0.125,0.2249755859375,0.0999755859375,0.07501220703125,0.07501220703125]]" \ -// RUN: --input="16x16xf16=[[0.175048828125,0.07501220703125,0.199951171875,0.0,0.175048828125,0.125,0.199951171875,0.04998779296875,0.0999755859375,0.175048828125,0.07501220703125,0.04998779296875,0.125,0.125,0.07501220703125,0.2249755859375],[0.024993896484375,0.199951171875,0.0,0.1500244140625,0.175048828125,0.0999755859375,0.175048828125,0.1500244140625,0.2249755859375,0.07501220703125,0.199951171875,0.0999755859375,0.0999755859375,0.2249755859375,0.0999755859375,0.0999755859375],[0.2249755859375,0.2249755859375,0.125,0.175048828125,0.0,0.07501220703125,0.04998779296875,0.0,0.199951171875,0.1500244140625,0.024993896484375,0.2249755859375,0.024993896484375,0.1500244140625,0.2249755859375,0.199951171875],[0.1500244140625,0.125,0.024993896484375,0.07501220703125,0.125,0.125,0.07501220703125,0.1500244140625,0.04998779296875,0.175048828125,0.125,0.175048828125,0.175048828125,0.07501220703125,0.024993896484375,0.125],[0.2249755859375,0.125,0.2249755859375,0.1500244140625,0.0,0.0,0.1500244140625,0.125,0.024993896484375,0.125,0.0,0.024993896484375,0.175048828125,0.175048828125,0.024993896484375,0.125],[0.2249755859375,0.024993896484375,0.04998779296875,0.0,0.0,0.1500244140625,0.07501220703125,0.2249755859375,0.1500244140625,0.024993896484375,0.0,0.0999755859375,0.125,0.1500244140625,0.2249755859375,0.0],[0.125,0.0999755859375,0.0,0.0999755859375,0.199951171875,0.125,0.175048828125,0.175048828125,0.1500244140625,0.2249755859375,0.04998779296875,0.125,0.1500244140625,0.0,0.0,0.0999755859375],[0.125,0.07501220703125,0.175048828125,0.1500244140625,0.175048828125,0.0,0.04998779296875,0.125,0.125,0.024993896484375,0.0999755859375,0.175048828125,0.024993896484375,0.0,0.024993896484375,0.0],[0.2249755859375,0.024993896484375,0.0999755859375,0.04998779296875,0.125,0.07501220703125,0.0999755859375,0.024993896484375,0.125,0.125,0.125,0.024993896484375,0.125,0.04998779296875,0.0999755859375,0.07501220703125],[0.0999755859375,0.175048828125,0.199951171875,0.0999755859375,0.175048828125,0.07501220703125,0.024993896484375,0.125,0.07501220703125,0.0,0.125,0.07501220703125,0.07501220703125,0.0,0.199951171875,0.175048828125],[0.07501220703125,0.0999755859375,0.175048828125,0.07501220703125,0.125,0.1500244140625,0.0,0.0999755859375,0.2249755859375,0.199951171875,0.04998779296875,0.0,0.0,0.1500244140625,0.199951171875,0.2249755859375],[0.024993896484375,0.2249755859375,0.04998779296875,0.1500244140625,0.2249755859375,0.2249755859375,0.175048828125,0.0999755859375,0.024993896484375,0.199951171875,0.125,0.199951171875,0.175048828125,0.2249755859375,0.175048828125,0.0999755859375],[0.125,0.0999755859375,0.04998779296875,0.125,0.199951171875,0.07501220703125,0.199951171875,0.0,0.024993896484375,0.04998779296875,0.0,0.04998779296875,0.04998779296875,0.199951171875,0.1500244140625,0.0999755859375],[0.199951171875,0.0,0.125,0.04998779296875,0.07501220703125,0.175048828125,0.0999755859375,0.175048828125,0.024993896484375,0.07501220703125,0.0,0.1500244140625,0.07501220703125,0.024993896484375,0.07501220703125,0.175048828125],[0.1500244140625,0.125,0.0999755859375,0.175048828125,0.04998779296875,0.0,0.04998779296875,0.1500244140625,0.024993896484375,0.125,0.125,0.175048828125,0.125,0.0999755859375,0.175048828125,0.1500244140625],[0.07501220703125,0.199951171875,0.024993896484375,0.0999755859375,0.175048828125,0.07501220703125,0.1500244140625,0.04998779296875,0.0,0.024993896484375,0.07501220703125,0.07501220703125,0.1500244140625,0.04998779296875,0.2249755859375,0.1500244140625]]" \ -// RUN: --input="16x8xf16=[[0.1500244140625,0.07501220703125,0.1500244140625,0.0,0.199951171875,0.125,0.0,0.175048828125],[0.04998779296875,0.07501220703125,0.04998779296875,0.125,0.2249755859375,0.04998779296875,0.04998779296875,0.2249755859375],[0.0,0.07501220703125,0.04998779296875,0.175048828125,0.0999755859375,0.1500244140625,0.04998779296875,0.199951171875],[0.125,0.175048828125,0.04998779296875,0.07501220703125,0.199951171875,0.07501220703125,0.024993896484375,0.1500244140625],[0.175048828125,0.0,0.0,0.0999755859375,0.0999755859375,0.1500244140625,0.07501220703125,0.024993896484375],[0.1500244140625,0.199951171875,0.0999755859375,0.0999755859375,0.125,0.175048828125,0.199951171875,0.0],[0.175048828125,0.0999755859375,0.024993896484375,0.175048828125,0.125,0.07501220703125,0.175048828125,0.175048828125],[0.175048828125,0.175048828125,0.2249755859375,0.125,0.175048828125,0.0,0.04998779296875,0.175048828125],[0.175048828125,0.024993896484375,0.125,0.1500244140625,0.1500244140625,0.07501220703125,0.0,0.04998779296875],[0.125,0.0999755859375,0.024993896484375,0.199951171875,0.175048828125,0.0999755859375,0.04998779296875,0.125],[0.199951171875,0.04998779296875,0.1500244140625,0.0999755859375,0.04998779296875,0.07501220703125,0.199951171875,0.125],[0.1500244140625,0.0,0.125,0.175048828125,0.024993896484375,0.07501220703125,0.199951171875,0.0999755859375],[0.175048828125,0.04998779296875,0.07501220703125,0.125,0.024993896484375,0.2249755859375,0.0,0.0],[0.024993896484375,0.0999755859375,0.1500244140625,0.07501220703125,0.125,0.2249755859375,0.0,0.0],[0.04998779296875,0.125,0.175048828125,0.04998779296875,0.125,0.0999755859375,0.0999755859375,0.04998779296875],[0.125,0.175048828125,0.0,0.2249755859375,0.199951171875,0.175048828125,0.1500244140625,0.1500244140625]]" |\ -// RUN: FileCheck %s --check-prefix=EXEC - -// EXEC: result[0]: hal.buffer_view -// EXEC-NEXT: 16x8xf16=[0.465332 0.345703 0.341064 0.440674 0.495605 0.424805 0.296875 0.393555][0.420166 0.324707 0.314941 0.414551 0.470215 0.408447 0.268555 0.361084][0.404785 0.305176 0.304688 0.390381 0.438232 0.378174 0.262207 0.349609][0.33252 0.25708 0.235596 0.327393 0.364258 0.320312 0.222168 0.287109][0.326172 0.256592 0.235107 0.332031 0.377686 0.316895 0.204224 0.300537][0.484131 0.361328 0.346436 0.46875 0.525391 0.442871 0.303467 0.425293][0.422852 0.324219 0.311035 0.41626 0.472412 0.398926 0.26709 0.374023][0.447754 0.33252 0.314697 0.439209 0.487305 0.414551 0.285156 0.395508][0.30127 0.23938 0.229736 0.297363 0.343994 0.293701 0.192749 0.272705][0.355225 0.271729 0.270752 0.34668 0.391846 0.334717 0.227539 0.305664][0.375 0.286133 0.267822 0.369141 0.416992 0.348877 0.237549 0.334473][0.454834 0.349365 0.334717 0.44165 0.502441 0.426758 0.292725 0.390381][0.404541 0.302002 0.296631 0.390137 0.437012 0.379639 0.260742 0.34668][0.348877 0.269043 0.263428 0.335205 0.386719 0.33252 0.22522 0.300293][0.297363 0.216431 0.211182 0.283203 0.311523 0.266113 0.194946 0.25708][0.320068 0.242188 0.235107 0.30835 0.349365 0.299072 0.203125 0.275879] diff --git a/tests/transform_dialect/cuda/double_mma_layout_analysis_codegen_spec.mlir b/tests/transform_dialect/cuda/double_mma_layout_analysis_codegen_spec.mlir deleted file mode 100644 index 49f07d2c963a..000000000000 --- a/tests/transform_dialect/cuda/double_mma_layout_analysis_codegen_spec.mlir +++ /dev/null @@ -1,69 +0,0 @@ -// RUN: iree-opt %s - -module attributes { transform.with_named_sequence } { - transform.named_sequence @codegen( - %variant_op: !transform.any_op) { - - // Step 1. Find the fill and matmul ops - // =========================================================================== - %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op - %matmul = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> !transform.any_op - %fill0, %fill1 = transform.split_handle %fill : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - %matmul0, %matmul1 = transform.split_handle %matmul : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - - // Step 2. Tile the matmul and fuse the fill - // =========================================================================== - %grid_reduction, %forall_grid = - transform.structured.tile_using_forall %matmul1 tile_sizes [16] ( mapping = [#gpu.block] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> () - - transform.structured.fuse_into_containing_op %fill1 into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.structured.fuse_into_containing_op %matmul0 into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.structured.fuse_into_containing_op %fill0 into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - - // Step 3. Vectorize - // =========================================================================== - %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op - transform.apply_patterns to %func { - transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface - transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices - transform.apply_patterns.vector.cast_away_vector_leading_one_dim - } : !transform.any_op - %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op - - // Step 4. Bufferize - // =========================================================================== - transform.apply_patterns to %func_3 { - transform.apply_patterns.iree.fold_fill_into_pad - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.scf.for_loop_canonicalization - } : !transform.any_op - transform.apply_patterns to %func_3 { - transform.apply_patterns.tensor.reassociative_reshape_folding - transform.apply_patterns.canonicalization - } : !transform.any_op - transform.apply_cse to %func_3 : !transform.any_op - transform.iree.eliminate_empty_tensors %func_3 : (!transform.any_op) -> () - transform.apply_patterns to %func_3 { - transform.apply_patterns.linalg.erase_unnecessary_inputs - } : !transform.any_op - %memref_func = transform.iree.bufferize { target_gpu } %func_3 : (!transform.any_op) -> (!transform.any_op) - - // Step 5. Pre-process the contract and transfer ops to put it in the right form. - // =========================================================================== - transform.apply_patterns to %memref_func { - transform.apply_patterns.iree.prepare_vector_to_mma - } : !transform.any_op - - // Step 6. Post-bufferization vector distribution - // =========================================================================== - transform.iree.forall_to_workgroup %memref_func : (!transform.any_op) -> () - transform.iree.map_nested_forall_to_gpu_threads %memref_func workgroup_dims = [4, 8, 1] : (!transform.any_op) -> () - - // Step 7. Do layout analysis and lower to mma - // =========================================================================== - %func_11 = transform.iree.layout_analysis_and_distribution %memref_func : (!transform.any_op) -> (!transform.any_op) - - transform.yield - } -} // module diff --git a/tests/transform_dialect/cuda/double_mma_layout_analysis_dispatch_spec.mlir b/tests/transform_dialect/cuda/double_mma_layout_analysis_dispatch_spec.mlir deleted file mode 100644 index 8fad32f799b6..000000000000 --- a/tests/transform_dialect/cuda/double_mma_layout_analysis_dispatch_spec.mlir +++ /dev/null @@ -1,23 +0,0 @@ -// RUN: iree-opt %s - -module attributes { transform.with_named_sequence } { - transform.named_sequence @__transform_main(%func: !transform.any_op) { - %ops = transform.structured.match ops{["linalg.fill", "linalg.matmul"]} - in %func : (!transform.any_op) -> !transform.any_op - - %fill0, %matmul0, %fill1, %matmul1 = - transform.split_handle %ops - : (!transform.any_op) -> (!transform.any_op, !transform.any_op, - !transform.any_op, !transform.any_op) - - %region_op = transform.iree.wrap_in_dispatch_region %matmul1 { generateWorkload = false } : (!transform.any_op) -> !transform.any_op - - %non_matmul1 = transform.merge_handles %fill0, %matmul0, %fill1 : !transform.any_op - %region_op_2 = transform.iree.move_preceding_op_into_dispatch_region %non_matmul1 into %region_op : (!transform.any_op, !transform.any_op) -> !transform.any_op - - %empty = transform.structured.match ops{["tensor.empty"]} in %func : (!transform.any_op) -> !transform.any_op - %region_op_3 = transform.iree.move_preceding_op_into_dispatch_region %empty into %region_op_2 : (!transform.any_op, !transform.any_op) -> !transform.any_op - transform.iree.region_to_workgroups %region_op_3 : (!transform.any_op) -> !transform.any_op - transform.yield - } -} diff --git a/tests/transform_dialect/cuda/eltwise_reduction.mlir b/tests/transform_dialect/cuda/eltwise_reduction.mlir deleted file mode 100644 index eabf1a0cf739..000000000000 --- a/tests/transform_dialect/cuda/eltwise_reduction.mlir +++ /dev/null @@ -1,39 +0,0 @@ -!in_tensor_t = tensor<8x64xf32> -!out_tensor_t = tensor<8xf32> - -func.func @reduce(%arg : !in_tensor_t) -> (!out_tensor_t) { - %cst = arith.constant -0.000000e+00 : f32 - - %0 = tensor.empty() : !out_tensor_t - %1 = linalg.fill ins(%cst : f32) outs(%0 : !out_tensor_t) -> !out_tensor_t - %2 = tensor.empty() : !in_tensor_t - %3 = linalg.generic { - indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, - affine_map<(d0, d1) -> (d0, d1)>], - iterator_types = ["parallel", "parallel"]} - ins(%arg : !in_tensor_t) outs(%2 : !in_tensor_t) { - ^bb0(%arg3: f32, %arg4: f32): - %4 = arith.addf %arg3, %arg3 : f32 - %5 = arith.addf %4, %4 : f32 - linalg.yield %5 : f32 - } -> !in_tensor_t - - %6 = linalg.generic { - indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, - affine_map<(d0, d1) -> (d0)>], - iterator_types = ["parallel", "reduction"]} - ins(%3 : !in_tensor_t) outs(%1 : !out_tensor_t) { - ^bb0(%arg3: f32, %arg4: f32): - %4 = arith.addf %arg3, %arg4 : f32 - linalg.yield %4 : f32 - } -> !out_tensor_t - - return %6 : !out_tensor_t -} - -// RUN: iree-compile %s --iree-hal-target-backends=cuda | \ -// RUN: iree-run-module --module=- --function=reduce --device=cuda --input="8x64xf32=1" |\ -// RUN: FileCheck %s --check-prefix=EXEC - -// EXEC: result[0]: hal.buffer_view -// EXEC-NEXT: 8xf32=256 256 256 256 256 256 256 256 diff --git a/tests/transform_dialect/cuda/eltwise_reduction_eltwise.mlir b/tests/transform_dialect/cuda/eltwise_reduction_eltwise.mlir deleted file mode 100644 index 70aa3322dfd3..000000000000 --- a/tests/transform_dialect/cuda/eltwise_reduction_eltwise.mlir +++ /dev/null @@ -1,51 +0,0 @@ -!in_tensor_t = tensor<8x64xf32> -!out_tensor_t = tensor<8xf32> - -func.func @reduce(%arg : !in_tensor_t) -> (!out_tensor_t) { - %cst = arith.constant -0.000000e+00 : f32 - - %0 = tensor.empty() : !out_tensor_t - %1 = linalg.fill ins(%cst : f32) outs(%0 : !out_tensor_t) -> !out_tensor_t - %2 = tensor.empty() : !in_tensor_t - %3 = linalg.generic { - indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, - affine_map<(d0, d1) -> (d0, d1)>], - iterator_types = ["parallel", "parallel"]} - ins(%arg : !in_tensor_t) outs(%2 : !in_tensor_t) { - ^bb0(%arg3: f32, %arg4: f32): - %4 = arith.addf %arg3, %arg3 : f32 - %5 = arith.addf %4, %4 : f32 - linalg.yield %5 : f32 - } -> !in_tensor_t - - %6 = linalg.generic { - indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, - affine_map<(d0, d1) -> (d0)>], - iterator_types = ["parallel", "reduction"]} - ins(%3 : !in_tensor_t) outs(%1 : !out_tensor_t) { - ^bb0(%arg3: f32, %arg4: f32): - %4 = arith.addf %arg3, %arg4 : f32 - linalg.yield %4 : f32 - } -> !out_tensor_t - - %7 = tensor.empty() : !out_tensor_t - %8 = linalg.generic { - indexing_maps = [affine_map<(d0) -> (d0)>, - affine_map<(d0) -> (d0)>], - iterator_types = ["parallel"]} - ins(%6 : !out_tensor_t) outs(%7 : !out_tensor_t) { - ^bb0(%arg3: f32, %arg4: f32): - %4 = math.sqrt %arg3 : f32 - linalg.yield %4 : f32 - } -> !out_tensor_t - - - return %8 : !out_tensor_t -} - -// RUN: iree-compile %s --iree-hal-target-backends=cuda | \ -// RUN: iree-run-module --module=- --function=reduce --device=cuda --input="8x64xf32=1" |\ -// RUN: FileCheck %s --check-prefix=EXEC - -// EXEC: result[0]: hal.buffer_view -// EXEC-NEXT: 8xf32=16 16 16 16 16 16 16 16 diff --git a/tests/transform_dialect/cuda/mma_elemwise_layout_analysis.mlir b/tests/transform_dialect/cuda/mma_elemwise_layout_analysis.mlir deleted file mode 100644 index c1b1c2904c44..000000000000 --- a/tests/transform_dialect/cuda/mma_elemwise_layout_analysis.mlir +++ /dev/null @@ -1,29 +0,0 @@ -#map = affine_map<(d0, d1) -> (d0, d1)> -func.func @matmul(%lhs : tensor<16x16xf16>, %rhs : tensor<8x16xf16>, %bias : tensor<16x8xf16>) -> tensor<16x8xf16> { - %c0 = arith.constant 0.0 : f16 - %0 = tensor.empty() : tensor<16x8xf16> - %1 = linalg.fill ins(%c0 : f16) outs(%0 : tensor<16x8xf16>) -> tensor<16x8xf16> - %2 = linalg.matmul_transpose_b ins(%lhs, %rhs : tensor<16x16xf16>, tensor<8x16xf16>) - outs(%1 : tensor<16x8xf16>) -> tensor<16x8xf16> - %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types=["parallel", "parallel"]} - ins(%2, %bias : tensor<16x8xf16>, tensor<16x8xf16>) outs(%0 : tensor<16x8xf16>) { - ^bb0(%arg0: f16, %arg1: f16, %arg2: f16): - %10 = arith.subf %arg0, %arg1 : f16 - %11 = math.exp %10 : f16 - linalg.yield %11 : f16 - } -> tensor<16x8xf16> - return %3 : tensor<16x8xf16> -} - -// RUN: iree-compile %s --iree-hal-target-backends=cuda \ -// RUN: --iree-hal-cuda-llvm-target-arch=sm_80 \ -// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \ -// RUN: --iree-codegen-transform-dialect-library=%p/mma_elemwise_layout_analysis_codegen_spec.mlir@__transform_main | \ -// RUN: iree-run-module --module=- --function=matmul --device=cuda \ -// RUN: --input="16x16xf16=[[0.0999755859375,0.2249755859375,0.07501220703125,0.0,0.07501220703125,0.2249755859375,0.175048828125,0.07501220703125,0.175048828125,0.07501220703125,0.024993896484375,0.1500244140625,0.1500244140625,0.2249755859375,0.199951171875,0.1500244140625],[0.1500244140625,0.199951171875,0.0999755859375,0.07501220703125,0.1500244140625,0.2249755859375,0.024993896484375,0.0999755859375,0.0999755859375,0.024993896484375,0.2249755859375,0.2249755859375,0.2249755859375,0.0,0.024993896484375,0.04998779296875],[0.07501220703125,0.0,0.125,0.125,0.04998779296875,0.2249755859375,0.024993896484375,0.199951171875,0.199951171875,0.07501220703125,0.1500244140625,0.2249755859375,0.024993896484375,0.175048828125,0.07501220703125,0.125],[0.04998779296875,0.024993896484375,0.0,0.2249755859375,0.07501220703125,0.024993896484375,0.024993896484375,0.0,0.07501220703125,0.1500244140625,0.1500244140625,0.175048828125,0.2249755859375,0.1500244140625,0.07501220703125,0.0999755859375],[0.125,0.0,0.199951171875,0.04998779296875,0.199951171875,0.04998779296875,0.175048828125,0.125,0.0,0.0,0.199951171875,0.024993896484375,0.2249755859375,0.1500244140625,0.024993896484375,0.0],[0.04998779296875,0.2249755859375,0.0999755859375,0.07501220703125,0.2249755859375,0.07501220703125,0.2249755859375,0.07501220703125,0.2249755859375,0.199951171875,0.125,0.07501220703125,0.04998779296875,0.199951171875,0.125,0.1500244140625],[0.1500244140625,0.125,0.175048828125,0.04998779296875,0.125,0.1500244140625,0.1500244140625,0.125,0.0999755859375,0.0,0.199951171875,0.024993896484375,0.175048828125,0.199951171875,0.125,0.0999755859375],[0.0999755859375,0.199951171875,0.0999755859375,0.0999755859375,0.2249755859375,0.0,0.175048828125,0.0999755859375,0.125,0.07501220703125,0.07501220703125,0.175048828125,0.07501220703125,0.0,0.2249755859375,0.2249755859375],[0.07501220703125,0.024993896484375,0.199951171875,0.024993896484375,0.175048828125,0.199951171875,0.0999755859375,0.024993896484375,0.0,0.0999755859375,0.0,0.0999755859375,0.2249755859375,0.175048828125,0.0,0.0],[0.024993896484375,0.0999755859375,0.2249755859375,0.2249755859375,0.125,0.2249755859375,0.04998779296875,0.04998779296875,0.04998779296875,0.024993896484375,0.0999755859375,0.2249755859375,0.024993896484375,0.024993896484375,0.0,0.07501220703125],[0.0,0.1500244140625,0.175048828125,0.1500244140625,0.2249755859375,0.024993896484375,0.1500244140625,0.0999755859375,0.024993896484375,0.0,0.125,0.04998779296875,0.125,0.199951171875,0.024993896484375,0.199951171875],[0.024993896484375,0.04998779296875,0.199951171875,0.0,0.07501220703125,0.199951171875,0.2249755859375,0.04998779296875,0.175048828125,0.0,0.199951171875,0.199951171875,0.1500244140625,0.199951171875,0.125,0.199951171875],[0.1500244140625,0.125,0.04998779296875,0.0999755859375,0.04998779296875,0.175048828125,0.04998779296875,0.0999755859375,0.2249755859375,0.199951171875,0.125,0.1500244140625,0.0999755859375,0.07501220703125,0.07501220703125,0.0999755859375],[0.0,0.04998779296875,0.125,0.024993896484375,0.04998779296875,0.199951171875,0.04998779296875,0.0999755859375,0.199951171875,0.07501220703125,0.1500244140625,0.125,0.199951171875,0.199951171875,0.0,0.125],[0.024993896484375,0.07501220703125,0.0,0.199951171875,0.024993896484375,0.024993896484375,0.024993896484375,0.175048828125,0.04998779296875,0.04998779296875,0.04998779296875,0.07501220703125,0.07501220703125,0.1500244140625,0.175048828125,0.199951171875],[0.0,0.125,0.0,0.07501220703125,0.125,0.125,0.07501220703125,0.1500244140625,0.04998779296875,0.04998779296875,0.125,0.125,0.2249755859375,0.0999755859375,0.07501220703125,0.07501220703125]]" \ -// RUN: --input="8x16xf16=[[0.175049 0.0999756 0.0249939 0.224976 0.224976 0.199951 0.150024 0.0499878 0.224976 0.0249939 0.224976 0.150024 0.125 0.150024 0.125 0.125][0.0750122 0.175049 0.199951 0.0750122 0.224976 0.150024 0.125 0.175049 0.125 0.125 0.0249939 0.0249939 0.0999756 0.224976 0.0750122 0.0249939][0.199951 0.0750122 0 0.199951 0.125 0.0249939 0.0249939 0.125 0.224976 0 0.0499878 0 0 0.0499878 0.175049 0.0999756][0 0.0499878 0.150024 0.0999756 0.175049 0.224976 0.0750122 0.175049 0.150024 0.0249939 0 0.0999756 0.0999756 0.125 0.150024 0.175049][0.175049 0.125 0.175049 0.0999756 0 0.0249939 0.125 0.175049 0 0.175049 0 0.125 0.199951 0.150024 0.175049 0.0249939][0.125 0.125 0.0999756 0.224976 0.0750122 0.150024 0.125 0.0750122 0 0.175049 0.150024 0.150024 0.125 0 0 0][0.199951 0.0750122 0.175049 0.0999756 0.0499878 0.224976 0.0750122 0.0249939 0.150024 0.0249939 0.0750122 0.224976 0.175049 0 0.0499878 0.0249939][0.0499878 0.224976 0.150024 0.0999756 0 0.199951 0.150024 0.125 0.125 0.125 0.224976 0 0.175049 0.0999756 0.125 0]]" \ -// RUN: --input="16x8xf16=[[0.0,-0.03173828125,-0.1318359375,-0.044189453125,-0.0655517578125,-0.126220703125,-0.076171875,-0.041259765625],[0.0,-0.0855712890625,-0.157470703125,-0.09619140625,-0.1124267578125,-0.0718994140625,-0.04052734375,-0.0531005859375],[0.0,-0.065673828125,-0.118896484375,-0.0438232421875,-0.1031494140625,-0.1051025390625,-0.06884765625,-0.0750732421875],[0.0,-0.0911865234375,-0.11810302734375,-0.09375,-0.0711669921875,-0.06494140625,-0.083740234375,-0.0755615234375],[0.0,-0.0150146484375,-0.125,-0.064453125,-0.0462646484375,-0.065673828125,-0.064453125,-0.0325927734375],[0.0,-0.026123046875,-0.1287841796875,-0.078125,-0.1043701171875,-0.125,-0.1368408203125,-0.06005859375],[0.0,-0.036865234375,-0.1300048828125,-0.0699462890625,-0.078125,-0.11376953125,-0.088134765625,-0.03369140625],[0.0,-0.06201171875,-0.0894775390625,-0.0594482421875,-0.078857421875,-0.10693359375,-0.0982666015625,-0.0863037109375],[-0.021240234375,0.0,-0.15185546875,-0.036865234375,-0.0380859375,-0.0587158203125,-0.0343017578125,-0.045654296875],[0.0,-0.052490234375,-0.1268310546875,-0.04248046875,-0.0955810546875,-0.0399169921875,-0.029296875,-0.060546875],[0.0,-0.0162353515625,-0.1187744140625,-0.043701171875,-0.079345703125,-0.0924072265625,-0.1112060546875,-0.0599365234375],[0.0,-0.0706787109375,-0.175048828125,-0.054931640625,-0.1119384765625,-0.1356201171875,-0.076904296875,-0.06005859375],[0.0,-0.0662841796875,-0.105712890625,-0.0782470703125,-0.0838623046875,-0.078857421875,-0.061279296875,-0.0494384765625],[0.0,-0.0374755859375,-0.138671875,-0.0374755859375,-0.086181640625,-0.09619140625,-0.05615234375,-0.0318603515625],[0.0,-0.0455322265625,-0.0450439453125,-0.023681640625,-0.0343017578125,-0.07745361328125,-0.09375,-0.05126953125],[0.0,-0.041259765625,-0.11566162109375,-0.0462646484375,-0.061279296875,-0.06689453125,-0.0706787109375,-0.0325927734375]]" |\ -// RUN: FileCheck %s --check-prefix=EXEC - -// EXEC: result[0]: hal.buffer_view -// EXEC-NEXT: 16x8xf16=[1.34863 1.34863 1.34863 1.34863 1.34863 1.34863 1.34863 1.34863][1.34766 1.34766 1.34766 1.34766 1.34668 1.34766 1.34668 1.34766][1.32715 1.32715 1.32715 1.32715 1.32715 1.32715 1.32715 1.32715][1.27246 1.27246 1.27246 1.27246 1.27246 1.27246 1.27246 1.27246][1.25586 1.25586 1.25586 1.25684 1.25586 1.25586 1.25586 1.25684][1.37598 1.37598 1.37598 1.37598 1.37598 1.37598 1.37598 1.37598][1.33984 1.33984 1.33984 1.33984 1.33984 1.33984 1.33984 1.33984][1.32715 1.32715 1.32715 1.32715 1.32715 1.32715 1.32715 1.32715][1.24023 1.24023 1.24023 1.24023 1.24023 1.24023 1.23926 1.24023][1.26855 1.26855 1.26855 1.26855 1.26855 1.26855 1.26855 1.26855][1.28516 1.28516 1.28516 1.28516 1.28516 1.28516 1.28516 1.28516][1.36523 1.36523 1.36523 1.36523 1.36523 1.36523 1.36523 1.36523][1.31445 1.31445 1.31445 1.31445 1.31445 1.31445 1.31445 1.31445][1.28027 1.28027 1.28027 1.28027 1.28027 1.28027 1.28027 1.28027][1.21387 1.21387 1.21387 1.21387 1.21387 1.21387 1.21387 1.21387][1.24902 1.24902 1.24902 1.24902 1.24902 1.24902 1.24902 1.24902] diff --git a/tests/transform_dialect/cuda/mma_elemwise_layout_analysis_codegen_spec.mlir b/tests/transform_dialect/cuda/mma_elemwise_layout_analysis_codegen_spec.mlir deleted file mode 100644 index 138fc5899ae8..000000000000 --- a/tests/transform_dialect/cuda/mma_elemwise_layout_analysis_codegen_spec.mlir +++ /dev/null @@ -1,65 +0,0 @@ -// RUN: iree-opt %s - -module attributes { transform.with_named_sequence } { - transform.named_sequence @__transform_main( - %variant_op: !transform.any_op) { - // Step 1. Find the fill, matmul and generic ops - // =========================================================================== - %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op - %matmul = transform.structured.match ops{["linalg.generic"]} - attributes{iterator_types = [#linalg.iterator_type, #linalg.iterator_type, #linalg.iterator_type]} - in %variant_op : (!transform.any_op) -> !transform.any_op - %generic = transform.structured.match ops{["linalg.generic"]} - attributes{iterator_types = [#linalg.iterator_type, #linalg.iterator_type]} - in %variant_op : (!transform.any_op) -> !transform.any_op - - // Step 2. Tile the generic and fuse the fill and matmul - // =========================================================================== - %grid_reduction, %forall_grid = - transform.structured.tile_using_forall %generic tile_sizes [16] ( mapping = [#gpu.block] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> () - - transform.structured.fuse_into_containing_op %matmul into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.structured.fuse_into_containing_op %fill into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - - // Step 3. Vectorize - // =========================================================================== - %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op - transform.apply_patterns to %func { - transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface - transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices - transform.apply_patterns.vector.cast_away_vector_leading_one_dim - } : !transform.any_op - %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op - - // Step 4. Bufferize - // =========================================================================== - transform.apply_patterns to %func_3 { - transform.apply_patterns.iree.fold_fill_into_pad - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.scf.for_loop_canonicalization - } : !transform.any_op - transform.apply_patterns to %func_3 { - transform.apply_patterns.tensor.reassociative_reshape_folding - transform.apply_patterns.canonicalization - } : !transform.any_op - transform.apply_cse to %func_3 : !transform.any_op - transform.iree.eliminate_empty_tensors %func_3 : (!transform.any_op) -> () - transform.apply_patterns to %func_3 { - transform.apply_patterns.linalg.erase_unnecessary_inputs - } : !transform.any_op - %memref_func = transform.iree.bufferize { target_gpu } %func_3 : (!transform.any_op) -> (!transform.any_op) - - // Step 6. Post-bufferization vector distribution - // =========================================================================== - transform.iree.forall_to_workgroup %memref_func : (!transform.any_op) -> () - transform.iree.map_nested_forall_to_gpu_threads %memref_func - workgroup_dims = [4, 8, 1] : (!transform.any_op) -> () - - // Step 7. Do layout analysis and lower to mma - // =========================================================================== - %func_11 = transform.iree.layout_analysis_and_distribution %memref_func : (!transform.any_op) -> (!transform.any_op) - - transform.yield - } -} // module diff --git a/tests/transform_dialect/cuda/mma_reduction_layout_analysis.mlir b/tests/transform_dialect/cuda/mma_reduction_layout_analysis.mlir deleted file mode 100644 index 198b513aeeda..000000000000 --- a/tests/transform_dialect/cuda/mma_reduction_layout_analysis.mlir +++ /dev/null @@ -1,38 +0,0 @@ -#map = affine_map<(d0, d1) -> (d0, d1)> -#map1 = affine_map<(d0, d1) -> (d0)> -func.func @matmul_reduction(%lhs : tensor<16x16xf16>, %rhs : tensor<16x16xf16>) -> tensor<16x16xf16> { - %c0 = arith.constant 0.0 : f16 - %c1 = arith.constant -1.0e+04 : f16 - %acc = tensor.empty() : tensor<16xf16> - %init = linalg.fill ins(%c1 : f16) outs(%acc : tensor<16xf16>) -> tensor<16xf16> - %0 = tensor.empty() : tensor<16x16xf16> - %1 = linalg.fill ins(%c0 : f16) outs(%0 : tensor<16x16xf16>) -> tensor<16x16xf16> - %2 = linalg.matmul_transpose_b ins(%lhs, %rhs : tensor<16x16xf16>, tensor<16x16xf16>) - outs(%1 : tensor<16x16xf16>) -> tensor<16x16xf16> - %6 = linalg.generic {indexing_maps - = [#map, #map1], iterator_types = ["parallel", "reduction"]} - ins(%2 : tensor<16x16xf16>) outs(%init : tensor<16xf16>) { - ^bb0(%in: f16, %out: f16): - %20 = arith.maximumf %in, %out : f16 - linalg.yield %20 : f16 - } -> tensor<16xf16> - %8 = linalg.generic {indexing_maps = [#map1, #map], iterator_types=["parallel", "parallel"]} - ins(%6 : tensor<16xf16>) outs(%0 : tensor<16x16xf16>) { - ^bb0(%in: f16, %out: f16): - linalg.yield %in : f16 - } -> tensor<16x16xf16> - return %8 : tensor<16x16xf16> -} - -// RUN: iree-compile %s --iree-hal-target-backends=cuda \ -// RUN: --iree-hal-cuda-llvm-target-arch=sm_80 \ -// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \ -// RUN: --iree-flow-dispatch-use-transform-dialect=%p/mma_reduction_layout_analysis_dispatch_spec.mlir \ -// RUN: --iree-codegen-transform-dialect-library=%p/mma_reduction_layout_analysis_codegen_spec.mlir@codegen | \ -// RUN: iree-run-module --module=- --function=matmul_reduction --device=cuda \ -// RUN: --input="16x16xf16=[[3.0,2.0,2.5,4.5,1.5,4.0,2.0,2.5,4.0,4.0,1.5,0.5,2.0,3.0,0.5,2.0],[2.5,2.5,0.5,3.5,0.0,2.5,3.5,1.0,0.5,0.0,3.0,4.5,0.5,0.5,0.0,3.5],[4.5,3.0,4.0,2.5,1.0,0.5,0.0,4.5,0.0,2.5,3.5,0.0,2.0,4.5,1.5,4.5],[0.0,2.0,1.5,0.0,2.0,1.5,3.0,2.0,2.0,4.0,4.0,2.5,0.0,3.0,2.0,0.5],[0.5,3.5,3.0,2.5,0.0,2.5,3.0,3.0,4.5,2.0,2.0,1.0,2.0,1.0,3.5,2.0],[0.0,4.5,2.0,4.0,2.5,2.5,1.5,1.5,1.5,3.0,3.0,0.0,2.5,0.5,2.0,2.0],[3.5,4.0,3.5,1.5,2.0,0.5,1.0,2.5,4.0,3.5,0.0,3.0,0.0,1.5,4.5,0.0],[4.5,3.5,1.0,4.5,0.5,0.0,1.5,4.5,1.5,3.5,3.0,2.5,0.0,0.5,0.0,4.0],[2.0,3.0,0.5,2.0,1.5,0.5,2.0,2.5,2.5,4.0,2.0,4.5,4.0,0.0,2.0,3.0],[2.5,4.0,4.0,3.0,2.0,2.0,4.5,0.5,4.5,1.0,2.0,0.0,4.5,1.0,3.0,0.5],[4.0,1.5,3.5,3.0,2.5,4.5,1.0,3.5,3.0,2.5,2.5,2.0,2.0,4.5,1.5,2.5],[3.0,3.0,0.0,2.5,1.0,3.0,0.0,1.5,1.5,2.5,0.5,1.0,3.0,3.5,1.5,1.5],[0.0,4.5,0.5,1.5,0.5,4.0,3.5,4.0,4.0,0.0,0.5,1.0,4.5,1.5,0.0,3.5],[2.5,2.0,2.5,1.5,3.0,0.0,2.0,1.0,2.5,4.0,0.0,4.0,4.0,1.5,3.0,2.5],[3.0,0.0,4.0,4.0,2.0,0.5,1.0,3.5,4.0,2.5,4.0,4.5,0.0,3.0,1.5,2.5],[0.5,0.5,2.5,4.0,1.0,2.5,0.5,4.5,2.0,3.0,1.5,4.5,1.5,4.5,0.5,1.5]]" \ -// RUN: --input="16x16xf16=[[3.5,3.0,4.5,3.0,3.0,0.0,2.0,2.5,2.0,0.0,4.5,2.5,0.5,0.0,4.0,3.5],[0.0,0.5,2.0,4.5,0.0,4.0,1.5,3.5,0.5,2.5,3.5,1.5,3.5,4.5,4.0,3.0],[3.0,3.5,2.5,1.5,1.5,1.5,0.5,4.5,0.0,3.5,4.0,0.0,0.0,2.0,0.5,1.0],[1.5,4.0,3.5,3.5,0.0,0.0,0.0,2.0,3.0,1.5,0.0,3.0,0.0,2.5,2.0,3.0],[3.5,4.0,2.5,1.5,3.0,2.0,3.0,4.5,1.5,3.0,2.0,3.5,2.5,4.5,0.5,3.5],[0.0,0.0,0.0,0.5,1.0,2.5,1.5,1.0,2.5,1.5,0.0,1.5,1.5,2.0,4.5,2.5],[4.0,1.5,3.0,2.5,2.5,3.5,2.0,4.0,1.5,2.5,0.5,4.0,1.0,4.5,3.5,0.0],[1.0,2.0,4.0,4.5,4.5,3.5,0.0,1.0,4.5,3.5,2.0,3.0,0.5,4.0,3.5,1.5],[1.0,0.0,2.5,4.5,0.0,2.0,0.0,2.5,3.0,4.0,2.5,0.5,3.5,0.0,3.5,1.0],[0.0,3.5,4.0,0.0,0.0,4.5,1.0,3.5,1.5,3.0,2.0,1.0,0.5,0.5,2.0,0.0],[1.5,0.0,4.5,2.0,4.5,4.5,3.5,3.0,2.5,4.5,0.5,0.5,0.0,4.5,0.0,4.0],[4.5,3.5,4.0,4.0,1.5,4.0,1.0,4.0,2.5,0.5,4.5,3.5,3.5,0.5,4.5,3.0],[0.0,3.0,2.5,1.0,1.5,2.0,1.0,1.5,4.0,2.5,3.5,1.0,3.5,2.5,3.5,4.5],[1.5,4.5,2.0,2.0,2.0,0.5,4.0,2.0,4.0,3.5,4.0,1.0,1.5,2.5,1.0,0.0],[0.0,0.0,1.0,2.5,3.5,2.5,4.0,0.0,2.0,2.0,4.5,0.5,1.0,3.5,3.0,2.5],[2.0,2.0,0.5,2.0,4.5,2.5,3.0,1.5,4.5,2.0,3.5,3.0,1.0,2.0,1.5,2.0]]" |\ -// RUN: FileCheck %s --check-prefix=EXEC - -// EXEC: result[0]: hal.buffer_view -// EXEC-NEXT: 16x16xf16=[116 116 116 116 116 116 116 116 116 116 116 116 116 116 116 116][96.5 96.5 96.5 96.5 96.5 96.5 96.5 96.5 96.5 96.5 96.5 96.5 96.5 96.5 96.5 96.5][124.75 124.75 124.75 124.75 124.75 124.75 124.75 124.75 124.75 124.75 124.75 124.75 124.75 124.75 124.75 124.75][86.75 86.75 86.75 86.75 86.75 86.75 86.75 86.75 86.75 86.75 86.75 86.75 86.75 86.75 86.75 86.75][115.5 115.5 115.5 115.5 115.5 115.5 115.5 115.5 115.5 115.5 115.5 115.5 115.5 115.5 115.5 115.5][103.75 103.75 103.75 103.75 103.75 103.75 103.75 103.75 103.75 103.75 103.75 103.75 103.75 103.75 103.75 103.75][109 109 109 109 109 109 109 109 109 109 109 109 109 109 109 109][114.75 114.75 114.75 114.75 114.75 114.75 114.75 114.75 114.75 114.75 114.75 114.75 114.75 114.75 114.75 114.75][110.75 110.75 110.75 110.75 110.75 110.75 110.75 110.75 110.75 110.75 110.75 110.75 110.75 110.75 110.75 110.75][122.75 122.75 122.75 122.75 122.75 122.75 122.75 122.75 122.75 122.75 122.75 122.75 122.75 122.75 122.75 122.75][136.5 136.5 136.5 136.5 136.5 136.5 136.5 136.5 136.5 136.5 136.5 136.5 136.5 136.5 136.5 136.5][87.75 87.75 87.75 87.75 87.75 87.75 87.75 87.75 87.75 87.75 87.75 87.75 87.75 87.75 87.75 87.75][102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75][102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75][126.25 126.25 126.25 126.25 126.25 126.25 126.25 126.25 126.25 126.25 126.25 126.25 126.25 126.25 126.25 126.25][106 106 106 106 106 106 106 106 106 106 106 106 106 106 106 106] diff --git a/tests/transform_dialect/cuda/mma_reduction_layout_analysis_codegen_spec.mlir b/tests/transform_dialect/cuda/mma_reduction_layout_analysis_codegen_spec.mlir deleted file mode 100644 index 746625f798f8..000000000000 --- a/tests/transform_dialect/cuda/mma_reduction_layout_analysis_codegen_spec.mlir +++ /dev/null @@ -1,68 +0,0 @@ -// RUN: iree-opt %s - -module attributes { transform.with_named_sequence } { - transform.named_sequence @codegen( - %variant_op: !transform.any_op) { - - // Step 1. Find the fill, matmul and generic ops - // =========================================================================== - %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op - %matmul = transform.structured.match ops{["linalg.generic"]} - attributes{iterator_types = [#linalg.iterator_type, #linalg.iterator_type, #linalg.iterator_type]} - in %variant_op : (!transform.any_op) -> !transform.any_op - %reduce = transform.structured.match ops{["linalg.generic"]} - attributes{iterator_types = [#linalg.iterator_type, #linalg.iterator_type]} - in %variant_op : (!transform.any_op) -> !transform.any_op - %broadcast = transform.structured.match ops{["linalg.generic"]} - attributes{iterator_types = [#linalg.iterator_type, #linalg.iterator_type]} - in %variant_op : (!transform.any_op) -> !transform.any_op - - // Step 2. Tile the matmul and fuse the fill - // =========================================================================== - %grid_reduction, %forall_grid = - transform.structured.tile_using_forall %broadcast tile_sizes [16] ( mapping = [#gpu.block] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> () - transform.structured.fuse_into_containing_op %reduce into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.structured.fuse_into_containing_op %matmul into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.structured.fuse_into_containing_op %fill into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - - // Step 3. Vectorize - // =========================================================================== - %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op - transform.apply_patterns to %func { - transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface - transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices - transform.apply_patterns.vector.cast_away_vector_leading_one_dim - } : !transform.any_op - %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op - - // Step 4. Bufferize - // =========================================================================== - transform.apply_patterns to %func_3 { - transform.apply_patterns.iree.fold_fill_into_pad - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.scf.for_loop_canonicalization - } : !transform.any_op - transform.apply_patterns to %func_3 { - transform.apply_patterns.tensor.reassociative_reshape_folding - transform.apply_patterns.canonicalization - } : !transform.any_op - transform.apply_cse to %func_3 : !transform.any_op - transform.iree.eliminate_empty_tensors %func_3 : (!transform.any_op) -> () - transform.apply_patterns to %func_3 { - transform.apply_patterns.linalg.erase_unnecessary_inputs - } : !transform.any_op - %memref_func = transform.iree.bufferize { target_gpu } %func_3 : (!transform.any_op) -> (!transform.any_op) - - // Step 6. Post-bufferization vector distribution - // =========================================================================== - transform.iree.forall_to_workgroup %memref_func : (!transform.any_op) -> () - transform.iree.map_nested_forall_to_gpu_threads %memref_func workgroup_dims = [4, 8, 1] : (!transform.any_op) -> () - - // Step 7. Do layout analysis and lower to mma - // =========================================================================== - %func_11 = transform.iree.layout_analysis_and_distribution %memref_func : (!transform.any_op) -> (!transform.any_op) - - transform.yield - } -} // module diff --git a/tests/transform_dialect/cuda/mma_reduction_layout_analysis_dispatch_spec.mlir b/tests/transform_dialect/cuda/mma_reduction_layout_analysis_dispatch_spec.mlir deleted file mode 100644 index 998146768b09..000000000000 --- a/tests/transform_dialect/cuda/mma_reduction_layout_analysis_dispatch_spec.mlir +++ /dev/null @@ -1,23 +0,0 @@ -// RUN: iree-opt %s - -module attributes { transform.with_named_sequence } { - transform.named_sequence @__transform_main(%func: !transform.any_op) { - %ops = transform.structured.match ops{["linalg.fill", "linalg.matmul_transpose_b", "linalg.generic"]} - in %func : (!transform.any_op) -> !transform.any_op - - %fill0, %fill1, %matmul, %reduce, %broadcast = - transform.split_handle %ops - : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, - !transform.any_op, !transform.any_op) - - %region_op = transform.iree.wrap_in_dispatch_region %broadcast { generateWorkload = false } : (!transform.any_op) -> !transform.any_op - - %non_broadcast = transform.merge_handles %fill0, %fill1, %matmul, %reduce : !transform.any_op - %region_op_2 = transform.iree.move_preceding_op_into_dispatch_region %non_broadcast into %region_op : (!transform.any_op, !transform.any_op) -> !transform.any_op - - %empty = transform.structured.match ops{["tensor.empty"]} in %func : (!transform.any_op) -> !transform.any_op - %region_op_3 = transform.iree.move_preceding_op_into_dispatch_region %empty into %region_op_2 : (!transform.any_op, !transform.any_op) -> !transform.any_op - transform.iree.region_to_workgroups %region_op_3 : (!transform.any_op) -> !transform.any_op - transform.yield - } -} diff --git a/tests/transform_dialect/cuda/mma_using_layout_analysis.mlir b/tests/transform_dialect/cuda/mma_using_layout_analysis.mlir deleted file mode 100644 index 5c0e240db337..000000000000 --- a/tests/transform_dialect/cuda/mma_using_layout_analysis.mlir +++ /dev/null @@ -1,20 +0,0 @@ -func.func @matmul(%lhs : tensor<16x16xf16>, %rhs : tensor<16x8xf16>) -> tensor<16x8xf16> { - %c0 = arith.constant 0.0 : f16 - %0 = tensor.empty() : tensor<16x8xf16> - %1 = linalg.fill ins(%c0 : f16) outs(%0 : tensor<16x8xf16>) -> tensor<16x8xf16> - %2 = linalg.matmul ins(%lhs, %rhs : tensor<16x16xf16>, tensor<16x8xf16>) - outs(%1 : tensor<16x8xf16>) -> tensor<16x8xf16> - return %2 : tensor<16x8xf16> -} - -// RUN: iree-compile %s --iree-hal-target-backends=cuda \ -// RUN: --iree-hal-cuda-llvm-target-arch=sm_80 \ -// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \ -// RUN: --iree-codegen-transform-dialect-library=%p/mma_using_layout_analysis_codegen_spec.mlir@__transform_main | \ -// RUN: iree-run-module --module=- --function=matmul --device=cuda \ -// RUN: --input="16x16xf16=[[1.0,1.125,1.25,1.375,1.5,1.625,1.75,1.875,2.0,2.125,2.25,2.375,2.5,2.625,2.75,2.875],[3.0,3.125,3.25,3.375,3.5,3.625,3.75,3.875,4.0,4.125,4.25,4.375,4.5,4.625,4.75,4.875],[5.0,5.125,5.25,5.375,5.5,5.625,5.75,5.875,6.0,6.125,6.25,6.375,6.5,6.625,6.75,6.875],[7.0,7.125,7.25,7.375,7.5,7.625,7.75,7.875,8.0,8.125,8.25,8.375,8.5,8.625,8.75,8.875],[9.0,9.125,9.25,9.375,9.5,9.625,9.75,9.875,10.0,10.125,10.25,10.375,10.5,10.625,10.75,10.875],[11.0,11.125,11.25,11.375,11.5,11.625,11.75,11.875,12.0,12.125,12.25,12.375,12.5,12.625,12.75,12.875],[13.0,13.125,13.25,13.375,13.5,13.625,13.75,13.875,14.0,14.125,14.25,14.375,14.5,14.625,14.75,14.875],[15.0,15.125,15.25,15.375,15.5,15.625,15.75,15.875,16.0,16.125,16.25,16.375,16.5,16.625,16.75,16.875],[17.0,17.125,17.25,17.375,17.5,17.625,17.75,17.875,18.0,18.125,18.25,18.375,18.5,18.625,18.75,18.875],[19.0,19.125,19.25,19.375,19.5,19.625,19.75,19.875,20.0,20.125,20.25,20.375,20.5,20.625,20.75,20.875],[21.0,21.125,21.25,21.375,21.5,21.625,21.75,21.875,22.0,22.125,22.25,22.375,22.5,22.625,22.75,22.875],[23.0,23.125,23.25,23.375,23.5,23.625,23.75,23.875,24.0,24.125,24.25,24.375,24.5,24.625,24.75,24.875],[25.0,25.125,25.25,25.375,25.5,25.625,25.75,25.875,26.0,26.125,26.25,26.375,26.5,26.625,26.75,26.875],[27.0,27.125,27.25,27.375,27.5,27.625,27.75,27.875,28.0,28.125,28.25,28.375,28.5,28.625,28.75,28.875],[29.0,29.125,29.25,29.375,29.5,29.625,29.75,29.875,30.0,30.125,30.25,30.375,30.5,30.625,30.75,30.875],[31.0,31.125,31.25,31.375,31.5,31.625,31.75,31.875,32.0,32.125,32.25,32.375,32.5,32.625,32.75,32.875]]" \ -// RUN: --input="16x8xf16=[[1.0,1.125,1.25,1.375,1.5,1.625,1.75,1.875],[2.0,2.125,2.25,2.375,2.5,2.625,2.75,2.875],[3.0,3.125,3.25,3.375,3.5,3.625,3.75,3.875],[4.0,4.125,4.25,4.375,4.5,4.625,4.75,4.875],[5.0,5.125,5.25,5.375,5.5,5.625,5.75,5.875],[6.0,6.125,6.25,6.375,6.5,6.625,6.75,6.875],[7.0,7.125,7.25,7.375,7.5,7.625,7.75,7.875],[8.0,8.125,8.25,8.375,8.5,8.625,8.75,8.875],[9.0,9.125,9.25,9.375,9.5,9.625,9.75,9.875],[10.0,10.125,10.25,10.375,10.5,10.625,10.75,10.875],[11.0,11.125,11.25,11.375,11.5,11.625,11.75,11.875],[12.0,12.125,12.25,12.375,12.5,12.625,12.75,12.875],[13.0,13.125,13.25,13.375,13.5,13.625,13.75,13.875],[14.0,14.125,14.25,14.375,14.5,14.625,14.75,14.875],[15.0,15.125,15.25,15.375,15.5,15.625,15.75,15.875],[16.0,16.125,16.25,16.375,16.5,16.625,16.75,16.875]]" |\ -// RUN: FileCheck %s --check-prefix=EXEC - -// EXEC: result[0]: hal.buffer_view -// EXEC-NEXT: 16x8xf16=[306 310 313.75 317.5 321.5 325.5 329.25 333][578 586 594 601.5 609.5 617.5 625 633][850 862 874 885.5 897.5 909.5 921 933][1122 1138 1154 1170 1186 1201 1217 1233][1394 1414 1434 1454 1474 1493 1513 1533][1666 1690 1714 1738 1762 1785 1809 1833][1938 1966 1994 2022 2050 2078 2106 2134][2210 2242 2274 2306 2338 2370 2402 2434][2482 2518 2554 2590 2626 2662 2698 2734][2754 2794 2834 2874 2914 2954 2994 3034][3026 3070 3114 3158 3202 3246 3290 3334][3298 3346 3394 3442 3490 3538 3586 3634][3570 3622 3674 3726 3778 3830 3882 3934][3842 3898 3954 4010 4066 4120 4176 4232][4112 4172 4232 4292 4352 4412 4472 4532][4384 4448 4512 4576 4640 4704 4768 4832] diff --git a/tests/transform_dialect/cuda/mma_using_layout_analysis_codegen_spec.mlir b/tests/transform_dialect/cuda/mma_using_layout_analysis_codegen_spec.mlir deleted file mode 100644 index ee3f7336258d..000000000000 --- a/tests/transform_dialect/cuda/mma_using_layout_analysis_codegen_spec.mlir +++ /dev/null @@ -1,72 +0,0 @@ -// RUN: iree-opt %s - -module attributes { transform.with_named_sequence } { - transform.named_sequence @__transform_main( - %variant_op: !transform.any_op {transform.consumed}) { - // Step 1. Find the fill and matmul ops - // =========================================================================== - %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op - %matmul = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> !transform.any_op - - // Step 2. Tile the matmul and fuse the fill - // =========================================================================== - %grid_reduction, %forall_grid = - transform.structured.tile_using_forall %matmul tile_sizes [16] ( mapping = [#gpu.block] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> () - - transform.structured.fuse_into_containing_op %fill into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - - // Promote operands in order to test loading from shared memory. - %matmul_2 = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> !transform.any_op - %promoted_matmul, %alloc_0, %alloc_1 = - transform.iree.promote_operands %matmul_2 [0, 1] - : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) - - - // Step 3. Vectorize - // =========================================================================== - %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op - transform.apply_patterns to %func { - transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface - transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices - transform.apply_patterns.vector.cast_away_vector_leading_one_dim - } : !transform.any_op - %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op - - // Step 4. Bufferize - // =========================================================================== - transform.apply_patterns to %func_3 { - transform.apply_patterns.iree.fold_fill_into_pad - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.scf.for_loop_canonicalization - } : !transform.any_op - transform.apply_patterns to %func_3 { - transform.apply_patterns.tensor.reassociative_reshape_folding - transform.apply_patterns.canonicalization - } : !transform.any_op - transform.apply_cse to %func_3 : !transform.any_op - transform.iree.eliminate_empty_tensors %func_3 : (!transform.any_op) -> () - transform.apply_patterns to %func_3 { - transform.apply_patterns.linalg.erase_unnecessary_inputs - } : !transform.any_op - %memref_func = transform.iree.bufferize { target_gpu } %func_3: (!transform.any_op) -> (!transform.any_op) - - // Step 5. Pre-process the contract and transfer ops to put it in the right form. - // =========================================================================== - transform.apply_patterns to %memref_func { - transform.apply_patterns.iree.prepare_vector_to_mma - } : !transform.any_op - - // Step 6. Post-bufferization vector distribution - // =========================================================================== - transform.iree.forall_to_workgroup %memref_func : (!transform.any_op) -> () - transform.iree.map_nested_forall_to_gpu_threads %memref_func - workgroup_dims = [4, 8, 1] : (!transform.any_op) -> () - - // Step 7. Do layout analysis and lower to mma - // =========================================================================== - %func_11 = transform.iree.layout_analysis_and_distribution %memref_func : (!transform.any_op) -> (!transform.any_op) - - transform.yield - } -} // module diff --git a/tests/transform_dialect/cuda/reduction.mlir b/tests/transform_dialect/cuda/reduction.mlir deleted file mode 100644 index d506a1740cf1..000000000000 --- a/tests/transform_dialect/cuda/reduction.mlir +++ /dev/null @@ -1,33 +0,0 @@ -!in_tensor_t = tensor<8x64xf32> -!out_tensor_t = tensor<8xf32> - -func.func @reduce(%arg : !in_tensor_t) -> (!out_tensor_t) { - %cst = arith.constant -0.000000e+00 : f32 - - %0 = tensor.empty() : !out_tensor_t - %1 = linalg.fill ins(%cst : f32) outs(%0 : !out_tensor_t) -> !out_tensor_t - %2 = linalg.generic { - indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, - affine_map<(d0, d1) -> (d0)>], - iterator_types = ["parallel", "reduction"]} - ins(%arg : !in_tensor_t) outs(%1 : !out_tensor_t) { - ^bb0(%arg3: f32, %arg4: f32): - %3 = arith.addf %arg3, %arg4 : f32 - linalg.yield %3 : f32 - } -> !out_tensor_t - return %2 : !out_tensor_t -} - -// RUN: iree-compile %s --iree-hal-target-backends=cuda \ -// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \ -// RUN: --iree-codegen-transform-dialect-library=%p/reduction_codegen_spec.mlir@codegen | \ -// RUN: iree-run-module --module=- --function=reduce --device=cuda --input="8x64xf32=1" |\ -// RUN: FileCheck %s --check-prefix=EXEC - -/// Note: the current --iree-codegen-llvmgpu-enable-transform-dialect-jit only works for exactly this reduction atm. -// RUN: iree-compile %s --iree-hal-target-backends=cuda | \ -// RUN: iree-run-module --module=- --function=reduce --device=cuda --input="8x64xf32=1" |\ -// RUN: FileCheck %s --check-prefix=EXEC - -// EXEC: result[0]: hal.buffer_view -// EXEC-NEXT: 8xf32=64 64 64 64 64 64 64 64 diff --git a/tests/transform_dialect/cuda/reduction_codegen_spec.mlir b/tests/transform_dialect/cuda/reduction_codegen_spec.mlir deleted file mode 100644 index c8b46b3756a6..000000000000 --- a/tests/transform_dialect/cuda/reduction_codegen_spec.mlir +++ /dev/null @@ -1,125 +0,0 @@ -// RUN: iree-opt %s - -module attributes { transform.with_named_sequence } { - transform.named_sequence @codegen( - %variant_op: !transform.any_op {transform.consumed}) { - - %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op - - // Step 1. Split the reduction to get meatier (size(red) / 2)-way parallelism. - // =========================================================================== - %0 = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!transform.any_op) -> !transform.any_op - %init_or_alloc_op, %more_parallel_fill_op, %more_parallel_op, %combiner_op = - transform.structured.split_reduction %0 - { split_factor = 2, insert_split_dimension = 1 } - : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) - - // Step 2. First level of tiling + fusion parallelizes to blocks. - // =========================================================================== - %grid_combiner_op, %forall_grid = - transform.structured.tile_using_forall %combiner_op tile_sizes [1] - ( mapping = [#gpu.block] ) - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> () - %not_combiner = transform.merge_handles %fill, %more_parallel_fill_op, %more_parallel_op : !transform.any_op - transform.structured.fuse_into_containing_op %not_combiner into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - - // Step 3. Second level of tiling + fusion parallelizes to threads. - // =========================================================================== - %fill_1d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1xf32> in %variant_op - : (!transform.any_op) -> !transform.any_op - %block_combiner_op, %forall_block_combiner_op = - transform.structured.tile_using_forall %grid_combiner_op tile_sizes [1] - ( mapping = [#gpu.thread] ) - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.structured.fuse_into_containing_op %fill_1d into %forall_block_combiner_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - - // Canonicalizations. - %func_op = transform.structured.match ops{["func.func"]} in %variant_op - : (!transform.any_op) -> !transform.any_op - transform.apply_patterns to %func_op { - transform.apply_patterns.iree.fold_fill_into_pad - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.scf.for_loop_canonicalization - } : !transform.any_op - transform.iree.apply_licm %func_op : !transform.any_op - transform.apply_cse to %func_op : !transform.any_op - - %fill_2d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1x2xf32> in %variant_op - : (!transform.any_op) -> !transform.any_op - %grid_more_parallel_op = transform.structured.match ops{["linalg.generic"]} - attributes{iterator_types = [#linalg.iterator_type, #linalg.iterator_type, #linalg.iterator_type]} in %variant_op - : (!transform.any_op) -> !transform.any_op - %block_more_parallel_op, %forall_block_more_parallel_op = - transform.structured.tile_using_forall %grid_more_parallel_op tile_sizes [1, 1] - ( mapping = [#gpu.thread, #gpu.thread] ) - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.structured.fuse_into_containing_op %fill_2d into %forall_block_more_parallel_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - - // Step 4. Rank-reduce and vectorize. - // =========================================================================== - %func = transform.structured.match ops{["func.func"]} in %variant_op - : (!transform.any_op) -> !transform.any_op - transform.apply_patterns to %func { - transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface - transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices - transform.apply_patterns.vector.cast_away_vector_leading_one_dim - } : !transform.any_op - %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op - - // Step 5. Bufferize and drop HAL decriptor from memref ops. - // =========================================================================== - transform.apply_patterns to %func_3 { - transform.apply_patterns.tensor.reassociative_reshape_folding - } : !transform.any_op - transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> () - %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> !transform.any_op - %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 - : (!transform.any_op) -> !transform.any_op - - // Step 6. Post-bufferization mapping to blocks and threads. - // =========================================================================== - %func_5 = transform.structured.match ops{["func.func"]} in %variant_op_3 - : (!transform.any_op) -> !transform.any_op - transform.iree.forall_to_workgroup %func_5 : (!transform.any_op) -> () - transform.iree.map_nested_forall_to_gpu_threads %func_5 - workgroup_dims = [32, 2, 1] : (!transform.any_op) -> () - - // Step 7. Post-bufferization vector distribution with rank-reduction. - // =========================================================================== - transform.apply_patterns to %func_5 { - transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface - transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices - transform.apply_patterns.memref.fold_memref_alias_ops - transform.apply_patterns.vector.cast_away_vector_leading_one_dim - } : !transform.any_op - %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3 - : (!transform.any_op) -> !transform.any_op - // Don't complain about unsupported if (threadIdx.x == 0 && threadIdx.y == 0) - // at this point. - transform.sequence %variant_op_3 : !transform.any_op failures(suppress) { - ^bb0(%arg0: !transform.any_op): - transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 } : (!transform.any_op) -> !transform.any_op - } - transform.iree.vector.warp_distribute %func_5 : (!transform.any_op) -> () - - - // Late Canonicalizations. - %func_op_3 = transform.structured.match ops{["func.func"]} in %variant_op_3 - : (!transform.any_op) -> !transform.any_op - transform.apply_patterns to %func_op_3 { - transform.apply_patterns.iree.fold_fill_into_pad - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.scf.for_loop_canonicalization - } : !transform.any_op - transform.iree.apply_licm %func_op_3 : !transform.any_op - transform.apply_cse to %func_op_3 : !transform.any_op - - // Annotate the exported function as already translated. - %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op - %none = transform.param.constant #iree_codegen.translation_info -> !transform.any_param - transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param - - transform.yield - } -} // module diff --git a/tests/transform_dialect/cuda/reduction_eltwise.mlir b/tests/transform_dialect/cuda/reduction_eltwise.mlir deleted file mode 100644 index 0bd49b5d4298..000000000000 --- a/tests/transform_dialect/cuda/reduction_eltwise.mlir +++ /dev/null @@ -1,45 +0,0 @@ -!in_tensor_t = tensor<8x64xf32> -!out_tensor_t = tensor<8xf32> - -func.func @reduce(%arg : !in_tensor_t) -> (!out_tensor_t) { - %cst = arith.constant -0.000000e+00 : f32 - - %0 = tensor.empty() : !out_tensor_t - %1 = linalg.fill ins(%cst : f32) outs(%0 : !out_tensor_t) -> !out_tensor_t - %5 = linalg.generic { - indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, - affine_map<(d0, d1) -> (d0)>], - iterator_types = ["parallel", "reduction"]} - ins(%arg : !in_tensor_t) outs(%1 : !out_tensor_t) { - ^bb0(%arg3: f32, %arg4: f32): - %4 = arith.addf %arg3, %arg4 : f32 - linalg.yield %4 : f32 - } -> !out_tensor_t - - %6 = tensor.empty() : !out_tensor_t - %7 = linalg.generic { - indexing_maps = [affine_map<(d0) -> (d0)>, - affine_map<(d0) -> (d0)>], - iterator_types = ["parallel"]} - ins(%5 : !out_tensor_t) outs(%6 : !out_tensor_t) { - ^bb0(%arg3: f32, %arg4: f32): - %4 = math.sqrt %arg3 : f32 - linalg.yield %4 : f32 - } -> !out_tensor_t - return %7 : !out_tensor_t -} - -// RUN: iree-compile %s --iree-hal-target-backends=cuda \ -// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \ -// RUN: --iree-codegen-transform-dialect-library=%p/reduction_eltwise_codegen_spec.mlir@codegen | \ -// RUN: iree-run-module --module=- --function=reduce --device=cuda --input="8x64xf32=1" |\ -// RUN: FileCheck %s --check-prefix=EXEC - -/// Note: the current --iree-codegen-llvmgpu-enable-transform-dialect-jit -/// only works for exactly this reduction atm. -// RUN: iree-compile %s --iree-hal-target-backends=cuda | \ -// RUN: iree-run-module --module=- --function=reduce --device=cuda --input="8x64xf32=1" |\ -// RUN: FileCheck %s --check-prefix=EXEC - -// EXEC: result[0]: hal.buffer_view -// EXEC-NEXT: 8xf32=8 8 8 8 8 8 8 8 diff --git a/tests/transform_dialect/cuda/reduction_eltwise_codegen_spec.mlir b/tests/transform_dialect/cuda/reduction_eltwise_codegen_spec.mlir deleted file mode 100644 index 6422201305af..000000000000 --- a/tests/transform_dialect/cuda/reduction_eltwise_codegen_spec.mlir +++ /dev/null @@ -1,164 +0,0 @@ -// RUN: iree-opt %s - -module attributes { transform.with_named_sequence } { - transform.named_sequence @codegen( - %variant_op: !transform.any_op {transform.consumed}) { - %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op - : (!transform.any_op) -> !transform.any_op - - // Step 1. Split the reduction to get meatier (size(red) / 2)-way parallelism. - // =========================================================================== - %0 = transform.structured.match ops{["linalg.generic"]} in %variant_op - : (!transform.any_op) -> !transform.any_op - %reduction, %eltwise = transform.split_handle %0 - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - %init_or_alloc_op, %more_parallel_fill_op, %more_parallel_op, %combiner_op = - transform.structured.split_reduction %reduction - { split_factor = 2, insert_split_dimension = 1 } - : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) - - // Canonicalizations. - %func_op = transform.structured.match ops{["func.func"]} in %variant_op - : (!transform.any_op) -> !transform.any_op - transform.apply_patterns to %func_op { - transform.apply_patterns.iree.fold_fill_into_pad - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.scf.for_loop_canonicalization - transform.apply_patterns.canonicalization - } : !transform.any_op - transform.iree.apply_licm %func_op : !transform.any_op - transform.apply_cse to %func_op : !transform.any_op - - // Step 2. First level of tiling + fusion parallelizes to blocks. Tile the - // trailing elementwise the same way we want to tile the reduction. - // =========================================================================== - %eltwise_grid_op, %grid_loop = transform.structured.tile_using_forall %eltwise - tile_sizes [1] (mapping = [#gpu.block]) - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.iree.populate_workgroup_count_region_using_num_threads_slice %grid_loop : (!transform.any_op) -> () - %not_eltwise = transform.merge_handles %fill, %more_parallel_fill_op, %more_parallel_op, %combiner_op - : !transform.any_op - transform.structured.fuse_into_containing_op %not_eltwise into %grid_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - - // Canonicalizations. - transform.apply_patterns to %func_op { - transform.apply_patterns.iree.fold_fill_into_pad - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.scf.for_loop_canonicalization - transform.apply_patterns.canonicalization - } : !transform.any_op - transform.iree.apply_licm %func_op : !transform.any_op - transform.apply_cse to %func_op : !transform.any_op - - // Step 3. Second level of tiling + fusion parallelizes to threads. - // =========================================================================== - %fill_1d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1xf32> in %variant_op - : (!transform.any_op) -> !transform.any_op - %eltwise_block_op, %eltwise_block_loop = - transform.structured.tile_using_forall %eltwise_grid_op tile_sizes [1] - ( mapping = [#gpu.thread] ) - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - %block_combiner_op = transform.structured.match ops{["linalg.generic"]} - attributes {iterator_types = [#linalg.iterator_type, #linalg.iterator_type]} in %variant_op - : (!transform.any_op) -> !transform.any_op - %combined_and_fill = transform.merge_handles %fill_1d, %block_combiner_op : !transform.any_op - transform.structured.fuse_into_containing_op %combined_and_fill into %eltwise_block_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - - // Canonicalizations. - transform.apply_patterns to %func_op { - transform.apply_patterns.iree.fold_fill_into_pad - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.scf.for_loop_canonicalization - transform.apply_patterns.canonicalization - } : !transform.any_op - transform.iree.apply_licm %func_op : !transform.any_op - transform.apply_cse to %func_op : !transform.any_op - - %fill_2d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1x2xf32> in %variant_op - : (!transform.any_op) -> !transform.any_op - %grid_more_parallel_op = transform.structured.match ops{["linalg.generic"]} - attributes{iterator_types = [#linalg.iterator_type, #linalg.iterator_type, #linalg.iterator_type]} in %variant_op - : (!transform.any_op) -> !transform.any_op - %block_more_parallel_op, %forall_block_more_parallel_op = - transform.structured.tile_using_forall %grid_more_parallel_op tile_sizes [1, 1] - ( mapping = [#gpu.thread, #gpu.thread] ) - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.structured.fuse_into_containing_op %fill_2d into %forall_block_more_parallel_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - - // Canonicalizations. - transform.apply_patterns to %func_op { - transform.apply_patterns.iree.fold_fill_into_pad - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.scf.for_loop_canonicalization - transform.apply_patterns.canonicalization - } : !transform.any_op - transform.iree.apply_licm %func_op : !transform.any_op - transform.apply_cse to %func_op : !transform.any_op - - // Step 4. Rank-reduce and vectorize. - // =========================================================================== - %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op - transform.apply_patterns to %func { - transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface - transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices - transform.apply_patterns.vector.cast_away_vector_leading_one_dim - } : !transform.any_op - %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op - - // Step 5. Bufferize and drop HAL decriptor from memref ops. - // =========================================================================== - transform.apply_patterns to %func_3 { - transform.apply_patterns.tensor.reassociative_reshape_folding - } : !transform.any_op - transform.iree.eliminate_empty_tensors %variant_op: (!transform.any_op) -> () - %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op) - %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 - : (!transform.any_op) -> !transform.any_op - - // Step 6. Post-bufferization mapping to blocks and threads. - // =========================================================================== - %func_5 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op - transform.iree.forall_to_workgroup %func_5 : (!transform.any_op) -> () - transform.iree.map_nested_forall_to_gpu_threads %func_5 - workgroup_dims = [32, 2, 1] : (!transform.any_op) -> () - - // Step 7. Post-bufferization vector distribution with rank-reduction. - // =========================================================================== - transform.apply_patterns to %func_5 { - transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface - transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices - transform.apply_patterns.memref.fold_memref_alias_ops - transform.apply_patterns.vector.cast_away_vector_leading_one_dim - } : !transform.any_op - %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3 - : (!transform.any_op) -> !transform.any_op - // Don't complain about unsupported if (threadIdx.x == 0 && threadIdx.y == 0) - // at this point. - transform.sequence %variant_op_3 : !transform.any_op failures(suppress) { - ^bb0(%arg0: !transform.any_op): - transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 } - : (!transform.any_op) -> !transform.any_op - } - transform.iree.vector.warp_distribute %func_5 : (!transform.any_op) -> () - - - // Late canonicalizations. - %func_op_3 = transform.structured.match ops{["func.func"]} in %variant_op_3 - : (!transform.any_op) -> !transform.any_op - transform.apply_patterns to %func_op_3 { - transform.apply_patterns.iree.fold_fill_into_pad - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.scf.for_loop_canonicalization - transform.apply_patterns.canonicalization - } : !transform.any_op - transform.iree.apply_licm %func_op_3 : !transform.any_op - transform.apply_cse to %func_op_3 : !transform.any_op - - // Annotate the exported function as already translated. - %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op - %none = transform.param.constant #iree_codegen.translation_info -> !transform.any_param - transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param - - transform.yield - } -} // module diff --git a/tests/transform_dialect/cuda/reduction_v2.mlir b/tests/transform_dialect/cuda/reduction_v2.mlir deleted file mode 100644 index 6d367d218397..000000000000 --- a/tests/transform_dialect/cuda/reduction_v2.mlir +++ /dev/null @@ -1,33 +0,0 @@ -!in_tensor_t = tensor<33x1024xf32> -!out_tensor_t = tensor<33xf32> - -func.func @reduce(%arg : !in_tensor_t) -> (!out_tensor_t) { - %cst = arith.constant -0.000000e+00 : f32 - - %0 = tensor.empty() : !out_tensor_t - %1 = linalg.fill ins(%cst : f32) outs(%0 : !out_tensor_t) -> !out_tensor_t - %2 = linalg.generic { - indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, - affine_map<(d0, d1) -> (d0)>], - iterator_types = ["parallel", "reduction"]} - ins(%arg : !in_tensor_t) outs(%1 : !out_tensor_t) { - ^bb0(%arg3: f32, %arg4: f32): - %3 = arith.addf %arg3, %arg4 : f32 - linalg.yield %3 : f32 - } -> !out_tensor_t - return %2 : !out_tensor_t -} - -// RUN: iree-compile %s --iree-hal-target-backends=cuda \ -// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \ -// RUN: --iree-codegen-transform-dialect-library=%p/reduction_v2_codegen_spec.mlir@codegen | \ -// RUN: iree-run-module --module=- --function=reduce --device=cuda --input="33x1024xf32=1" |\ -// RUN: FileCheck %s --check-prefix=EXEC - -// RUN: iree-compile %s --iree-hal-target-backends=cuda | \ -// RUN: iree-run-module --module=- --function=reduce --device=cuda --input="33x1024xf32=1" |\ -// RUN: FileCheck %s --check-prefix=EXEC - -// only checking the first 6 of 33 -// EXEC: result[0]: hal.buffer_view -// EXEC-NEXT: 33xf32=1024 1024 1024 1024 1024 1024 diff --git a/tests/transform_dialect/cuda/reduction_v2_codegen_spec.mlir b/tests/transform_dialect/cuda/reduction_v2_codegen_spec.mlir deleted file mode 100644 index bb9ecbfc6429..000000000000 --- a/tests/transform_dialect/cuda/reduction_v2_codegen_spec.mlir +++ /dev/null @@ -1,113 +0,0 @@ -// RUN: iree-opt %s - -module attributes { transform.with_named_sequence } { - transform.named_sequence @codegen( - %variant_op: !transform.any_op {transform.consumed}) { - - %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op - %reduction = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!transform.any_op) -> !transform.any_op - - // Step 1. First level of tiling + fusion parallelizes to blocks. - // =========================================================================== - %grid_reduction, %forall_grid = - transform.structured.tile_using_forall %reduction tile_sizes [1] - ( mapping = [#gpu.block] ) - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> () - transform.structured.fuse_into_containing_op %fill into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - - // Step 2. Split the reduction to get meatier parallelism. - // =========================================================================== - %block_more_parallel_fill_op_2, %block_more_parallel_op_2, %block_combiner_op_2, %forall = - transform.structured.tile_reduction_using_for %grid_reduction by tile_sizes = [0, 128] - : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) - %_1:2 = - transform.structured.tile_using_forall %block_more_parallel_op_2 num_threads [0, 32] - ( mapping = [#gpu.thread] ) - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - - // Step 3. Second level of tiling parallelizes to threads. - // =========================================================================== - // 1st op is [parallel, parallel], map it to threadIdx.x by 4. - %_2:2 = - transform.structured.tile_using_forall %block_more_parallel_fill_op_2 tile_sizes [0, 4] - ( mapping = [#gpu.thread] ) - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - // 2nd op is [parallel, reduction] of 1x128, map the 1-dim to threadIdx.y to - // trigger mapping of the reduction to threadIdx.x via predication via `if (x==0)`. - %_3:2 = - transform.structured.tile_using_forall %block_combiner_op_2 tile_sizes [1] - ( mapping = [#gpu.thread] ) - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - - // Step 4. Rank-reduce and vectorize. - // =========================================================================== - %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op - transform.apply_patterns to %func { - transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface - transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices - transform.apply_patterns.vector.cast_away_vector_leading_one_dim - } : !transform.any_op - %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op - - // Step 5. Bufferize and drop HAL decriptor from memref ops. - // =========================================================================== - // Canonicalization/CSE is needed before bufferization otherwise unnecessary - // allocs will be created. - transform.apply_patterns to %func_3 { - transform.apply_patterns.iree.fold_fill_into_pad - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.scf.for_loop_canonicalization - } : !transform.any_op - transform.apply_patterns to %func_3 { - transform.apply_patterns.tensor.reassociative_reshape_folding - transform.apply_patterns.canonicalization - } : !transform.any_op - transform.apply_cse to %func_3 : !transform.any_op - transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> () - %func_5 = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op - transform.apply_patterns to %func_5 { - transform.apply_patterns.linalg.erase_unnecessary_inputs - } : !transform.any_op - %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op) - %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op - - // Step 6. Post-bufferization mapping to blocks and threads. - // =========================================================================== - %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op - transform.iree.forall_to_workgroup %func_7 : (!transform.any_op) -> () - transform.iree.map_nested_forall_to_gpu_threads %func_7 - workgroup_dims = [32, 1, 1] : (!transform.any_op) -> () - - // Step 7. Post-bufferization vector distribution with rank-reduction. - // =========================================================================== - transform.apply_patterns to %func_7 { - transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface - transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices - transform.apply_patterns.memref.fold_memref_alias_ops - transform.apply_patterns.vector.cast_away_vector_leading_one_dim - } : !transform.any_op - %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3 - : (!transform.any_op) -> !transform.any_op - %warp = transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 } : (!transform.any_op) -> !transform.any_op - transform.iree.vector.warp_distribute %func_7 - : (!transform.any_op) -> () - - // Late canonicalizations to cleanup and pass the checks - transform.apply_patterns to %func_7 { - transform.apply_patterns.iree.fold_fill_into_pad - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.scf.for_loop_canonicalization - transform.apply_patterns.canonicalization - } : !transform.any_op - transform.iree.apply_licm %func_7 : !transform.any_op - transform.apply_cse to %func_7 : !transform.any_op - - // Annotate the exported function as already translated. - %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op - %none = transform.param.constant #iree_codegen.translation_info -> !transform.any_param - transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param - - transform.yield - } -} // module diff --git a/tests/transform_dialect/cuda/reduction_v2_uneven.mlir b/tests/transform_dialect/cuda/reduction_v2_uneven.mlir deleted file mode 100644 index 66b88cdf2232..000000000000 --- a/tests/transform_dialect/cuda/reduction_v2_uneven.mlir +++ /dev/null @@ -1,29 +0,0 @@ -!in_tensor_t = tensor<33x34567xf32> -!out_tensor_t = tensor<33xf32> - -func.func @reduce(%arg : !in_tensor_t) -> (!out_tensor_t) { - %cst = arith.constant -0.000000e+00 : f32 - - %0 = tensor.empty() : !out_tensor_t - %1 = linalg.fill ins(%cst : f32) outs(%0 : !out_tensor_t) -> !out_tensor_t - %2 = linalg.generic { - indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, - affine_map<(d0, d1) -> (d0)>], - iterator_types = ["parallel", "reduction"]} - ins(%arg : !in_tensor_t) outs(%1 : !out_tensor_t) { - ^bb0(%arg3: f32, %arg4: f32): - %3 = arith.addf %arg3, %arg4 : f32 - linalg.yield %3 : f32 - } -> !out_tensor_t - return %2 : !out_tensor_t -} - -// RUN: iree-compile %s --iree-hal-target-backends=cuda \ -// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \ -// RUN: --iree-codegen-transform-dialect-library=%p/reduction_v2_codegen_spec.mlir@codegen | \ -// RUN: iree-run-module --module=- --function=reduce --device=cuda --input="33x34567xf32=1" |\ -// RUN: FileCheck %s --check-prefix=EXEC - -// only checking the first 6 of 33 -// EXEC: result[0]: hal.buffer_view -// EXEC-NEXT: 33xf32=34567 34567 34567 34567 34567 34567 diff --git a/tests/transform_dialect/cuda/softmax.mlir b/tests/transform_dialect/cuda/softmax.mlir deleted file mode 100644 index 91bbc07b7a28..000000000000 --- a/tests/transform_dialect/cuda/softmax.mlir +++ /dev/null @@ -1,84 +0,0 @@ -// RUN: iree-compile %s --iree-hal-target-backends=cuda \ -// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \ -// RUN: --iree-flow-dispatch-use-transform-dialect=%p/softmax_dispatch_spec.mlir \ -// RUN: --iree-codegen-transform-dialect-library=%p/softmax_codegen_spec.mlir@codegen | \ -// RUN: iree-run-module --module=- --function=softmax --device=cuda | \ -// RUN: FileCheck %s - - -!tmp_tensor_t = tensor<16x128xf32> -!in_tensor_t = tensor<16x128x128xf32> -!out_tensor_t = tensor<16x128x128xf32> - -// Execution only checks that @softmax runs. -// CHECK: EXEC @softmax -// CHECK: 16x128x128xf32=[ -// CHECK-SAME: [0.0078125 0.0078125 0.0078125 0.0078125 - -func.func @softmax() -> !out_tensor_t { - %cst_0 = arith.constant 0.0 : f32 - %cst_1 = arith.constant 1.0 : f32 - %cst_min = arith.constant -3.40282347E+38 : f32 - %input = arith.constant dense<5.000000e+00> : !out_tensor_t - util.optimization_barrier %input : !in_tensor_t - - %input_max_empty = tensor.empty() : !tmp_tensor_t - %input_max_filled = linalg.fill ins(%cst_min : f32) - outs(%input_max_empty : !tmp_tensor_t) -> !tmp_tensor_t - %input_max = linalg.generic - {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, - affine_map<(d0, d1, d2) -> (d0, d1)>], - iterator_types = ["parallel", "parallel", "reduction"]} - ins(%input : !in_tensor_t) - outs(%input_max_filled : !tmp_tensor_t) { - ^bb0(%arg0: f32, %arg1: f32): - %max = arith.maximumf %arg0, %arg1 : f32 - linalg.yield %max : f32 - } -> !tmp_tensor_t - - // This has been fused manually to avoid the fusion on tensors pass and reduce noise atm. - %exps_empty = tensor.empty() : !out_tensor_t - %exps_sum_empty = tensor.empty() : !tmp_tensor_t - %exps_sum_filled = linalg.fill ins(%cst_0 : f32) - outs(%exps_sum_empty : !tmp_tensor_t) -> !tmp_tensor_t - %exps = linalg.generic - {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, - affine_map<(d0, d1, d2) -> (d0, d1)>, - affine_map<(d0, d1, d2) -> (d0, d1, d2)>], - iterator_types = ["parallel", "parallel", "parallel"]} - ins(%input, %input_max : !in_tensor_t, !tmp_tensor_t) - outs(%exps_empty : !out_tensor_t) { - ^bb0(%arg0: f32, %arg1: f32, %arg2: f32): - %sub = arith.subf %arg0, %arg1 : f32 - %exp = math.exp %sub : f32 - linalg.yield %exp: f32 - } -> (!out_tensor_t) - - %exps_sum = linalg.generic - {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, - affine_map<(d0, d1, d2) -> (d0, d1)>], - iterator_types = ["parallel", "parallel", "reduction"]} - ins(%exps : !out_tensor_t) - outs(%exps_sum_filled : !tmp_tensor_t) { - ^bb0(%exp: f32, %acc: f32): - %add = arith.addf %exp, %acc : f32 - linalg.yield %add : f32 - } -> (!tmp_tensor_t) - - %res_empty = tensor.empty() : !out_tensor_t - %res = linalg.generic - {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, - affine_map<(d0, d1, d2) -> (d0, d1)>, - affine_map<(d0, d1, d2) -> (d0, d1, d2)>], - iterator_types = ["parallel", "parallel", "parallel"]} - ins(%exps, %exps_sum : !out_tensor_t, !tmp_tensor_t) - outs(%res_empty : !out_tensor_t) { - ^bb0(%arg0: f32, %arg1: f32, %arg2: f32): - // %10 = arith.divf %cst_1, %arg1 : f32 - // %11 = arith.mulf %arg0, %10 : f32 - %div = arith.divf %arg0, %arg1 : f32 - linalg.yield %div : f32 - } -> !out_tensor_t - - return %res: !out_tensor_t -} diff --git a/tests/transform_dialect/cuda/softmax_codegen_spec.mlir b/tests/transform_dialect/cuda/softmax_codegen_spec.mlir deleted file mode 100644 index 86f49091536d..000000000000 --- a/tests/transform_dialect/cuda/softmax_codegen_spec.mlir +++ /dev/null @@ -1,119 +0,0 @@ -// RUN: iree-opt %s - -// Codegen -module attributes { transform.with_named_sequence } { - transform.named_sequence @codegen( - %variant_op: !transform.any_op {transform.consumed}) { - - %ops = transform.structured.match ops{["linalg.fill", "linalg.generic"]} - in %variant_op : (!transform.any_op) -> !transform.any_op - %input_max_fill, - %input_max, - %exps_sum_fill, - %exps, - %exps_sum, - %div = transform.split_handle %ops - : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, - !transform.any_op, !transform.any_op, !transform.any_op) - - // Step 1. First level of tiling + fusion parallelizes to blocks. - // ============================================================== - %_, %forall = - transform.structured.tile_using_forall %div tile_sizes [1, 4] - ( mapping = [#gpu.block, #gpu.block] ) - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall : (!transform.any_op) -> () - - // TODO: Merging and fusing merged handles does not work properly atm. - transform.structured.fuse_into_containing_op %exps_sum into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.structured.fuse_into_containing_op %exps into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.structured.fuse_into_containing_op %exps_sum_fill into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.structured.fuse_into_containing_op %input_max into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.structured.fuse_into_containing_op %input_max_fill into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - // By default, fusion into scf.forall does not promote captured values - // to shared as this involves a cross-thread dependence analysis. - // Instead, we activate it explicitly post-hoc to promote all the extract_slice - // ops that we find and match the prerequisites - %forall_with_type = transform.cast %forall : !transform.any_op to !transform.op<"scf.forall"> - transform.iree.share_forall_operands %forall_with_type - : (!transform.op<"scf.forall">) -> !transform.op<"scf.forall"> - transform.apply_patterns to %variant_op { - transform.apply_patterns.canonicalization - } : !transform.any_op - transform.apply_cse to %variant_op : !transform.any_op - - // Step 2. Second level of tiling + fusion parallelizes to threads. - // ================================================================ - %tiled_ops = transform.structured.match ops{["linalg.fill", "linalg.generic"]} - in %variant_op : (!transform.any_op) -> !transform.any_op - %tiled_input_max_fill, - %tiled_input_max, - %tiled_exps_sum_fill, - %tiled_exp_and_exps_sum, - %tiled_exp_and_exps_sum_2, - %tiled_div = transform.split_handle %tiled_ops - : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, - !transform.any_op, !transform.any_op, !transform.any_op) - // Leaving the reduction untiled on threadIdx.x makes it sequential on - // threadIdx.x. After distribution, predication by `if (threadIdx.x == 0)` is - // introduced and opportunities for distributing vector ops across warps - // appear. - %reduction_linalg_ops = transform.merge_handles %tiled_input_max, - %tiled_exp_and_exps_sum, - %tiled_exp_and_exps_sum_2 - : !transform.any_op - transform.structured.tile_using_forall %reduction_linalg_ops tile_sizes [1, 1] - ( mapping = [#gpu.thread, #gpu.thread] ) - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - // Fully parallel ops are tiled and mapped. - %parallel_linalg_ops = transform.merge_handles %tiled_input_max_fill, - %tiled_exps_sum_fill, - %tiled_div - : !transform.any_op - transform.structured.tile_using_forall %parallel_linalg_ops num_threads [1, 4, 32] - ( mapping = [#gpu.thread, #gpu.thread, #gpu.thread] ) - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - - // Step 3. Rank-reduce and vectorize. - // ================================== - %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op - transform.apply_patterns to %func { - transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface - transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices - transform.apply_patterns.vector.cast_away_vector_leading_one_dim - } : !transform.any_op - transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op - - // Step 4. Bufferize and drop HAL decriptor from memref ops. - // ========================================================= - transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> () - %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> !transform.any_op - %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op - - // Step 5. Post-bufferization mapping to blocks and threads. - // ========================================================= - %func_2 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op - transform.iree.forall_to_workgroup %func_2 : (!transform.any_op) -> () - transform.iree.map_nested_forall_to_gpu_threads %func_2 workgroup_dims = [32, 4, 1] : (!transform.any_op) -> () - - // Step 6. Post-bufferization vector distribution with rank-reduction. - // =================================================================== - %end_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op - transform.apply_patterns to %end_func { - transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface - transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices - transform.apply_patterns.memref.fold_memref_alias_ops - transform.apply_patterns.vector.cast_away_vector_leading_one_dim - } : !transform.any_op - %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op - %warp = transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 } : (!transform.any_op) -> !transform.any_op - transform.iree.vector.warp_distribute %end_func : (!transform.any_op) -> () - - // Annotate the exported function as already translated. - %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op - %none = transform.param.constant #iree_codegen.translation_info -> !transform.any_param - transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param - - transform.yield - } -} // module diff --git a/tests/transform_dialect/cuda/softmax_dispatch_spec.mlir b/tests/transform_dialect/cuda/softmax_dispatch_spec.mlir deleted file mode 100644 index 09c2c07895ee..000000000000 --- a/tests/transform_dialect/cuda/softmax_dispatch_spec.mlir +++ /dev/null @@ -1,28 +0,0 @@ -// RUN: iree-opt %s - -// Dispatch softmax. -module attributes { transform.with_named_sequence } { - transform.named_sequence @__transform_main(%func: !transform.any_op) { - %ops = transform.structured.match ops{["linalg.fill", "linalg.generic"]} - in %func : (!transform.any_op) -> !transform.any_op - - %input_max_fill, %input_max, %exps_sum_fill, %exps, %exps_sum, %div = - transform.split_handle %ops - : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, - !transform.any_op, !transform.any_op, !transform.any_op) - - /// This must be used with the custom dispatch region formation - /// because IREE's does not fuse the 6 ops softmax version even with - /// --iree-flow-fuse-multi-use. - %region_op = transform.iree.wrap_in_dispatch_region %div { generateWorkload = false } : (!transform.any_op) -> !transform.any_op - - %non_div = transform.merge_handles %input_max_fill, %input_max, %exps_sum_fill, %exps, %exps_sum - : !transform.any_op - %region_op_2 = transform.iree.move_preceding_op_into_dispatch_region %non_div into %region_op : (!transform.any_op, !transform.any_op) -> !transform.any_op - - %empty = transform.structured.match ops{["tensor.empty"]} in %func : (!transform.any_op) -> !transform.any_op - %region_op_3 = transform.iree.move_preceding_op_into_dispatch_region %empty into %region_op_2 : (!transform.any_op, !transform.any_op) -> !transform.any_op - transform.iree.region_to_workgroups %region_op_3 : (!transform.any_op) -> !transform.any_op - transform.yield - } -} diff --git a/tests/transform_dialect/cuda/softmax_partial.mlir b/tests/transform_dialect/cuda/softmax_partial.mlir deleted file mode 100644 index 018ad8c42ac8..000000000000 --- a/tests/transform_dialect/cuda/softmax_partial.mlir +++ /dev/null @@ -1,46 +0,0 @@ -// RUN: iree-compile %s --iree-hal-target-backends=cuda \ -// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \ -// RUN: --iree-codegen-transform-dialect-library=%p/softmax_partial_codegen_spec.mlir@codegen | \ -// RUN: iree-run-module --module=- --function=softmax_partial --device=cuda | \ -// RUN: FileCheck %s - -!tmp_tensor_t = tensor<16x128xf32> -!out_tensor_t = tensor<16x128x128xf32> - -// Execution only checks that @softmax_partial runs. -// CHECK: EXEC @softmax_partial -// CHECK: 16x128x128xf32=[ -// CHECK-SAME: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 - -func.func @softmax_partial() -> !out_tensor_t { - %cst = arith.constant -3.40282347E+38 : f32 - %cst_0 = arith.constant dense<1121212.000000e+00> : !out_tensor_t - %cst_1 = arith.constant dense<5.000000e+00> : !out_tensor_t - %0 = util.optimization_barrier %cst_1 : !out_tensor_t - - %1 = tensor.empty() : !tmp_tensor_t - %2 = linalg.fill ins(%cst : f32) outs(%1 : !tmp_tensor_t) -> !tmp_tensor_t - %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, - affine_map<(d0, d1, d2) -> (d0, d1)>], - iterator_types = ["parallel", "parallel", "reduction"]} - ins(%0 : !out_tensor_t) outs(%2 : !tmp_tensor_t) { - ^bb0(%arg0: f32, %arg1: f32): - %8 = arith.maximumf %arg0, %arg1 : f32 - linalg.yield %8 : f32 - } -> !tmp_tensor_t - - // This has been fused manually to avoid the fusion on tensors pass and reduce noise atm. - %4 = tensor.empty() : !out_tensor_t - %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, - affine_map<(d0, d1, d2) -> (d0, d1)>, - affine_map<(d0, d1, d2) -> (d0, d1, d2)>], - iterator_types = ["parallel", "parallel", "parallel"]} - ins(%0, %3 : !out_tensor_t, !tmp_tensor_t) outs(%4 : !out_tensor_t) { - ^bb0(%arg0: f32, %arg1: f32, %arg2: f32): - %6 = arith.subf %arg0, %arg1 : f32 - %7 = math.exp %6 : f32 - linalg.yield %7 : f32 - } -> !out_tensor_t - - return %5: !out_tensor_t -} diff --git a/tests/transform_dialect/cuda/softmax_partial_codegen_spec.mlir b/tests/transform_dialect/cuda/softmax_partial_codegen_spec.mlir deleted file mode 100644 index 65ea847af92c..000000000000 --- a/tests/transform_dialect/cuda/softmax_partial_codegen_spec.mlir +++ /dev/null @@ -1,102 +0,0 @@ -// RUN: iree-opt %s - -// Codegen -module attributes { transform.with_named_sequence } { - transform.named_sequence @codegen( - %variant_op: !transform.any_op {transform.consumed}) { - - // Step 1. First level of tiling + fusion parallelizes to blocks. - // ============================================================== - %root = transform.structured.match interface{LinalgOp} - attributes{iterator_types = [#linalg.iterator_type, #linalg.iterator_type, #linalg.iterator_type]} in %variant_op : (!transform.any_op) -> !transform.any_op - %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op - %red = transform.structured.match interface{LinalgOp} - attributes{iterator_types = [#linalg.iterator_type, #linalg.iterator_type, #linalg.iterator_type]} in %variant_op : (!transform.any_op) -> !transform.any_op - %not_root = transform.merge_handles %fill, %red : !transform.any_op - %tiled_generic, %forall = - transform.structured.tile_using_forall %root tile_sizes [1, 4] - ( mapping = [#gpu.block, #gpu.block] ) - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall : (!transform.any_op) -> () - transform.structured.fuse_into_containing_op %not_root into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - - // Step 2. Second level of tiling + fusion parallelizes to threads. - // ================================================================ - %fill_linalg = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op - %reduction_linalg = transform.structured.match ops{["linalg.generic"]} - attributes{iterator_types = [#linalg.iterator_type, #linalg.iterator_type, #linalg.iterator_type]} in %variant_op : (!transform.any_op) -> !transform.any_op - %parallel_linalg = transform.structured.match ops{["linalg.generic"]} - attributes{iterator_types = [#linalg.iterator_type, #linalg.iterator_type, #linalg.iterator_type]} in %variant_op : (!transform.any_op) -> !transform.any_op - %tiled_reduction_generic, %forall_reduction = - transform.structured.tile_using_forall %reduction_linalg tile_sizes [1, 1] - ( mapping = [#gpu.thread, #gpu.thread] ) - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - // TODO: this fusion currently does not happen properly, this is related to the clone - // behavior when fusing into scf.forall. - // Once fixed we'll be able to fuse. - // Fusion will save us one roundtrip to memory. - // transform.structured.fuse_into_containing_op %fill_linalg into %forall_reduction - transform.structured.tile_using_forall %parallel_linalg num_threads [1, 4, 32] - ( mapping = [#gpu.thread, #gpu.thread, #gpu.thread] ) - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - - - // Inability to tile reductions to scf.forall has 2 implications: - // 1. since no scf.forall is present, no gpu.barrier is added. - // This should be fixed independently: ops that are not nested in an scf.forall - // should have a gpu.barrier. Later needs to be complemented by a barrier - // removal pass. - // 2. Similarly, needs to be predicated under an if threadIx == 0 to avoid - // multiple threads updating the buffer inplace once bufferized. - // - // Instead, we can vectorize and go to vector SSA values that sidestep these - // issues. - // Everyone will race to the write while still computing the same value. - // - // That is still not good enough because we need to predicate this in order - // to enable the parallel reduction on warps. - - // Step 3. Rank-reduce and vectorize. - // ================================== - %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op - transform.apply_patterns to %func { - transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface - transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices - transform.apply_patterns.vector.cast_away_vector_leading_one_dim - } : !transform.any_op - transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op - - // Step 4. Bufferize and drop HAL decriptor from memref ops. - // ========================================================= - transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> () - %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> !transform.any_op - %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op - - // Step 5. Post-bufferization mapping to blocks and threads. - // ========================================================= - %func_2 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op - transform.iree.forall_to_workgroup %func_2 : (!transform.any_op) -> () - transform.iree.map_nested_forall_to_gpu_threads %func_2 workgroup_dims = [32, 4, 1] : (!transform.any_op) -> () - - // Step 6. Post-bufferization vector distribution with rank-reduction. - // =================================================================== - %end_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op - transform.apply_patterns to %end_func { - transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface - transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices - transform.apply_patterns.memref.fold_memref_alias_ops - transform.apply_patterns.vector.cast_away_vector_leading_one_dim - } : !transform.any_op - %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op - %warp = transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 } - : (!transform.any_op) -> !transform.any_op - transform.iree.vector.warp_distribute %end_func : (!transform.any_op) -> () - - // Annotate the exported function as already translated. - %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op - %none = transform.param.constant #iree_codegen.translation_info -> !transform.any_param - transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param - - transform.yield - } -} // module diff --git a/tests/transform_dialect/cuda/softmax_v2.mlir b/tests/transform_dialect/cuda/softmax_v2.mlir deleted file mode 100644 index 5ef3a90347b9..000000000000 --- a/tests/transform_dialect/cuda/softmax_v2.mlir +++ /dev/null @@ -1,76 +0,0 @@ -// RUN: iree-compile %s --iree-hal-target-backends=cuda \ -// RUN: --iree-flow-fuse-multi-use \ -// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \ -// RUN: --iree-codegen-transform-dialect-library=%p/softmax_v2_codegen_spec.mlir@codegen | \ -// RUN: iree-run-module --module=- --function=softmax --device=cuda | \ -// RUN: FileCheck %s - -!tmp_tensor_t = tensor<16x128xf32> -!in_tensor_t = tensor<16x128x128xf32> -!out_tensor_t = tensor<16x128x128xf32> - -// Execution only checks that @softmax runs. -// CHECK: EXEC @softmax -// CHECK: 16x128x128xf32=[ -// CHECK-SAME: [0.0078125 0.0078125 0.0078125 0.0078125 - -func.func @softmax() -> !out_tensor_t { - %cst_0 = arith.constant 0.0 : f32 - %cst_1 = arith.constant 1.0 : f32 - %cst_min = arith.constant -3.40282347E+38 : f32 - %input = arith.constant dense<5.000000e+00> : !out_tensor_t - util.optimization_barrier %input : !in_tensor_t - - %softmax = flow.dispatch.region[] -> (!out_tensor_t) { - %input_max_empty = tensor.empty() : !tmp_tensor_t - %input_max_filled = linalg.fill ins(%cst_min : f32) - outs(%input_max_empty : !tmp_tensor_t) -> !tmp_tensor_t - %input_max = linalg.generic - {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, - affine_map<(d0, d1, d2) -> (d0, d1)>], - iterator_types = ["parallel", "parallel", "reduction"]} - ins(%input : !in_tensor_t) - outs(%input_max_filled : !tmp_tensor_t) { - ^bb0(%arg0: f32, %arg1: f32): - %max = arith.maximumf %arg0, %arg1 : f32 - linalg.yield %max : f32 - } -> !tmp_tensor_t - - // This has been fused manually to avoid the fusion on tensors pass and reduce noise atm. - %exps_empty = tensor.empty() : !out_tensor_t - %exps_sum_empty = tensor.empty() : !tmp_tensor_t - %exps_sum_filled = linalg.fill ins(%cst_0 : f32) - outs(%exps_sum_empty : !tmp_tensor_t) -> !tmp_tensor_t - %exps, %exps_sum = linalg.generic - {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, - affine_map<(d0, d1, d2) -> (d0, d1)>, - affine_map<(d0, d1, d2) -> (d0, d1, d2)>, - affine_map<(d0, d1, d2) -> (d0, d1)>], - iterator_types = ["parallel", "parallel", "reduction"]} - ins(%input, %input_max : !in_tensor_t, !tmp_tensor_t) - outs(%exps_empty, %exps_sum_filled : !out_tensor_t, !tmp_tensor_t) { - ^bb0(%arg0: f32, %arg1: f32, %arg2: f32, %arg3: f32): - %sub = arith.subf %arg0, %arg1 : f32 - %exp = math.exp %sub : f32 - %add = arith.addf %exp, %arg3 : f32 - linalg.yield %exp, %add : f32, f32 - } -> (!out_tensor_t, !tmp_tensor_t) - - %res_empty = tensor.empty() : !out_tensor_t - %res = linalg.generic - {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, - affine_map<(d0, d1, d2) -> (d0, d1)>, - affine_map<(d0, d1, d2) -> (d0, d1, d2)>], - iterator_types = ["parallel", "parallel", "parallel"]} - ins(%exps, %exps_sum : !out_tensor_t, !tmp_tensor_t) - outs(%res_empty : !out_tensor_t) { - ^bb0(%arg0: f32, %arg1: f32, %arg2: f32): - // %10 = arith.divf %cst_1, %arg1 : f32 - // %11 = arith.mulf %arg0, %10 : f32 - %div = arith.divf %arg0, %arg1 : f32 - linalg.yield %div : f32 - } -> !out_tensor_t - flow.return %res : !out_tensor_t - } - return %softmax: !out_tensor_t -} diff --git a/tests/transform_dialect/cuda/softmax_v2_codegen_spec.mlir b/tests/transform_dialect/cuda/softmax_v2_codegen_spec.mlir deleted file mode 100644 index c73cbe9e18d2..000000000000 --- a/tests/transform_dialect/cuda/softmax_v2_codegen_spec.mlir +++ /dev/null @@ -1,148 +0,0 @@ -// RUN: iree-opt %s - -// Codegen -module attributes { transform.with_named_sequence } { - transform.named_sequence @codegen( - %variant_op: !transform.any_op {transform.consumed}) { - - %ops = transform.structured.match ops{["linalg.fill", "linalg.generic"]} - in %variant_op : (!transform.any_op) -> !transform.any_op - %input_max_fill, - %input_max, - %exps_sum_fill, - %exp_and_exps_sum, - %div = transform.split_handle %ops - : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, - !transform.any_op, !transform.any_op) - - // Step 1. First level of tiling + fusion parallelizes to blocks. - // ============================================================== - %_, %forall = - transform.structured.tile_using_forall %div tile_sizes [1, 4] - ( mapping = [#gpu.block, #gpu.block] ) - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall : (!transform.any_op) -> () - - // TODO: Merging and fusing merged handles does not work properly atm. - transform.structured.fuse_into_containing_op %exp_and_exps_sum into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.structured.fuse_into_containing_op %exps_sum_fill into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.structured.fuse_into_containing_op %input_max into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.structured.fuse_into_containing_op %input_max_fill into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - // By default, fusion into scf.forall does not promote captured values - // to shared as this involves a cross-thread dependence analysis. - // Instead, we activate it explicitly post-hoc to promote all the extract_slice - // ops that we find and match the prerequisites - %forall_with_type = transform.cast %forall : !transform.any_op to !transform.op<"scf.forall"> - transform.iree.share_forall_operands %forall_with_type - : (!transform.op<"scf.forall">) -> !transform.op<"scf.forall"> - - // Canonicalizations. - %func_op = transform.structured.match ops{["func.func"]} in %variant_op - : (!transform.any_op) -> !transform.any_op - transform.apply_patterns to %func_op { - transform.apply_patterns.iree.fold_fill_into_pad - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.scf.for_loop_canonicalization - transform.apply_patterns.canonicalization - } : !transform.any_op - transform.iree.apply_licm %func_op : !transform.any_op - transform.apply_cse to %func_op : !transform.any_op - - - // Step 2. Second level of tiling + fusion parallelizes to threads. - // ================================================================ - %tiled_ops = transform.structured.match ops{["linalg.fill", "linalg.generic"]} - in %variant_op : (!transform.any_op) -> !transform.any_op - %tiled_input_max_fill, - %tiled_input_max, - %tiled_exps_sum_fill, - %tiled_exp_and_exps_sum, - %tiled_div = transform.split_handle %tiled_ops - : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, - !transform.any_op, !transform.any_op) - // Leaving the reduction untiled on threadIdx.x makes it sequential on - // threadIdx.x. After distribution, predication by `if (threadIdx.x == 0)` is - // introduced and opportunities for distributing vector ops across warps - // appear. - %reduction_linalg_ops = transform.merge_handles %tiled_input_max, - %tiled_exp_and_exps_sum - : !transform.any_op - transform.structured.tile_using_forall %reduction_linalg_ops tile_sizes [1, 1] - ( mapping = [#gpu.thread, #gpu.thread] ) - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - // Fully parallel ops are tiled and mapped. - %parallel_linalg_ops = transform.merge_handles %tiled_input_max_fill, - %tiled_exps_sum_fill, - %tiled_div - : !transform.any_op - transform.structured.tile_using_forall %parallel_linalg_ops num_threads [1, 4, 32] - ( mapping = [#gpu.thread, #gpu.thread, #gpu.thread] ) - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - - // Canonicalizations. - transform.apply_patterns to %func_op { - transform.apply_patterns.iree.fold_fill_into_pad - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.scf.for_loop_canonicalization - transform.apply_patterns.canonicalization - } : !transform.any_op - transform.iree.apply_licm %func_op : !transform.any_op - transform.apply_cse to %func_op : !transform.any_op - - // Step 3. Rank-reduce and vectorize. - // ================================== - transform.apply_patterns to %func_op { - transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface - transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices - transform.apply_patterns.vector.cast_away_vector_leading_one_dim - } : !transform.any_op - transform.structured.vectorize_children_and_apply_patterns %func_op : (!transform.any_op) -> !transform.any_op - - // Step 4. Bufferize and drop HAL decriptor from memref ops. - // ========================================================= - transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> () - %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> !transform.any_op - %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op - - // Step 5. Post-bufferization mapping to blocks and threads. - // ========================================================= - transform.iree.forall_to_workgroup %memref_func : (!transform.any_op) -> () - transform.iree.map_nested_forall_to_gpu_threads %memref_func - workgroup_dims = [32, 4, 1] : (!transform.any_op) -> () - - // Step 6. Post-bufferization vector distribution with rank-reduction. - // =================================================================== - transform.apply_patterns to %memref_func { - transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface - transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices - transform.apply_patterns.memref.fold_memref_alias_ops - transform.apply_patterns.vector.cast_away_vector_leading_one_dim - } : !transform.any_op - %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3 - : (!transform.any_op) -> !transform.any_op - %warp = transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 } - : (!transform.any_op) -> !transform.any_op - transform.iree.vector.warp_distribute %memref_func - : (!transform.any_op) -> () - - - // Late canonicalizations. - %func_op_3 = transform.structured.match ops{["func.func"]} in %variant_op_3 - : (!transform.any_op) -> !transform.any_op - transform.apply_patterns to %func_op_3 { - transform.apply_patterns.iree.fold_fill_into_pad - transform.apply_patterns.linalg.tiling_canonicalization - transform.apply_patterns.scf.for_loop_canonicalization - transform.apply_patterns.canonicalization - } : !transform.any_op - transform.iree.apply_licm %func_op_3 : !transform.any_op - transform.apply_cse to %func_op_3 : !transform.any_op - - // Annotate the exported function as already translated. - %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op - %none = transform.param.constant #iree_codegen.translation_info -> !transform.any_param - transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param - - transform.yield - } -} // module