From 694dd3675fb38ef6f85bd7f4a09a0e4cdc402de1 Mon Sep 17 00:00:00 2001
From: Quinn Dawkins <quinn@nod-labs.com>
Date: Wed, 10 Jul 2024 19:10:01 -0400
Subject: [PATCH] Drop unmaintained transform dialect tests

The tests in `tests/transform_dialect` have been unmaintained (and
mostly disabled) for a while. Most of the tests were misplaced anyway,
mixing lit tests in with e2e tests, and much of the relevant
functionality is now tested upstream or has evolved into different lit
tests in Codegen.

This drops the entire `cuda` directory, and drops all `cpu` tests except
one which is testing a transform dialect library call, which is still
a relevant e2e test.
---
 tests/transform_dialect/cpu/BUILD.bazel       |   8 -
 .../cpu/contraction-packing-and-dispatch.mlir |  59 -------
 .../cpu/contraction-packing.mlir              | 153 ----------------
 .../cpu/eltwise_reduction_eltwise.mlir        |  71 --------
 .../cpu/fold_tensor_slice_into_transfer.mlir  | 111 ------------
 tests/transform_dialect/cpu/matmul.mlir       |  21 ---
 .../cpu/matmul_codegen_default_spec.mlir      |  32 ----
 tests/transform_dialect/cuda/BUILD.bazel      | 113 ------------
 tests/transform_dialect/cuda/CMakeLists.txt   |  78 ---------
 .../cuda/double_mma_layout_analysis.mlir      |  26 ---
 ...uble_mma_layout_analysis_codegen_spec.mlir |  69 --------
 ...ble_mma_layout_analysis_dispatch_spec.mlir |  23 ---
 .../cuda/eltwise_reduction.mlir               |  39 -----
 .../cuda/eltwise_reduction_eltwise.mlir       |  51 ------
 .../cuda/mma_elemwise_layout_analysis.mlir    |  29 ----
 ...elemwise_layout_analysis_codegen_spec.mlir |  65 -------
 .../cuda/mma_reduction_layout_analysis.mlir   |  38 ----
 ...eduction_layout_analysis_codegen_spec.mlir |  68 --------
 ...duction_layout_analysis_dispatch_spec.mlir |  23 ---
 .../cuda/mma_using_layout_analysis.mlir       |  20 ---
 ...ma_using_layout_analysis_codegen_spec.mlir |  72 --------
 tests/transform_dialect/cuda/reduction.mlir   |  33 ----
 .../cuda/reduction_codegen_spec.mlir          | 125 -------------
 .../cuda/reduction_eltwise.mlir               |  45 -----
 .../cuda/reduction_eltwise_codegen_spec.mlir  | 164 ------------------
 .../transform_dialect/cuda/reduction_v2.mlir  |  33 ----
 .../cuda/reduction_v2_codegen_spec.mlir       | 113 ------------
 .../cuda/reduction_v2_uneven.mlir             |  29 ----
 tests/transform_dialect/cuda/softmax.mlir     |  84 ---------
 .../cuda/softmax_codegen_spec.mlir            | 119 -------------
 .../cuda/softmax_dispatch_spec.mlir           |  28 ---
 .../cuda/softmax_partial.mlir                 |  46 -----
 .../cuda/softmax_partial_codegen_spec.mlir    | 102 -----------
 tests/transform_dialect/cuda/softmax_v2.mlir  |  76 --------
 .../cuda/softmax_v2_codegen_spec.mlir         | 148 ----------------
 35 files changed, 2314 deletions(-)
 delete mode 100644 tests/transform_dialect/cpu/contraction-packing-and-dispatch.mlir
 delete mode 100644 tests/transform_dialect/cpu/contraction-packing.mlir
 delete mode 100644 tests/transform_dialect/cpu/eltwise_reduction_eltwise.mlir
 delete mode 100644 tests/transform_dialect/cpu/fold_tensor_slice_into_transfer.mlir
 delete mode 100644 tests/transform_dialect/cpu/matmul.mlir
 delete mode 100644 tests/transform_dialect/cpu/matmul_codegen_default_spec.mlir
 delete mode 100644 tests/transform_dialect/cuda/BUILD.bazel
 delete mode 100644 tests/transform_dialect/cuda/CMakeLists.txt
 delete mode 100644 tests/transform_dialect/cuda/double_mma_layout_analysis.mlir
 delete mode 100644 tests/transform_dialect/cuda/double_mma_layout_analysis_codegen_spec.mlir
 delete mode 100644 tests/transform_dialect/cuda/double_mma_layout_analysis_dispatch_spec.mlir
 delete mode 100644 tests/transform_dialect/cuda/eltwise_reduction.mlir
 delete mode 100644 tests/transform_dialect/cuda/eltwise_reduction_eltwise.mlir
 delete mode 100644 tests/transform_dialect/cuda/mma_elemwise_layout_analysis.mlir
 delete mode 100644 tests/transform_dialect/cuda/mma_elemwise_layout_analysis_codegen_spec.mlir
 delete mode 100644 tests/transform_dialect/cuda/mma_reduction_layout_analysis.mlir
 delete mode 100644 tests/transform_dialect/cuda/mma_reduction_layout_analysis_codegen_spec.mlir
 delete mode 100644 tests/transform_dialect/cuda/mma_reduction_layout_analysis_dispatch_spec.mlir
 delete mode 100644 tests/transform_dialect/cuda/mma_using_layout_analysis.mlir
 delete mode 100644 tests/transform_dialect/cuda/mma_using_layout_analysis_codegen_spec.mlir
 delete mode 100644 tests/transform_dialect/cuda/reduction.mlir
 delete mode 100644 tests/transform_dialect/cuda/reduction_codegen_spec.mlir
 delete mode 100644 tests/transform_dialect/cuda/reduction_eltwise.mlir
 delete mode 100644 tests/transform_dialect/cuda/reduction_eltwise_codegen_spec.mlir
 delete mode 100644 tests/transform_dialect/cuda/reduction_v2.mlir
 delete mode 100644 tests/transform_dialect/cuda/reduction_v2_codegen_spec.mlir
 delete mode 100644 tests/transform_dialect/cuda/reduction_v2_uneven.mlir
 delete mode 100644 tests/transform_dialect/cuda/softmax.mlir
 delete mode 100644 tests/transform_dialect/cuda/softmax_codegen_spec.mlir
 delete mode 100644 tests/transform_dialect/cuda/softmax_dispatch_spec.mlir
 delete mode 100644 tests/transform_dialect/cuda/softmax_partial.mlir
 delete mode 100644 tests/transform_dialect/cuda/softmax_partial_codegen_spec.mlir
 delete mode 100644 tests/transform_dialect/cuda/softmax_v2.mlir
 delete mode 100644 tests/transform_dialect/cuda/softmax_v2_codegen_spec.mlir

diff --git a/tests/transform_dialect/cpu/BUILD.bazel b/tests/transform_dialect/cpu/BUILD.bazel
index 669933c9f2af..fc81794397de 100644
--- a/tests/transform_dialect/cpu/BUILD.bazel
+++ b/tests/transform_dialect/cpu/BUILD.bazel
@@ -14,20 +14,12 @@ package(
 iree_lit_test_suite(
     name = "lit",
     srcs = [
-        "contraction-packing.mlir",
-        "contraction-packing-and-dispatch.mlir",
-        # DISABLED: incorrectly assuming default flag values.
-        # "eltwise_reduction_eltwise.mlir",
-        "fold_tensor_slice_into_transfer.mlir",
-        # DISABLED: incorrectly assuming default flag values.
-        # "matmul.mlir",
         "matmul_library_call.mlir",
     ],
     cfg = "//tests:lit.cfg.py",
     # transform dialect spec files are MLIR files that specify a transformation,
     # they need to be included as data.
     data = [
-        "matmul_codegen_default_spec.mlir",
         "transform_library.mlir",
     ],
     tags = [
diff --git a/tests/transform_dialect/cpu/contraction-packing-and-dispatch.mlir b/tests/transform_dialect/cpu/contraction-packing-and-dispatch.mlir
deleted file mode 100644
index bb0c56a24213..000000000000
--- a/tests/transform_dialect/cpu/contraction-packing-and-dispatch.mlir
+++ /dev/null
@@ -1,59 +0,0 @@
-
-// Preprocessing with generalized packing.
-//
-// RUN: iree-opt %s --iree-transform-dialect-interpreter --transform-dialect-drop-schedule | \
-// RUN: iree-compile - --iree-hal-target-backends=llvm-cpu \
-// RUN:     --iree-opt-data-tiling=false \
-// RUN:     --compile-to=executable-configurations | \
-// RUN: FileCheck %s
-
-!a_tensor_t = tensor<1234x567xf32>
-!b_tensor_t = tensor<567x890xf32>
-!c_tensor_t = tensor<1234x890xf32>
-
-// Note: the normalization in these maps is gone due to InterchangeGenericOps.
-// When using generalized packing, it would be better to drop that pass.
-
-// CHECK-LABEL: func.func @matmul_dispatch_0
-//       CHECK:   tensor.empty() : tensor<155x18x8x32xf32>
-//       CHECK:   tensor.pack
-
-// CHECK-LABEL: func.func @matmul_dispatch_1
-//       CHECK:   tensor.empty() : tensor<18x56x16x32xf32>
-//       CHECK:   tensor.pack
-
-// CHECK-LABEL: func.func @matmul_dispatch_2
-//       CHECK:   tensor.empty() : tensor<155x56x8x16xf32>
-//       CHECK:   tensor.pack
-
-// CHECK-LABEL: func.func @matmul_dispatch_3
-func.func public @matmul(%arg0: !a_tensor_t, %arg2: !c_tensor_t) -> !c_tensor_t {
-  %rhs = arith.constant dense<0.1> : !b_tensor_t
-  %c0 = util.optimization_barrier %rhs : !b_tensor_t
-  //  CHECK-NOT: pack
-  //      CHECK: linalg.generic
-  // CHECK-SAME:   indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d4, d2, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d4, d1, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>]
-  // CHECK-SAME:   iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction"]}
-  // CHECK-SAME:   ins(%{{.*}} : tensor<155x18x8x32xf32>, tensor<18x56x16x32xf32>)
-  // CHECK-SAME:  outs(%{{.*}} : tensor<155x56x8x16xf32>)
-
-  %0 = linalg.matmul
-     ins(%arg0, %c0: !a_tensor_t, !b_tensor_t)
-    outs(%arg2: !c_tensor_t) -> !c_tensor_t
-  return %0 : !c_tensor_t
-}
-
-// CHECK-LABEL: func.func @matmul_dispatch_4
-//       CHECK:   tensor.unpack
-module attributes { transform.with_named_sequence } {
-  transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) {
-    %matmul = transform.structured.match interface{LinalgOp} in %module_op
-      : (!transform.any_op) -> (!transform.any_op)
-
-    transform.structured.pack_greedily %matmul
-        matmul_packed_sizes = [8, 16, 32]
-        matmul_inner_dims_order = [0, 1, 2]
-      : (!transform.any_op) -> !transform.op<"linalg.generic">
-    transform.yield
-  }
-} // module
diff --git a/tests/transform_dialect/cpu/contraction-packing.mlir b/tests/transform_dialect/cpu/contraction-packing.mlir
deleted file mode 100644
index 8c67a38295e9..000000000000
--- a/tests/transform_dialect/cpu/contraction-packing.mlir
+++ /dev/null
@@ -1,153 +0,0 @@
-
-// Preprocessing with generalized packing.
-//
-// RUN: iree-opt %s --iree-transform-dialect-interpreter --transform-dialect-drop-schedule | \
-// RUN: FileCheck %s
-
-!a_tensor_t = tensor<1234x567xf32>
-!at_tensor_t = tensor<567x1234xf32>
-!b_tensor_t = tensor<567x890xf32>
-!bt_tensor_t = tensor<890x567xf32>
-!c_tensor_t = tensor<1234x890xf32>
-!ct_tensor_t = tensor<890x1234xf32>
-
-// CHECK-DAG: #[[$map_lhs:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>
-// CHECK-DAG: #[[$map_rhs:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d1, d4, d5)>
-// CHECK-DAG: #[[$map_res:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>
-// CHECK-DAG: #[[$map_tlhs:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d3, d5)>
-// CHECK-DAG: #[[$map_trhs:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d4, d5)>
-// CHECK-DAG: #[[$map_tres:.*]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d3, d4)>
-
-// CHECK-LABEL: func.func @matmul_nnn
-func.func @matmul_nnn(%arg0: !a_tensor_t, %arg2: !c_tensor_t) -> !c_tensor_t {
-  %c0 = arith.constant dense<0.1> : !b_tensor_t
-
-  //      CHECK: tensor.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [8, 32]
-  //      CHECK: tensor.pack %{{.*}} inner_dims_pos = [1, 0] inner_tiles = [16, 32]
-  //      CHECK: tensor.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [8, 16]
-  //      CHECK: linalg.generic
-  // CHECK-SAME:   indexing_maps = [#[[$map_lhs]], #[[$map_rhs]], #[[$map_res]]]
-  // CHECK-SAME:   iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]}
-  // CHECK-SAME:   ins(%{{.*}} : tensor<155x18x8x32xf32>, tensor<18x56x16x32xf32>)
-  // CHECK-SAME:  outs(%{{.*}} : tensor<155x56x8x16xf32>)
-  //      CHECK: tensor.unpack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [8, 16]
-  %0 = linalg.matmul
-     ins(%arg0, %c0: !a_tensor_t, !b_tensor_t)
-    outs(%arg2: !c_tensor_t) -> !c_tensor_t
-  return %0 : !c_tensor_t
-}
-
-#matmul_tnn_trait = {
-  indexing_maps = [
-    affine_map<(m, n, k) -> (k, m)>,
-    affine_map<(m, n, k) -> (k, n)>,
-    affine_map<(m, n, k) -> (m, n)>
-  ],
-  iterator_types = ["parallel", "parallel", "reduction"]
-}
-
-// CHECK-LABEL: func.func @matmul_tnn
-func.func @matmul_tnn(%arg0: !at_tensor_t, %arg2: !c_tensor_t) -> !c_tensor_t {
-  %c0 = arith.constant dense<0.1> : !b_tensor_t
-
-  //      CHECK: tensor.pack %{{.*}} inner_dims_pos = [1, 0] inner_tiles = [8, 32]
-  //      CHECK: tensor.pack %{{.*}} inner_dims_pos = [1, 0] inner_tiles = [16, 32]
-  //      CHECK: tensor.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [8, 16]
-  //      CHECK: linalg.generic
-  // CHECK-SAME:   indexing_maps = [#[[$map_tlhs]], #[[$map_rhs]], #[[$map_res]]]
-  // CHECK-SAME:   iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]}
-  // CHECK-SAME:   ins(%{{.*}} : tensor<18x155x8x32xf32>, tensor<18x56x16x32xf32>)
-  // CHECK-SAME:  outs(%{{.*}} : tensor<155x56x8x16xf32>)
-  //      CHECK: tensor.unpack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [8, 16]
-  %0 = linalg.generic #matmul_tnn_trait
-     ins(%arg0, %c0: !at_tensor_t, !b_tensor_t)
-    outs(%arg2: !c_tensor_t) {
-    ^bb(%a: f32, %b: f32, %c: f32) :
-      %d = arith.mulf %a, %b: f32
-      %e = arith.addf %c, %d: f32
-      linalg.yield %e : f32
-  } -> !c_tensor_t
-  return %0 : !c_tensor_t
-}
-
-#matmul_ntn_trait = {
-  indexing_maps = [
-    affine_map<(m, n, k) -> (m, k)>,
-    affine_map<(m, n, k) -> (n, k)>,
-    affine_map<(m, n, k) -> (m, n)>
-  ],
-  iterator_types = ["parallel", "parallel", "reduction"]
-}
-
-// CHECK-LABEL: func.func @matmul_ntn
-func.func @matmul_ntn(%arg0: !a_tensor_t, %arg2: !c_tensor_t) -> !c_tensor_t {
-  %c0 = arith.constant dense<0.1> : !bt_tensor_t
-
-  //      CHECK: tensor.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [8, 32]
-  //      CHECK: tensor.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [16, 32]
-  //      CHECK: tensor.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [8, 16]
-  //      CHECK: linalg.generic
-  // CHECK-SAME:   indexing_maps = [#[[$map_lhs]], #[[$map_trhs]], #[[$map_res]]]
-  // CHECK-SAME:   iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]}
-  // CHECK-SAME:   ins(%{{.*}} : tensor<155x18x8x32xf32>, tensor<56x18x16x32xf32>)
-  // CHECK-SAME:  outs(%{{.*}} : tensor<155x56x8x16xf32>)
-  //      CHECK: tensor.unpack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [8, 16]
-  %0 = linalg.generic #matmul_ntn_trait
-     ins(%arg0, %c0: !a_tensor_t, !bt_tensor_t)
-    outs(%arg2: !c_tensor_t) {
-    ^bb(%a: f32, %b: f32, %c: f32) :
-      %d = arith.mulf %a, %b: f32
-      %e = arith.addf %c, %d: f32
-      linalg.yield %e : f32
-  } -> !c_tensor_t
-  return %0 : !c_tensor_t
-}
-
-#matmul_nnt_trait = {
-  indexing_maps = [
-    affine_map<(m, n, k) -> (m, k)>,
-    affine_map<(m, n, k) -> (k, n)>,
-    affine_map<(m, n, k) -> (n, m)>
-  ],
-  iterator_types = ["parallel", "parallel", "reduction"]
-}
-
-// CHECK-LABEL: func.func @matmul_nnt
-func.func @matmul_nnt(%arg0: !a_tensor_t, %arg2: !ct_tensor_t) -> !ct_tensor_t {
-  %c0 = arith.constant dense<0.1> : !b_tensor_t
-
-  //      CHECK: tensor.pack %{{.*}} inner_dims_pos = [0, 1] inner_tiles = [8, 32]
-  //      CHECK: tensor.pack %{{.*}} inner_dims_pos = [1, 0] inner_tiles = [16, 32]
-  //      CHECK: tensor.pack %{{.*}} inner_dims_pos = [1, 0] inner_tiles = [8, 16]
-  //      CHECK: linalg.generic
-  // CHECK-SAME:   indexing_maps = [#[[$map_lhs]], #[[$map_rhs]], #[[$map_tres]]]
-  // CHECK-SAME:   iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]}
-  // CHECK-SAME:   ins(%{{.*}} : tensor<155x18x8x32xf32>, tensor<18x56x16x32xf32>)
-  // CHECK-SAME:  outs(%{{.*}} : tensor<56x155x8x16xf32>)
-  //      CHECK: tensor.unpack %{{.*}} inner_dims_pos = [1, 0] inner_tiles = [8, 16]
-  %0 = linalg.generic #matmul_nnt_trait
-     ins(%arg0, %c0: !a_tensor_t, !b_tensor_t)
-    outs(%arg2: !ct_tensor_t) {
-    ^bb(%a: f32, %b: f32, %c: f32) :
-      %d = arith.mulf %a, %b: f32
-      %e = arith.addf %c, %d: f32
-      linalg.yield %e : f32
-  } -> !ct_tensor_t
-  return %0 : !ct_tensor_t
-}
-
-module attributes { transform.with_named_sequence } {
-  transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) {
-    %matmul = transform.structured.match interface{LinalgOp} in %module_op
-      : (!transform.any_op) -> (!transform.any_op)
-
-    // Generalized packing rewrite extracts a gemm from any linalg op that contains
-    // one. This acts as a powerful normalization step: after this point, we have a
-    // gemm (i.e. 3-D contraction with (m,n,k)=(8,16,32) ) on the 3 most minor
-    // dimensions.
-    transform.structured.pack_greedily %matmul
-        matmul_packed_sizes = [8, 16, 32] matmul_inner_dims_order = [0, 1, 2]
-      : (!transform.any_op) -> !transform.op<"linalg.generic">
-    transform.yield
-  }
-} // module
diff --git a/tests/transform_dialect/cpu/eltwise_reduction_eltwise.mlir b/tests/transform_dialect/cpu/eltwise_reduction_eltwise.mlir
deleted file mode 100644
index 0e61ca71cf31..000000000000
--- a/tests/transform_dialect/cpu/eltwise_reduction_eltwise.mlir
+++ /dev/null
@@ -1,71 +0,0 @@
-!in_tensor_t = tensor<32x256xf32>
-!out_tensor_t = tensor<32xf32>
-
-func.func @reduce(%arg : !in_tensor_t) -> (!out_tensor_t) {
-  %cst = arith.constant -0.000000e+00 : f32
-
-  %0 = tensor.empty() : !out_tensor_t
-  %1 = linalg.fill ins(%cst : f32) outs(%0 : !out_tensor_t) ->  !out_tensor_t
-  %2 = tensor.empty() : !in_tensor_t
-  %3 = linalg.generic {
-    indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
-                     affine_map<(d0, d1) -> (d0, d1)>],
-    iterator_types = ["parallel", "parallel"]}
-    ins(%arg : !in_tensor_t) outs(%2 : !in_tensor_t) {
-    ^bb0(%arg3: f32, %arg4: f32):
-      %4 = arith.addf %arg3, %arg3 : f32
-      %5 = arith.addf %4, %4 : f32
-      linalg.yield %5 : f32
-    } -> !in_tensor_t
-
-  %6 = linalg.generic {
-    indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
-                     affine_map<(d0, d1) -> (d0)>],
-    iterator_types = ["parallel", "reduction"]}
-    ins(%3 : !in_tensor_t) outs(%1 : !out_tensor_t) {
-      ^bb0(%arg3: f32, %arg4: f32):
-        %4 = arith.addf %arg3, %arg4 : f32
-        linalg.yield %4 : f32
-      } -> !out_tensor_t
-
-  %7 = tensor.empty() : !out_tensor_t
-  %8 = linalg.generic {
-    indexing_maps = [affine_map<(d0) -> (d0)>,
-                     affine_map<(d0) -> (d0)>],
-    iterator_types = ["parallel"]}
-    ins(%6 : !out_tensor_t) outs(%7 : !out_tensor_t) {
-    ^bb0(%arg3: f32, %arg4: f32):
-      %4 = math.sqrt %arg3 : f32
-      linalg.yield %4 : f32
-    } -> !out_tensor_t
-
-
-  return %8 : !out_tensor_t
-}
-
-// RUN: iree-compile %s --iree-hal-target-backends=llvm-cpu \
-// RUN:     --iree-opt-data-tiling=false \
-// RUN:     --compile-to=executable-configurations | \
-// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-codegen-materialize-user-configs,iree-llvmcpu-select-lowering-strategy,iree-llvmcpu-lower-executable-target)))' \
-// RUN:     --iree-llvmcpu-enable-transform-dialect-jit | \
-// RUN: FileCheck %s
-
-// RUN: iree-compile %s --iree-hal-target-backends=llvm-cpu  \
-// RUN:     --iree-opt-data-tiling=false \
-// RUN:     --iree-llvmcpu-enable-transform-dialect-jit | \
-// RUN: iree-run-module --module=- --function=reduce --device=local-task --input="32x256xf32=1" | \
-// RUN: FileCheck %s --check-prefix=EXEC
-
-//      CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
-//      CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
-//      CHECK-DAG: %[[workgroup_id_x:.*]] = hal.interface.workgroup.id[0] : index
-//          CHECK: scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} -> (vector<8xf32>) {
-//          CHECK:   arith.addf %{{.*}} : vector<8x16xf32>
-// CHECK-COUNT-16:   vector.extract %{{.*}} : vector<8xf32> from vector<16x8xf32>{{[[:space:]].*}}arith.addf %{{.*}} : vector<8xf32>
-//          CHECK:   scf.yield %{{.*}} : vector<8xf32>
-//          CHECK: }
-//          CHECK: math.sqrt %{{.*}} : vector<8xf32>
-//          CHECK: vector.store %{{.*}} : memref<8xf32, strided<[1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<8xf32>
-
-//      EXEC: result[0]: hal.buffer_view
-// EXEC-NEXT: 32xf32=32 32 32 32 32 32 32 32
diff --git a/tests/transform_dialect/cpu/fold_tensor_slice_into_transfer.mlir b/tests/transform_dialect/cpu/fold_tensor_slice_into_transfer.mlir
deleted file mode 100644
index 435c25f434f1..000000000000
--- a/tests/transform_dialect/cpu/fold_tensor_slice_into_transfer.mlir
+++ /dev/null
@@ -1,111 +0,0 @@
-// RUN: iree-opt --iree-transform-dialect-interpreter %s | FileCheck %s
-
-// This transform was removed from MLIR by https://reviews.llvm.org/D154932 and
-// added to IREE in https://github.com/iree-org/iree/pull/14373, as a workaround
-// for other patterns being sensitive to these exact transforms.
-
-// CHECK-LABEL: func @transfer_read_of_extract_slice(
-//  CHECK-SAME:     %[[t:.*]]: tensor<?x?xf32>, %[[s1:.*]]: index, %[[s2:.*]]: index
-//   CHECK-DAG:   %[[c4:.*]] = arith.constant 4 : index
-//   CHECK-DAG:   %[[c8:.*]] = arith.constant 8 : index
-//       CHECK:   %[[add:.*]] = arith.addi %[[s1]], %[[c4]]
-//       CHECK:   %[[r:.*]] = vector.transfer_read %[[t]][%[[c8]], %[[add]]], %{{.*}} {in_bounds = [true, true]} : tensor<?x?xf32>, vector<5x6xf32>
-//       CHECK:   return %[[r]]
-func.func @transfer_read_of_extract_slice(%t : tensor<?x?xf32>, %s1 : index, %s2 : index) -> vector<5x6xf32> {
-  %c3 = arith.constant 3 : index
-  %c4 = arith.constant 4 : index
-  %cst = arith.constant 0.0 : f32
-  %0 = tensor.extract_slice %t[5, %s1] [10, %s2] [1, 1] : tensor<?x?xf32> to tensor<10x?xf32>
-  %1 = vector.transfer_read %0[%c3, %c4], %cst {in_bounds = [true, true]} : tensor<10x?xf32>, vector<5x6xf32>
-  return %1 : vector<5x6xf32>
-}
-
-// CHECK-LABEL: func @transfer_read_of_extract_slice_1d(
-//  CHECK-SAME:     %[[t:.*]]: tensor<?x?xf32>, %[[s1:.*]]: index, %[[s2:.*]]: index
-//   CHECK-DAG:   %[[c4:.*]] = arith.constant 4 : index
-//   CHECK-DAG:   %[[c8:.*]] = arith.constant 8 : index
-//       CHECK:   %[[add:.*]] = arith.addi %[[s1]], %[[c4]]
-//       CHECK:   %[[r:.*]] = vector.transfer_read %[[t]][%[[c8]], %[[add]]], %{{.*}} {in_bounds = [true]} : tensor<?x?xf32>, vector<6xf32>
-//       CHECK:   return %[[r]]
-func.func @transfer_read_of_extract_slice_1d(%t : tensor<?x?xf32>, %s1 : index, %s2 : index) -> vector<6xf32> {
-  %c3 = arith.constant 3 : index
-  %c4 = arith.constant 4 : index
-  %cst = arith.constant 0.0 : f32
-  %0 = tensor.extract_slice %t[5, %s1] [10, %s2] [1, 1] : tensor<?x?xf32> to tensor<10x?xf32>
-  %1 = vector.transfer_read %0[%c3, %c4], %cst {in_bounds = [true]} : tensor<10x?xf32>, vector<6xf32>
-  return %1 : vector<6xf32>
-}
-
-// CHECK-LABEL: func @transfer_read_of_extract_slice_rank_reducing(
-//  CHECK-SAME:     %[[t:.*]]: tensor<?x?x?xf32>, %[[s1:.*]]: index, %[[s2:.*]]: index
-//   CHECK-DAG:   %[[c3:.*]] = arith.constant 3 : index
-//   CHECK-DAG:   %[[c5:.*]] = arith.constant 5 : index
-//   CHECK-DAG:   %[[c10:.*]] = arith.constant 10 : index
-//       CHECK:   %[[add:.*]] = arith.addi %[[s1]], %[[c3]]
-//       CHECK:   %[[r:.*]] = vector.transfer_read %[[t]][%[[c5]], %[[add]], %[[c10]]], %{{.*}} {in_bounds = [true, true]} : tensor<?x?x?xf32>, vector<5x6xf32>
-//       CHECK:   return %[[r]]
-func.func @transfer_read_of_extract_slice_rank_reducing(%t : tensor<?x?x?xf32>, %s1 : index, %s2 : index) -> vector<5x6xf32> {
-  %c3 = arith.constant 3 : index
-  %c4 = arith.constant 4 : index
-  %cst = arith.constant 0.0 : f32
-  %0 = tensor.extract_slice %t[5, %s1, 6] [1, %s2, 12] [1, 1, 1] : tensor<?x?x?xf32> to tensor<?x12xf32>
-  %1 = vector.transfer_read %0[%c3, %c4], %cst {in_bounds = [true, true]} : tensor<?x12xf32>, vector<5x6xf32>
-  return %1 : vector<5x6xf32>
-}
-
-// CHECK-LABEL: func @transfer_read_of_extract_slice_illegal_rank_reducing(
-//       CHECK:   extract_slice
-//       CHECK:   vector.transfer_read
-func.func @transfer_read_of_extract_slice_illegal_rank_reducing(%t : tensor<?x?x?xf32>, %s1 : index, %s2 : index) -> vector<5x6xf32> {
-  %c3 = arith.constant 3 : index
-  %c4 = arith.constant 4 : index
-  %cst = arith.constant 0.0 : f32
-  %0 = tensor.extract_slice %t[5, %s1, 6] [%s2, 1, 12] [1, 1, 1] : tensor<?x?x?xf32> to tensor<?x12xf32>
-  %1 = vector.transfer_read %0[%c3, %c4], %cst {in_bounds = [true, true]} : tensor<?x12xf32>, vector<5x6xf32>
-  return %1 : vector<5x6xf32>
-}
-
-// CHECK-LABEL: func @insert_slice_of_transfer_write(
-//  CHECK-SAME:     %[[t1:.*]]: tensor<?x12xf32>, %[[v:.*]]: vector<5x6xf32>, %[[s:.*]]: index
-//       CHECK:   %[[c3:.*]] = arith.constant 3 : index
-//       CHECK:   %[[r:.*]] = vector.transfer_write %[[v]], %[[t1]][%[[c3]], %[[s]]] {in_bounds = [true, true]} : vector<5x6xf32>, tensor<?x12xf32>
-//       CHECK:   return %[[r]]
-func.func @insert_slice_of_transfer_write(%t1 : tensor<?x12xf32>, %v : vector<5x6xf32>, %s : index, %t2 : tensor<5x6xf32>) -> tensor<?x12xf32> {
-  %c0 = arith.constant 0 : index
-  %0 = vector.transfer_write %v, %t2[%c0, %c0] {in_bounds = [true, true]} : vector<5x6xf32>, tensor<5x6xf32>
-  %1 = tensor.insert_slice %0 into %t1[3, %s] [5, 6] [1, 1] : tensor<5x6xf32> into tensor<?x12xf32>
-  return %1 : tensor<?x12xf32>
-}
-
-// CHECK-LABEL: func @insert_slice_of_transfer_write_illegal_rank_extending(
-//       CHECK:   vector.transfer_write
-//       CHECK:   insert_slice
-func.func @insert_slice_of_transfer_write_illegal_rank_extending(%t1 : tensor<?x?x12xf32>, %v : vector<5x6xf32>, %s : index, %t2 : tensor<5x6xf32>) -> tensor<?x?x12xf32> {
-  %c0 = arith.constant 0 : index
-  %0 = vector.transfer_write %v, %t2[%c0, %c0] {in_bounds = [true, true]} : vector<5x6xf32>, tensor<5x6xf32>
-  %1 = tensor.insert_slice %0 into %t1[4, 3, %s] [5, 1, 6] [1, 1, 1] : tensor<5x6xf32> into tensor<?x?x12xf32>
-  return %1 : tensor<?x?x12xf32>
-}
-
-// CHECK-LABEL: func @insert_slice_of_transfer_write_rank_extending(
-//  CHECK-SAME:     %[[t1:.*]]: tensor<?x?x12xf32>, %[[v:.*]]: vector<5x6xf32>, %[[s:.*]]: index
-//   CHECK-DAG:   %[[c3:.*]] = arith.constant 3 : index
-//   CHECK-DAG:   %[[c4:.*]] = arith.constant 4 : index
-//       CHECK:   %[[r:.*]] = vector.transfer_write %[[v]], %[[t1]][%[[c4]], %[[c3]], %[[s]]] {in_bounds = [true, true]} : vector<5x6xf32>, tensor<?x?x12xf32>
-//       CHECK:   return %[[r]]
-func.func @insert_slice_of_transfer_write_rank_extending(%t1 : tensor<?x?x12xf32>, %v : vector<5x6xf32>, %s : index, %t2 : tensor<5x6xf32>) -> tensor<?x?x12xf32> {
-  %c0 = arith.constant 0 : index
-  %0 = vector.transfer_write %v, %t2[%c0, %c0] {in_bounds = [true, true]} : vector<5x6xf32>, tensor<5x6xf32>
-  %1 = tensor.insert_slice %0 into %t1[4, 3, %s] [1, 5, 6] [1, 1, 1] : tensor<5x6xf32> into tensor<?x?x12xf32>
-  return %1 : tensor<?x?x12xf32>
-}
-
-module attributes { transform.with_named_sequence } {
-  transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) {
-    %top_level_func = transform.structured.match ops{["func.func"]} in %module_op : (!transform.any_op) -> !transform.any_op
-    transform.apply_patterns to %top_level_func {
-      transform.apply_patterns.iree.fold_tensor_slice_into_transfer
-    } : !transform.any_op
-    transform.yield
-  }
-}
diff --git a/tests/transform_dialect/cpu/matmul.mlir b/tests/transform_dialect/cpu/matmul.mlir
deleted file mode 100644
index 39a2074f6043..000000000000
--- a/tests/transform_dialect/cpu/matmul.mlir
+++ /dev/null
@@ -1,21 +0,0 @@
-!A_size = tensor<3x5xf32>
-!B_size = tensor<5x3xf32>
-!C_size = tensor<3x3xf32>
-
-func.func @matmul_static(
-    %A : !A_size, %B : !B_size, %C : !C_size) -> !C_size {
-  %0 = linalg.matmul ins(%A, %B : !A_size, !B_size)
-                     outs(%C : !C_size) -> !C_size
-  return %0 : !C_size
-}
-
-// RUN: iree-compile %s --iree-hal-target-backends=llvm-cpu \
-// RUN:   --iree-opt-data-tiling=false \
-// RUN:   --iree-codegen-transform-dialect-library=%p/matmul_codegen_default_spec.mlir@codegen | \
-// RUN: iree-run-module --module=- --function=matmul_static \
-// RUN:   --input="3x5xf32=1" \
-// RUN:   --input="5x3xf32=2" \
-// RUN:   --input="3x3xf32=42" | \
-// RUN: FileCheck %s --check-prefixes=EXEC
-
-// EXEC: 3x3xf32=[52 52 52][52 52 52][52 52 52]
diff --git a/tests/transform_dialect/cpu/matmul_codegen_default_spec.mlir b/tests/transform_dialect/cpu/matmul_codegen_default_spec.mlir
deleted file mode 100644
index 1fa31050e506..000000000000
--- a/tests/transform_dialect/cpu/matmul_codegen_default_spec.mlir
+++ /dev/null
@@ -1,32 +0,0 @@
-// RUN: iree-opt %s
-
-module attributes { transform.with_named_sequence } {
-  transform.named_sequence @codegen(
-      %variant_op: !transform.any_op {transform.consumed}) {
-    %matmul = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-
-    // Step 1. Tile to forall with tile_sizes [2].
-    // ===================================================
-    %tiled_generic, %forall =
-      transform.structured.tile_using_forall %matmul tile_sizes [2]
-        ( mapping = [#gpu.block<x>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-    transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall
-      : (!transform.any_op) -> ()
-
-    // Step 2. Bufferize and drop HAL decriptor from memref ops.
-    // =========================================================
-    transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
-    %variant_op_3 = transform.iree.bufferize %variant_op : (!transform.any_op) -> !transform.any_op
-
-    // Step 3. Post-bufferization mapping workgroup.
-    // =========================================================
-    %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-    transform.iree.forall_to_workgroup %memref_func : (!transform.any_op) -> ()
-
-    // Annotate the exported function as already translated.
-    %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-    %none = transform.param.constant #iree_codegen.translation_info<None> -> !transform.any_param
-    transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param
-    transform.yield
-  }
-} // module
diff --git a/tests/transform_dialect/cuda/BUILD.bazel b/tests/transform_dialect/cuda/BUILD.bazel
deleted file mode 100644
index a35e0dbe609f..000000000000
--- a/tests/transform_dialect/cuda/BUILD.bazel
+++ /dev/null
@@ -1,113 +0,0 @@
-# Copyright 2022 The IREE Authors
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-load("//build_tools/bazel:build_defs.oss.bzl", "iree_cmake_extra_content")
-load("//build_tools/bazel:iree_lit_test.bzl", "iree_lit_test_suite")
-
-package(
-    features = ["layering_check"],
-    licenses = ["notice"],  # Apache 2.0
-)
-
-iree_cmake_extra_content(
-    content = """
-if(NOT IREE_HAL_DRIVER_CUDA)
-  return()
-endif()
-
-if(NOT IREE_HOST_BIN_DIR AND NOT IREE_TARGET_BACKEND_CUDA)
-  return()
-endif()
-""",
-    inline = True,
-)
-
-iree_lit_test_suite(
-    name = "lit",
-    srcs = [
-        # TODO(#15892): reductions have flakes and need to be triaged.
-        # "reduction.mlir",
-        # "reduction_eltwise.mlir",
-        # "reduction_v2.mlir",
-        # "reduction_v2_uneven.mlir",
-        # "softmax.mlir",
-        # "softmax_v2.mlir",
-        # First few ops of softmax only, acts as a proxy example.
-        # "softmax_partial.mlir",
-    ],
-    cfg = "//tests:lit.cfg.py",
-    # transform dialect spec files are MLIR files that specify a transformation,
-    # they need to be included as data.
-    data = [
-        "mma_reduction_layout_analysis_codegen_spec.mlir",
-        "mma_reduction_layout_analysis_dispatch_spec.mlir",
-        "mma_using_layout_analysis_codegen_spec.mlir",
-        "reduction_codegen_spec.mlir",
-        "reduction_eltwise_codegen_spec.mlir",
-        "reduction_v2_codegen_spec.mlir",
-        "softmax_codegen_spec.mlir",
-        "softmax_v2_codegen_spec.mlir",
-        #
-        # FIXME: This must be used with the custom dispatch region formation
-        # because IREE's does not fuse the 6 ops softmax version even with
-        # --iree-flow-fuse-multi-use.
-        #
-        "softmax_dispatch_spec.mlir",
-        # First few ops of softmax only, acts as a proxy example.
-        "softmax_partial_codegen_spec.mlir",
-    ],
-    tags = [
-        # CUDA cuInit fails with sanitizer on.
-        "noasan",
-        "nomsan",
-        "notsan",
-        "noubsan",
-        "requires-gpu-nvidia",
-        "driver=cuda",
-    ],
-    tools = [
-        "//tools:iree-compile",
-        "//tools:iree-opt",
-        "//tools:iree-run-module",
-        "@llvm-project//llvm:FileCheck",
-    ],
-)
-
-iree_lit_test_suite(
-    name = "sm80_lit",
-    srcs = [
-        "double_mma_layout_analysis.mlir",
-        "mma_elemwise_layout_analysis.mlir",
-        "mma_reduction_layout_analysis.mlir",
-        "mma_using_layout_analysis.mlir",
-    ],
-    cfg = "//tests:lit.cfg.py",
-    # transform dialect spec files are MLIR files that specify a transformation,
-    # they need to be included as data.
-    data = [
-        "double_mma_layout_analysis_codegen_spec.mlir",
-        "double_mma_layout_analysis_dispatch_spec.mlir",
-        "mma_elemwise_layout_analysis_codegen_spec.mlir",
-        "mma_reduction_layout_analysis_codegen_spec.mlir",
-        "mma_reduction_layout_analysis_dispatch_spec.mlir",
-        "mma_using_layout_analysis_codegen_spec.mlir",
-    ],
-    tags = [
-        # CUDA cuInit fails with sanitizer on.
-        "noasan",
-        "nomsan",
-        "notsan",
-        "noubsan",
-        "requires-gpu-sm80",
-        "driver=cuda",
-    ],
-    tools = [
-        "//tools:iree-compile",
-        "//tools:iree-opt",
-        "//tools:iree-run-module",
-        "@llvm-project//llvm:FileCheck",
-    ],
-)
diff --git a/tests/transform_dialect/cuda/CMakeLists.txt b/tests/transform_dialect/cuda/CMakeLists.txt
deleted file mode 100644
index 7534bc8bfd46..000000000000
--- a/tests/transform_dialect/cuda/CMakeLists.txt
+++ /dev/null
@@ -1,78 +0,0 @@
-################################################################################
-# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
-# tests/transform_dialect/cuda/BUILD.bazel                                     #
-#                                                                              #
-# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
-# CMake-only content.                                                          #
-#                                                                              #
-# To disable autogeneration for this file entirely, delete this header.        #
-################################################################################
-
-iree_add_all_subdirs()
-
-if(NOT IREE_HAL_DRIVER_CUDA)
-  return()
-endif()
-
-if(NOT IREE_HOST_BIN_DIR AND NOT IREE_TARGET_BACKEND_CUDA)
-  return()
-endif()
-
-iree_lit_test_suite(
-  NAME
-    lit
-  TOOLS
-    FileCheck
-    iree-compile
-    iree-opt
-    iree-run-module
-  DATA
-    mma_reduction_layout_analysis_codegen_spec.mlir
-    mma_reduction_layout_analysis_dispatch_spec.mlir
-    mma_using_layout_analysis_codegen_spec.mlir
-    reduction_codegen_spec.mlir
-    reduction_eltwise_codegen_spec.mlir
-    reduction_v2_codegen_spec.mlir
-    softmax_codegen_spec.mlir
-    softmax_dispatch_spec.mlir
-    softmax_partial_codegen_spec.mlir
-    softmax_v2_codegen_spec.mlir
-  LABELS
-    "noasan"
-    "nomsan"
-    "notsan"
-    "noubsan"
-    "requires-gpu-nvidia"
-    "driver=cuda"
-)
-
-iree_lit_test_suite(
-  NAME
-    sm80_lit
-  SRCS
-    "double_mma_layout_analysis.mlir"
-    "mma_elemwise_layout_analysis.mlir"
-    "mma_reduction_layout_analysis.mlir"
-    "mma_using_layout_analysis.mlir"
-  TOOLS
-    FileCheck
-    iree-compile
-    iree-opt
-    iree-run-module
-  DATA
-    double_mma_layout_analysis_codegen_spec.mlir
-    double_mma_layout_analysis_dispatch_spec.mlir
-    mma_elemwise_layout_analysis_codegen_spec.mlir
-    mma_reduction_layout_analysis_codegen_spec.mlir
-    mma_reduction_layout_analysis_dispatch_spec.mlir
-    mma_using_layout_analysis_codegen_spec.mlir
-  LABELS
-    "noasan"
-    "nomsan"
-    "notsan"
-    "noubsan"
-    "requires-gpu-sm80"
-    "driver=cuda"
-)
-
-### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/tests/transform_dialect/cuda/double_mma_layout_analysis.mlir b/tests/transform_dialect/cuda/double_mma_layout_analysis.mlir
deleted file mode 100644
index 9293fd4ff0ab..000000000000
--- a/tests/transform_dialect/cuda/double_mma_layout_analysis.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-func.func @double_matmul(%lhs : tensor<16x16xf16>, %rhs : tensor<16x16xf16>, %second : tensor<16x8xf16>) -> tensor<16x8xf16> {
-  %c0 = arith.constant 0.0 : f16
-  %0 = tensor.empty() : tensor<16x16xf16>
-  %1 = linalg.fill ins(%c0 : f16) outs(%0 : tensor<16x16xf16>) -> tensor<16x16xf16>
-  %2 = linalg.matmul ins(%lhs, %rhs : tensor<16x16xf16>, tensor<16x16xf16>)
-      outs(%1 : tensor<16x16xf16>) -> tensor<16x16xf16>
-  %3 = tensor.empty() : tensor<16x8xf16>
-  %4 = linalg.fill ins(%c0 : f16) outs(%3 : tensor<16x8xf16>) -> tensor<16x8xf16>
-  %5 = linalg.matmul ins(%2, %second : tensor<16x16xf16>, tensor<16x8xf16>)
-      outs(%4 : tensor<16x8xf16>) -> tensor<16x8xf16>
-  return %5 : tensor<16x8xf16>
-}
-
-// RUN: iree-compile %s --iree-hal-target-backends=cuda \
-// RUN:     --iree-hal-cuda-llvm-target-arch=sm_80 \
-// RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-flow-dispatch-use-transform-dialect=%p/double_mma_layout_analysis_dispatch_spec.mlir \
-// RUN:     --iree-codegen-transform-dialect-library=%p/double_mma_layout_analysis_codegen_spec.mlir@codegen | \
-// RUN: iree-run-module --module=- --function=double_matmul --device=cuda \
-// RUN: --input="16x16xf16=[[0.0999755859375,0.2249755859375,0.07501220703125,0.0,0.07501220703125,0.2249755859375,0.175048828125,0.07501220703125,0.175048828125,0.07501220703125,0.024993896484375,0.1500244140625,0.1500244140625,0.2249755859375,0.199951171875,0.1500244140625],[0.1500244140625,0.199951171875,0.0999755859375,0.07501220703125,0.1500244140625,0.2249755859375,0.024993896484375,0.0999755859375,0.0999755859375,0.024993896484375,0.2249755859375,0.2249755859375,0.2249755859375,0.0,0.024993896484375,0.04998779296875],[0.07501220703125,0.0,0.125,0.125,0.04998779296875,0.2249755859375,0.024993896484375,0.199951171875,0.199951171875,0.07501220703125,0.1500244140625,0.2249755859375,0.024993896484375,0.175048828125,0.07501220703125,0.125],[0.04998779296875,0.024993896484375,0.0,0.2249755859375,0.07501220703125,0.024993896484375,0.024993896484375,0.0,0.07501220703125,0.1500244140625,0.1500244140625,0.175048828125,0.2249755859375,0.1500244140625,0.07501220703125,0.0999755859375],[0.125,0.0,0.199951171875,0.04998779296875,0.199951171875,0.04998779296875,0.175048828125,0.125,0.0,0.0,0.199951171875,0.024993896484375,0.2249755859375,0.1500244140625,0.024993896484375,0.0],[0.04998779296875,0.2249755859375,0.0999755859375,0.07501220703125,0.2249755859375,0.07501220703125,0.2249755859375,0.07501220703125,0.2249755859375,0.199951171875,0.125,0.07501220703125,0.04998779296875,0.199951171875,0.125,0.1500244140625],[0.1500244140625,0.125,0.175048828125,0.04998779296875,0.125,0.1500244140625,0.1500244140625,0.125,0.0999755859375,0.0,0.199951171875,0.024993896484375,0.175048828125,0.199951171875,0.125,0.0999755859375],[0.0999755859375,0.199951171875,0.0999755859375,0.0999755859375,0.2249755859375,0.0,0.175048828125,0.0999755859375,0.125,0.07501220703125,0.07501220703125,0.175048828125,0.07501220703125,0.0,0.2249755859375,0.2249755859375],[0.07501220703125,0.024993896484375,0.199951171875,0.024993896484375,0.175048828125,0.199951171875,0.0999755859375,0.024993896484375,0.0,0.0999755859375,0.0,0.0999755859375,0.2249755859375,0.175048828125,0.0,0.0],[0.024993896484375,0.0999755859375,0.2249755859375,0.2249755859375,0.125,0.2249755859375,0.04998779296875,0.04998779296875,0.04998779296875,0.024993896484375,0.0999755859375,0.2249755859375,0.024993896484375,0.024993896484375,0.0,0.07501220703125],[0.0,0.1500244140625,0.175048828125,0.1500244140625,0.2249755859375,0.024993896484375,0.1500244140625,0.0999755859375,0.024993896484375,0.0,0.125,0.04998779296875,0.125,0.199951171875,0.024993896484375,0.199951171875],[0.024993896484375,0.04998779296875,0.199951171875,0.0,0.07501220703125,0.199951171875,0.2249755859375,0.04998779296875,0.175048828125,0.0,0.199951171875,0.199951171875,0.1500244140625,0.199951171875,0.125,0.199951171875],[0.1500244140625,0.125,0.04998779296875,0.0999755859375,0.04998779296875,0.175048828125,0.04998779296875,0.0999755859375,0.2249755859375,0.199951171875,0.125,0.1500244140625,0.0999755859375,0.07501220703125,0.07501220703125,0.0999755859375],[0.0,0.04998779296875,0.125,0.024993896484375,0.04998779296875,0.199951171875,0.04998779296875,0.0999755859375,0.199951171875,0.07501220703125,0.1500244140625,0.125,0.199951171875,0.199951171875,0.0,0.125],[0.024993896484375,0.07501220703125,0.0,0.199951171875,0.024993896484375,0.024993896484375,0.024993896484375,0.175048828125,0.04998779296875,0.04998779296875,0.04998779296875,0.07501220703125,0.07501220703125,0.1500244140625,0.175048828125,0.199951171875],[0.0,0.125,0.0,0.07501220703125,0.125,0.125,0.07501220703125,0.1500244140625,0.04998779296875,0.04998779296875,0.125,0.125,0.2249755859375,0.0999755859375,0.07501220703125,0.07501220703125]]" \
-// RUN: --input="16x16xf16=[[0.175048828125,0.07501220703125,0.199951171875,0.0,0.175048828125,0.125,0.199951171875,0.04998779296875,0.0999755859375,0.175048828125,0.07501220703125,0.04998779296875,0.125,0.125,0.07501220703125,0.2249755859375],[0.024993896484375,0.199951171875,0.0,0.1500244140625,0.175048828125,0.0999755859375,0.175048828125,0.1500244140625,0.2249755859375,0.07501220703125,0.199951171875,0.0999755859375,0.0999755859375,0.2249755859375,0.0999755859375,0.0999755859375],[0.2249755859375,0.2249755859375,0.125,0.175048828125,0.0,0.07501220703125,0.04998779296875,0.0,0.199951171875,0.1500244140625,0.024993896484375,0.2249755859375,0.024993896484375,0.1500244140625,0.2249755859375,0.199951171875],[0.1500244140625,0.125,0.024993896484375,0.07501220703125,0.125,0.125,0.07501220703125,0.1500244140625,0.04998779296875,0.175048828125,0.125,0.175048828125,0.175048828125,0.07501220703125,0.024993896484375,0.125],[0.2249755859375,0.125,0.2249755859375,0.1500244140625,0.0,0.0,0.1500244140625,0.125,0.024993896484375,0.125,0.0,0.024993896484375,0.175048828125,0.175048828125,0.024993896484375,0.125],[0.2249755859375,0.024993896484375,0.04998779296875,0.0,0.0,0.1500244140625,0.07501220703125,0.2249755859375,0.1500244140625,0.024993896484375,0.0,0.0999755859375,0.125,0.1500244140625,0.2249755859375,0.0],[0.125,0.0999755859375,0.0,0.0999755859375,0.199951171875,0.125,0.175048828125,0.175048828125,0.1500244140625,0.2249755859375,0.04998779296875,0.125,0.1500244140625,0.0,0.0,0.0999755859375],[0.125,0.07501220703125,0.175048828125,0.1500244140625,0.175048828125,0.0,0.04998779296875,0.125,0.125,0.024993896484375,0.0999755859375,0.175048828125,0.024993896484375,0.0,0.024993896484375,0.0],[0.2249755859375,0.024993896484375,0.0999755859375,0.04998779296875,0.125,0.07501220703125,0.0999755859375,0.024993896484375,0.125,0.125,0.125,0.024993896484375,0.125,0.04998779296875,0.0999755859375,0.07501220703125],[0.0999755859375,0.175048828125,0.199951171875,0.0999755859375,0.175048828125,0.07501220703125,0.024993896484375,0.125,0.07501220703125,0.0,0.125,0.07501220703125,0.07501220703125,0.0,0.199951171875,0.175048828125],[0.07501220703125,0.0999755859375,0.175048828125,0.07501220703125,0.125,0.1500244140625,0.0,0.0999755859375,0.2249755859375,0.199951171875,0.04998779296875,0.0,0.0,0.1500244140625,0.199951171875,0.2249755859375],[0.024993896484375,0.2249755859375,0.04998779296875,0.1500244140625,0.2249755859375,0.2249755859375,0.175048828125,0.0999755859375,0.024993896484375,0.199951171875,0.125,0.199951171875,0.175048828125,0.2249755859375,0.175048828125,0.0999755859375],[0.125,0.0999755859375,0.04998779296875,0.125,0.199951171875,0.07501220703125,0.199951171875,0.0,0.024993896484375,0.04998779296875,0.0,0.04998779296875,0.04998779296875,0.199951171875,0.1500244140625,0.0999755859375],[0.199951171875,0.0,0.125,0.04998779296875,0.07501220703125,0.175048828125,0.0999755859375,0.175048828125,0.024993896484375,0.07501220703125,0.0,0.1500244140625,0.07501220703125,0.024993896484375,0.07501220703125,0.175048828125],[0.1500244140625,0.125,0.0999755859375,0.175048828125,0.04998779296875,0.0,0.04998779296875,0.1500244140625,0.024993896484375,0.125,0.125,0.175048828125,0.125,0.0999755859375,0.175048828125,0.1500244140625],[0.07501220703125,0.199951171875,0.024993896484375,0.0999755859375,0.175048828125,0.07501220703125,0.1500244140625,0.04998779296875,0.0,0.024993896484375,0.07501220703125,0.07501220703125,0.1500244140625,0.04998779296875,0.2249755859375,0.1500244140625]]" \
-// RUN: --input="16x8xf16=[[0.1500244140625,0.07501220703125,0.1500244140625,0.0,0.199951171875,0.125,0.0,0.175048828125],[0.04998779296875,0.07501220703125,0.04998779296875,0.125,0.2249755859375,0.04998779296875,0.04998779296875,0.2249755859375],[0.0,0.07501220703125,0.04998779296875,0.175048828125,0.0999755859375,0.1500244140625,0.04998779296875,0.199951171875],[0.125,0.175048828125,0.04998779296875,0.07501220703125,0.199951171875,0.07501220703125,0.024993896484375,0.1500244140625],[0.175048828125,0.0,0.0,0.0999755859375,0.0999755859375,0.1500244140625,0.07501220703125,0.024993896484375],[0.1500244140625,0.199951171875,0.0999755859375,0.0999755859375,0.125,0.175048828125,0.199951171875,0.0],[0.175048828125,0.0999755859375,0.024993896484375,0.175048828125,0.125,0.07501220703125,0.175048828125,0.175048828125],[0.175048828125,0.175048828125,0.2249755859375,0.125,0.175048828125,0.0,0.04998779296875,0.175048828125],[0.175048828125,0.024993896484375,0.125,0.1500244140625,0.1500244140625,0.07501220703125,0.0,0.04998779296875],[0.125,0.0999755859375,0.024993896484375,0.199951171875,0.175048828125,0.0999755859375,0.04998779296875,0.125],[0.199951171875,0.04998779296875,0.1500244140625,0.0999755859375,0.04998779296875,0.07501220703125,0.199951171875,0.125],[0.1500244140625,0.0,0.125,0.175048828125,0.024993896484375,0.07501220703125,0.199951171875,0.0999755859375],[0.175048828125,0.04998779296875,0.07501220703125,0.125,0.024993896484375,0.2249755859375,0.0,0.0],[0.024993896484375,0.0999755859375,0.1500244140625,0.07501220703125,0.125,0.2249755859375,0.0,0.0],[0.04998779296875,0.125,0.175048828125,0.04998779296875,0.125,0.0999755859375,0.0999755859375,0.04998779296875],[0.125,0.175048828125,0.0,0.2249755859375,0.199951171875,0.175048828125,0.1500244140625,0.1500244140625]]" |\
-// RUN: FileCheck %s --check-prefix=EXEC
-
-//      EXEC: result[0]: hal.buffer_view
-// EXEC-NEXT: 16x8xf16=[0.465332 0.345703 0.341064 0.440674 0.495605 0.424805 0.296875 0.393555][0.420166 0.324707 0.314941 0.414551 0.470215 0.408447 0.268555 0.361084][0.404785 0.305176 0.304688 0.390381 0.438232 0.378174 0.262207 0.349609][0.33252 0.25708 0.235596 0.327393 0.364258 0.320312 0.222168 0.287109][0.326172 0.256592 0.235107 0.332031 0.377686 0.316895 0.204224 0.300537][0.484131 0.361328 0.346436 0.46875 0.525391 0.442871 0.303467 0.425293][0.422852 0.324219 0.311035 0.41626 0.472412 0.398926 0.26709 0.374023][0.447754 0.33252 0.314697 0.439209 0.487305 0.414551 0.285156 0.395508][0.30127 0.23938 0.229736 0.297363 0.343994 0.293701 0.192749 0.272705][0.355225 0.271729 0.270752 0.34668 0.391846 0.334717 0.227539 0.305664][0.375 0.286133 0.267822 0.369141 0.416992 0.348877 0.237549 0.334473][0.454834 0.349365 0.334717 0.44165 0.502441 0.426758 0.292725 0.390381][0.404541 0.302002 0.296631 0.390137 0.437012 0.379639 0.260742 0.34668][0.348877 0.269043 0.263428 0.335205 0.386719 0.33252 0.22522 0.300293][0.297363 0.216431 0.211182 0.283203 0.311523 0.266113 0.194946 0.25708][0.320068 0.242188 0.235107 0.30835 0.349365 0.299072 0.203125 0.275879]
diff --git a/tests/transform_dialect/cuda/double_mma_layout_analysis_codegen_spec.mlir b/tests/transform_dialect/cuda/double_mma_layout_analysis_codegen_spec.mlir
deleted file mode 100644
index 49f07d2c963a..000000000000
--- a/tests/transform_dialect/cuda/double_mma_layout_analysis_codegen_spec.mlir
+++ /dev/null
@@ -1,69 +0,0 @@
-// RUN: iree-opt %s
-
-module attributes { transform.with_named_sequence } {
-  transform.named_sequence @codegen(
-      %variant_op: !transform.any_op) {
-
-    // Step 1. Find the fill and matmul ops
-    // ===========================================================================
-    %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    %matmul = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    %fill0, %fill1 = transform.split_handle %fill : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-    %matmul0, %matmul1 = transform.split_handle %matmul : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-    // Step 2. Tile the matmul and fuse the fill
-    // ===========================================================================
-    %grid_reduction, %forall_grid =
-    transform.structured.tile_using_forall %matmul1 tile_sizes [16] ( mapping = [#gpu.block<x>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-    transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
-
-    transform.structured.fuse_into_containing_op %fill1 into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-    transform.structured.fuse_into_containing_op %matmul0 into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-    transform.structured.fuse_into_containing_op %fill0 into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-    // Step 3. Vectorize
-    // ===========================================================================
-    %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    transform.apply_patterns to %func {
-      transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
-      transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
-      transform.apply_patterns.vector.cast_away_vector_leading_one_dim
-    } : !transform.any_op
-    %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
-
-    // Step 4. Bufferize
-    // ===========================================================================
-    transform.apply_patterns to %func_3 {
-      transform.apply_patterns.iree.fold_fill_into_pad
-      transform.apply_patterns.linalg.tiling_canonicalization
-      transform.apply_patterns.scf.for_loop_canonicalization
-    } : !transform.any_op
-    transform.apply_patterns to %func_3 {
-      transform.apply_patterns.tensor.reassociative_reshape_folding
-      transform.apply_patterns.canonicalization
-    } : !transform.any_op
-    transform.apply_cse to %func_3 : !transform.any_op
-    transform.iree.eliminate_empty_tensors %func_3 : (!transform.any_op) -> ()
-    transform.apply_patterns to %func_3 {
-      transform.apply_patterns.linalg.erase_unnecessary_inputs
-    } : !transform.any_op
-    %memref_func = transform.iree.bufferize { target_gpu } %func_3 : (!transform.any_op) -> (!transform.any_op)
-
-    // Step 5. Pre-process the contract and transfer ops to put it in the right form.
-    // ===========================================================================
-    transform.apply_patterns to %memref_func {
-      transform.apply_patterns.iree.prepare_vector_to_mma
-    } : !transform.any_op
-
-    // Step 6. Post-bufferization vector distribution
-    // ===========================================================================
-    transform.iree.forall_to_workgroup %memref_func : (!transform.any_op) -> ()
-    transform.iree.map_nested_forall_to_gpu_threads %memref_func workgroup_dims = [4, 8, 1] : (!transform.any_op) -> ()
-
-    // Step 7. Do layout analysis and lower to mma
-    // ===========================================================================
-    %func_11 = transform.iree.layout_analysis_and_distribution %memref_func : (!transform.any_op) -> (!transform.any_op)
-
-    transform.yield
-  }
-} // module
diff --git a/tests/transform_dialect/cuda/double_mma_layout_analysis_dispatch_spec.mlir b/tests/transform_dialect/cuda/double_mma_layout_analysis_dispatch_spec.mlir
deleted file mode 100644
index 8fad32f799b6..000000000000
--- a/tests/transform_dialect/cuda/double_mma_layout_analysis_dispatch_spec.mlir
+++ /dev/null
@@ -1,23 +0,0 @@
-// RUN: iree-opt %s
-
-module attributes { transform.with_named_sequence } {
-  transform.named_sequence @__transform_main(%func: !transform.any_op) {
-    %ops = transform.structured.match ops{["linalg.fill", "linalg.matmul"]}
-      in %func : (!transform.any_op) -> !transform.any_op
-
-    %fill0, %matmul0, %fill1, %matmul1 =
-      transform.split_handle %ops
-        : (!transform.any_op) -> (!transform.any_op, !transform.any_op,
-                               !transform.any_op, !transform.any_op)
-
-    %region_op = transform.iree.wrap_in_dispatch_region %matmul1 { generateWorkload = false } : (!transform.any_op) -> !transform.any_op
-
-    %non_matmul1 = transform.merge_handles %fill0, %matmul0, %fill1 : !transform.any_op
-    %region_op_2 = transform.iree.move_preceding_op_into_dispatch_region %non_matmul1 into %region_op : (!transform.any_op, !transform.any_op) -> !transform.any_op
-
-    %empty = transform.structured.match ops{["tensor.empty"]} in %func : (!transform.any_op) -> !transform.any_op
-    %region_op_3 = transform.iree.move_preceding_op_into_dispatch_region %empty into %region_op_2 : (!transform.any_op, !transform.any_op) -> !transform.any_op
-    transform.iree.region_to_workgroups %region_op_3 : (!transform.any_op) -> !transform.any_op
-    transform.yield
-  }
-}
diff --git a/tests/transform_dialect/cuda/eltwise_reduction.mlir b/tests/transform_dialect/cuda/eltwise_reduction.mlir
deleted file mode 100644
index eabf1a0cf739..000000000000
--- a/tests/transform_dialect/cuda/eltwise_reduction.mlir
+++ /dev/null
@@ -1,39 +0,0 @@
-!in_tensor_t = tensor<8x64xf32>
-!out_tensor_t = tensor<8xf32>
-
-func.func @reduce(%arg : !in_tensor_t) -> (!out_tensor_t) {
-  %cst = arith.constant -0.000000e+00 : f32
-
-  %0 = tensor.empty() : !out_tensor_t
-  %1 = linalg.fill ins(%cst : f32) outs(%0 : !out_tensor_t) ->  !out_tensor_t
-  %2 = tensor.empty() : !in_tensor_t
-  %3 = linalg.generic {
-    indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
-                     affine_map<(d0, d1) -> (d0, d1)>],
-    iterator_types = ["parallel", "parallel"]}
-    ins(%arg : !in_tensor_t) outs(%2 : !in_tensor_t) {
-    ^bb0(%arg3: f32, %arg4: f32):
-      %4 = arith.addf %arg3, %arg3 : f32
-      %5 = arith.addf %4, %4 : f32
-      linalg.yield %5 : f32
-    } -> !in_tensor_t
-
-  %6 = linalg.generic {
-    indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
-                     affine_map<(d0, d1) -> (d0)>],
-    iterator_types = ["parallel", "reduction"]}
-    ins(%3 : !in_tensor_t) outs(%1 : !out_tensor_t) {
-      ^bb0(%arg3: f32, %arg4: f32):
-        %4 = arith.addf %arg3, %arg4 : f32
-        linalg.yield %4 : f32
-      } -> !out_tensor_t
-
-  return %6 : !out_tensor_t
-}
-
-// RUN: iree-compile %s --iree-hal-target-backends=cuda | \
-// RUN: iree-run-module --module=- --function=reduce --device=cuda --input="8x64xf32=1" |\
-// RUN: FileCheck %s --check-prefix=EXEC
-
-//      EXEC: result[0]: hal.buffer_view
-// EXEC-NEXT: 8xf32=256 256 256 256 256 256 256 256
diff --git a/tests/transform_dialect/cuda/eltwise_reduction_eltwise.mlir b/tests/transform_dialect/cuda/eltwise_reduction_eltwise.mlir
deleted file mode 100644
index 70aa3322dfd3..000000000000
--- a/tests/transform_dialect/cuda/eltwise_reduction_eltwise.mlir
+++ /dev/null
@@ -1,51 +0,0 @@
-!in_tensor_t = tensor<8x64xf32>
-!out_tensor_t = tensor<8xf32>
-
-func.func @reduce(%arg : !in_tensor_t) -> (!out_tensor_t) {
-  %cst = arith.constant -0.000000e+00 : f32
-
-  %0 = tensor.empty() : !out_tensor_t
-  %1 = linalg.fill ins(%cst : f32) outs(%0 : !out_tensor_t) ->  !out_tensor_t
-  %2 = tensor.empty() : !in_tensor_t
-  %3 = linalg.generic {
-    indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
-                     affine_map<(d0, d1) -> (d0, d1)>],
-    iterator_types = ["parallel", "parallel"]}
-    ins(%arg : !in_tensor_t) outs(%2 : !in_tensor_t) {
-    ^bb0(%arg3: f32, %arg4: f32):
-      %4 = arith.addf %arg3, %arg3 : f32
-      %5 = arith.addf %4, %4 : f32
-      linalg.yield %5 : f32
-    } -> !in_tensor_t
-
-  %6 = linalg.generic {
-    indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
-                     affine_map<(d0, d1) -> (d0)>],
-    iterator_types = ["parallel", "reduction"]}
-    ins(%3 : !in_tensor_t) outs(%1 : !out_tensor_t) {
-      ^bb0(%arg3: f32, %arg4: f32):
-        %4 = arith.addf %arg3, %arg4 : f32
-        linalg.yield %4 : f32
-      } -> !out_tensor_t
-
-  %7 = tensor.empty() : !out_tensor_t
-  %8 = linalg.generic {
-    indexing_maps = [affine_map<(d0) -> (d0)>,
-                     affine_map<(d0) -> (d0)>],
-    iterator_types = ["parallel"]}
-    ins(%6 : !out_tensor_t) outs(%7 : !out_tensor_t) {
-    ^bb0(%arg3: f32, %arg4: f32):
-      %4 = math.sqrt %arg3 : f32
-      linalg.yield %4 : f32
-    } -> !out_tensor_t
-
-
-  return %8 : !out_tensor_t
-}
-
-// RUN: iree-compile %s --iree-hal-target-backends=cuda | \
-// RUN: iree-run-module --module=- --function=reduce --device=cuda --input="8x64xf32=1" |\
-// RUN: FileCheck %s --check-prefix=EXEC
-
-//      EXEC: result[0]: hal.buffer_view
-// EXEC-NEXT: 8xf32=16 16 16 16 16 16 16 16
diff --git a/tests/transform_dialect/cuda/mma_elemwise_layout_analysis.mlir b/tests/transform_dialect/cuda/mma_elemwise_layout_analysis.mlir
deleted file mode 100644
index c1b1c2904c44..000000000000
--- a/tests/transform_dialect/cuda/mma_elemwise_layout_analysis.mlir
+++ /dev/null
@@ -1,29 +0,0 @@
-#map = affine_map<(d0, d1) -> (d0, d1)>
-func.func @matmul(%lhs : tensor<16x16xf16>, %rhs : tensor<8x16xf16>, %bias : tensor<16x8xf16>) -> tensor<16x8xf16> {
-  %c0 = arith.constant 0.0 : f16
-  %0 = tensor.empty() : tensor<16x8xf16>
-  %1 = linalg.fill ins(%c0 : f16) outs(%0 : tensor<16x8xf16>) -> tensor<16x8xf16>
-  %2 = linalg.matmul_transpose_b ins(%lhs, %rhs : tensor<16x16xf16>, tensor<8x16xf16>)
-      outs(%1 : tensor<16x8xf16>) -> tensor<16x8xf16>
-  %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types=["parallel", "parallel"]}
-        ins(%2, %bias : tensor<16x8xf16>, tensor<16x8xf16>) outs(%0 : tensor<16x8xf16>) {
-          ^bb0(%arg0: f16, %arg1: f16, %arg2: f16):
-            %10 = arith.subf %arg0, %arg1 : f16
-            %11 = math.exp %10 : f16
-            linalg.yield %11 : f16
-       } -> tensor<16x8xf16>
-  return %3 : tensor<16x8xf16>
-}
-
-// RUN: iree-compile %s --iree-hal-target-backends=cuda \
-// RUN:     --iree-hal-cuda-llvm-target-arch=sm_80 \
-// RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-transform-dialect-library=%p/mma_elemwise_layout_analysis_codegen_spec.mlir@__transform_main | \
-// RUN: iree-run-module --module=- --function=matmul --device=cuda \
-// RUN: --input="16x16xf16=[[0.0999755859375,0.2249755859375,0.07501220703125,0.0,0.07501220703125,0.2249755859375,0.175048828125,0.07501220703125,0.175048828125,0.07501220703125,0.024993896484375,0.1500244140625,0.1500244140625,0.2249755859375,0.199951171875,0.1500244140625],[0.1500244140625,0.199951171875,0.0999755859375,0.07501220703125,0.1500244140625,0.2249755859375,0.024993896484375,0.0999755859375,0.0999755859375,0.024993896484375,0.2249755859375,0.2249755859375,0.2249755859375,0.0,0.024993896484375,0.04998779296875],[0.07501220703125,0.0,0.125,0.125,0.04998779296875,0.2249755859375,0.024993896484375,0.199951171875,0.199951171875,0.07501220703125,0.1500244140625,0.2249755859375,0.024993896484375,0.175048828125,0.07501220703125,0.125],[0.04998779296875,0.024993896484375,0.0,0.2249755859375,0.07501220703125,0.024993896484375,0.024993896484375,0.0,0.07501220703125,0.1500244140625,0.1500244140625,0.175048828125,0.2249755859375,0.1500244140625,0.07501220703125,0.0999755859375],[0.125,0.0,0.199951171875,0.04998779296875,0.199951171875,0.04998779296875,0.175048828125,0.125,0.0,0.0,0.199951171875,0.024993896484375,0.2249755859375,0.1500244140625,0.024993896484375,0.0],[0.04998779296875,0.2249755859375,0.0999755859375,0.07501220703125,0.2249755859375,0.07501220703125,0.2249755859375,0.07501220703125,0.2249755859375,0.199951171875,0.125,0.07501220703125,0.04998779296875,0.199951171875,0.125,0.1500244140625],[0.1500244140625,0.125,0.175048828125,0.04998779296875,0.125,0.1500244140625,0.1500244140625,0.125,0.0999755859375,0.0,0.199951171875,0.024993896484375,0.175048828125,0.199951171875,0.125,0.0999755859375],[0.0999755859375,0.199951171875,0.0999755859375,0.0999755859375,0.2249755859375,0.0,0.175048828125,0.0999755859375,0.125,0.07501220703125,0.07501220703125,0.175048828125,0.07501220703125,0.0,0.2249755859375,0.2249755859375],[0.07501220703125,0.024993896484375,0.199951171875,0.024993896484375,0.175048828125,0.199951171875,0.0999755859375,0.024993896484375,0.0,0.0999755859375,0.0,0.0999755859375,0.2249755859375,0.175048828125,0.0,0.0],[0.024993896484375,0.0999755859375,0.2249755859375,0.2249755859375,0.125,0.2249755859375,0.04998779296875,0.04998779296875,0.04998779296875,0.024993896484375,0.0999755859375,0.2249755859375,0.024993896484375,0.024993896484375,0.0,0.07501220703125],[0.0,0.1500244140625,0.175048828125,0.1500244140625,0.2249755859375,0.024993896484375,0.1500244140625,0.0999755859375,0.024993896484375,0.0,0.125,0.04998779296875,0.125,0.199951171875,0.024993896484375,0.199951171875],[0.024993896484375,0.04998779296875,0.199951171875,0.0,0.07501220703125,0.199951171875,0.2249755859375,0.04998779296875,0.175048828125,0.0,0.199951171875,0.199951171875,0.1500244140625,0.199951171875,0.125,0.199951171875],[0.1500244140625,0.125,0.04998779296875,0.0999755859375,0.04998779296875,0.175048828125,0.04998779296875,0.0999755859375,0.2249755859375,0.199951171875,0.125,0.1500244140625,0.0999755859375,0.07501220703125,0.07501220703125,0.0999755859375],[0.0,0.04998779296875,0.125,0.024993896484375,0.04998779296875,0.199951171875,0.04998779296875,0.0999755859375,0.199951171875,0.07501220703125,0.1500244140625,0.125,0.199951171875,0.199951171875,0.0,0.125],[0.024993896484375,0.07501220703125,0.0,0.199951171875,0.024993896484375,0.024993896484375,0.024993896484375,0.175048828125,0.04998779296875,0.04998779296875,0.04998779296875,0.07501220703125,0.07501220703125,0.1500244140625,0.175048828125,0.199951171875],[0.0,0.125,0.0,0.07501220703125,0.125,0.125,0.07501220703125,0.1500244140625,0.04998779296875,0.04998779296875,0.125,0.125,0.2249755859375,0.0999755859375,0.07501220703125,0.07501220703125]]" \
-// RUN: --input="8x16xf16=[[0.175049 0.0999756 0.0249939 0.224976 0.224976 0.199951 0.150024 0.0499878 0.224976 0.0249939 0.224976 0.150024 0.125 0.150024 0.125 0.125][0.0750122 0.175049 0.199951 0.0750122 0.224976 0.150024 0.125 0.175049 0.125 0.125 0.0249939 0.0249939 0.0999756 0.224976 0.0750122 0.0249939][0.199951 0.0750122 0 0.199951 0.125 0.0249939 0.0249939 0.125 0.224976 0 0.0499878 0 0 0.0499878 0.175049 0.0999756][0 0.0499878 0.150024 0.0999756 0.175049 0.224976 0.0750122 0.175049 0.150024 0.0249939 0 0.0999756 0.0999756 0.125 0.150024 0.175049][0.175049 0.125 0.175049 0.0999756 0 0.0249939 0.125 0.175049 0 0.175049 0 0.125 0.199951 0.150024 0.175049 0.0249939][0.125 0.125 0.0999756 0.224976 0.0750122 0.150024 0.125 0.0750122 0 0.175049 0.150024 0.150024 0.125 0 0 0][0.199951 0.0750122 0.175049 0.0999756 0.0499878 0.224976 0.0750122 0.0249939 0.150024 0.0249939 0.0750122 0.224976 0.175049 0 0.0499878 0.0249939][0.0499878 0.224976 0.150024 0.0999756 0 0.199951 0.150024 0.125 0.125 0.125 0.224976 0 0.175049 0.0999756 0.125 0]]" \
-// RUN: --input="16x8xf16=[[0.0,-0.03173828125,-0.1318359375,-0.044189453125,-0.0655517578125,-0.126220703125,-0.076171875,-0.041259765625],[0.0,-0.0855712890625,-0.157470703125,-0.09619140625,-0.1124267578125,-0.0718994140625,-0.04052734375,-0.0531005859375],[0.0,-0.065673828125,-0.118896484375,-0.0438232421875,-0.1031494140625,-0.1051025390625,-0.06884765625,-0.0750732421875],[0.0,-0.0911865234375,-0.11810302734375,-0.09375,-0.0711669921875,-0.06494140625,-0.083740234375,-0.0755615234375],[0.0,-0.0150146484375,-0.125,-0.064453125,-0.0462646484375,-0.065673828125,-0.064453125,-0.0325927734375],[0.0,-0.026123046875,-0.1287841796875,-0.078125,-0.1043701171875,-0.125,-0.1368408203125,-0.06005859375],[0.0,-0.036865234375,-0.1300048828125,-0.0699462890625,-0.078125,-0.11376953125,-0.088134765625,-0.03369140625],[0.0,-0.06201171875,-0.0894775390625,-0.0594482421875,-0.078857421875,-0.10693359375,-0.0982666015625,-0.0863037109375],[-0.021240234375,0.0,-0.15185546875,-0.036865234375,-0.0380859375,-0.0587158203125,-0.0343017578125,-0.045654296875],[0.0,-0.052490234375,-0.1268310546875,-0.04248046875,-0.0955810546875,-0.0399169921875,-0.029296875,-0.060546875],[0.0,-0.0162353515625,-0.1187744140625,-0.043701171875,-0.079345703125,-0.0924072265625,-0.1112060546875,-0.0599365234375],[0.0,-0.0706787109375,-0.175048828125,-0.054931640625,-0.1119384765625,-0.1356201171875,-0.076904296875,-0.06005859375],[0.0,-0.0662841796875,-0.105712890625,-0.0782470703125,-0.0838623046875,-0.078857421875,-0.061279296875,-0.0494384765625],[0.0,-0.0374755859375,-0.138671875,-0.0374755859375,-0.086181640625,-0.09619140625,-0.05615234375,-0.0318603515625],[0.0,-0.0455322265625,-0.0450439453125,-0.023681640625,-0.0343017578125,-0.07745361328125,-0.09375,-0.05126953125],[0.0,-0.041259765625,-0.11566162109375,-0.0462646484375,-0.061279296875,-0.06689453125,-0.0706787109375,-0.0325927734375]]" |\
-// RUN: FileCheck %s --check-prefix=EXEC
-
-//      EXEC: result[0]: hal.buffer_view
-// EXEC-NEXT: 16x8xf16=[1.34863 1.34863 1.34863 1.34863 1.34863 1.34863 1.34863 1.34863][1.34766 1.34766 1.34766 1.34766 1.34668 1.34766 1.34668 1.34766][1.32715 1.32715 1.32715 1.32715 1.32715 1.32715 1.32715 1.32715][1.27246 1.27246 1.27246 1.27246 1.27246 1.27246 1.27246 1.27246][1.25586 1.25586 1.25586 1.25684 1.25586 1.25586 1.25586 1.25684][1.37598 1.37598 1.37598 1.37598 1.37598 1.37598 1.37598 1.37598][1.33984 1.33984 1.33984 1.33984 1.33984 1.33984 1.33984 1.33984][1.32715 1.32715 1.32715 1.32715 1.32715 1.32715 1.32715 1.32715][1.24023 1.24023 1.24023 1.24023 1.24023 1.24023 1.23926 1.24023][1.26855 1.26855 1.26855 1.26855 1.26855 1.26855 1.26855 1.26855][1.28516 1.28516 1.28516 1.28516 1.28516 1.28516 1.28516 1.28516][1.36523 1.36523 1.36523 1.36523 1.36523 1.36523 1.36523 1.36523][1.31445 1.31445 1.31445 1.31445 1.31445 1.31445 1.31445 1.31445][1.28027 1.28027 1.28027 1.28027 1.28027 1.28027 1.28027 1.28027][1.21387 1.21387 1.21387 1.21387 1.21387 1.21387 1.21387 1.21387][1.24902 1.24902 1.24902 1.24902 1.24902 1.24902 1.24902 1.24902]
diff --git a/tests/transform_dialect/cuda/mma_elemwise_layout_analysis_codegen_spec.mlir b/tests/transform_dialect/cuda/mma_elemwise_layout_analysis_codegen_spec.mlir
deleted file mode 100644
index 138fc5899ae8..000000000000
--- a/tests/transform_dialect/cuda/mma_elemwise_layout_analysis_codegen_spec.mlir
+++ /dev/null
@@ -1,65 +0,0 @@
-// RUN: iree-opt %s
-
-module attributes { transform.with_named_sequence } {
-  transform.named_sequence @__transform_main(
-      %variant_op: !transform.any_op) {
-      // Step 1. Find the fill, matmul and generic ops
-      // ===========================================================================
-      %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-      %matmul = transform.structured.match ops{["linalg.generic"]}
-                  attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]}
-                  in %variant_op : (!transform.any_op) -> !transform.any_op
-      %generic = transform.structured.match ops{["linalg.generic"]}
-                  attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>]}
-                  in %variant_op : (!transform.any_op) -> !transform.any_op
-
-      // Step 2. Tile the generic and fuse the fill and matmul
-      // ===========================================================================
-      %grid_reduction, %forall_grid =
-      transform.structured.tile_using_forall %generic tile_sizes [16] ( mapping = [#gpu.block<x>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-      transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
-
-      transform.structured.fuse_into_containing_op %matmul into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-      transform.structured.fuse_into_containing_op %fill into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-      // Step 3. Vectorize
-      // ===========================================================================
-      %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-      transform.apply_patterns to %func {
-        transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
-        transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
-        transform.apply_patterns.vector.cast_away_vector_leading_one_dim
-      } : !transform.any_op
-      %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
-
-      // Step 4. Bufferize
-      // ===========================================================================
-      transform.apply_patterns to %func_3 {
-        transform.apply_patterns.iree.fold_fill_into_pad
-        transform.apply_patterns.linalg.tiling_canonicalization
-        transform.apply_patterns.scf.for_loop_canonicalization
-      } : !transform.any_op
-      transform.apply_patterns to %func_3 {
-        transform.apply_patterns.tensor.reassociative_reshape_folding
-        transform.apply_patterns.canonicalization
-      } : !transform.any_op
-      transform.apply_cse to %func_3 : !transform.any_op
-      transform.iree.eliminate_empty_tensors %func_3 : (!transform.any_op) -> ()
-      transform.apply_patterns to %func_3 {
-        transform.apply_patterns.linalg.erase_unnecessary_inputs
-      } : !transform.any_op
-      %memref_func = transform.iree.bufferize { target_gpu } %func_3 : (!transform.any_op) -> (!transform.any_op)
-
-      // Step 6. Post-bufferization vector distribution
-      // ===========================================================================
-      transform.iree.forall_to_workgroup %memref_func : (!transform.any_op) -> ()
-      transform.iree.map_nested_forall_to_gpu_threads %memref_func
-          workgroup_dims = [4, 8, 1] : (!transform.any_op) -> ()
-
-      // Step 7. Do layout analysis and lower to mma
-      // ===========================================================================
-      %func_11 = transform.iree.layout_analysis_and_distribution %memref_func : (!transform.any_op) -> (!transform.any_op)
-
-      transform.yield
-    }
-} // module
diff --git a/tests/transform_dialect/cuda/mma_reduction_layout_analysis.mlir b/tests/transform_dialect/cuda/mma_reduction_layout_analysis.mlir
deleted file mode 100644
index 198b513aeeda..000000000000
--- a/tests/transform_dialect/cuda/mma_reduction_layout_analysis.mlir
+++ /dev/null
@@ -1,38 +0,0 @@
-#map = affine_map<(d0, d1) -> (d0, d1)>
-#map1 = affine_map<(d0, d1) -> (d0)>
-func.func @matmul_reduction(%lhs : tensor<16x16xf16>, %rhs : tensor<16x16xf16>) -> tensor<16x16xf16> {
-  %c0 = arith.constant 0.0 : f16
-  %c1 = arith.constant -1.0e+04 : f16
-  %acc = tensor.empty() : tensor<16xf16>
-  %init = linalg.fill ins(%c1 : f16) outs(%acc : tensor<16xf16>) -> tensor<16xf16>
-  %0 = tensor.empty() : tensor<16x16xf16>
-  %1 = linalg.fill ins(%c0 : f16) outs(%0 : tensor<16x16xf16>) -> tensor<16x16xf16>
-  %2 = linalg.matmul_transpose_b ins(%lhs, %rhs : tensor<16x16xf16>, tensor<16x16xf16>)
-      outs(%1 : tensor<16x16xf16>) -> tensor<16x16xf16>
-  %6 = linalg.generic {indexing_maps
- = [#map, #map1], iterator_types = ["parallel", "reduction"]}
-        ins(%2 : tensor<16x16xf16>) outs(%init : tensor<16xf16>) {
-        ^bb0(%in: f16, %out: f16):
-          %20 = arith.maximumf %in, %out : f16
-          linalg.yield %20 : f16
-        } -> tensor<16xf16>
-  %8 = linalg.generic {indexing_maps = [#map1, #map], iterator_types=["parallel", "parallel"]}
-        ins(%6 : tensor<16xf16>) outs(%0 : tensor<16x16xf16>) {
-        ^bb0(%in: f16,  %out: f16):
-          linalg.yield %in : f16
-        } -> tensor<16x16xf16>
-  return %8 : tensor<16x16xf16>
-}
-
-// RUN: iree-compile %s --iree-hal-target-backends=cuda \
-// RUN:     --iree-hal-cuda-llvm-target-arch=sm_80 \
-// RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-flow-dispatch-use-transform-dialect=%p/mma_reduction_layout_analysis_dispatch_spec.mlir \
-// RUN:     --iree-codegen-transform-dialect-library=%p/mma_reduction_layout_analysis_codegen_spec.mlir@codegen | \
-// RUN: iree-run-module --module=- --function=matmul_reduction --device=cuda \
-// RUN: --input="16x16xf16=[[3.0,2.0,2.5,4.5,1.5,4.0,2.0,2.5,4.0,4.0,1.5,0.5,2.0,3.0,0.5,2.0],[2.5,2.5,0.5,3.5,0.0,2.5,3.5,1.0,0.5,0.0,3.0,4.5,0.5,0.5,0.0,3.5],[4.5,3.0,4.0,2.5,1.0,0.5,0.0,4.5,0.0,2.5,3.5,0.0,2.0,4.5,1.5,4.5],[0.0,2.0,1.5,0.0,2.0,1.5,3.0,2.0,2.0,4.0,4.0,2.5,0.0,3.0,2.0,0.5],[0.5,3.5,3.0,2.5,0.0,2.5,3.0,3.0,4.5,2.0,2.0,1.0,2.0,1.0,3.5,2.0],[0.0,4.5,2.0,4.0,2.5,2.5,1.5,1.5,1.5,3.0,3.0,0.0,2.5,0.5,2.0,2.0],[3.5,4.0,3.5,1.5,2.0,0.5,1.0,2.5,4.0,3.5,0.0,3.0,0.0,1.5,4.5,0.0],[4.5,3.5,1.0,4.5,0.5,0.0,1.5,4.5,1.5,3.5,3.0,2.5,0.0,0.5,0.0,4.0],[2.0,3.0,0.5,2.0,1.5,0.5,2.0,2.5,2.5,4.0,2.0,4.5,4.0,0.0,2.0,3.0],[2.5,4.0,4.0,3.0,2.0,2.0,4.5,0.5,4.5,1.0,2.0,0.0,4.5,1.0,3.0,0.5],[4.0,1.5,3.5,3.0,2.5,4.5,1.0,3.5,3.0,2.5,2.5,2.0,2.0,4.5,1.5,2.5],[3.0,3.0,0.0,2.5,1.0,3.0,0.0,1.5,1.5,2.5,0.5,1.0,3.0,3.5,1.5,1.5],[0.0,4.5,0.5,1.5,0.5,4.0,3.5,4.0,4.0,0.0,0.5,1.0,4.5,1.5,0.0,3.5],[2.5,2.0,2.5,1.5,3.0,0.0,2.0,1.0,2.5,4.0,0.0,4.0,4.0,1.5,3.0,2.5],[3.0,0.0,4.0,4.0,2.0,0.5,1.0,3.5,4.0,2.5,4.0,4.5,0.0,3.0,1.5,2.5],[0.5,0.5,2.5,4.0,1.0,2.5,0.5,4.5,2.0,3.0,1.5,4.5,1.5,4.5,0.5,1.5]]" \
-// RUN: --input="16x16xf16=[[3.5,3.0,4.5,3.0,3.0,0.0,2.0,2.5,2.0,0.0,4.5,2.5,0.5,0.0,4.0,3.5],[0.0,0.5,2.0,4.5,0.0,4.0,1.5,3.5,0.5,2.5,3.5,1.5,3.5,4.5,4.0,3.0],[3.0,3.5,2.5,1.5,1.5,1.5,0.5,4.5,0.0,3.5,4.0,0.0,0.0,2.0,0.5,1.0],[1.5,4.0,3.5,3.5,0.0,0.0,0.0,2.0,3.0,1.5,0.0,3.0,0.0,2.5,2.0,3.0],[3.5,4.0,2.5,1.5,3.0,2.0,3.0,4.5,1.5,3.0,2.0,3.5,2.5,4.5,0.5,3.5],[0.0,0.0,0.0,0.5,1.0,2.5,1.5,1.0,2.5,1.5,0.0,1.5,1.5,2.0,4.5,2.5],[4.0,1.5,3.0,2.5,2.5,3.5,2.0,4.0,1.5,2.5,0.5,4.0,1.0,4.5,3.5,0.0],[1.0,2.0,4.0,4.5,4.5,3.5,0.0,1.0,4.5,3.5,2.0,3.0,0.5,4.0,3.5,1.5],[1.0,0.0,2.5,4.5,0.0,2.0,0.0,2.5,3.0,4.0,2.5,0.5,3.5,0.0,3.5,1.0],[0.0,3.5,4.0,0.0,0.0,4.5,1.0,3.5,1.5,3.0,2.0,1.0,0.5,0.5,2.0,0.0],[1.5,0.0,4.5,2.0,4.5,4.5,3.5,3.0,2.5,4.5,0.5,0.5,0.0,4.5,0.0,4.0],[4.5,3.5,4.0,4.0,1.5,4.0,1.0,4.0,2.5,0.5,4.5,3.5,3.5,0.5,4.5,3.0],[0.0,3.0,2.5,1.0,1.5,2.0,1.0,1.5,4.0,2.5,3.5,1.0,3.5,2.5,3.5,4.5],[1.5,4.5,2.0,2.0,2.0,0.5,4.0,2.0,4.0,3.5,4.0,1.0,1.5,2.5,1.0,0.0],[0.0,0.0,1.0,2.5,3.5,2.5,4.0,0.0,2.0,2.0,4.5,0.5,1.0,3.5,3.0,2.5],[2.0,2.0,0.5,2.0,4.5,2.5,3.0,1.5,4.5,2.0,3.5,3.0,1.0,2.0,1.5,2.0]]" |\
-// RUN: FileCheck %s --check-prefix=EXEC
-
-//      EXEC: result[0]: hal.buffer_view
-// EXEC-NEXT: 16x16xf16=[116 116 116 116 116 116 116 116 116 116 116 116 116 116 116 116][96.5 96.5 96.5 96.5 96.5 96.5 96.5 96.5 96.5 96.5 96.5 96.5 96.5 96.5 96.5 96.5][124.75 124.75 124.75 124.75 124.75 124.75 124.75 124.75 124.75 124.75 124.75 124.75 124.75 124.75 124.75 124.75][86.75 86.75 86.75 86.75 86.75 86.75 86.75 86.75 86.75 86.75 86.75 86.75 86.75 86.75 86.75 86.75][115.5 115.5 115.5 115.5 115.5 115.5 115.5 115.5 115.5 115.5 115.5 115.5 115.5 115.5 115.5 115.5][103.75 103.75 103.75 103.75 103.75 103.75 103.75 103.75 103.75 103.75 103.75 103.75 103.75 103.75 103.75 103.75][109 109 109 109 109 109 109 109 109 109 109 109 109 109 109 109][114.75 114.75 114.75 114.75 114.75 114.75 114.75 114.75 114.75 114.75 114.75 114.75 114.75 114.75 114.75 114.75][110.75 110.75 110.75 110.75 110.75 110.75 110.75 110.75 110.75 110.75 110.75 110.75 110.75 110.75 110.75 110.75][122.75 122.75 122.75 122.75 122.75 122.75 122.75 122.75 122.75 122.75 122.75 122.75 122.75 122.75 122.75 122.75][136.5 136.5 136.5 136.5 136.5 136.5 136.5 136.5 136.5 136.5 136.5 136.5 136.5 136.5 136.5 136.5][87.75 87.75 87.75 87.75 87.75 87.75 87.75 87.75 87.75 87.75 87.75 87.75 87.75 87.75 87.75 87.75][102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75][102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75 102.75][126.25 126.25 126.25 126.25 126.25 126.25 126.25 126.25 126.25 126.25 126.25 126.25 126.25 126.25 126.25 126.25][106 106 106 106 106 106 106 106 106 106 106 106 106 106 106 106]
diff --git a/tests/transform_dialect/cuda/mma_reduction_layout_analysis_codegen_spec.mlir b/tests/transform_dialect/cuda/mma_reduction_layout_analysis_codegen_spec.mlir
deleted file mode 100644
index 746625f798f8..000000000000
--- a/tests/transform_dialect/cuda/mma_reduction_layout_analysis_codegen_spec.mlir
+++ /dev/null
@@ -1,68 +0,0 @@
-// RUN: iree-opt %s
-
-module attributes { transform.with_named_sequence } {
-  transform.named_sequence @codegen(
-      %variant_op: !transform.any_op) {
-
-    // Step 1. Find the fill, matmul and generic ops
-    // ===========================================================================
-    %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    %matmul = transform.structured.match ops{["linalg.generic"]}
-                attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]}
-                in %variant_op : (!transform.any_op) -> !transform.any_op
-    %reduce = transform.structured.match ops{["linalg.generic"]}
-                attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]}
-                in %variant_op : (!transform.any_op) -> !transform.any_op
-    %broadcast = transform.structured.match ops{["linalg.generic"]}
-                attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>]}
-                in %variant_op : (!transform.any_op) -> !transform.any_op
-
-    // Step 2. Tile the matmul and fuse the fill
-    // ===========================================================================
-    %grid_reduction, %forall_grid =
-    transform.structured.tile_using_forall %broadcast tile_sizes [16] ( mapping = [#gpu.block<x>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-    transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
-    transform.structured.fuse_into_containing_op %reduce into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-    transform.structured.fuse_into_containing_op %matmul into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-    transform.structured.fuse_into_containing_op %fill into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-    // Step 3. Vectorize
-    // ===========================================================================
-    %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    transform.apply_patterns to %func {
-      transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
-      transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
-      transform.apply_patterns.vector.cast_away_vector_leading_one_dim
-    } : !transform.any_op
-    %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
-
-    // Step 4. Bufferize
-    // ===========================================================================
-    transform.apply_patterns to %func_3 {
-      transform.apply_patterns.iree.fold_fill_into_pad
-      transform.apply_patterns.linalg.tiling_canonicalization
-      transform.apply_patterns.scf.for_loop_canonicalization
-    } : !transform.any_op
-    transform.apply_patterns to %func_3 {
-      transform.apply_patterns.tensor.reassociative_reshape_folding
-      transform.apply_patterns.canonicalization
-    } : !transform.any_op
-    transform.apply_cse to %func_3 : !transform.any_op
-    transform.iree.eliminate_empty_tensors %func_3 : (!transform.any_op) -> ()
-    transform.apply_patterns to %func_3 {
-      transform.apply_patterns.linalg.erase_unnecessary_inputs
-    } : !transform.any_op
-    %memref_func = transform.iree.bufferize { target_gpu } %func_3 : (!transform.any_op) -> (!transform.any_op)
-
-    // Step 6. Post-bufferization vector distribution
-    // ===========================================================================
-    transform.iree.forall_to_workgroup %memref_func : (!transform.any_op) -> ()
-    transform.iree.map_nested_forall_to_gpu_threads %memref_func workgroup_dims = [4, 8, 1] : (!transform.any_op) -> ()
-
-    // Step 7. Do layout analysis and lower to mma
-    // ===========================================================================
-    %func_11 = transform.iree.layout_analysis_and_distribution %memref_func : (!transform.any_op) -> (!transform.any_op)
-
-    transform.yield
-  }
-} // module
diff --git a/tests/transform_dialect/cuda/mma_reduction_layout_analysis_dispatch_spec.mlir b/tests/transform_dialect/cuda/mma_reduction_layout_analysis_dispatch_spec.mlir
deleted file mode 100644
index 998146768b09..000000000000
--- a/tests/transform_dialect/cuda/mma_reduction_layout_analysis_dispatch_spec.mlir
+++ /dev/null
@@ -1,23 +0,0 @@
-// RUN: iree-opt %s
-
-module attributes { transform.with_named_sequence } {
-  transform.named_sequence @__transform_main(%func: !transform.any_op) {
-    %ops = transform.structured.match ops{["linalg.fill", "linalg.matmul_transpose_b", "linalg.generic"]}
-      in %func : (!transform.any_op) -> !transform.any_op
-
-    %fill0, %fill1, %matmul, %reduce, %broadcast =
-      transform.split_handle %ops
-        : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op,
-                               !transform.any_op, !transform.any_op)
-
-    %region_op = transform.iree.wrap_in_dispatch_region %broadcast { generateWorkload = false } : (!transform.any_op) -> !transform.any_op
-
-    %non_broadcast = transform.merge_handles %fill0, %fill1, %matmul, %reduce : !transform.any_op
-    %region_op_2 = transform.iree.move_preceding_op_into_dispatch_region %non_broadcast into %region_op : (!transform.any_op, !transform.any_op) -> !transform.any_op
-
-    %empty = transform.structured.match ops{["tensor.empty"]} in %func : (!transform.any_op) -> !transform.any_op
-    %region_op_3 = transform.iree.move_preceding_op_into_dispatch_region %empty into %region_op_2 : (!transform.any_op, !transform.any_op) -> !transform.any_op
-    transform.iree.region_to_workgroups %region_op_3 : (!transform.any_op) -> !transform.any_op
-    transform.yield
-  }
-}
diff --git a/tests/transform_dialect/cuda/mma_using_layout_analysis.mlir b/tests/transform_dialect/cuda/mma_using_layout_analysis.mlir
deleted file mode 100644
index 5c0e240db337..000000000000
--- a/tests/transform_dialect/cuda/mma_using_layout_analysis.mlir
+++ /dev/null
@@ -1,20 +0,0 @@
-func.func @matmul(%lhs : tensor<16x16xf16>, %rhs : tensor<16x8xf16>) -> tensor<16x8xf16> {
-  %c0 = arith.constant 0.0 : f16
-  %0 = tensor.empty() : tensor<16x8xf16>
-  %1 = linalg.fill ins(%c0 : f16) outs(%0 : tensor<16x8xf16>) -> tensor<16x8xf16>
-  %2 = linalg.matmul ins(%lhs, %rhs : tensor<16x16xf16>, tensor<16x8xf16>)
-      outs(%1 : tensor<16x8xf16>) -> tensor<16x8xf16>
-  return %2 : tensor<16x8xf16>
-}
-
-// RUN: iree-compile %s --iree-hal-target-backends=cuda \
-// RUN:     --iree-hal-cuda-llvm-target-arch=sm_80 \
-// RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-transform-dialect-library=%p/mma_using_layout_analysis_codegen_spec.mlir@__transform_main | \
-// RUN: iree-run-module --module=- --function=matmul --device=cuda \
-// RUN: --input="16x16xf16=[[1.0,1.125,1.25,1.375,1.5,1.625,1.75,1.875,2.0,2.125,2.25,2.375,2.5,2.625,2.75,2.875],[3.0,3.125,3.25,3.375,3.5,3.625,3.75,3.875,4.0,4.125,4.25,4.375,4.5,4.625,4.75,4.875],[5.0,5.125,5.25,5.375,5.5,5.625,5.75,5.875,6.0,6.125,6.25,6.375,6.5,6.625,6.75,6.875],[7.0,7.125,7.25,7.375,7.5,7.625,7.75,7.875,8.0,8.125,8.25,8.375,8.5,8.625,8.75,8.875],[9.0,9.125,9.25,9.375,9.5,9.625,9.75,9.875,10.0,10.125,10.25,10.375,10.5,10.625,10.75,10.875],[11.0,11.125,11.25,11.375,11.5,11.625,11.75,11.875,12.0,12.125,12.25,12.375,12.5,12.625,12.75,12.875],[13.0,13.125,13.25,13.375,13.5,13.625,13.75,13.875,14.0,14.125,14.25,14.375,14.5,14.625,14.75,14.875],[15.0,15.125,15.25,15.375,15.5,15.625,15.75,15.875,16.0,16.125,16.25,16.375,16.5,16.625,16.75,16.875],[17.0,17.125,17.25,17.375,17.5,17.625,17.75,17.875,18.0,18.125,18.25,18.375,18.5,18.625,18.75,18.875],[19.0,19.125,19.25,19.375,19.5,19.625,19.75,19.875,20.0,20.125,20.25,20.375,20.5,20.625,20.75,20.875],[21.0,21.125,21.25,21.375,21.5,21.625,21.75,21.875,22.0,22.125,22.25,22.375,22.5,22.625,22.75,22.875],[23.0,23.125,23.25,23.375,23.5,23.625,23.75,23.875,24.0,24.125,24.25,24.375,24.5,24.625,24.75,24.875],[25.0,25.125,25.25,25.375,25.5,25.625,25.75,25.875,26.0,26.125,26.25,26.375,26.5,26.625,26.75,26.875],[27.0,27.125,27.25,27.375,27.5,27.625,27.75,27.875,28.0,28.125,28.25,28.375,28.5,28.625,28.75,28.875],[29.0,29.125,29.25,29.375,29.5,29.625,29.75,29.875,30.0,30.125,30.25,30.375,30.5,30.625,30.75,30.875],[31.0,31.125,31.25,31.375,31.5,31.625,31.75,31.875,32.0,32.125,32.25,32.375,32.5,32.625,32.75,32.875]]" \
-// RUN: --input="16x8xf16=[[1.0,1.125,1.25,1.375,1.5,1.625,1.75,1.875],[2.0,2.125,2.25,2.375,2.5,2.625,2.75,2.875],[3.0,3.125,3.25,3.375,3.5,3.625,3.75,3.875],[4.0,4.125,4.25,4.375,4.5,4.625,4.75,4.875],[5.0,5.125,5.25,5.375,5.5,5.625,5.75,5.875],[6.0,6.125,6.25,6.375,6.5,6.625,6.75,6.875],[7.0,7.125,7.25,7.375,7.5,7.625,7.75,7.875],[8.0,8.125,8.25,8.375,8.5,8.625,8.75,8.875],[9.0,9.125,9.25,9.375,9.5,9.625,9.75,9.875],[10.0,10.125,10.25,10.375,10.5,10.625,10.75,10.875],[11.0,11.125,11.25,11.375,11.5,11.625,11.75,11.875],[12.0,12.125,12.25,12.375,12.5,12.625,12.75,12.875],[13.0,13.125,13.25,13.375,13.5,13.625,13.75,13.875],[14.0,14.125,14.25,14.375,14.5,14.625,14.75,14.875],[15.0,15.125,15.25,15.375,15.5,15.625,15.75,15.875],[16.0,16.125,16.25,16.375,16.5,16.625,16.75,16.875]]" |\
-// RUN: FileCheck %s --check-prefix=EXEC
-
-//      EXEC: result[0]: hal.buffer_view
-// EXEC-NEXT: 16x8xf16=[306 310 313.75 317.5 321.5 325.5 329.25 333][578 586 594 601.5 609.5 617.5 625 633][850 862 874 885.5 897.5 909.5 921 933][1122 1138 1154 1170 1186 1201 1217 1233][1394 1414 1434 1454 1474 1493 1513 1533][1666 1690 1714 1738 1762 1785 1809 1833][1938 1966 1994 2022 2050 2078 2106 2134][2210 2242 2274 2306 2338 2370 2402 2434][2482 2518 2554 2590 2626 2662 2698 2734][2754 2794 2834 2874 2914 2954 2994 3034][3026 3070 3114 3158 3202 3246 3290 3334][3298 3346 3394 3442 3490 3538 3586 3634][3570 3622 3674 3726 3778 3830 3882 3934][3842 3898 3954 4010 4066 4120 4176 4232][4112 4172 4232 4292 4352 4412 4472 4532][4384 4448 4512 4576 4640 4704 4768 4832]
diff --git a/tests/transform_dialect/cuda/mma_using_layout_analysis_codegen_spec.mlir b/tests/transform_dialect/cuda/mma_using_layout_analysis_codegen_spec.mlir
deleted file mode 100644
index ee3f7336258d..000000000000
--- a/tests/transform_dialect/cuda/mma_using_layout_analysis_codegen_spec.mlir
+++ /dev/null
@@ -1,72 +0,0 @@
-// RUN: iree-opt %s
-
-module attributes { transform.with_named_sequence } {
-  transform.named_sequence @__transform_main(
-      %variant_op: !transform.any_op {transform.consumed}) {
-    // Step 1. Find the fill and matmul ops
-    // ===========================================================================
-    %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    %matmul = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-
-    // Step 2. Tile the matmul and fuse the fill
-    // ===========================================================================
-    %grid_reduction, %forall_grid =
-    transform.structured.tile_using_forall %matmul tile_sizes [16] ( mapping = [#gpu.block<x>] ) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-    transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
-
-    transform.structured.fuse_into_containing_op %fill into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-    // Promote operands in order to test loading from shared memory.
-    %matmul_2 = transform.structured.match ops{["linalg.matmul"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    %promoted_matmul, %alloc_0, %alloc_1 =
-      transform.iree.promote_operands %matmul_2 [0, 1]
-        : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
-
-
-    // Step 3. Vectorize
-    // ===========================================================================
-    %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    transform.apply_patterns to %func {
-      transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
-      transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
-      transform.apply_patterns.vector.cast_away_vector_leading_one_dim
-    } : !transform.any_op
-    %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
-
-    // Step 4. Bufferize
-    // ===========================================================================
-    transform.apply_patterns to %func_3 {
-      transform.apply_patterns.iree.fold_fill_into_pad
-      transform.apply_patterns.linalg.tiling_canonicalization
-      transform.apply_patterns.scf.for_loop_canonicalization
-    } : !transform.any_op
-    transform.apply_patterns to %func_3 {
-      transform.apply_patterns.tensor.reassociative_reshape_folding
-      transform.apply_patterns.canonicalization
-    } : !transform.any_op
-    transform.apply_cse to %func_3 : !transform.any_op
-    transform.iree.eliminate_empty_tensors %func_3 : (!transform.any_op) -> ()
-    transform.apply_patterns to %func_3 {
-      transform.apply_patterns.linalg.erase_unnecessary_inputs
-    } : !transform.any_op
-    %memref_func = transform.iree.bufferize { target_gpu } %func_3: (!transform.any_op) -> (!transform.any_op)
-
-    // Step 5. Pre-process the contract and transfer ops to put it in the right form.
-    // ===========================================================================
-    transform.apply_patterns to %memref_func {
-      transform.apply_patterns.iree.prepare_vector_to_mma
-    } : !transform.any_op
-
-    // Step 6. Post-bufferization vector distribution
-    // ===========================================================================
-    transform.iree.forall_to_workgroup %memref_func : (!transform.any_op) -> ()
-    transform.iree.map_nested_forall_to_gpu_threads %memref_func
-        workgroup_dims = [4, 8, 1] : (!transform.any_op) -> ()
-
-    // Step 7. Do layout analysis and lower to mma
-    // ===========================================================================
-    %func_11 = transform.iree.layout_analysis_and_distribution %memref_func : (!transform.any_op) -> (!transform.any_op)
-
-    transform.yield
-  }
-} // module
diff --git a/tests/transform_dialect/cuda/reduction.mlir b/tests/transform_dialect/cuda/reduction.mlir
deleted file mode 100644
index d506a1740cf1..000000000000
--- a/tests/transform_dialect/cuda/reduction.mlir
+++ /dev/null
@@ -1,33 +0,0 @@
-!in_tensor_t = tensor<8x64xf32>
-!out_tensor_t = tensor<8xf32>
-
-func.func @reduce(%arg : !in_tensor_t) -> (!out_tensor_t) {
-  %cst = arith.constant -0.000000e+00 : f32
-
-  %0 = tensor.empty() : !out_tensor_t
-  %1 = linalg.fill ins(%cst : f32) outs(%0 : !out_tensor_t) ->   !out_tensor_t
-  %2 = linalg.generic {
-    indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
-                     affine_map<(d0, d1) -> (d0)>],
-    iterator_types = ["parallel", "reduction"]}
-    ins(%arg : !in_tensor_t) outs(%1 : !out_tensor_t) {
-      ^bb0(%arg3: f32, %arg4: f32):
-        %3 = arith.addf %arg3, %arg4 : f32
-        linalg.yield %3 : f32
-      } -> !out_tensor_t
-  return %2 : !out_tensor_t
-}
-
-// RUN: iree-compile %s --iree-hal-target-backends=cuda \
-// RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-transform-dialect-library=%p/reduction_codegen_spec.mlir@codegen | \
-// RUN: iree-run-module --module=- --function=reduce --device=cuda --input="8x64xf32=1" |\
-// RUN: FileCheck %s --check-prefix=EXEC
-
-/// Note: the current --iree-codegen-llvmgpu-enable-transform-dialect-jit only works for exactly this reduction atm.
-// RUN: iree-compile %s --iree-hal-target-backends=cuda | \
-// RUN: iree-run-module --module=- --function=reduce --device=cuda --input="8x64xf32=1" |\
-// RUN: FileCheck %s --check-prefix=EXEC
-
-//      EXEC: result[0]: hal.buffer_view
-// EXEC-NEXT: 8xf32=64 64 64 64 64 64 64 64
diff --git a/tests/transform_dialect/cuda/reduction_codegen_spec.mlir b/tests/transform_dialect/cuda/reduction_codegen_spec.mlir
deleted file mode 100644
index c8b46b3756a6..000000000000
--- a/tests/transform_dialect/cuda/reduction_codegen_spec.mlir
+++ /dev/null
@@ -1,125 +0,0 @@
-// RUN: iree-opt %s
-
-module attributes { transform.with_named_sequence } {
-  transform.named_sequence @codegen(
-      %variant_op: !transform.any_op {transform.consumed}) {
-
-    %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-
-    // Step 1. Split the reduction to get meatier (size(red) / 2)-way parallelism.
-    // ===========================================================================
-    %0 = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    %init_or_alloc_op, %more_parallel_fill_op, %more_parallel_op, %combiner_op =
-      transform.structured.split_reduction %0
-        { split_factor = 2, insert_split_dimension = 1 }
-        : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
-
-    // Step 2. First level of tiling + fusion parallelizes to blocks.
-    // ===========================================================================
-    %grid_combiner_op, %forall_grid =
-      transform.structured.tile_using_forall %combiner_op tile_sizes [1]
-        ( mapping = [#gpu.block<x>] )
-        : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-    transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
-    %not_combiner = transform.merge_handles %fill, %more_parallel_fill_op, %more_parallel_op : !transform.any_op
-    transform.structured.fuse_into_containing_op %not_combiner into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-    // Step 3. Second level of tiling + fusion parallelizes to threads.
-    // ===========================================================================
-    %fill_1d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1xf32> in %variant_op
-      : (!transform.any_op) -> !transform.any_op
-    %block_combiner_op, %forall_block_combiner_op =
-      transform.structured.tile_using_forall %grid_combiner_op tile_sizes [1]
-      ( mapping = [#gpu.thread<z>] )
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-    transform.structured.fuse_into_containing_op %fill_1d into %forall_block_combiner_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-    // Canonicalizations.
-    %func_op = transform.structured.match ops{["func.func"]} in %variant_op
-        : (!transform.any_op) -> !transform.any_op
-    transform.apply_patterns to %func_op {
-      transform.apply_patterns.iree.fold_fill_into_pad
-      transform.apply_patterns.linalg.tiling_canonicalization
-      transform.apply_patterns.scf.for_loop_canonicalization
-    } : !transform.any_op
-    transform.iree.apply_licm %func_op : !transform.any_op
-    transform.apply_cse to %func_op : !transform.any_op
-
-    %fill_2d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1x2xf32> in %variant_op
-      : (!transform.any_op) -> !transform.any_op
-    %grid_more_parallel_op = transform.structured.match ops{["linalg.generic"]}
-      attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]} in %variant_op
-        : (!transform.any_op) -> !transform.any_op
-    %block_more_parallel_op, %forall_block_more_parallel_op =
-      transform.structured.tile_using_forall %grid_more_parallel_op tile_sizes [1, 1]
-      ( mapping = [#gpu.thread<z>, #gpu.thread<y>] )
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-    transform.structured.fuse_into_containing_op %fill_2d into %forall_block_more_parallel_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-    // Step 4. Rank-reduce and vectorize.
-    // ===========================================================================
-    %func = transform.structured.match ops{["func.func"]} in %variant_op
-      : (!transform.any_op) -> !transform.any_op
-    transform.apply_patterns to %func {
-      transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
-      transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
-      transform.apply_patterns.vector.cast_away_vector_leading_one_dim
-    } : !transform.any_op
-    %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
-
-    // Step 5. Bufferize and drop HAL decriptor from memref ops.
-    // ===========================================================================
-    transform.apply_patterns to %func_3 {
-      transform.apply_patterns.tensor.reassociative_reshape_folding
-    } : !transform.any_op
-    transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
-    %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> !transform.any_op
-    %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3
-      : (!transform.any_op) -> !transform.any_op
-
-    // Step 6. Post-bufferization mapping to blocks and threads.
-    // ===========================================================================
-    %func_5 = transform.structured.match ops{["func.func"]} in %variant_op_3
-      : (!transform.any_op) -> !transform.any_op
-    transform.iree.forall_to_workgroup %func_5 : (!transform.any_op) -> ()
-    transform.iree.map_nested_forall_to_gpu_threads %func_5
-        workgroup_dims = [32, 2, 1] : (!transform.any_op) -> ()
-
-    // Step 7. Post-bufferization vector distribution with rank-reduction.
-    // ===========================================================================
-    transform.apply_patterns to %func_5 {
-      transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
-      transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
-      transform.apply_patterns.memref.fold_memref_alias_ops
-      transform.apply_patterns.vector.cast_away_vector_leading_one_dim
-    } : !transform.any_op
-    %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3
-      : (!transform.any_op) -> !transform.any_op
-    // Don't complain about unsupported if (threadIdx.x == 0 && threadIdx.y == 0)
-    // at this point.
-    transform.sequence %variant_op_3 : !transform.any_op failures(suppress) {
-    ^bb0(%arg0: !transform.any_op):
-      transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 } : (!transform.any_op) -> !transform.any_op
-    }
-    transform.iree.vector.warp_distribute %func_5 : (!transform.any_op) -> ()
-
-
-    // Late Canonicalizations.
-    %func_op_3 = transform.structured.match ops{["func.func"]} in %variant_op_3
-        : (!transform.any_op) -> !transform.any_op
-    transform.apply_patterns to %func_op_3 {
-      transform.apply_patterns.iree.fold_fill_into_pad
-      transform.apply_patterns.linalg.tiling_canonicalization
-      transform.apply_patterns.scf.for_loop_canonicalization
-    } : !transform.any_op
-    transform.iree.apply_licm %func_op_3 : !transform.any_op
-    transform.apply_cse to %func_op_3 : !transform.any_op
-
-    // Annotate the exported function as already translated.
-    %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-    %none = transform.param.constant #iree_codegen.translation_info<None> -> !transform.any_param
-    transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param
-
-    transform.yield
-  }
-} // module
diff --git a/tests/transform_dialect/cuda/reduction_eltwise.mlir b/tests/transform_dialect/cuda/reduction_eltwise.mlir
deleted file mode 100644
index 0bd49b5d4298..000000000000
--- a/tests/transform_dialect/cuda/reduction_eltwise.mlir
+++ /dev/null
@@ -1,45 +0,0 @@
-!in_tensor_t = tensor<8x64xf32>
-!out_tensor_t = tensor<8xf32>
-
-func.func @reduce(%arg : !in_tensor_t) -> (!out_tensor_t) {
-  %cst = arith.constant -0.000000e+00 : f32
-
-  %0 = tensor.empty() : !out_tensor_t
-  %1 = linalg.fill ins(%cst : f32) outs(%0 : !out_tensor_t) -> !out_tensor_t
-  %5 = linalg.generic {
-    indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
-                     affine_map<(d0, d1) -> (d0)>],
-    iterator_types = ["parallel", "reduction"]}
-    ins(%arg : !in_tensor_t) outs(%1 : !out_tensor_t) {
-      ^bb0(%arg3: f32, %arg4: f32):
-        %4 = arith.addf %arg3, %arg4 : f32
-        linalg.yield %4 : f32
-      } -> !out_tensor_t
-
-  %6 = tensor.empty() : !out_tensor_t
-  %7 = linalg.generic {
-    indexing_maps = [affine_map<(d0) -> (d0)>,
-                     affine_map<(d0) -> (d0)>],
-    iterator_types = ["parallel"]}
-    ins(%5 : !out_tensor_t) outs(%6 : !out_tensor_t) {
-    ^bb0(%arg3: f32, %arg4: f32):
-      %4 = math.sqrt %arg3 : f32
-      linalg.yield %4 : f32
-    } -> !out_tensor_t
-  return %7 : !out_tensor_t
-}
-
-// RUN: iree-compile %s --iree-hal-target-backends=cuda \
-// RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-transform-dialect-library=%p/reduction_eltwise_codegen_spec.mlir@codegen | \
-// RUN: iree-run-module --module=- --function=reduce --device=cuda --input="8x64xf32=1" |\
-// RUN: FileCheck %s --check-prefix=EXEC
-
-/// Note: the current --iree-codegen-llvmgpu-enable-transform-dialect-jit
-/// only works for exactly this reduction atm.
-// RUN: iree-compile %s --iree-hal-target-backends=cuda | \
-// RUN: iree-run-module --module=- --function=reduce --device=cuda --input="8x64xf32=1" |\
-// RUN: FileCheck %s --check-prefix=EXEC
-
-//      EXEC: result[0]: hal.buffer_view
-// EXEC-NEXT: 8xf32=8 8 8 8 8 8 8 8
diff --git a/tests/transform_dialect/cuda/reduction_eltwise_codegen_spec.mlir b/tests/transform_dialect/cuda/reduction_eltwise_codegen_spec.mlir
deleted file mode 100644
index 6422201305af..000000000000
--- a/tests/transform_dialect/cuda/reduction_eltwise_codegen_spec.mlir
+++ /dev/null
@@ -1,164 +0,0 @@
-// RUN: iree-opt %s
-
-module attributes { transform.with_named_sequence } {
-  transform.named_sequence @codegen(
-      %variant_op: !transform.any_op {transform.consumed}) {
-    %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op
-      : (!transform.any_op) -> !transform.any_op
-
-    // Step 1. Split the reduction to get meatier (size(red) / 2)-way parallelism.
-    // ===========================================================================
-    %0 = transform.structured.match ops{["linalg.generic"]} in %variant_op
-      : (!transform.any_op) -> !transform.any_op
-    %reduction, %eltwise = transform.split_handle %0
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-    %init_or_alloc_op, %more_parallel_fill_op, %more_parallel_op, %combiner_op =
-      transform.structured.split_reduction %reduction
-        { split_factor = 2, insert_split_dimension = 1 }
-        : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
-
-    // Canonicalizations.
-    %func_op = transform.structured.match ops{["func.func"]} in %variant_op
-        : (!transform.any_op) -> !transform.any_op
-    transform.apply_patterns to %func_op {
-      transform.apply_patterns.iree.fold_fill_into_pad
-      transform.apply_patterns.linalg.tiling_canonicalization
-      transform.apply_patterns.scf.for_loop_canonicalization
-      transform.apply_patterns.canonicalization
-    } : !transform.any_op
-    transform.iree.apply_licm %func_op : !transform.any_op
-    transform.apply_cse to %func_op : !transform.any_op
-
-    // Step 2. First level of tiling + fusion parallelizes to blocks. Tile the
-    // trailing elementwise the same way we want to tile the reduction.
-    // ===========================================================================
-    %eltwise_grid_op, %grid_loop = transform.structured.tile_using_forall %eltwise
-      tile_sizes [1] (mapping = [#gpu.block<x>])
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-    transform.iree.populate_workgroup_count_region_using_num_threads_slice %grid_loop : (!transform.any_op) -> ()
-    %not_eltwise = transform.merge_handles %fill, %more_parallel_fill_op, %more_parallel_op, %combiner_op
-      : !transform.any_op
-    transform.structured.fuse_into_containing_op %not_eltwise into %grid_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-    // Canonicalizations.
-    transform.apply_patterns to %func_op {
-      transform.apply_patterns.iree.fold_fill_into_pad
-      transform.apply_patterns.linalg.tiling_canonicalization
-      transform.apply_patterns.scf.for_loop_canonicalization
-      transform.apply_patterns.canonicalization
-    } : !transform.any_op
-    transform.iree.apply_licm %func_op : !transform.any_op
-    transform.apply_cse to %func_op : !transform.any_op
-
-    // Step 3. Second level of tiling + fusion parallelizes to threads.
-    // ===========================================================================
-    %fill_1d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1xf32> in %variant_op
-      : (!transform.any_op) -> !transform.any_op
-    %eltwise_block_op, %eltwise_block_loop =
-      transform.structured.tile_using_forall %eltwise_grid_op tile_sizes [1]
-      ( mapping = [#gpu.thread<z>] )
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-    %block_combiner_op = transform.structured.match ops{["linalg.generic"]}
-      attributes {iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]} in %variant_op
-        : (!transform.any_op) -> !transform.any_op
-    %combined_and_fill = transform.merge_handles %fill_1d, %block_combiner_op : !transform.any_op
-    transform.structured.fuse_into_containing_op %combined_and_fill into %eltwise_block_loop : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-    // Canonicalizations.
-    transform.apply_patterns to %func_op {
-      transform.apply_patterns.iree.fold_fill_into_pad
-      transform.apply_patterns.linalg.tiling_canonicalization
-      transform.apply_patterns.scf.for_loop_canonicalization
-      transform.apply_patterns.canonicalization
-    } : !transform.any_op
-    transform.iree.apply_licm %func_op : !transform.any_op
-    transform.apply_cse to %func_op : !transform.any_op
-
-    %fill_2d = transform.structured.match ops{["linalg.fill"]} filter_result_type = tensor<1x2xf32> in %variant_op
-      : (!transform.any_op) -> !transform.any_op
-    %grid_more_parallel_op = transform.structured.match ops{["linalg.generic"]}
-      attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]} in %variant_op
-        : (!transform.any_op) -> !transform.any_op
-    %block_more_parallel_op, %forall_block_more_parallel_op =
-      transform.structured.tile_using_forall %grid_more_parallel_op tile_sizes [1, 1]
-      ( mapping = [#gpu.thread<z>, #gpu.thread<y>] )
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-    transform.structured.fuse_into_containing_op %fill_2d into %forall_block_more_parallel_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-    // Canonicalizations.
-    transform.apply_patterns to %func_op {
-      transform.apply_patterns.iree.fold_fill_into_pad
-      transform.apply_patterns.linalg.tiling_canonicalization
-      transform.apply_patterns.scf.for_loop_canonicalization
-      transform.apply_patterns.canonicalization
-    } : !transform.any_op
-    transform.iree.apply_licm %func_op : !transform.any_op
-    transform.apply_cse to %func_op : !transform.any_op
-
-    // Step 4. Rank-reduce and vectorize.
-    // ===========================================================================
-    %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    transform.apply_patterns to %func {
-      transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
-      transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
-      transform.apply_patterns.vector.cast_away_vector_leading_one_dim
-    } : !transform.any_op
-    %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
-
-    // Step 5. Bufferize and drop HAL decriptor from memref ops.
-    // ===========================================================================
-    transform.apply_patterns to %func_3 {
-      transform.apply_patterns.tensor.reassociative_reshape_folding
-    } : !transform.any_op
-    transform.iree.eliminate_empty_tensors %variant_op: (!transform.any_op) -> ()
-    %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op)
-    %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3
-      : (!transform.any_op) -> !transform.any_op
-
-    // Step 6. Post-bufferization mapping to blocks and threads.
-    // ===========================================================================
-    %func_5 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-    transform.iree.forall_to_workgroup %func_5 : (!transform.any_op) -> ()
-    transform.iree.map_nested_forall_to_gpu_threads %func_5
-        workgroup_dims = [32, 2, 1] : (!transform.any_op) -> ()
-
-    // Step 7. Post-bufferization vector distribution with rank-reduction.
-    // ===========================================================================
-    transform.apply_patterns to %func_5 {
-      transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
-      transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
-      transform.apply_patterns.memref.fold_memref_alias_ops
-      transform.apply_patterns.vector.cast_away_vector_leading_one_dim
-    } : !transform.any_op
-    %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3
-      : (!transform.any_op) -> !transform.any_op
-    // Don't complain about unsupported if (threadIdx.x == 0 && threadIdx.y == 0)
-    // at this point.
-    transform.sequence %variant_op_3 : !transform.any_op failures(suppress) {
-    ^bb0(%arg0: !transform.any_op):
-      transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 }
-      : (!transform.any_op) -> !transform.any_op
-    }
-    transform.iree.vector.warp_distribute %func_5 : (!transform.any_op) -> ()
-
-
-    // Late canonicalizations.
-    %func_op_3 = transform.structured.match ops{["func.func"]} in %variant_op_3
-        : (!transform.any_op) -> !transform.any_op
-    transform.apply_patterns to %func_op_3 {
-      transform.apply_patterns.iree.fold_fill_into_pad
-      transform.apply_patterns.linalg.tiling_canonicalization
-      transform.apply_patterns.scf.for_loop_canonicalization
-      transform.apply_patterns.canonicalization
-    } : !transform.any_op
-    transform.iree.apply_licm %func_op_3 : !transform.any_op
-    transform.apply_cse to %func_op_3 : !transform.any_op
-
-    // Annotate the exported function as already translated.
-    %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-    %none = transform.param.constant #iree_codegen.translation_info<None> -> !transform.any_param
-    transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param
-
-    transform.yield
-  }
-} // module
diff --git a/tests/transform_dialect/cuda/reduction_v2.mlir b/tests/transform_dialect/cuda/reduction_v2.mlir
deleted file mode 100644
index 6d367d218397..000000000000
--- a/tests/transform_dialect/cuda/reduction_v2.mlir
+++ /dev/null
@@ -1,33 +0,0 @@
-!in_tensor_t = tensor<33x1024xf32>
-!out_tensor_t = tensor<33xf32>
-
-func.func @reduce(%arg : !in_tensor_t) -> (!out_tensor_t) {
-  %cst = arith.constant -0.000000e+00 : f32
-
-  %0 = tensor.empty() : !out_tensor_t
-  %1 = linalg.fill ins(%cst : f32) outs(%0 : !out_tensor_t) ->   !out_tensor_t
-  %2 = linalg.generic {
-    indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
-                     affine_map<(d0, d1) -> (d0)>],
-    iterator_types = ["parallel", "reduction"]}
-    ins(%arg : !in_tensor_t) outs(%1 : !out_tensor_t) {
-      ^bb0(%arg3: f32, %arg4: f32):
-        %3 = arith.addf %arg3, %arg4 : f32
-        linalg.yield %3 : f32
-      } -> !out_tensor_t
-  return %2 : !out_tensor_t
-}
-
-// RUN: iree-compile %s --iree-hal-target-backends=cuda \
-// RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-transform-dialect-library=%p/reduction_v2_codegen_spec.mlir@codegen | \
-// RUN: iree-run-module --module=- --function=reduce --device=cuda --input="33x1024xf32=1" |\
-// RUN: FileCheck %s --check-prefix=EXEC
-
-// RUN: iree-compile %s --iree-hal-target-backends=cuda | \
-// RUN: iree-run-module --module=- --function=reduce --device=cuda --input="33x1024xf32=1" |\
-// RUN: FileCheck %s --check-prefix=EXEC
-
-// only checking the first 6 of 33
-//      EXEC: result[0]: hal.buffer_view
-// EXEC-NEXT: 33xf32=1024 1024 1024 1024 1024 1024
diff --git a/tests/transform_dialect/cuda/reduction_v2_codegen_spec.mlir b/tests/transform_dialect/cuda/reduction_v2_codegen_spec.mlir
deleted file mode 100644
index bb9ecbfc6429..000000000000
--- a/tests/transform_dialect/cuda/reduction_v2_codegen_spec.mlir
+++ /dev/null
@@ -1,113 +0,0 @@
-// RUN: iree-opt %s
-
-module attributes { transform.with_named_sequence } {
-  transform.named_sequence @codegen(
-      %variant_op: !transform.any_op {transform.consumed}) {
-
-    %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    %reduction = transform.structured.match ops{["linalg.generic"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-
-    // Step 1. First level of tiling + fusion parallelizes to blocks.
-    // ===========================================================================
-    %grid_reduction, %forall_grid =
-      transform.structured.tile_using_forall %reduction tile_sizes [1]
-        ( mapping = [#gpu.block<x>] )
-        : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-    transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall_grid : (!transform.any_op) -> ()
-    transform.structured.fuse_into_containing_op %fill into %forall_grid : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-    // Step 2. Split the reduction to get meatier parallelism.
-    // ===========================================================================
-    %block_more_parallel_fill_op_2, %block_more_parallel_op_2, %block_combiner_op_2, %forall =
-      transform.structured.tile_reduction_using_for %grid_reduction by tile_sizes = [0, 128]
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
-    %_1:2 =
-      transform.structured.tile_using_forall %block_more_parallel_op_2 num_threads [0, 32]
-      ( mapping = [#gpu.thread<x>] )
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-    // Step 3. Second level of tiling parallelizes to threads.
-    // ===========================================================================
-    // 1st op is [parallel, parallel], map it to threadIdx.x by 4.
-    %_2:2 =
-      transform.structured.tile_using_forall %block_more_parallel_fill_op_2 tile_sizes [0, 4]
-      ( mapping = [#gpu.thread<x>] )
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-    // 2nd op is [parallel, reduction] of 1x128, map the 1-dim to threadIdx.y to
-    // trigger mapping of the reduction to threadIdx.x via predication via `if (x==0)`.
-    %_3:2 =
-      transform.structured.tile_using_forall %block_combiner_op_2 tile_sizes [1]
-      ( mapping = [#gpu.thread<y>] )
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-    // Step 4. Rank-reduce and vectorize.
-    // ===========================================================================
-    %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    transform.apply_patterns to %func {
-      transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
-      transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
-      transform.apply_patterns.vector.cast_away_vector_leading_one_dim
-    } : !transform.any_op
-    %func_3 = transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
-
-    // Step 5. Bufferize and drop HAL decriptor from memref ops.
-    // ===========================================================================
-    // Canonicalization/CSE is needed before bufferization otherwise unnecessary
-    // allocs will be created.
-    transform.apply_patterns to %func_3 {
-      transform.apply_patterns.iree.fold_fill_into_pad
-      transform.apply_patterns.linalg.tiling_canonicalization
-      transform.apply_patterns.scf.for_loop_canonicalization
-    } : !transform.any_op
-    transform.apply_patterns to %func_3 {
-      transform.apply_patterns.tensor.reassociative_reshape_folding
-      transform.apply_patterns.canonicalization
-    } : !transform.any_op
-    transform.apply_cse to %func_3 : !transform.any_op
-    transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
-    %func_5 = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    transform.apply_patterns to %func_5 {
-      transform.apply_patterns.linalg.erase_unnecessary_inputs
-    } : !transform.any_op
-    %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> (!transform.any_op)
-    %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-
-    // Step 6. Post-bufferization mapping to blocks and threads.
-    // ===========================================================================
-    %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-    transform.iree.forall_to_workgroup %func_7 : (!transform.any_op) -> ()
-    transform.iree.map_nested_forall_to_gpu_threads %func_7
-        workgroup_dims = [32, 1, 1] : (!transform.any_op) -> ()
-
-    // Step 7. Post-bufferization vector distribution with rank-reduction.
-    // ===========================================================================
-    transform.apply_patterns to %func_7 {
-      transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
-      transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
-      transform.apply_patterns.memref.fold_memref_alias_ops
-      transform.apply_patterns.vector.cast_away_vector_leading_one_dim
-    } : !transform.any_op
-    %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3
-      : (!transform.any_op) -> !transform.any_op
-    %warp = transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 } : (!transform.any_op) -> !transform.any_op
-    transform.iree.vector.warp_distribute %func_7
-      : (!transform.any_op) -> ()
-
-    // Late canonicalizations to cleanup and pass the checks
-    transform.apply_patterns to %func_7 {
-      transform.apply_patterns.iree.fold_fill_into_pad
-      transform.apply_patterns.linalg.tiling_canonicalization
-      transform.apply_patterns.scf.for_loop_canonicalization
-      transform.apply_patterns.canonicalization
-    } : !transform.any_op
-    transform.iree.apply_licm %func_7 : !transform.any_op
-    transform.apply_cse to %func_7 : !transform.any_op
-
-    // Annotate the exported function as already translated.
-    %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-    %none = transform.param.constant #iree_codegen.translation_info<None> -> !transform.any_param
-    transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param
-
-    transform.yield
-  }
-} // module
diff --git a/tests/transform_dialect/cuda/reduction_v2_uneven.mlir b/tests/transform_dialect/cuda/reduction_v2_uneven.mlir
deleted file mode 100644
index 66b88cdf2232..000000000000
--- a/tests/transform_dialect/cuda/reduction_v2_uneven.mlir
+++ /dev/null
@@ -1,29 +0,0 @@
-!in_tensor_t = tensor<33x34567xf32>
-!out_tensor_t = tensor<33xf32>
-
-func.func @reduce(%arg : !in_tensor_t) -> (!out_tensor_t) {
-  %cst = arith.constant -0.000000e+00 : f32
-
-  %0 = tensor.empty() : !out_tensor_t
-  %1 = linalg.fill ins(%cst : f32) outs(%0 : !out_tensor_t) ->   !out_tensor_t
-  %2 = linalg.generic {
-    indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
-                     affine_map<(d0, d1) -> (d0)>],
-    iterator_types = ["parallel", "reduction"]}
-    ins(%arg : !in_tensor_t) outs(%1 : !out_tensor_t) {
-      ^bb0(%arg3: f32, %arg4: f32):
-        %3 = arith.addf %arg3, %arg4 : f32
-        linalg.yield %3 : f32
-      } -> !out_tensor_t
-  return %2 : !out_tensor_t
-}
-
-// RUN: iree-compile %s --iree-hal-target-backends=cuda \
-// RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-transform-dialect-library=%p/reduction_v2_codegen_spec.mlir@codegen | \
-// RUN: iree-run-module --module=- --function=reduce --device=cuda --input="33x34567xf32=1" |\
-// RUN: FileCheck %s --check-prefix=EXEC
-
-// only checking the first 6 of 33
-//      EXEC: result[0]: hal.buffer_view
-// EXEC-NEXT: 33xf32=34567 34567 34567 34567 34567 34567
diff --git a/tests/transform_dialect/cuda/softmax.mlir b/tests/transform_dialect/cuda/softmax.mlir
deleted file mode 100644
index 91bbc07b7a28..000000000000
--- a/tests/transform_dialect/cuda/softmax.mlir
+++ /dev/null
@@ -1,84 +0,0 @@
-// RUN: iree-compile %s --iree-hal-target-backends=cuda \
-// RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-flow-dispatch-use-transform-dialect=%p/softmax_dispatch_spec.mlir \
-// RUN:     --iree-codegen-transform-dialect-library=%p/softmax_codegen_spec.mlir@codegen | \
-// RUN: iree-run-module --module=- --function=softmax --device=cuda | \
-// RUN: FileCheck %s
-
-
-!tmp_tensor_t = tensor<16x128xf32>
-!in_tensor_t = tensor<16x128x128xf32>
-!out_tensor_t = tensor<16x128x128xf32>
-
-// Execution only checks that @softmax runs.
-//      CHECK: EXEC @softmax
-//      CHECK: 16x128x128xf32=[
-// CHECK-SAME:                [0.0078125 0.0078125 0.0078125 0.0078125
-
-func.func @softmax() -> !out_tensor_t {
-  %cst_0 = arith.constant 0.0 : f32
-  %cst_1 = arith.constant 1.0 : f32
-  %cst_min = arith.constant -3.40282347E+38 : f32
-  %input = arith.constant dense<5.000000e+00> : !out_tensor_t
-  util.optimization_barrier %input : !in_tensor_t
-
-  %input_max_empty = tensor.empty() : !tmp_tensor_t
-  %input_max_filled = linalg.fill ins(%cst_min : f32)
-    outs(%input_max_empty : !tmp_tensor_t) -> !tmp_tensor_t
-  %input_max = linalg.generic
-    {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
-                      affine_map<(d0, d1, d2) -> (d0, d1)>],
-                      iterator_types = ["parallel", "parallel", "reduction"]}
-     ins(%input : !in_tensor_t)
-    outs(%input_max_filled : !tmp_tensor_t) {
-      ^bb0(%arg0: f32, %arg1: f32):
-        %max = arith.maximumf %arg0, %arg1 : f32
-        linalg.yield %max : f32
-      } -> !tmp_tensor_t
-
-  // This has been fused manually to avoid the fusion on tensors pass and reduce noise atm.
-  %exps_empty = tensor.empty() : !out_tensor_t
-  %exps_sum_empty = tensor.empty() : !tmp_tensor_t
-  %exps_sum_filled = linalg.fill ins(%cst_0 : f32)
-    outs(%exps_sum_empty : !tmp_tensor_t) -> !tmp_tensor_t
-  %exps = linalg.generic
-    {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
-                      affine_map<(d0, d1, d2) -> (d0, d1)>,
-                      affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
-                      iterator_types = ["parallel", "parallel", "parallel"]}
-     ins(%input, %input_max : !in_tensor_t, !tmp_tensor_t)
-    outs(%exps_empty : !out_tensor_t) {
-      ^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
-        %sub = arith.subf %arg0, %arg1 : f32
-        %exp = math.exp %sub : f32
-        linalg.yield %exp: f32
-      } -> (!out_tensor_t)
-
-  %exps_sum = linalg.generic
-    {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
-                      affine_map<(d0, d1, d2) -> (d0, d1)>],
-                      iterator_types = ["parallel", "parallel", "reduction"]}
-     ins(%exps : !out_tensor_t)
-    outs(%exps_sum_filled : !tmp_tensor_t) {
-      ^bb0(%exp: f32, %acc: f32):
-        %add = arith.addf %exp, %acc : f32
-        linalg.yield %add : f32
-      } -> (!tmp_tensor_t)
-
-  %res_empty = tensor.empty() : !out_tensor_t
-  %res = linalg.generic
-    {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
-                      affine_map<(d0, d1, d2) -> (d0, d1)>,
-                      affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
-                      iterator_types = ["parallel", "parallel", "parallel"]}
-     ins(%exps, %exps_sum : !out_tensor_t, !tmp_tensor_t)
-    outs(%res_empty : !out_tensor_t) {
-      ^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
-        // %10 = arith.divf %cst_1, %arg1 : f32
-        // %11 = arith.mulf %arg0, %10 : f32
-        %div = arith.divf %arg0, %arg1 : f32
-        linalg.yield %div : f32
-      } -> !out_tensor_t
-
-  return %res: !out_tensor_t
-}
diff --git a/tests/transform_dialect/cuda/softmax_codegen_spec.mlir b/tests/transform_dialect/cuda/softmax_codegen_spec.mlir
deleted file mode 100644
index 86f49091536d..000000000000
--- a/tests/transform_dialect/cuda/softmax_codegen_spec.mlir
+++ /dev/null
@@ -1,119 +0,0 @@
-// RUN: iree-opt %s
-
-// Codegen
-module attributes { transform.with_named_sequence } {
-  transform.named_sequence @codegen(
-      %variant_op: !transform.any_op {transform.consumed}) {
-
-    %ops = transform.structured.match ops{["linalg.fill", "linalg.generic"]}
-      in %variant_op : (!transform.any_op) -> !transform.any_op
-    %input_max_fill,
-    %input_max,
-    %exps_sum_fill,
-    %exps,
-    %exps_sum,
-    %div = transform.split_handle %ops
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op,
-                            !transform.any_op, !transform.any_op, !transform.any_op)
-
-    // Step 1. First level of tiling + fusion parallelizes to blocks.
-    // ==============================================================
-    %_, %forall =
-      transform.structured.tile_using_forall %div tile_sizes [1, 4]
-        ( mapping = [#gpu.block<x>, #gpu.block<y>] )
-        : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-    transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall : (!transform.any_op) -> ()
-
-    // TODO: Merging and fusing merged handles does not work properly atm.
-    transform.structured.fuse_into_containing_op %exps_sum into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-    transform.structured.fuse_into_containing_op %exps into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-    transform.structured.fuse_into_containing_op %exps_sum_fill into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-    transform.structured.fuse_into_containing_op %input_max into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-    transform.structured.fuse_into_containing_op %input_max_fill into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-    // By default, fusion into scf.forall does not promote captured values
-    // to shared as this involves a cross-thread dependence analysis.
-    // Instead, we activate it explicitly post-hoc to promote all the extract_slice
-    // ops that we find and match the prerequisites
-    %forall_with_type = transform.cast %forall : !transform.any_op to !transform.op<"scf.forall">
-    transform.iree.share_forall_operands %forall_with_type
-      : (!transform.op<"scf.forall">) -> !transform.op<"scf.forall">
-    transform.apply_patterns to %variant_op {
-      transform.apply_patterns.canonicalization
-    } : !transform.any_op
-    transform.apply_cse to %variant_op : !transform.any_op
-
-    // Step 2. Second level of tiling + fusion parallelizes to threads.
-    // ================================================================
-    %tiled_ops = transform.structured.match ops{["linalg.fill", "linalg.generic"]}
-      in %variant_op : (!transform.any_op) -> !transform.any_op
-    %tiled_input_max_fill,
-    %tiled_input_max,
-    %tiled_exps_sum_fill,
-    %tiled_exp_and_exps_sum,
-    %tiled_exp_and_exps_sum_2,
-    %tiled_div = transform.split_handle %tiled_ops
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op,
-                            !transform.any_op, !transform.any_op, !transform.any_op)
-    // Leaving the reduction untiled on threadIdx.x makes it sequential on
-    // threadIdx.x. After distribution, predication by `if (threadIdx.x == 0)` is
-    // introduced and opportunities for distributing vector ops across warps
-    // appear.
-    %reduction_linalg_ops = transform.merge_handles %tiled_input_max,
-                                                    %tiled_exp_and_exps_sum,
-                                                    %tiled_exp_and_exps_sum_2
-      : !transform.any_op
-    transform.structured.tile_using_forall %reduction_linalg_ops tile_sizes [1, 1]
-      ( mapping = [#gpu.thread<z>, #gpu.thread<y>] )
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-    // Fully parallel ops are tiled and mapped.
-    %parallel_linalg_ops = transform.merge_handles %tiled_input_max_fill,
-                                                  %tiled_exps_sum_fill,
-                                                  %tiled_div
-      : !transform.any_op
-    transform.structured.tile_using_forall %parallel_linalg_ops num_threads [1, 4, 32]
-        ( mapping = [#gpu.thread<z>, #gpu.thread<y>, #gpu.thread<x>] )
-        : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-    // Step 3. Rank-reduce and vectorize.
-    // ==================================
-    %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    transform.apply_patterns to %func {
-      transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
-      transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
-      transform.apply_patterns.vector.cast_away_vector_leading_one_dim
-    } : !transform.any_op
-    transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
-
-    // Step 4. Bufferize and drop HAL decriptor from memref ops.
-    // =========================================================
-    transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
-    %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> !transform.any_op
-    %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-
-    // Step 5. Post-bufferization mapping to blocks and threads.
-    // =========================================================
-    %func_2 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-    transform.iree.forall_to_workgroup %func_2 : (!transform.any_op) -> ()
-    transform.iree.map_nested_forall_to_gpu_threads %func_2 workgroup_dims = [32, 4, 1] : (!transform.any_op) -> ()
-
-    // Step 6. Post-bufferization vector distribution with rank-reduction.
-    // ===================================================================
-    %end_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-    transform.apply_patterns to %end_func {
-      transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
-      transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
-      transform.apply_patterns.memref.fold_memref_alias_ops
-      transform.apply_patterns.vector.cast_away_vector_leading_one_dim
-    } : !transform.any_op
-    %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-    %warp = transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 } : (!transform.any_op) -> !transform.any_op
-    transform.iree.vector.warp_distribute %end_func : (!transform.any_op) -> ()
-
-    // Annotate the exported function as already translated.
-    %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-    %none = transform.param.constant #iree_codegen.translation_info<None> -> !transform.any_param
-    transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param
-
-    transform.yield
-  }
-} // module
diff --git a/tests/transform_dialect/cuda/softmax_dispatch_spec.mlir b/tests/transform_dialect/cuda/softmax_dispatch_spec.mlir
deleted file mode 100644
index 09c2c07895ee..000000000000
--- a/tests/transform_dialect/cuda/softmax_dispatch_spec.mlir
+++ /dev/null
@@ -1,28 +0,0 @@
-// RUN: iree-opt %s
-
-// Dispatch softmax.
-module attributes { transform.with_named_sequence } {
-  transform.named_sequence @__transform_main(%func: !transform.any_op) {
-    %ops = transform.structured.match ops{["linalg.fill", "linalg.generic"]}
-      in %func : (!transform.any_op) -> !transform.any_op
-
-    %input_max_fill, %input_max, %exps_sum_fill, %exps, %exps_sum, %div =
-      transform.split_handle %ops
-        : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op,
-                               !transform.any_op, !transform.any_op, !transform.any_op)
-
-    /// This must be used with the custom dispatch region formation
-    /// because IREE's does not fuse the 6 ops softmax version even with
-    /// --iree-flow-fuse-multi-use.
-    %region_op = transform.iree.wrap_in_dispatch_region %div { generateWorkload = false } : (!transform.any_op) -> !transform.any_op
-
-    %non_div = transform.merge_handles %input_max_fill, %input_max, %exps_sum_fill, %exps, %exps_sum
-      : !transform.any_op
-    %region_op_2 = transform.iree.move_preceding_op_into_dispatch_region %non_div into %region_op : (!transform.any_op, !transform.any_op) -> !transform.any_op
-
-    %empty = transform.structured.match ops{["tensor.empty"]} in %func : (!transform.any_op) -> !transform.any_op
-    %region_op_3 = transform.iree.move_preceding_op_into_dispatch_region %empty into %region_op_2 : (!transform.any_op, !transform.any_op) -> !transform.any_op
-    transform.iree.region_to_workgroups %region_op_3 : (!transform.any_op) -> !transform.any_op
-    transform.yield
-  }
-}
diff --git a/tests/transform_dialect/cuda/softmax_partial.mlir b/tests/transform_dialect/cuda/softmax_partial.mlir
deleted file mode 100644
index 018ad8c42ac8..000000000000
--- a/tests/transform_dialect/cuda/softmax_partial.mlir
+++ /dev/null
@@ -1,46 +0,0 @@
-// RUN: iree-compile %s --iree-hal-target-backends=cuda \
-// RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-transform-dialect-library=%p/softmax_partial_codegen_spec.mlir@codegen | \
-// RUN: iree-run-module --module=- --function=softmax_partial --device=cuda | \
-// RUN: FileCheck %s
-
-!tmp_tensor_t = tensor<16x128xf32>
-!out_tensor_t = tensor<16x128x128xf32>
-
-// Execution only checks that @softmax_partial runs.
-//      CHECK: EXEC @softmax_partial
-//      CHECK: 16x128x128xf32=[
-// CHECK-SAME:                [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
-
-func.func @softmax_partial() -> !out_tensor_t {
-  %cst = arith.constant -3.40282347E+38 : f32
-  %cst_0 = arith.constant dense<1121212.000000e+00> : !out_tensor_t
-  %cst_1 = arith.constant dense<5.000000e+00> : !out_tensor_t
-  %0 = util.optimization_barrier %cst_1 : !out_tensor_t
-
-  %1 = tensor.empty() : !tmp_tensor_t
-  %2 = linalg.fill ins(%cst : f32) outs(%1 : !tmp_tensor_t) -> !tmp_tensor_t
-  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
-                                        affine_map<(d0, d1, d2) -> (d0, d1)>],
-                       iterator_types = ["parallel", "parallel", "reduction"]}
-  ins(%0 : !out_tensor_t) outs(%2 : !tmp_tensor_t) {
-  ^bb0(%arg0: f32, %arg1: f32):
-    %8 = arith.maximumf %arg0, %arg1 : f32
-    linalg.yield %8 : f32
-  } -> !tmp_tensor_t
-
-  // This has been fused manually to avoid the fusion on tensors pass and reduce noise atm.
-  %4 = tensor.empty() : !out_tensor_t
-  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
-                                        affine_map<(d0, d1, d2) -> (d0, d1)>,
-                                        affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
-                       iterator_types = ["parallel", "parallel", "parallel"]}
-  ins(%0, %3 : !out_tensor_t, !tmp_tensor_t) outs(%4 : !out_tensor_t) {
-  ^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
-    %6 = arith.subf %arg0, %arg1 : f32
-    %7 = math.exp %6 : f32
-    linalg.yield %7 : f32
-  } -> !out_tensor_t
-
-  return %5: !out_tensor_t
-}
diff --git a/tests/transform_dialect/cuda/softmax_partial_codegen_spec.mlir b/tests/transform_dialect/cuda/softmax_partial_codegen_spec.mlir
deleted file mode 100644
index 65ea847af92c..000000000000
--- a/tests/transform_dialect/cuda/softmax_partial_codegen_spec.mlir
+++ /dev/null
@@ -1,102 +0,0 @@
-// RUN: iree-opt %s
-
-// Codegen
-module attributes { transform.with_named_sequence } {
-  transform.named_sequence @codegen(
-      %variant_op: !transform.any_op {transform.consumed}) {
-
-    // Step 1. First level of tiling + fusion parallelizes to blocks.
-    // ==============================================================
-    %root = transform.structured.match interface{LinalgOp}
-      attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    %fill = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    %red = transform.structured.match interface{LinalgOp}
-      attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    %not_root = transform.merge_handles %fill, %red : !transform.any_op
-    %tiled_generic, %forall =
-      transform.structured.tile_using_forall %root tile_sizes [1, 4]
-      ( mapping = [#gpu.block<x>, #gpu.block<y>] )
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-    transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall : (!transform.any_op) -> ()
-    transform.structured.fuse_into_containing_op %not_root into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-    // Step 2. Second level of tiling + fusion parallelizes to threads.
-    // ================================================================
-    %fill_linalg = transform.structured.match ops{["linalg.fill"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    %reduction_linalg = transform.structured.match ops{["linalg.generic"]}
-      attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    %parallel_linalg = transform.structured.match ops{["linalg.generic"]}
-      attributes{iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    %tiled_reduction_generic, %forall_reduction =
-      transform.structured.tile_using_forall %reduction_linalg tile_sizes [1, 1]
-        ( mapping = [#gpu.thread<z>, #gpu.thread<y>] )
-        : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-    // TODO: this fusion currently does not happen properly, this is related to the clone
-    // behavior when fusing into scf.forall.
-    // Once fixed we'll be able to fuse.
-    // Fusion will save us one roundtrip to memory.
-    // transform.structured.fuse_into_containing_op %fill_linalg into %forall_reduction
-    transform.structured.tile_using_forall %parallel_linalg num_threads [1, 4, 32]
-        ( mapping = [#gpu.thread<z>, #gpu.thread<y>, #gpu.thread<x>] )
-        : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-
-    // Inability to tile reductions to scf.forall has 2 implications:
-    //   1. since no scf.forall is present, no gpu.barrier is added.
-    //      This should be fixed independently: ops that are not nested in an scf.forall
-    //      should have a gpu.barrier. Later needs to be complemented by a barrier
-    //      removal pass.
-    //   2. Similarly, needs to be predicated under an if threadIx == 0 to avoid
-    //      multiple threads updating the buffer inplace once bufferized.
-    //
-    // Instead, we can vectorize and go to vector SSA values that sidestep these
-    // issues.
-    // Everyone will race to the write while still computing the same value.
-    //
-    // That is still not good enough because we need to predicate this in order
-    // to enable the parallel reduction on warps.
-
-    // Step 3. Rank-reduce and vectorize.
-    // ==================================
-    %func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    transform.apply_patterns to %func {
-      transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
-      transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
-      transform.apply_patterns.vector.cast_away_vector_leading_one_dim
-    } : !transform.any_op
-    transform.structured.vectorize_children_and_apply_patterns %func : (!transform.any_op) -> !transform.any_op
-
-    // Step 4. Bufferize and drop HAL decriptor from memref ops.
-    // =========================================================
-    transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
-    %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> !transform.any_op
-    %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-
-    // Step 5. Post-bufferization mapping to blocks and threads.
-    // =========================================================
-    %func_2 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-    transform.iree.forall_to_workgroup %func_2 : (!transform.any_op) -> ()
-    transform.iree.map_nested_forall_to_gpu_threads %func_2 workgroup_dims = [32, 4, 1] : (!transform.any_op) -> ()
-
-    // Step 6. Post-bufferization vector distribution with rank-reduction.
-    // ===================================================================
-    %end_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-    transform.apply_patterns to %end_func {
-      transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
-      transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
-      transform.apply_patterns.memref.fold_memref_alias_ops
-      transform.apply_patterns.vector.cast_away_vector_leading_one_dim
-    } : !transform.any_op
-    %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-    %warp = transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 }
-      : (!transform.any_op) -> !transform.any_op
-    transform.iree.vector.warp_distribute %end_func : (!transform.any_op) -> ()
-
-    // Annotate the exported function as already translated.
-    %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-    %none = transform.param.constant #iree_codegen.translation_info<None> -> !transform.any_param
-    transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param
-
-    transform.yield
-  }
-} // module
diff --git a/tests/transform_dialect/cuda/softmax_v2.mlir b/tests/transform_dialect/cuda/softmax_v2.mlir
deleted file mode 100644
index 5ef3a90347b9..000000000000
--- a/tests/transform_dialect/cuda/softmax_v2.mlir
+++ /dev/null
@@ -1,76 +0,0 @@
-// RUN: iree-compile %s --iree-hal-target-backends=cuda \
-// RUN:     --iree-flow-fuse-multi-use \
-// RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
-// RUN:     --iree-codegen-transform-dialect-library=%p/softmax_v2_codegen_spec.mlir@codegen | \
-// RUN: iree-run-module --module=- --function=softmax --device=cuda | \
-// RUN: FileCheck %s
-
-!tmp_tensor_t = tensor<16x128xf32>
-!in_tensor_t = tensor<16x128x128xf32>
-!out_tensor_t = tensor<16x128x128xf32>
-
-// Execution only checks that @softmax runs.
-//      CHECK: EXEC @softmax
-//      CHECK: 16x128x128xf32=[
-// CHECK-SAME:                [0.0078125 0.0078125 0.0078125 0.0078125
-
-func.func @softmax() -> !out_tensor_t {
-  %cst_0 = arith.constant 0.0 : f32
-  %cst_1 = arith.constant 1.0 : f32
-  %cst_min = arith.constant -3.40282347E+38 : f32
-  %input = arith.constant dense<5.000000e+00> : !out_tensor_t
-  util.optimization_barrier %input : !in_tensor_t
-
-  %softmax = flow.dispatch.region[] -> (!out_tensor_t) {
-    %input_max_empty = tensor.empty() : !tmp_tensor_t
-    %input_max_filled = linalg.fill ins(%cst_min : f32)
-      outs(%input_max_empty : !tmp_tensor_t) -> !tmp_tensor_t
-    %input_max = linalg.generic
-      {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
-                        affine_map<(d0, d1, d2) -> (d0, d1)>],
-                        iterator_types = ["parallel", "parallel", "reduction"]}
-     ins(%input : !in_tensor_t)
-    outs(%input_max_filled : !tmp_tensor_t) {
-      ^bb0(%arg0: f32, %arg1: f32):
-        %max = arith.maximumf %arg0, %arg1 : f32
-        linalg.yield %max : f32
-      } -> !tmp_tensor_t
-
-    // This has been fused manually to avoid the fusion on tensors pass and reduce noise atm.
-    %exps_empty = tensor.empty() : !out_tensor_t
-    %exps_sum_empty = tensor.empty() : !tmp_tensor_t
-    %exps_sum_filled = linalg.fill ins(%cst_0 : f32)
-      outs(%exps_sum_empty : !tmp_tensor_t) -> !tmp_tensor_t
-    %exps, %exps_sum = linalg.generic
-      {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
-                        affine_map<(d0, d1, d2) -> (d0, d1)>,
-                        affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
-                        affine_map<(d0, d1, d2) -> (d0, d1)>],
-                        iterator_types = ["parallel", "parallel", "reduction"]}
-       ins(%input, %input_max : !in_tensor_t, !tmp_tensor_t)
-      outs(%exps_empty, %exps_sum_filled : !out_tensor_t, !tmp_tensor_t) {
-        ^bb0(%arg0: f32, %arg1: f32, %arg2: f32, %arg3: f32):
-          %sub = arith.subf %arg0, %arg1 : f32
-          %exp = math.exp %sub : f32
-          %add = arith.addf %exp, %arg3 : f32
-          linalg.yield %exp, %add : f32, f32
-        } -> (!out_tensor_t, !tmp_tensor_t)
-
-    %res_empty = tensor.empty() : !out_tensor_t
-    %res = linalg.generic
-      {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
-                        affine_map<(d0, d1, d2) -> (d0, d1)>,
-                        affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
-                        iterator_types = ["parallel", "parallel", "parallel"]}
-       ins(%exps, %exps_sum : !out_tensor_t, !tmp_tensor_t)
-      outs(%res_empty : !out_tensor_t) {
-        ^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
-          // %10 = arith.divf %cst_1, %arg1 : f32
-          // %11 = arith.mulf %arg0, %10 : f32
-          %div = arith.divf %arg0, %arg1 : f32
-          linalg.yield %div : f32
-        } -> !out_tensor_t
-      flow.return %res : !out_tensor_t
-  }
-  return %softmax: !out_tensor_t
-}
diff --git a/tests/transform_dialect/cuda/softmax_v2_codegen_spec.mlir b/tests/transform_dialect/cuda/softmax_v2_codegen_spec.mlir
deleted file mode 100644
index c73cbe9e18d2..000000000000
--- a/tests/transform_dialect/cuda/softmax_v2_codegen_spec.mlir
+++ /dev/null
@@ -1,148 +0,0 @@
-// RUN: iree-opt %s
-
-// Codegen
-module attributes { transform.with_named_sequence } {
-  transform.named_sequence @codegen(
-      %variant_op: !transform.any_op {transform.consumed}) {
-
-    %ops = transform.structured.match ops{["linalg.fill", "linalg.generic"]}
-      in %variant_op : (!transform.any_op) -> !transform.any_op
-    %input_max_fill,
-    %input_max,
-    %exps_sum_fill,
-    %exp_and_exps_sum,
-    %div = transform.split_handle %ops
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op,
-                            !transform.any_op, !transform.any_op)
-
-    // Step 1. First level of tiling + fusion parallelizes to blocks.
-    // ==============================================================
-    %_, %forall =
-    transform.structured.tile_using_forall %div tile_sizes [1, 4]
-      ( mapping = [#gpu.block<x>, #gpu.block<y>] )
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-    transform.iree.populate_workgroup_count_region_using_num_threads_slice %forall : (!transform.any_op) -> ()
-
-    // TODO: Merging and fusing merged handles does not work properly atm.
-    transform.structured.fuse_into_containing_op %exp_and_exps_sum into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-    transform.structured.fuse_into_containing_op %exps_sum_fill into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-    transform.structured.fuse_into_containing_op %input_max into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-    transform.structured.fuse_into_containing_op %input_max_fill into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
-    // By default, fusion into scf.forall does not promote captured values
-    // to shared as this involves a cross-thread dependence analysis.
-    // Instead, we activate it explicitly post-hoc to promote all the extract_slice
-    // ops that we find and match the prerequisites
-    %forall_with_type = transform.cast %forall : !transform.any_op to !transform.op<"scf.forall">
-    transform.iree.share_forall_operands %forall_with_type
-      : (!transform.op<"scf.forall">) -> !transform.op<"scf.forall">
-
-    // Canonicalizations.
-    %func_op = transform.structured.match ops{["func.func"]} in %variant_op
-        : (!transform.any_op) -> !transform.any_op
-    transform.apply_patterns to %func_op {
-      transform.apply_patterns.iree.fold_fill_into_pad
-      transform.apply_patterns.linalg.tiling_canonicalization
-      transform.apply_patterns.scf.for_loop_canonicalization
-      transform.apply_patterns.canonicalization
-    } : !transform.any_op
-    transform.iree.apply_licm %func_op : !transform.any_op
-    transform.apply_cse to %func_op : !transform.any_op
-
-
-    // Step 2. Second level of tiling + fusion parallelizes to threads.
-    // ================================================================
-    %tiled_ops = transform.structured.match ops{["linalg.fill", "linalg.generic"]}
-      in %variant_op : (!transform.any_op) -> !transform.any_op
-    %tiled_input_max_fill,
-    %tiled_input_max,
-    %tiled_exps_sum_fill,
-    %tiled_exp_and_exps_sum,
-    %tiled_div = transform.split_handle %tiled_ops
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op,
-                            !transform.any_op, !transform.any_op)
-    // Leaving the reduction untiled on threadIdx.x makes it sequential on
-    // threadIdx.x. After distribution, predication by `if (threadIdx.x == 0)` is
-    // introduced and opportunities for distributing vector ops across warps
-    // appear.
-    %reduction_linalg_ops = transform.merge_handles %tiled_input_max,
-                                                    %tiled_exp_and_exps_sum
-      : !transform.any_op
-    transform.structured.tile_using_forall %reduction_linalg_ops tile_sizes [1, 1]
-      ( mapping = [#gpu.thread<z>, #gpu.thread<y>] )
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-    // Fully parallel ops are tiled and mapped.
-    %parallel_linalg_ops = transform.merge_handles %tiled_input_max_fill,
-                                                  %tiled_exps_sum_fill,
-                                                  %tiled_div
-      : !transform.any_op
-    transform.structured.tile_using_forall %parallel_linalg_ops num_threads [1, 4, 32]
-      ( mapping = [#gpu.thread<z>, #gpu.thread<y>, #gpu.thread<x>] )
-      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-
-    // Canonicalizations.
-    transform.apply_patterns to %func_op {
-      transform.apply_patterns.iree.fold_fill_into_pad
-      transform.apply_patterns.linalg.tiling_canonicalization
-      transform.apply_patterns.scf.for_loop_canonicalization
-      transform.apply_patterns.canonicalization
-    } : !transform.any_op
-    transform.iree.apply_licm %func_op : !transform.any_op
-    transform.apply_cse to %func_op : !transform.any_op
-
-    // Step 3. Rank-reduce and vectorize.
-    // ==================================
-    transform.apply_patterns to %func_op {
-      transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
-      transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
-      transform.apply_patterns.vector.cast_away_vector_leading_one_dim
-    } : !transform.any_op
-    transform.structured.vectorize_children_and_apply_patterns %func_op : (!transform.any_op) -> !transform.any_op
-
-    // Step 4. Bufferize and drop HAL decriptor from memref ops.
-    // =========================================================
-    transform.iree.eliminate_empty_tensors %variant_op : (!transform.any_op) -> ()
-    %variant_op_3 = transform.iree.bufferize { target_gpu } %variant_op : (!transform.any_op) -> !transform.any_op
-    %memref_func = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-
-    // Step 5. Post-bufferization mapping to blocks and threads.
-    // =========================================================
-    transform.iree.forall_to_workgroup %memref_func : (!transform.any_op) -> ()
-    transform.iree.map_nested_forall_to_gpu_threads %memref_func
-      workgroup_dims = [32, 4, 1] : (!transform.any_op) -> ()
-
-    // Step 6. Post-bufferization vector distribution with rank-reduction.
-    // ===================================================================
-    transform.apply_patterns to %memref_func {
-      transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
-      transform.apply_patterns.linalg.fold_unit_extent_dims_via_slices
-      transform.apply_patterns.memref.fold_memref_alias_ops
-      transform.apply_patterns.vector.cast_away_vector_leading_one_dim
-    } : !transform.any_op
-    %if_op = transform.structured.match ops{["scf.if"]} in %variant_op_3
-      : (!transform.any_op) -> !transform.any_op
-    %warp = transform.iree.vector.to_warp_execute_on_lane_0 %if_op { warp_size = 32 }
-      : (!transform.any_op) -> !transform.any_op
-    transform.iree.vector.warp_distribute %memref_func
-      : (!transform.any_op) -> ()
-
-
-    // Late canonicalizations.
-    %func_op_3 = transform.structured.match ops{["func.func"]} in %variant_op_3
-        : (!transform.any_op) -> !transform.any_op
-    transform.apply_patterns to %func_op_3 {
-      transform.apply_patterns.iree.fold_fill_into_pad
-      transform.apply_patterns.linalg.tiling_canonicalization
-      transform.apply_patterns.scf.for_loop_canonicalization
-      transform.apply_patterns.canonicalization
-    } : !transform.any_op
-    transform.iree.apply_licm %func_op_3 : !transform.any_op
-    transform.apply_cse to %func_op_3 : !transform.any_op
-
-    // Annotate the exported function as already translated.
-    %exports = transform.structured.match ops{["hal.executable.export"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
-    %none = transform.param.constant #iree_codegen.translation_info<None> -> !transform.any_param
-    transform.annotate %exports "translation_info" = %none : !transform.any_op, !transform.any_param
-
-    transform.yield
-  }
-} // module