From 9dc81605c2ab7a0b82d7b60379e8af519fcb4770 Mon Sep 17 00:00:00 2001
From: Lei Zhang <antiagainst@gmail.com>
Date: Tue, 25 Jan 2022 14:39:00 -0500
Subject: [PATCH] Explicitly list the tool used for running benchmarks (#8179)

This is a preliminary step to enable using other benchmark
tools to report other benchmarks like statistics.
---
 benchmarks/TFLite/CMakeLists.txt              |  40 ++++++
 .../benchmarks/run_benchmarks_on_android.py   | 125 +++++++++++-------
 .../cmake/android/arm64-v8a/benchmark2.yml    |   4 +-
 build_tools/cmake/iree_benchmark_suite.cmake  |  39 ++++--
 4 files changed, 147 insertions(+), 61 deletions(-)

diff --git a/benchmarks/TFLite/CMakeLists.txt b/benchmarks/TFLite/CMakeLists.txt
index 73133521c712..4c2f83acbf3f 100644
--- a/benchmarks/TFLite/CMakeLists.txt
+++ b/benchmarks/TFLite/CMakeLists.txt
@@ -163,6 +163,8 @@ iree_benchmark_suite(
     "CPU-ARM64-v8A"
   TRANSLATION_FLAGS
     ${ANDROID_CPU_TRANSLATION_FLAGS}
+  BENCHMARK_TOOL
+    iree-benchmark-module
   DRIVER
     "dylib-sync"
 )
@@ -186,6 +188,8 @@ iree_benchmark_suite(
     "CPU-ARM64-v8A"
   TRANSLATION_FLAGS
     ${ANDROID_CPU_TRANSLATION_FLAGS}
+  BENCHMARK_TOOL
+    iree-benchmark-module
   DRIVER
     "dylib"
   RUNTIME_FLAGS
@@ -212,6 +216,8 @@ iree_benchmark_suite(
 #     "CPU-ARM64-v8A"
 #   TRANSLATION_FLAGS
 #     ${ANDROID_CPU_TRANSLATION_FLAGS}
+#   BENCHMARK_TOOL
+#     iree-benchmark-module
 #   DRIVER
 #     "dylib"
 #   RUNTIME_FLAGS
@@ -236,6 +242,8 @@ iree_benchmark_suite(
 #     "CPU-ARM64-v8A"
 #   TRANSLATION_FLAGS
 #     ${ANDROID_CPU_TRANSLATION_FLAGS}
+#   BENCHMARK_TOOL
+#     iree-benchmark-module
 #   DRIVER
 #     "dylib"
 #   RUNTIME_FLAGS
@@ -260,6 +268,8 @@ iree_benchmark_suite(
     "CPU-ARM64-v8A"
   TRANSLATION_FLAGS
     ${ANDROID_CPU_TRANSLATION_FLAGS}
+  BENCHMARK_TOOL
+    iree-benchmark-module
   DRIVER
     "dylib"
   RUNTIME_FLAGS
@@ -284,6 +294,8 @@ iree_benchmark_suite(
     "GPU-Adreno"
   TRANSLATION_FLAGS
     ${ANDROID_ADRENO_GPU_TRANSLATION_FLAGS}
+  BENCHMARK_TOOL
+    iree-benchmark-module
   DRIVER
     "vulkan"
 )
@@ -306,6 +318,8 @@ iree_benchmark_suite(
     "GPU-Mali-Valhall"
   TRANSLATION_FLAGS
     ${ANDROID_MALI_GPU_TRANSLATION_FLAGS}
+  BENCHMARK_TOOL
+    iree-benchmark-module
   DRIVER
     "vulkan"
 )
@@ -326,6 +340,8 @@ iree_benchmark_suite(
     # This isn't a special optimization flag. It's so we can reuse the same f32
     # model file. See comments on MOBILEBERT_FP16_MODULE
     "--iree-flow-demote-f32-to-f16"
+  BENCHMARK_TOOL
+    iree-benchmark-module
   DRIVER
     "vulkan"
 )
@@ -366,6 +382,8 @@ iree_benchmark_suite(
   TRANSLATION_FLAGS
     ${ANDROID_CPU_TRANSLATION_FLAGS}
     "--iree-llvm-loop-unrolling=true"
+  BENCHMARK_TOOL
+    iree-benchmark-module
   DRIVER
     "dylib-sync"
 )
@@ -393,6 +411,8 @@ iree_benchmark_suite(
   TRANSLATION_FLAGS
     ${ANDROID_CPU_TRANSLATION_FLAGS}
     "--iree-llvm-loop-unrolling=true"
+  BENCHMARK_TOOL
+    iree-benchmark-module
   DRIVER
     "dylib"
   RUNTIME_FLAGS
@@ -420,6 +440,8 @@ iree_benchmark_suite(
 #   TRANSLATION_FLAGS
 #     ${ANDROID_CPU_TRANSLATION_FLAGS}
 #     "--iree-llvm-loop-unrolling=true"
+#   BENCHMARK_TOOL
+#     iree-benchmark-module
 #   DRIVER
 #     "dylib"
 #   RUNTIME_FLAGS
@@ -445,6 +467,8 @@ iree_benchmark_suite(
 #   TRANSLATION_FLAGS
 #     ${ANDROID_CPU_TRANSLATION_FLAGS}
 #     "--iree-llvm-loop-unrolling=true"
+#   BENCHMARK_TOOL
+#     iree-benchmark-module
 #   DRIVER
 #     "dylib"
 #   RUNTIME_FLAGS
@@ -470,6 +494,8 @@ iree_benchmark_suite(
   TRANSLATION_FLAGS
     ${ANDROID_CPU_TRANSLATION_FLAGS}
     "--iree-llvm-loop-unrolling=true"
+  BENCHMARK_TOOL
+    iree-benchmark-module
   DRIVER
     "dylib"
   RUNTIME_FLAGS
@@ -495,6 +521,8 @@ iree_benchmark_suite(
     "CPU-ARM64-v8A"
   TRANSLATION_FLAGS
     "--iree-input-type=tosa"
+  BENCHMARK_TOOL
+    iree-benchmark-module
   DRIVER
     "vmvx"
   RUNTIME_FLAGS
@@ -521,6 +549,8 @@ iree_benchmark_suite(
   TRANSLATION_FLAGS
     ${ANDROID_ADRENO_GPU_TRANSLATION_FLAGS}
     "--iree-enable-fusion-with-reduction-ops"
+  BENCHMARK_TOOL
+    iree-benchmark-module
   DRIVER
     "vulkan"
 )
@@ -544,6 +574,8 @@ iree_benchmark_suite(
   TRANSLATION_FLAGS
     ${ANDROID_MALI_GPU_TRANSLATION_FLAGS}
     "--iree-enable-fusion-with-reduction-ops"
+  BENCHMARK_TOOL
+    iree-benchmark-module
   DRIVER
     "vulkan"
 )
@@ -563,6 +595,8 @@ iree_benchmark_suite(
     "--iree-flow-demote-f32-to-f16"
     "--iree-vulkan-target-triple=valhall-unknown-android11"
     "--iree-enable-fusion-with-reduction-ops"
+  BENCHMARK_TOOL
+    iree-benchmark-module
   DRIVER
     "vulkan"
 )
@@ -599,6 +633,8 @@ iree_benchmark_suite(
     ${ANDROID_ADRENO_GPU_TRANSLATION_FLAGS}
     "--iree-enable-fusion-with-reduction-ops"
     "--iree-hal-benchmark-dispatch-repeat-count=16"
+  BENCHMARK_TOOL
+    iree-benchmark-module
   DRIVER
     "vulkan"
   RUNTIME_FLAGS
@@ -625,6 +661,8 @@ iree_benchmark_suite(
     ${ANDROID_MALI_GPU_TRANSLATION_FLAGS}
     "--iree-enable-fusion-with-reduction-ops"
     "--iree-hal-benchmark-dispatch-repeat-count=32"
+  BENCHMARK_TOOL
+    iree-benchmark-module
   DRIVER
     "vulkan"
   RUNTIME_FLAGS
@@ -647,6 +685,8 @@ iree_benchmark_suite(
     "--iree-vulkan-target-triple=valhall-unknown-android11"
     "--iree-enable-fusion-with-reduction-ops"
     "--iree-hal-benchmark-dispatch-repeat-count=32"
+  BENCHMARK_TOOL
+    iree-benchmark-module
   DRIVER
     "vulkan"
   RUNTIME_FLAGS
diff --git a/build_tools/benchmarks/run_benchmarks_on_android.py b/build_tools/benchmarks/run_benchmarks_on_android.py
index 54567b1337a3..cd4c6ad0b1aa 100755
--- a/build_tools/benchmarks/run_benchmarks_on_android.py
+++ b/build_tools/benchmarks/run_benchmarks_on_android.py
@@ -10,39 +10,41 @@
 to filter and run suitable benchmarks and optionally captures Tracy traces on
 the Android phone.
 
-It expects that `adb` is installed, and there is an `iree-benchmark-module`
-tool cross-compiled towards Android. If to capture traces, another
-tracing-enabled `iree-benchmark-module` and the Tracy `capture` tool should be
-cross-compiled towards Android.
+It expects that `adb` is installed, and there is iree tools cross-compiled
+towards Android. If to capture traces, another set of tracing-enabled iree
+tools and the Tracy `capture` tool should be cross-compiled towards Android.
 
 It also expects the benchmark artifacts are generated by building the
 `iree-benchmark-suites` target in the following directory structure:
 
 <root-build-dir>/benchmark_suites
-└── <benchmark-category> (e.g., TensorFlow)
+└── <benchmark-category> (e.g., TFLite)
     ├── <benchmark-suite> (e.g., MobileBertSquad-fp32)
     │   ├── <benchmark-case> (e.g., iree-vulkan__GPU-Mali-Valhall__kernel-execution)
+    │   │   ├── tool
     │   │   └── flagfile
     │   ├── ...
+    │   │   ├── tool
     │   │   └── flagfile
     │   └── <benchmark_case>
+    │       ├── tool
     │       └── flagfile
     └── vmfb
-        ├── compiled-<sha1>.vmfb
+        ├── <compiled-iree-model>-<sha1>.vmfb
         ├── ...
-        └── compiled-<sha1>.vmfb
+        └── <compiled-iree-model>-<sha1>.vmfb
 
 Example usages:
 
   # Without trace generation
   python3 run_benchmarks.py \
-    --normal_benchmark_tool=/path/to/android/target/iree-benchmark_module \
+    --normal_benchmark_tool_dir=/path/to/normal/android/target/iree/tools/dir \
     /path/to/host/build/dir
 
   # With trace generation
   python3 run_benchmarks.py \
-    --normal_benchmark_tool=/path/to/normal/android/target/iree-benchmark_module \
-    --traced_benchmark_tool=/path/to/tracy/android/target/iree-benchmark_module \
+    --normal_benchmark_tool_dir=/path/to/normal/android/target/iree/tools/dir \
+    --traced_benchmark_tool_dir=/path/to/tracy/android/target/iree/tools/dir \
     --trace_capture_tool=/path/to/host/build/tracy/capture \
     /path/to/host/build/dir
 """
@@ -70,12 +72,16 @@
 # VMFB files' relative path against a benchmark category directory.
 VMFB_REL_PATH = "vmfb"
 
-# The flagfile's filename for compiled benchmark artifacts.
+# The flagfile/toolfile's filename for compiled benchmark artifacts.
 MODEL_FLAGFILE_NAME = "flagfile"
+MODEL_TOOLFILE_NAME = "tool"
 
 # Root directory to perform benchmarks in on the Android device.
 ANDROID_TMP_DIR = "/data/local/tmp/iree-benchmarks"
 
+NORMAL_TOOL_REL_DIR = "normal-tools"
+TRACED_TOOL_REL_DIR = "traced-tools"
+
 # A map from Android CPU ABI to IREE's benchmark target architecture.
 CPU_ABI_TO_TARGET_ARCH_MAP = {
     "arm64-v8a": "cpu-arm64-v8a",
@@ -317,8 +323,8 @@ def run_benchmarks_for_category(
     benchmark_category_dir: str,
     benchmark_case_dirs: Sequence[str],
     tmp_dir: str,
-    normal_benchmark_tool: str,
-    traced_benchmark_tool: Optional[str] = None,
+    normal_benchmark_tool_dir: str,
+    traced_benchmark_tool_dir: Optional[str] = None,
     trace_capture_tool: Optional[str] = None,
     skip_benchmarks: Optional[Set[str]] = None,
     skip_captures: Optional[Set[str]] = None,
@@ -335,8 +341,9 @@ def run_benchmarks_for_category(
     tmp_dir: path to temporary directory to which intermediate outputs should be
       stored. Separate "benchmark-results" and "captures" subdirectories will be
       created as necessary.
-    normal_benchmark_tool: the path to the normal benchmark tool.
-    traced_benchmark_tool: the path to the tracing-enabled benchmark tool.
+    normal_benchmark_tool_dir: the path to the normal benchmark tool directory.
+    traced_benchmark_tool_dir: the path to the tracing-enabled benchmark tool
+      directory.
     trace_capture_tool: the path to the tool for collecting captured traces.
     skip_benchmarks: names of benchmarks that should be skipped. Note that
       captures will still be run for these benchmarks if do_capture is true and
@@ -355,9 +362,10 @@ def run_benchmarks_for_category(
   adb_push_to_tmp_dir(os.path.join(benchmark_category_dir, VMFB_REL_PATH),
                       relative_dir=os.path.basename(benchmark_category_dir),
                       verbose=verbose)
-  normal_benchmark_tool_path = adb_push_to_tmp_dir(normal_benchmark_tool,
-                                                   relative_dir="normal-tools",
-                                                   verbose=verbose)
+  for f in os.listdir(normal_benchmark_tool_dir):
+    f = os.path.join(normal_benchmark_tool_dir, f)
+    if os.path.isfile(f) and os.access(f, os.X_OK):
+      adb_push_to_tmp_dir(f, relative_dir=NORMAL_TOOL_REL_DIR, verbose=verbose)
   # Create directories on the host to store results from each benchmark run.
   benchmark_results_dir = os.path.join(tmp_dir, "benchmark-results")
   os.makedirs(benchmark_results_dir, exist_ok=True)
@@ -366,8 +374,12 @@ def run_benchmarks_for_category(
   captures_dir = os.path.join(tmp_dir, "captures")
   if do_capture:
     os.makedirs(captures_dir, exist_ok=True)
-    traced_benchmark_tool_path = adb_push_to_tmp_dir(
-        traced_benchmark_tool, relative_dir="traced-tools", verbose=verbose)
+    for f in os.listdir(traced_benchmark_tool_dir):
+      f = os.path.join(traced_benchmark_tool_dir, f)
+      if os.path.isfile(f) and os.access(f, os.X_OK):
+        adb_push_to_tmp_dir(f,
+                            relative_dir=TRACED_TOOL_REL_DIR,
+                            verbose=verbose)
 
   results = []
   errors = []
@@ -377,15 +389,20 @@ def run_benchmarks_for_category(
   # Push all model artifacts to the device and run them.
   root_benchmark_dir = os.path.dirname(benchmark_category_dir)
   for benchmark_case_dir in benchmark_case_dirs:
+    # Read the file specifying which tool should be used for benchmarking
+    with open(os.path.join(benchmark_case_dir, MODEL_TOOLFILE_NAME)) as f:
+      tool = f.read().strip()
+
     benchmark_info = compose_benchmark_info_object(device_info,
                                                    benchmark_category_dir,
                                                    benchmark_case_dir)
     benchmark_key = str(benchmark_info)
-    # If we're not running the benchmark or the capture, just skip ahead. No
-    # need to push files.
-    if benchmark_key in skip_benchmarks and (not do_capture or
-                                             benchmark_key in skip_captures):
+    # If we're not running the benchmark or the capture, just skip ahead.
+    # No need to push files.
+    if (benchmark_key in skip_benchmarks) and (not do_capture or
+                                               benchmark_key in skip_captures):
       continue
+
     print(f"--> benchmark: {benchmark_info} <--")
     # Now try to actually run benchmarks and collect captures. If keep_going is
     # True then errors in the underlying commands will be logged and returned.
@@ -400,16 +417,21 @@ def run_benchmarks_for_category(
       if benchmark_key not in skip_benchmarks:
         repetitions = get_benchmark_repetition_count(benchmark_info.runner)
         benchmark_results_basename = f"{benchmark_key}.json"
+
         cmd = [
             "taskset",
             benchmark_info.deduce_taskset(),
-            normal_benchmark_tool_path,
-            f"--flagfile={MODEL_FLAGFILE_NAME}",
-            f"--benchmark_repetitions={repetitions}",
-            "--benchmark_format=json",
-            "--benchmark_out_format=json",
-            f"--benchmark_out='{benchmark_results_basename}'",
+            os.path.join(ANDROID_TMP_DIR, NORMAL_TOOL_REL_DIR, tool),
+            f"--flagfile={MODEL_FLAGFILE_NAME}"
         ]
+        if tool == "iree-benchmark-module":
+          cmd.extend([
+              f"--benchmark_repetitions={repetitions}",
+              "--benchmark_format=json",
+              "--benchmark_out_format=json",
+              f"--benchmark_out='{benchmark_results_basename}'",
+          ])
+
         result_json = adb_execute_and_get_output(cmd,
                                                  android_relative_dir,
                                                  verbose=verbose)
@@ -432,7 +454,8 @@ def run_benchmarks_for_category(
       if do_capture and benchmark_key not in skip_captures:
         run_cmd = [
             "TRACY_NO_EXIT=1", "taskset",
-            benchmark_info.deduce_taskset(), traced_benchmark_tool_path,
+            benchmark_info.deduce_taskset(),
+            os.path.join(ANDROID_TMP_DIR, TRACED_TOOL_REL_DIR, tool),
             f"--flagfile={MODEL_FLAGFILE_NAME}"
         ]
 
@@ -483,8 +506,8 @@ def filter_and_run_benchmarks(
     root_build_dir: str,
     driver_filter: Optional[str],
     tmp_dir: str,
-    normal_benchmark_tool: str,
-    traced_benchmark_tool: Optional[str],
+    normal_benchmark_tool_dir: str,
+    traced_benchmark_tool_dir: Optional[str],
     trace_capture_tool: Optional[str],
     skip_benchmarks: Optional[Set[str]],
     skip_captures: Optional[Set[str]],
@@ -502,8 +525,9 @@ def filter_and_run_benchmarks(
     tmp_dir: path to temporary directory to which intermediate outputs should be
       stored. Separate "benchmark-results" and "captures" subdirectories will be
       created as necessary.
-    normal_benchmark_tool: the path to the normal benchmark tool.
-    traced_benchmark_tool: the path to the tracing-enabled benchmark tool.
+    normal_benchmark_tool_dir: the path to the normal benchmark tool directory.
+    traced_benchmark_tool_dir: the path to the tracing-enabled benchmark tool
+      directory.
     trace_capture_tool: the path to the tool for collecting captured traces.
     skip_benchmarks: names of benchmarks that should be skipped. Note that
       captures will still be run for these benchmarks if do_capture is true and
@@ -542,8 +566,8 @@ def filter_and_run_benchmarks(
         benchmark_category_dir=benchmark_category_dir,
         benchmark_case_dirs=matched_benchmarks,
         tmp_dir=tmp_dir,
-        normal_benchmark_tool=normal_benchmark_tool,
-        traced_benchmark_tool=traced_benchmark_tool,
+        normal_benchmark_tool_dir=normal_benchmark_tool_dir,
+        traced_benchmark_tool_dir=traced_benchmark_tool_dir,
         skip_benchmarks=skip_benchmarks,
         trace_capture_tool=trace_capture_tool,
         do_capture=do_capture,
@@ -605,17 +629,16 @@ def check_exe_path(path):
       metavar="<build-dir>",
       type=check_dir_path,
       help="Path to the build directory containing benchmark suites")
-  parser.add_argument("--normal_benchmark_tool",
-                      "--normal-benchmark-tool",
+  parser.add_argument("--normal_benchmark_tool_dir",
+                      "--normal-benchmark-tool-dir",
                       type=check_exe_path,
                       required=True,
-                      help="Path to the normal iree-benchmark-module tool")
-  parser.add_argument(
-      "--traced_benchmark_tool",
-      "--traced-benchmark-tool",
-      type=check_exe_path,
-      default=None,
-      help="Path to the tracing-enabled iree-benchmark-module tool")
+                      help="Path to the normal iree tool directory")
+  parser.add_argument("--traced_benchmark_tool_dir",
+                      "--traced-benchmark-tool-dir",
+                      type=check_exe_path,
+                      default=None,
+                      help="Path to the tracing-enabled iree tool directory")
   parser.add_argument("--trace_capture_tool",
                       "--trace-capture-tool",
                       type=check_exe_path,
@@ -700,7 +723,7 @@ def main(args):
   previous_benchmarks = None
   previous_captures = None
 
-  do_capture = (args.traced_benchmark_tool is not None and
+  do_capture = (args.traced_benchmark_tool_dir is not None and
                 args.trace_capture_tool is not None)
 
   # Collect names of previous benchmarks and captures that should be skipped and
@@ -738,7 +761,8 @@ def main(args):
     atexit.register(execute_cmd_and_get_output,
                     ["adb", "forward", "--remove", "tcp:8086"])
 
-    args.traced_benchmark_tool = os.path.realpath(args.traced_benchmark_tool)
+    args.traced_benchmark_tool_dir = os.path.realpath(
+        args.traced_benchmark_tool_dir)
     args.trace_capture_tool = os.path.realpath(args.trace_capture_tool)
 
   results = BenchmarkResults()
@@ -753,8 +777,9 @@ def main(args):
       root_build_dir=args.build_dir,
       driver_filter=args.driver,
       tmp_dir=args.tmp_dir,
-      normal_benchmark_tool=os.path.realpath(args.normal_benchmark_tool),
-      traced_benchmark_tool=args.traced_benchmark_tool,
+      normal_benchmark_tool_dir=os.path.realpath(
+          args.normal_benchmark_tool_dir),
+      traced_benchmark_tool_dir=args.traced_benchmark_tool_dir,
       trace_capture_tool=args.trace_capture_tool,
       skip_benchmarks=previous_benchmarks,
       skip_captures=previous_captures,
diff --git a/build_tools/buildkite/cmake/android/arm64-v8a/benchmark2.yml b/build_tools/buildkite/cmake/android/arm64-v8a/benchmark2.yml
index 963f393cae9d..445ab311a805 100644
--- a/build_tools/buildkite/cmake/android/arm64-v8a/benchmark2.yml
+++ b/build_tools/buildkite/cmake/android/arm64-v8a/benchmark2.yml
@@ -32,7 +32,7 @@ steps:
       - "tar -xzvf benchmark-suites-${BUILDKITE_BUILD_NUMBER}.tgz"
       - "tar -xzvf iree-android-tools-${BUILDKITE_BUILD_NUMBER}.tgz"
       - "tar -xzvf tracy-capture-058e8901.tgz"
-      - "python3 build_tools/benchmarks/run_benchmarks_on_android.py --pin-cpu-freq --pin-gpu-freq --normal_benchmark_tool=build-android/iree/tools/iree-benchmark-module --traced_benchmark_tool=build-android-trace/iree/tools/iree-benchmark-module --trace_capture_tool=tracy-capture -o benchmark-results-pixel-4-${BUILDKITE_BUILD_NUMBER}.json --capture_tarball=trace-captures-pixel-4-${BUILDKITE_BUILD_NUMBER}.tgz --verbose build-host/"
+      - "python3 build_tools/benchmarks/run_benchmarks_on_android.py --pin-cpu-freq --pin-gpu-freq --normal_benchmark_tool_dir=build-android/iree/tools/ --traced_benchmark_tool_dir=build-android-trace/iree/tools/ --trace_capture_tool=tracy-capture -o benchmark-results-pixel-4-${BUILDKITE_BUILD_NUMBER}.json --capture_tarball=trace-captures-pixel-4-${BUILDKITE_BUILD_NUMBER}.tgz --verbose build-host/"
     if: "build.pull_request.id == null || (build.pull_request.labels includes 'buildkite:benchmark')"
     agents:
       - "android-soc=snapdragon-855"
@@ -52,7 +52,7 @@ steps:
       - "tar -xzvf benchmark-suites-${BUILDKITE_BUILD_NUMBER}.tgz"
       - "tar -xzvf iree-android-tools-${BUILDKITE_BUILD_NUMBER}.tgz"
       - "tar -xzvf tracy-capture-058e8901.tgz"
-      - "python3 build_tools/benchmarks/run_benchmarks_on_android.py --pin-cpu-freq --pin-gpu-freq --normal_benchmark_tool=build-android/iree/tools/iree-benchmark-module --traced_benchmark_tool=build-android-trace/iree/tools/iree-benchmark-module --trace_capture_tool=tracy-capture -o benchmark-results-galaxy-pixel6-pro-${BUILDKITE_BUILD_NUMBER}.json --capture_tarball=trace-captures-galaxy-pixel6-pro-${BUILDKITE_BUILD_NUMBER}.tgz --verbose build-host/"
+      - "python3 build_tools/benchmarks/run_benchmarks_on_android.py --pin-cpu-freq --pin-gpu-freq --normal_benchmark_tool_dir=build-android/iree/tools/ --traced_benchmark_tool_dir=build-android-trace/iree/tools/ --trace_capture_tool=tracy-capture -o benchmark-results-galaxy-pixel6-pro-${BUILDKITE_BUILD_NUMBER}.json --capture_tarball=trace-captures-galaxy-pixel6-pro-${BUILDKITE_BUILD_NUMBER}.tgz --verbose build-host/"
     if: "build.pull_request.id == null || (build.pull_request.labels includes 'buildkite:benchmark')"
     agents:
       - "android-soc=google-tensor"
diff --git a/build_tools/cmake/iree_benchmark_suite.cmake b/build_tools/cmake/iree_benchmark_suite.cmake
index 54a224d3579f..fb6e84520575 100644
--- a/build_tools/cmake/iree_benchmark_suite.cmake
+++ b/build_tools/cmake/iree_benchmark_suite.cmake
@@ -69,13 +69,13 @@ function(iree_benchmark_suite)
     _RULE
     ""
     "DRIVER;TARGET_BACKEND;TARGET_ARCHITECTURE"
-    "BENCHMARK_MODES;MODULES;TRANSLATION_FLAGS;RUNTIME_FLAGS"
+    "BENCHMARK_MODES;BENCHMARK_TOOL;MODULES;TRANSLATION_FLAGS;RUNTIME_FLAGS"
   )
 
   iree_validate_required_arguments(
     _RULE
     "DRIVER;TARGET_BACKEND;TARGET_ARCHITECTURE"
-    "BENCHMARK_MODES;MODULES"
+    "BENCHMARK_MODES;BENCHMARK_TOOL;MODULES"
   )
 
   iree_package_name(PACKAGE_NAME)
@@ -234,12 +234,13 @@ function(iree_benchmark_suite)
       endif()
       add_dependencies("${_FRIENDLY_TARGET_NAME}" "${_TRANSLATION_TARGET_NAME}")
 
-      # Finally create the command and target for the flagfile used to execute the
-      # generated artifacts.
-      set(_FLAGFILE_ARTIFACTS_DIR "${_ROOT_ARTIFACTS_DIR}/${_MODULE_DIR_NAME}/${_BENCHMARK_DIR_NAME}")
-      set(_FLAG_FILE "${_FLAGFILE_ARTIFACTS_DIR}/flagfile")
+      set(_RUN_SPEC_DIR "${_ROOT_ARTIFACTS_DIR}/${_MODULE_DIR_NAME}/${_BENCHMARK_DIR_NAME}")
+
+      # Create the command and target for the flagfile spec used to execute
+      # the generated artifacts.
+      set(_FLAG_FILE "${_RUN_SPEC_DIR}/flagfile")
       set(_ADDITIONAL_ARGS_CL "--additional_args=\"${_RULE_RUNTIME_FLAGS}\"")
-      file(RELATIVE_PATH _MODULE_FILE_FLAG "${_FLAGFILE_ARTIFACTS_DIR}" "${_VMFB_FILE}")
+      file(RELATIVE_PATH _MODULE_FILE_FLAG "${_RUN_SPEC_DIR}" "${_VMFB_FILE}")
       add_custom_command(
         OUTPUT "${_FLAG_FILE}"
         COMMAND
@@ -252,7 +253,7 @@ function(iree_benchmark_suite)
             -o "${_FLAG_FILE}"
         DEPENDS
           "${IREE_ROOT_DIR}/scripts/generate_flagfile.py"
-        WORKING_DIRECTORY "${_FLAGFILE_ARTIFACTS_DIR}"
+        WORKING_DIRECTORY "${_RUN_SPEC_DIR}"
         COMMENT "Generating ${_FLAG_FILE}"
       )
 
@@ -265,8 +266,28 @@ function(iree_benchmark_suite)
         DEPENDS "${_FLAG_FILE}"
       )
 
+      # Create the command and target for the toolfile spec used to execute
+      # the generated artifacts.
+      set(_TOOL_FILE "${_RUN_SPEC_DIR}/tool")
+      add_custom_command(
+        OUTPUT "${_TOOL_FILE}"
+        COMMAND ${CMAKE_COMMAND} -E echo ${_RULE_BENCHMARK_TOOL} > "${_TOOL_FILE}"
+        WORKING_DIRECTORY "${_RUN_SPEC_DIR}"
+        COMMENT "Generating ${_TOOL_FILE}"
+      )
+
+      set(_TOOLFILE_GEN_TARGET_NAME_LIST "iree-generate-benchmark-toolfile")
+      list(APPEND _TOOLFILE_GEN_TARGET_NAME_LIST ${_COMMON_NAME_SEGMENTS})
+      list(JOIN _TOOLFILE_GEN_TARGET_NAME_LIST "__" _TOOLFILE_GEN_TARGET_NAME)
+      add_custom_target("${_TOOLFILE_GEN_TARGET_NAME}"
+        DEPENDS "${_TOOL_FILE}"
+      )
+
       # Mark dependency so that we have one target to drive them all.
-      add_dependencies(iree-benchmark-suites "${_FLAGFILE_GEN_TARGET_NAME}")
+      add_dependencies(iree-benchmark-suites
+        "${_FLAGFILE_GEN_TARGET_NAME}"
+        "${_TOOLFILE_GEN_TARGET_NAME}"
+      )
     endforeach(_BENCHMARK_MODE IN LISTS _RULE_BENCHMARK_MODES)
 
   endforeach(_MODULE IN LISTS _RULE_MODULES)