From 0f153b0ddc6420c96971c8ad7a496e95f8b2dee2 Mon Sep 17 00:00:00 2001
From: Lei Zhang <antiagainst@google.com>
Date: Sun, 6 Jun 2021 14:26:46 -0400
Subject: [PATCH] Add benchmark cases for MobileBERT

This commit is meant to be an example for showing how to register
benchmark cases for a new model. In addition to the changes in
CMake, one would also need to generate the input model and place
it in a cloud storage.
---
 iree/benchmark/TensorFlow/CMakeLists.txt | 125 ++++++++++++++++++++++-
 1 file changed, 124 insertions(+), 1 deletion(-)

diff --git a/iree/benchmark/TensorFlow/CMakeLists.txt b/iree/benchmark/TensorFlow/CMakeLists.txt
index 7e0df7d68b6a..9f61cf268ced 100644
--- a/iree/benchmark/TensorFlow/CMakeLists.txt
+++ b/iree/benchmark/TensorFlow/CMakeLists.txt
@@ -16,6 +16,26 @@
 #                                                                              #
 ################################################################################
 
+set(MOBILEBERT_FP16_MODULE
+  "MobileBertSquad"               # MODULE_NAME
+  "fp16"                          # MODULE_TAGS
+  # This uses the same input MLIR source as fp32 to save download time.
+  # It requires users to have "--iree-flow-demote-f32-to-f16".
+  "https://storage.googleapis.com/iree-model-artifacts/MobileBertSquad-810f6fdc.tar.gz" # MLIR_SOURCE
+  "serving_default"               # ENTRY_FUNCTION
+  # The conversion done by "--iree-flow-demote-f32-to-f16" won't change the
+  # original input signature.
+  "1x384xi32,1x384xi32,1x384xi32" # FUNCTION_INPUTS
+)
+
+set(MOBILEBERT_FP32_MODULE
+  "MobileBertSquad"               # MODULE_NAME
+  "fp32"                          # MODULE_TAGS
+  "https://storage.googleapis.com/iree-model-artifacts/MobileBertSquad-810f6fdc.tar.gz" # MLIR_SOURCE
+  "serving_default"               # ENTRY_FUNCTION
+  "1x384xi32,1x384xi32,1x384xi32" # FUNCTION_INPUTS
+)
+
 set(MOBILENET_V2_MODULE
   "MobileNetV2"     # MODULE_NAME
   "fp32,imagenet"   # MODULE_TAGS
@@ -34,7 +54,7 @@ set(MOBILENET_V3SMALL_MODULE
 
 ################################################################################
 #                                                                              #
-# Benchmark suites                                                             #
+# Common benchmark configurations                                              #
 #                                                                              #
 # Each suite benchmarks a list of modules with some specific configuration,    #
 # typically involving different translation/runtime flags and targeting        #
@@ -142,6 +162,7 @@ iree_mlir_benchmark_suite(
 # GPU, Vulkan, Adreno, full-inference
 iree_mlir_benchmark_suite(
   MODULES
+    ${MOBILEBERT_FP32_MODULE}
     ${MOBILENET_V2_MODULE}
     ${MOBILENET_V3SMALL_MODULE}
 
@@ -189,6 +210,7 @@ iree_mlir_benchmark_suite(
 # GPU, Vulkan, Mali, full-inference
 iree_mlir_benchmark_suite(
   MODULES
+    ${MOBILEBERT_FP32_MODULE}
     ${MOBILENET_V2_MODULE}
     ${MOBILENET_V3SMALL_MODULE}
 
@@ -232,3 +254,104 @@ iree_mlir_benchmark_suite(
   RUNTIME_FLAGS
     "--batch_size=32"
 )
+
+################################################################################
+#                                                                              #
+# Speical benchmark configurations                                             #
+#                                                                              #
+# These are configurations that can only be enabled for some specific model.   #
+# However, THIS SHOULD REALLY BE TEMPORARY; we should strike for uniformity.   #
+#                                                                              #
+################################################################################
+
+# CPU, Dylib-Sync, big/little-core, full-inference
+iree_mlir_benchmark_suite(
+  MODULES
+    ${MOBILEBERT_FP32_MODULE}
+
+  BENCHMARK_MODES
+    "big-core,full-inference"
+    "little-core,full-inference"
+  TARGET_BACKEND
+    "dylib-llvm-aot"
+  TARGET_ARCHITECTURE
+    "CPU-ARM64-v8A"
+  TRANSLATION_FLAGS
+    # TODO: Merge this rule once we can use the same flags as the common one.
+    "--iree-input-type=mhlo"
+    "--iree-llvm-target-triple=aarch64-none-linux-android29"
+    "--iree-flow-inline-constants-max-byte-length=2048"
+  DRIVER
+    "dylib-sync"
+)
+
+# CPU, Dylib, 1-thread, big/little-core, full-inference
+iree_mlir_benchmark_suite(
+  MODULES
+    ${MOBILEBERT_FP32_MODULE}
+
+  BENCHMARK_MODES
+    "1-thread,big-core,full-inference"
+    "1-thread,little-core,full-inference"
+  TARGET_BACKEND
+    "dylib-llvm-aot"
+  TARGET_ARCHITECTURE
+    "CPU-ARM64-v8A"
+  TRANSLATION_FLAGS
+    # TODO: Merge this rule once we can use the same flags as the common one.
+    "--iree-input-type=mhlo"
+    "--iree-llvm-target-triple=aarch64-none-linux-android29"
+    "--iree-flow-inline-constants-max-byte-length=2048"
+  DRIVER
+    "dylib"
+  RUNTIME_FLAGS
+    "--task_topology_group_count=1"
+)
+
+# CPU, Dylib, 3-thread, big/little-core, full-inference
+iree_mlir_benchmark_suite(
+  MODULES
+    ${MOBILEBERT_FP32_MODULE}
+
+  BENCHMARK_MODES
+    "3-thread,big-core,full-inference"
+    "3-thread,little-core,full-inference"
+  TARGET_BACKEND
+    "dylib-llvm-aot"
+  TARGET_ARCHITECTURE
+    "CPU-ARM64-v8A"
+  TRANSLATION_FLAGS
+    # TODO: Merge this rule once we can use the same flags as the common one.
+    "--iree-input-type=mhlo"
+    "--iree-llvm-target-triple=aarch64-none-linux-android29"
+    "--iree-flow-inline-constants-max-byte-length=2048"
+  DRIVER
+    "dylib"
+  RUNTIME_FLAGS
+    "--task_topology_group_count=3"
+)
+
+# GPU, Vulkan, Mali, kernel-execution
+iree_mlir_benchmark_suite(
+  MODULES
+    ${MOBILEBERT_FP16_MODULE}
+
+  BENCHMARK_MODES
+    "kernel-execution"
+  TARGET_BACKEND
+    "vulkan-spirv"
+  TARGET_ARCHITECTURE
+    "GPU-Mali-Valhall"
+  TRANSLATION_FLAGS
+    "--iree-input-type=mhlo"
+    "--iree-flow-demote-f32-to-f16"
+    "--iree-vulkan-target-triple=valhall-unknown-android11"
+    "--iree-flow-inline-constants-max-byte-length=16"
+    "--iree-flow-dispatch-formation-enable-operand-fusion"
+    "--iree-enable-fusion-with-reduction-ops"
+    "--iree-hal-benchmark-dispatch-repeat-count=32"
+  DRIVER
+    "vulkan"
+  RUNTIME_FLAGS
+    "--batch_size=32"
+)