diff --git a/.bazelversion b/.bazelversion index 03f488b0..f22d756d 100644 --- a/.bazelversion +++ b/.bazelversion @@ -1 +1 @@ -5.3.0 +6.5.0 diff --git a/.github/tools/release_linux.sh b/.github/tools/release_linux.sh index 2fd22064..b697de6f 100755 --- a/.github/tools/release_linux.sh +++ b/.github/tools/release_linux.sh @@ -3,13 +3,17 @@ set -e -x python configure.py -# Build -bazel build :build_pip_pkg \ +# Inside the docker container on github actions there is not +# enough space for the bazel cache, but a larger disk is mounted at /github_disk +# so we tell bazel to store everything there + +# `release_cpu_linux` will activate absolute paths to files that only exist in the tensorflow/build:2.16-pythonXX docker container +bazel --output_user_root=/github_disk/bazel_root \ + build :build_pip_pkg \ + -c opt \ + --config=release_cpu_linux \ --copt=-fvisibility=hidden \ - --copt=-mavx \ - --distinct_host_configuration=false \ - --verbose_failures \ - --crosstool_top=@ubuntu20.04-gcc9_manylinux2014-cuda11.2-cudnn8.1-tensorrt7.2_config_cuda//crosstool:toolchain + --verbose_failures # Package Whl bazel-bin/build_pip_pkg artifacts diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 1241c867..be6ed26f 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -36,7 +36,7 @@ jobs: if: steps.cache.outputs.cache-hit != 'true' run: ./third_party/install_android.sh - name: Configure Bazel - run: LCE_SET_ANDROID_WORKSPACE=1 ANDROID_SDK_HOME="/tmp/lce_android" ANDROID_NDK_HOME="/tmp/lce_android/ndk/21.4.7075529" ./configure.py + run: LCE_SET_ANDROID_WORKSPACE=1 ANDROID_SDK_HOME="/tmp/lce_android" ANDROID_API_LEVEL=30 ANDROID_NDK_HOME="/tmp/lce_android/ndk/25.2.9519653" ANDROID_NDK_API_LEVEL=30 ANDROID_BUILD_TOOLS_VERSION=31.0.0 ./configure.py shell: bash - run: mkdir benchmark-binaries - name: Build Benchmark utility for AArch64 @@ -108,7 +108,7 @@ jobs: if: steps.cache.outputs.cache-hit != 'true' run: ./third_party/install_android.sh - name: Configure Bazel - run: LCE_SET_ANDROID_WORKSPACE=1 ANDROID_SDK_HOME="/tmp/lce_android" ANDROID_NDK_HOME="/tmp/lce_android/ndk/21.4.7075529" ./configure.py + run: LCE_SET_ANDROID_WORKSPACE=1 ANDROID_SDK_HOME="/tmp/lce_android" ANDROID_API_LEVEL=30 ANDROID_NDK_HOME="/tmp/lce_android/ndk/25.2.9519653" ANDROID_NDK_API_LEVEL=30 ANDROID_BUILD_TOOLS_VERSION=31.0.0 ./configure.py shell: bash - name: Build LCE AAR run: BUILDER=bazelisk ./larq_compute_engine/tflite/java/build_lce_aar.sh @@ -134,10 +134,10 @@ jobs: macos-release-wheel: name: Build release wheels for macOS - runs-on: macos-latest + runs-on: macos-13 strategy: matrix: - python-version: [3.9, "3.10", 3.11] + python-version: ["3.10", 3.11] fail-fast: false steps: - uses: actions/checkout@v4 @@ -154,33 +154,35 @@ jobs: python -m pip install delocate wheel setuptools numpy six --no-cache-dir ./configure.py - export MACOSX_DEPLOYMENT_TARGET=10.14 + # This matches `release_macox_x86` in .tensorflow.bazelrc + export MACOSX_DEPLOYMENT_TARGET=10.15 if [[ -n $GOOGLE_APPLICATION_CREDENTIALS ]]; then - echo -e 'build --remote_http_cache=https://storage.googleapis.com/plumerai-bazel-cache/lce-release-macos-python${{ matrix.python-version }}' >> .bazelrc.user + echo -e 'build --remote_cache=https://storage.googleapis.com/plumerai-bazel-cache/lce-release-macos-python${{ matrix.python-version }}' >> .bazelrc.user echo -e 'build --google_default_credentials' >> .bazelrc.user fi - bazelisk build :build_pip_pkg --copt=-fvisibility=hidden --copt=-mavx --linkopt=-dead_strip --distinct_host_configuration=false - bazel-bin/build_pip_pkg artifacts --plat-name macosx_10_14_x86_64 + bazelisk build :build_pip_pkg --config=release_macos_x86 --config=release_cpu_macos --copt=-fvisibility=hidden --linkopt=-dead_strip + bazel-bin/build_pip_pkg artifacts --plat-name macosx_10_15_x86_64 for f in artifacts/*.whl; do delocate-wheel -w wheelhouse $f done env: LCE_RELEASE_VERSION: ${{ github.event.inputs.version }} + TF_PYTHON_VERSION: ${{ matrix.python-version }} shell: bash - uses: actions/upload-artifact@v4 with: - name: ${{ runner.os }}-wheels + name: ${{ runner.os }}-wheels-${{ matrix.python-version }} path: wheelhouse macos-arm-release-wheel: name: Build release arm wheels for macOS - runs-on: macos-11 + runs-on: macos-14 strategy: matrix: - python-version: [3.9, "3.10", 3.11] + python-version: ["3.10", 3.11] fail-fast: false steps: - uses: actions/checkout@v4 @@ -197,25 +199,27 @@ jobs: python -m pip install delocate wheel setuptools numpy six --no-cache-dir ./configure.py - export MACOSX_DEPLOYMENT_TARGET=11.0 + # This matches `release_macox_arm64` in .tensorflow.bazelrc + export MACOSX_DEPLOYMENT_TARGET=12.0 if [[ -n $GOOGLE_APPLICATION_CREDENTIALS ]]; then - echo -e 'build --remote_http_cache=https://storage.googleapis.com/plumerai-bazel-cache/lce-release-macos-arm-python${{ matrix.python-version }}' >> .bazelrc.user + echo -e 'build --remote_cache=https://storage.googleapis.com/plumerai-bazel-cache/lce-release-macos-arm-python${{ matrix.python-version }}' >> .bazelrc.user echo -e 'build --google_default_credentials' >> .bazelrc.user fi - bazelisk build :build_pip_pkg --copt=-fvisibility=hidden --linkopt=-dead_strip --config=macos_arm64 - bazel-bin/build_pip_pkg artifacts --plat-name macosx_11_0_arm64 + bazelisk build :build_pip_pkg --config=release_macos_arm64 --copt=-fvisibility=hidden --linkopt=-dead_strip + bazel-bin/build_pip_pkg artifacts --plat-name macosx_12_0_arm64 for f in artifacts/*.whl; do delocate-wheel -w wheelhouse $f done env: LCE_RELEASE_VERSION: ${{ github.event.inputs.version }} + TF_PYTHON_VERSION: ${{ matrix.python-version }} shell: bash - uses: actions/upload-artifact@v4 with: - name: ${{ runner.os }}-arm-wheels + name: ${{ runner.os }}-arm-wheels-${{ matrix.python-version }} path: wheelhouse manylinux-release-wheel: @@ -223,7 +227,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.9, "3.10", 3.11] + python-version: ["3.10", 3.11] fail-fast: false steps: - uses: actions/checkout@v4 @@ -237,15 +241,17 @@ jobs: - name: Build manylinux2014 wheels run: | if [[ -n $GOOGLE_APPLICATION_CREDENTIALS ]]; then - echo -e 'build --remote_http_cache=https://storage.googleapis.com/plumerai-bazel-cache/lce-release-manylinux-python${{ matrix.python-version }}' >> .bazelrc.user + echo -e 'build --remote_cache=https://storage.googleapis.com/plumerai-bazel-cache/lce-release-manylinux-python${{ matrix.python-version }}' >> .bazelrc.user echo -e 'build --google_default_credentials' >> .bazelrc.user fi docker run -e LCE_RELEASE_VERSION=${{ github.event.inputs.version }} \ -e GOOGLE_APPLICATION_CREDENTIALS=/tmp/gcloud-credentials.json \ + -e TF_PYTHON_VERSION=${{ matrix.python-version }} \ -v $GOOGLE_APPLICATION_CREDENTIALS:/tmp/gcloud-credentials.json:ro \ -v ${PWD}:/compute-engine -w /compute-engine \ - tensorflow/build:2.13-python${{ matrix.python-version }} \ + -v /mnt:/github_disk \ + tensorflow/build:2.16-python${{ matrix.python-version }} \ .github/tools/release_linux.sh sudo apt-get -y -qq install patchelf --no-install-recommends @@ -258,7 +264,7 @@ jobs: ls -al wheelhouse/ - uses: actions/upload-artifact@v4 with: - name: ${{ runner.os }}-wheels + name: ${{ runner.os }}-wheels-${{ matrix.python-version }} path: wheelhouse windows-release-wheel: @@ -266,7 +272,7 @@ jobs: runs-on: windows-2019 strategy: matrix: - python-version: [3.9, "3.10", 3.11] + python-version: ["3.10", 3.11] fail-fast: false steps: - name: Configure Pagefile @@ -294,18 +300,24 @@ jobs: $Env:CC_OPT_FLAGS = "/O2" python --version - python -m pip install wheel setuptools numpy six --no-cache-dir + python -m pip install wheel setuptools numpy six pip-tools --no-cache-dir + # This is needed because the requirements on windows are different than on other systems + pip-compile --strip-extras --no-emit-index-url --allow-unsafe larq_compute_engine/requirements.in + + # Fix for path length limit: replace workspace name by 'lce' + (Get-Content WORKSPACE).Replace('workspace(name = "larq_compute_engine")', 'workspace(name = "lce")') | Set-Content WORKSPACE "" | python configure.py - bazelisk --output_base=C:\build_output build :build_pip_pkg --enable_runfiles --local_ram_resources=4096 --remote_http_cache=https://storage.googleapis.com/plumerai-bazel-cache/lce-release-windows-python${{ matrix.python-version }} --google_default_credentials + bazelisk --output_base=C:\bzl build :build_pip_pkg --enable_runfiles --local_ram_resources=4096 --remote_cache=https://storage.googleapis.com/plumerai-bazel-cache/lce-release-windows-python${{ matrix.python-version }} --google_default_credentials bazel-bin/build_pip_pkg wheelhouse env: LCE_RELEASE_VERSION: ${{ github.event.inputs.version }} + TF_PYTHON_VERSION: ${{ matrix.python-version }} shell: pwsh - uses: actions/upload-artifact@v4 with: - name: ${{ runner.os }}-wheels + name: ${{ runner.os }}-wheels-${{ matrix.python-version }} path: wheelhouse upload-wheels: @@ -322,31 +334,11 @@ jobs: steps: - uses: actions/download-artifact@v4 with: - name: Linux-wheels - path: Linux-wheels - if: ${{ needs.manylinux-release-wheel.result == 'success' }} - - uses: actions/download-artifact@v4 - with: - name: macOS-wheels - path: macOS-wheels - if: ${{ needs.macos-release-wheel.result == 'success' }} - - uses: actions/download-artifact@v4 - with: - name: macOS-arm-wheels - path: macOS-arm-wheels - if: ${{ needs.macos-arm-release-wheel.result == 'success' }} - - uses: actions/download-artifact@v4 - with: - name: Windows-wheels - path: Windows-wheels - if: ${{ needs.windows-release-wheel.result == 'success' }} + pattern: "*wheels*" + path: dist + merge-multiple: true - run: | set -x - mkdir -p dist - cp Linux-wheels/*.whl dist/ || true - cp macOS-wheels/*.whl dist/ || true - cp macOS-arm-wheels/*.whl dist/ || true - cp Windows-wheels/*.whl dist/ || true ls -la dist/ sha256sum dist/*.whl - uses: pypa/gh-action-pypi-publish@master diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml index 459f1022..15fd6394 100644 --- a/.github/workflows/unittests.yml +++ b/.github/workflows/unittests.yml @@ -10,6 +10,9 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true +env: + TF_PYTHON_VERSION: "3.11" + jobs: TFLite: runs-on: ubuntu-latest @@ -19,16 +22,8 @@ jobs: with: submodules: true fetch-depth: 0 - - uses: actions/setup-python@v5 - with: - python-version: 3.9 - - name: Configure Bazel - run: ./configure.py - shell: bash - - name: Install pip dependencies - run: pip install numpy --no-cache-dir - name: Run C++ Unit Tests - run: bazelisk test larq_compute_engine/tests:cc_tests --distinct_host_configuration=false --test_output=all + run: bazelisk test larq_compute_engine/tests:cc_tests --test_output=all - name: Build TF Lite Static Library with CMake run: | mkdir build @@ -45,14 +40,6 @@ jobs: run: | sudo apt-get update sudo apt-get install -y --no-install-recommends qemu-user - - uses: actions/setup-python@v5 - with: - python-version: 3.9 - - name: Configure Bazel - run: ./configure.py - shell: bash - - name: Install pip dependencies - run: pip install numpy six --no-cache-dir - name: "TF Lite Arm32: Cross-compile and run unit tests in qemu" run: bazelisk test larq_compute_engine/tests:arm32_tests --config=rpi3 --test_output=all --test_filter="-*BigTest*" --copt=-O1 - name: "TF Lite Aarch64: Cross-compile and run unit tests in qemu" @@ -69,32 +56,21 @@ jobs: continue-on-error: true with: credentials_json: ${{ secrets.gcs_bazel_cache }} - - uses: actions/setup-python@v5 - with: - python-version: 3.9 - name: Configure Bazel run: | - ./configure.py - echo -e 'build --distinct_host_configuration=false' >> .bazelrc.user if [[ -n $GOOGLE_APPLICATION_CREDENTIALS ]]; then - echo -e 'build --remote_http_cache=https://storage.googleapis.com/plumerai-bazel-cache/lce-ubuntu' >> .bazelrc.user + echo -e 'build --remote_cache=https://storage.googleapis.com/plumerai-bazel-cache/lce-ubuntu' >> .bazelrc.user echo -e 'build --google_default_credentials' >> .bazelrc.user fi shell: bash - - name: Set bazel cache - run: echo -e 'build --remote_upload_local_results=false' >> .bazelrc.user - if: github.ref != 'refs/heads/main' - shell: bash - - name: Install pip dependencies - run: pip install tensorflow-cpu~=2.13.0 larq~=0.13 pytest tensorflow_datasets~=4.9 flatbuffers==23.1.21 tqdm --no-cache-dir - name: Run Interpreter test run: bazelisk test larq_compute_engine/tflite/tests:interpreter_test --test_output=all - name: Run FileCheck tests run: bazelisk test larq_compute_engine/mlir/tests:all --test_output=all - name: Run End2End tests - run: bazelisk test larq_compute_engine/tests:end2end_test --test_output=all + run: bazelisk test larq_compute_engine/tests:end2end_test --test_output=all --test_env=TF_USE_LEGACY_KERAS=1 - name: Run Strip dequantize op tests - run: bazelisk test larq_compute_engine/tests:strip_lcedequantize_test --test_output=all + run: bazelisk test larq_compute_engine/tests:strip_lcedequantize_test --test_output=all --test_env=TF_USE_LEGACY_KERAS=1 ConverterPython: runs-on: ubuntu-latest @@ -109,6 +85,10 @@ jobs: python-version: 3.11 flatbuffers-version: 23.1.21 protobuf-version: 4.23.4 + - tf-version: 2.16.1 + python-version: 3.11 + flatbuffers-version: 24.3.25 + protobuf-version: 4.25.3 if: "!contains(github.event.head_commit.message, 'ci-skip')" steps: - uses: actions/checkout@v4 @@ -117,6 +97,9 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install TensorFlow run: pip install tensorflow==${{matrix.tf-version}} --no-cache-dir + - name: Install legacy tf-keras + if: matrix.tf-version == '2.16.1' + run: pip install tf-keras==2.16.0 - name: Install flatbuffers run: pip install flatbuffers==${{matrix.flatbuffers-version}} --no-cache-dir - name: Install protobuf @@ -124,7 +107,7 @@ jobs: - name: Install other dependencies run: pip install larq~=0.13.3 packaging tqdm --no-cache-dir - name: Run Converter test - run: PYTHONPATH=./ python larq_compute_engine/mlir/python/converter_test.py + run: TF_USE_LEGACY_KERAS=1 PYTHONPATH=./ python larq_compute_engine/mlir/python/converter_test.py Android_AAR: runs-on: ubuntu-latest @@ -142,15 +125,13 @@ jobs: with: path: /tmp/lce_android key: ${{ runner.os }}-${{ hashFiles('**/third_party/install_android.sh') }} - - name: Install pip dependencies - run: pip install numpy six --no-cache-dir - name: Set Java version run: echo "JAVA_HOME=${JAVA_HOME_8_X64}" >> $GITHUB_ENV - name: Download and install Android NDK/SDK if: steps.cache.outputs.cache-hit != 'true' run: ./third_party/install_android.sh - name: Configure Bazel - run: LCE_SET_ANDROID_WORKSPACE=1 ANDROID_SDK_HOME="/tmp/lce_android" ANDROID_NDK_HOME="/tmp/lce_android/ndk/21.4.7075529" ./configure.py + run: LCE_SET_ANDROID_WORKSPACE=1 ANDROID_SDK_HOME="/tmp/lce_android" ANDROID_API_LEVEL=30 ANDROID_NDK_HOME="/tmp/lce_android/ndk/25.2.9519653" ANDROID_NDK_API_LEVEL=30 ANDROID_BUILD_TOOLS_VERSION=31.0.0 ./configure.py shell: bash - name: Build LCE AAR run: BUILDER=bazelisk ./larq_compute_engine/tflite/java/build_lce_aar.sh diff --git a/.tensorflow.bazelrc b/.tensorflow.bazelrc index 98cf5a53..6a0e08e8 100644 --- a/.tensorflow.bazelrc +++ b/.tensorflow.bazelrc @@ -6,12 +6,11 @@ # Default build options. These are applied first and unconditionally. -# The following line opts in to modular op registration support by default. +# For projects which use TensorFlow as part of a Bazel build process, putting +# nothing in a bazelrc will default to a monolithic build. The following line +# opts in to modular op registration support by default. build --define framework_shared_object=true - -# For workaround https://github.com/bazelbuild/bazel/issues/8772 with Bazel >= 0.29.1 -build --java_toolchain=@tf_toolchains//toolchains/java:tf_java_toolchain -build --host_java_toolchain=@tf_toolchains//toolchains/java:tf_java_toolchain +build --define tsl_protobuf_header_only=true build --define=use_fast_cpp_protos=true build --define=allow_oversize_protos=true @@ -19,15 +18,38 @@ build --define=allow_oversize_protos=true build --spawn_strategy=standalone build -c opt +# Make Bazel print out all options from rc files. +build --announce_rc + build --define=grpc_no_ares=true +build --noincompatible_remove_legacy_whole_archive +build --features=-force_no_whole_archive + build --enable_platform_specific_config # Enable XLA support by default. build --define=with_xla_support=true +build --config=short_logs + build --config=v2 +# Disable AWS/HDFS support by default +build --define=no_aws_support=true +build --define=no_hdfs_support=true + +# TF now has `cc_shared_library` targets, so it needs the experimental flag +# TODO(rostam): Remove when `cc_shared_library` is enabled by default +build --experimental_cc_shared_library + +# cc_shared_library ensures no library is linked statically more than once. +build --experimental_link_static_libraries_once=false + +# Prevent regressions on those two incompatible changes +# TODO: remove those flags when they are flipped in the default Bazel version TF uses. +build --incompatible_enforce_config_setting_visibility + # Default options should come above this line. # Android configs. Bazel needs to have --cpu and --fat_apk_cpu both set to the @@ -48,20 +70,124 @@ build:android_x86_64 --config=android build:android_x86_64 --cpu=x86_64 build:android_x86_64 --fat_apk_cpu=x86_64 +# Build everything statically for Android since all static libs are later +# bundled together into a single .so for deployment. +build:android --dynamic_mode=off + # Sets the default Apple platform to macOS. build:macos --apple_platform_type=macos # gRPC on MacOS requires this #define build:macos --copt=-DGRPC_BAZEL_BUILD +# Avoid hitting command line argument limit +build:macos --features=archive_param_file + # Settings for MacOS on ARM CPUs. build:macos_arm64 --cpu=darwin_arm64 +build:macos_arm64 --macos_minimum_os=11.0 # Config to use a mostly-static build and disable modular op registration # support (this will revert to loading TensorFlow with RTLD_GLOBAL in Python). # By default, TensorFlow will build with a dependence on # //tensorflow:libtensorflow_framework.so. build:monolithic --define framework_shared_object=false +build:monolithic --define tsl_protobuf_header_only=false +build:monolithic --experimental_link_static_libraries_once=false # b/229868128 + +# Please note that MKL on MacOS is still not supported. +# If you would like to use a local MKL instead of downloading, please set the +# environment variable "TF_MKL_ROOT" every time before build. +build:mkl --define=build_with_mkl=true --define=enable_mkl=true +build:mkl --define=tensorflow_mkldnn_contraction_kernel=0 +build:mkl --define=build_with_openmp=true +build:mkl -c opt + +# config to build OneDNN backend with a user specified threadpool. +build:mkl_threadpool --define=build_with_mkl=true --define=enable_mkl=true +build:mkl_threadpool --define=tensorflow_mkldnn_contraction_kernel=0 +build:mkl_threadpool --define=build_with_mkl_opensource=true +build:mkl_threadpool -c opt + +# Config setting to build oneDNN with Compute Library for the Arm Architecture (ACL). +build:mkl_aarch64 --define=build_with_mkl_aarch64=true +build:mkl_aarch64 --define=build_with_openmp=true +build:mkl_aarch64 --define=build_with_acl=true +build:mkl_aarch64 -c opt + +# Config setting to build oneDNN with Compute Library for the Arm Architecture (ACL). +# with Eigen threadpool support +build:mkl_aarch64_threadpool --define=build_with_mkl_aarch64=true +build:mkl_aarch64_threadpool -c opt + +# CUDA: This config refers to building CUDA op kernels with nvcc. +build:cuda --repo_env TF_NEED_CUDA=1 +build:cuda --crosstool_top=@local_config_cuda//crosstool:toolchain +build:cuda --@local_config_cuda//:enable_cuda + +# CUDA: This config refers to building CUDA op kernels with clang. +build:cuda_clang --config=cuda +# Enable TensorRT optimizations https://developer.nvidia.com/tensorrt +build:cuda_clang --config=tensorrt +build:cuda_clang --action_env=TF_CUDA_CLANG="1" +build:cuda_clang --@local_config_cuda//:cuda_compiler=clang +# Select supported compute capabilities (supported graphics cards). +# This is the same as the official TensorFlow builds. +# See https://developer.nvidia.com/cuda-gpus#compute +# `compute_XY` enables PTX embedding in addition to SASS. PTX +# is forward compatible beyond the current compute capability major +# release while SASS is only forward compatible inside the current +# major release. Example: sm_80 kernels can run on sm_89 GPUs but +# not on sm_90 GPUs. compute_80 kernels though can also run on sm_90 GPUs. +build:cuda_clang --repo_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_50,sm_60,sm_70,sm_80,compute_90" + +# Set up compilation CUDA version and paths and use the CUDA Clang toolchain. +build:cuda_clang_official --config=cuda_clang +build:cuda_clang_official --action_env=TF_CUDA_VERSION="12" +build:cuda_clang_official --action_env=TF_CUDNN_VERSION="8" +build:cuda_clang_official --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-12.3" +build:cuda_clang_official --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc" +build:cuda_clang_official --action_env=CLANG_CUDA_COMPILER_PATH="/usr/lib/llvm-17/bin/clang" +build:cuda_clang_official --action_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64" +build:cuda_clang_official --crosstool_top="@sigbuild-r2.16-clang_config_cuda//crosstool:toolchain" + +# Build with nvcc for CUDA and clang for host +build:nvcc_clang --config=cuda +# Unfortunately, cuda_configure.bzl demands this for using nvcc + clang +build:nvcc_clang --action_env=TF_CUDA_CLANG="1" +build:nvcc_clang --action_env=TF_NVCC_CLANG="1" +build:nvcc_clang --@local_config_cuda//:cuda_compiler=nvcc + + +# Debug config +build:dbg -c dbg +# Only include debug info for files under tensorflow/, excluding kernels, to +# reduce the size of the debug info in the binary. This is because if the debug +# sections in the ELF binary are too large, errors can occur. See +# https://github.com/tensorflow/tensorflow/issues/48919. +# Users can still include debug info for a specific kernel, e.g. with: +# --config=dbg --per_file_copt=+tensorflow/core/kernels/identity_op.*@-g +# Since this .bazelrc file is synced between the tensorflow/tensorflow repo and +# the openxla/xla repo, also include debug info for files under xla/. +build:dbg --per_file_copt=+.*,-tensorflow.*,-xla.*@-g0 +build:dbg --per_file_copt=+tensorflow/core/kernels.*@-g0 +# for now, disable arm_neon. see: https://github.com/tensorflow/tensorflow/issues/33360 +build:dbg --cxxopt -DTF_LITE_DISABLE_X86_NEON +# AWS SDK must be compiled in release mode. see: https://github.com/tensorflow/tensorflow/issues/37498 +build:dbg --copt -DDEBUG_BUILD + +# Config to build TF TPU +build:tpu --define=with_tpu_support=true +build:tpu --define=framework_shared_object=true +build:tpu --copt=-DLIBTPU_ON_GCE +build:tpu --define=enable_mlir_bridge=true + +build:tensorrt --repo_env TF_NEED_TENSORRT=1 + +build:rocm --crosstool_top=@local_config_rocm//crosstool:toolchain +build:rocm --define=using_rocm_hipcc=true +build:rocm --define=tensorflow_mkldnn_contraction_kernel=0 +build:rocm --repo_env TF_NEED_ROCM=1 # Options to disable default on features build:noaws --define=no_aws_support=true @@ -75,6 +201,37 @@ build:dynamic_kernels --copt=-DAUTOLOAD_DYNAMIC_KERNELS # Don't trigger --config= when cross-compiling. build:android --noenable_platform_specific_config +build:ios --noenable_platform_specific_config + +# Suppress all C++ compiler warnings, otherwise build logs become 10s of MBs. +build:android --copt=-w +build:ios --copt=-w +build:linux --host_copt=-w +build:macos --copt=-w +build:windows --copt=/W0 +build:windows --host_copt=/W0 + +# Suppress most C++ compiler warnings to reduce log size but allow +# for specific warnings to still be present. +build:linux --copt="-Wno-all" +build:linux --copt="-Wno-extra" +build:linux --copt="-Wno-deprecated" +build:linux --copt="-Wno-deprecated-declarations" +build:linux --copt="-Wno-ignored-attributes" +build:linux --copt="-Wno-array-bounds" + +# Add unused-result as an error on Linux. +build:linux --copt="-Wunused-result" +build:linux --copt="-Werror=unused-result" +# Add switch as an error on Linux. +build:linux --copt="-Wswitch" +build:linux --copt="-Werror=switch" +# Required for building with clang +build:linux --copt="-Wno-error=unused-but-set-variable" + +# Linux ARM64 specific options +build:linux_arm64 --copt="-mtune=generic" --copt="-march=armv8-a" --copt="-O3" + # On Windows, `__cplusplus` is wrongly defined without this switch # See https://devblogs.microsoft.com/cppblog/msvc-now-correctly-reports-__cplusplus/ @@ -89,8 +246,32 @@ build:windows --host_copt=/D_USE_MATH_DEFINES # Windows has a relatively short command line limit, which TF has begun to hit. # See https://docs.bazel.build/versions/main/windows.html build:windows --features=compiler_param_file - -# By default, build LCE in C++ 17 mode. +build:windows --features=archive_param_file + +# Speed Windows compile times. Available in VS 16.4 (we are on 16.11). See +# https://groups.google.com/a/tensorflow.org/d/topic/build/SsW98Eo7l3o/discussion +build:windows --copt=/d2ReducedOptimizeHugeFunctions +build:windows --host_copt=/d2ReducedOptimizeHugeFunctions + +# Enable the runfiles symlink tree on Windows. This makes it possible to build +# the pip package on Windows without an intermediate data-file archive, as the +# build_pip_package script in its current form (as of Aug 2023) uses the +# runfiles symlink tree to decide what to put into the Python wheel. +startup --windows_enable_symlinks +build:windows --enable_runfiles + +# Default paths for TF_SYSTEM_LIBS +build:linux --define=PREFIX=/usr +build:linux --define=LIBDIR=$(PREFIX)/lib +build:linux --define=INCLUDEDIR=$(PREFIX)/include +build:linux --define=PROTOBUF_INCLUDE_PATH=$(PREFIX)/include +build:macos --define=PREFIX=/usr +build:macos --define=LIBDIR=$(PREFIX)/lib +build:macos --define=INCLUDEDIR=$(PREFIX)/include +build:macos --define=PROTOBUF_INCLUDE_PATH=$(PREFIX)/include +# TF_SYSTEM_LIBS do not work on windows. + +# By default, build TF in C++ 17 mode. build:android --cxxopt=-std=c++17 build:android --host_cxxopt=-std=c++17 build:linux --cxxopt=-std=c++17 @@ -113,7 +294,7 @@ build:windows --copt=-DNOGDI build:windows --host_copt=-DNOGDI # MSVC (Windows): Standards-conformant preprocessor mode -# See https://docs.microsoft.com/en-us/cpp/build/reference/zc-preprocessor +# See https://docs.microsoft.com/en-us/cpp/preprocessor/preprocessor-experimental-overview build:windows --copt=/Zc:preprocessor build:windows --host_copt=/Zc:preprocessor @@ -128,13 +309,45 @@ build:windows --host_linkopt=/OPT:ICF # Verbose failure logs when something goes wrong build:windows --verbose_failures -# On windows, we never cross compile -build:windows --distinct_host_configuration=false +# Work around potential issues with large command lines on windows. +# See: https://github.com/bazelbuild/bazel/issues/5163 +build:windows --features=compiler_param_file + +# Do not risk cache corruption. See: +# https://github.com/bazelbuild/bazel/issues/3360 +build:linux --experimental_guard_against_concurrent_changes + +# Configure short or long logs +build:short_logs --output_filter=DONT_MATCH_ANYTHING +build:verbose_logs --output_filter= + +# Instruction set optimizations +# TODO(gunan): Create a feature in toolchains for avx/avx2 to +# avoid having to define linux/win separately. +build:avx_linux --copt=-mavx +build:avx_linux --host_copt=-mavx +build:avx_win --copt=/arch:AVX + +# Use Clang-cl compiler on Windows +build:win_clang --copt=/clang:-Weverything +build:win_clang --extra_toolchains=@local_config_cc//:cc-toolchain-x64_windows-clang-cl +build:win_clang --extra_execution_platforms=//tensorflow/tools/toolchains/win:x64_windows-clang-cl +build:win_clang --host_platform=//tensorflow/tools/toolchains/win:x64_windows-clang-cl +build:win_clang --compiler=clang-cl +build:win_clang --linkopt=/FORCE:MULTIPLE +build:win_clang --host_linkopt=/FORCE:MULTIPLE +test:win_clang --linkopt=/FORCE:MULTIPLE +test:win_clang --host_linkopt=/FORCE:MULTIPLE # Options to build TensorFlow 1.x or 2.x. -build:v1 --define=tf_api_version=1 --action_env=TF2_BEHAVIOR=0 +# TODO(kanglan): Change v2's define to default behavior build:v2 --define=tf_api_version=2 --action_env=TF2_BEHAVIOR=1 +# Disable XLA on mobile. +build:xla --define=with_xla_support=true # TODO: remove, it's on by default. +build:android --define=with_xla_support=false +build:ios --define=with_xla_support=false + # Flag to enable remote config common --experimental_repo_remote_exec @@ -147,48 +360,203 @@ build:elinux_armhf --config=elinux build:elinux_armhf --cpu=armhf build:elinux_armhf --copt -mfp16-format=ieee -# Address sanitizer -# CC=clang bazel build --config asan -build:asan --strip=never -build:asan --copt -fsanitize=address -build:asan --copt -DADDRESS_SANITIZER -build:asan --copt -g -build:asan --copt -O3 -build:asan --copt -fno-omit-frame-pointer -build:asan --linkopt -fsanitize=address - -# Memory sanitizer -# CC=clang bazel build --config msan -build:msan --strip=never -build:msan --copt -fsanitize=memory -build:msan --copt -DADDRESS_SANITIZER -build:msan --copt -g -build:msan --copt -O3 -build:msan --copt -fno-omit-frame-pointer -build:msan --linkopt -fsanitize=memory - -# Undefined Behavior Sanitizer -# CC=clang bazel build --config ubsan -build:ubsan --strip=never -build:ubsan --copt -fsanitize=undefined -build:ubsan --copt -g -build:ubsan --copt -O3 -build:ubsan --copt -fno-omit-frame-pointer -build:ubsan --linkopt -fsanitize=undefined -build:ubsan --linkopt -lubsan - - -# Debug config -build:dbg -c dbg -# Only include debug info for files under tensorflow/, excluding kernels, to -# reduce the size of the debug info in the binary. This is because if the debug -# sections in the ELF binary are too large, errors can occur. See -# https://github.com/tensorflow/tensorflow/issues/48919. -# Users can still include debug info for a specific kernel, e.g. with: -# --config=dbg --per_file_copt=+tensorflow/core/kernels/identity_op.*@-g -build:dbg --per_file_copt=+.*,-tensorflow.*@-g0 -build:dbg --per_file_copt=+tensorflow/core/kernels.*@-g0 -# for now, disable arm_neon. see: https://github.com/tensorflow/tensorflow/issues/33360 -build:dbg --cxxopt -DTF_LITE_DISABLE_X86_NEON -# AWS SDK must be compiled in release mode. see: https://github.com/tensorflow/tensorflow/issues/37498 -build:dbg --copt -DDEBUG_BUILD +# Config-specific options should come above this line. + +# Load rc file written by ./configure. +try-import %workspace%/.tf_configure.bazelrc +try-import %workspace%/xla_configure.bazelrc + +# Here are bazelrc configs for release builds +# Build TensorFlow v2. +test:release_base --test_size_filters=small,medium +test:release_base --flaky_test_attempts=3 + +# Target the AVX instruction set +build:release_linux_base --config=avx_linux + +# Disable clang extension that rejects type definitions within offsetof. +# This was added in clang-16 by https://reviews.llvm.org/D133574. +# Can be removed once upb is updated, since a type definition is used within +# offset of in the current version of ubp. +# See https://github.com/protocolbuffers/upb/blob/9effcbcb27f0a665f9f345030188c0b291e32482/upb/upb.c#L183. +build:release_linux_base --copt=-Wno-gnu-offsetof-extensions +build:release_linux_base --copt=-Wno-error=array-parameter +build:release_linux_base --copt=-Wno-error=unused-command-line-argument +# Set lld as the linker. +build:release_linux_base --linkopt="-fuse-ld=lld" +build:release_linux_base --linkopt="-lm" + +# We have some invalid linker scripts in the build, +# so we need to disable this check +build:release_linux_base --linkopt=-Wl,--undefined-version + +# Container environment settings below this point. +# Use Python 3.X as installed in container image +build:release_linux_base --action_env PYTHON_BIN_PATH="/usr/bin/python3" +build:release_linux_base --action_env PYTHON_LIB_PATH="/usr/lib/tf_python" +build:release_linux_base --python_path="/usr/bin/python3" +# Set Clang as compiler. Use the actual path to clang installed in container. +build:release_cpu_linux_base --repo_env=CC="/usr/lib/llvm-17/bin/clang" +build:release_cpu_linux_base --repo_env=BAZEL_COMPILER="/usr/lib/llvm-17/bin/clang" +# Test-related settings below this point. +test:release_linux_base --build_tests_only --keep_going --test_output=errors --verbose_failures=true +test:release_linux_base --local_test_jobs=HOST_CPUS +test:release_linux_base --test_env=LD_LIBRARY_PATH +# Give only the list of failed tests at the end of the log +test:release_linux_base --test_summary=short + +# Use the Clang toolchain to compile +build:release_cpu_linux --config=release_linux_base +build:release_cpu_linux --crosstool_top="@sigbuild-r2.16-clang_config_cuda//crosstool:toolchain" + +build:release_gpu_linux --config=release_cpu_linux +# Set up compilation CUDA version and paths and use the CUDA Clang toolchain. +# Note that linux cpu and cuda builds share the same toolchain now. +build:release_gpu_linux --config=cuda_clang_official +test:release_gpu_linux --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64" +# Local test jobs has to be 4 because parallel_gpu_execute is fragile, I think +test:release_gpu_linux --test_timeout=300,450,1200,3600 --local_test_jobs=4 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute + +build:release_arm64_linux --config=release_linux_base +build:release_arm64_linux --config=linux_arm64 +build:release_arm64_linux --crosstool_top="@ml2014_clang_aarch64_config_aarch64//crosstool:toolchain" +build:release_arm64_linux --config=mkl_aarch64_threadpool +build:release_arm64_linux --copt=-flax-vector-conversions +test:release_arm64_linux --flaky_test_attempts=3 + +# The old gcc linux build options are preserved in the unsupported_*_linux +# configs. If your project fails to build with Clang, you can use these +# unsupported flags to replace the release flags in your build command. +# However, please note that the old toolchain is no longer officially supported +# by TensorFlow and the unsupported configs will be removed soon b/299962977. We +# strongly recommend that you migrate to Clang as your compiler for TensorFlow +# Linux builds. Instructions are available in the official documentation: +# https://www.tensorflow.org/install/source#install_clang_recommended_linux_only +# Another good option is to use our Docker containers to build and test TF: +# https://github.com/tensorflow/tensorflow/tree/master/tensorflow/tools/tf_sig_build_dockerfiles. +build:unsupported_cpu_linux --config=avx_linux +build:unsupported_cpu_linux --crosstool_top="@ubuntu20.04-gcc9_manylinux2014-cuda11.2-cudnn8.1-tensorrt7.2_config_cuda//crosstool:toolchain" +test:unsupported_cpu_linux --test_env=LD_LIBRARY_PATH +test:unsupported_cpu_linux --config=release_base + +build:unsupported_gpu_linux --config=cuda +build:unsupported_gpu_linux --config=unsupported_cpu_linux +build:unsupported_gpu_linux --action_env=TF_CUDA_VERSION="11" +build:unsupported_gpu_linux --action_env=TF_CUDNN_VERSION="8" +build:unsupported_gpu_linux --repo_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_35,sm_50,sm_60,sm_70,sm_75,compute_80" +build:unsupported_gpu_linux --config=tensorrt +build:unsupported_gpu_linux --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-11.2" +build:unsupported_gpu_linux --action_env=LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda-11.1/lib64:/usr/local/tensorrt/lib" +build:unsupported_gpu_linux --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc" +build:unsupported_gpu_linux --crosstool_top=@ubuntu20.04-gcc9_manylinux2014-cuda11.2-cudnn8.1-tensorrt7.2_config_cuda//crosstool:toolchain + +build:release_cpu_macos --config=avx_linux +test:release_cpu_macos --config=release_base + +# Base build configs for macOS +build:release_macos_base --action_env DEVELOPER_DIR=/Applications/Xcode.app/Contents/Developer +build:release_macos_base --define=no_nccl_support=true --output_filter=^$ + +# Build configs for macOS x86 +build:release_macos_x86 --config=release_macos_base +# Build with the AVX instruction set when on macOS x86 +build:release_macos_x86 --config=avx_linux +build:release_macos_x86 --cpu=darwin +# Target Catalina as the minimum compatible OS version +build:release_macos_x86 --macos_minimum_os=10.15 +build:release_macos_x86 --action_env MACOSX_DEPLOYMENT_TARGET=10.15 + +# Build configs for macOS Arm64 +build:release_macos_arm64 --config=release_macos_base +build:release_macos_arm64 --cpu=darwin_arm64 +build:release_macos_arm64 --define=tensorflow_mkldnn_contraction_kernel=0 +# Target Moneterey as the minimum compatible OS version +build:release_macos_arm64 --macos_minimum_os=12.0 +build:release_macos_arm64 --action_env MACOSX_DEPLOYMENT_TARGET=12.0 + +# Base test configs for macOS +test:release_macos_base --verbose_failures=true --local_test_jobs=HOST_CPUS +test:release_macos_base --test_timeout=300,450,1200,3600 --test_output=errors +test:release_macos_base --build_tests_only --keep_going +test:release_macos_base --flaky_test_attempts=3 + +# Test configs for macOS x86 +test:release_macos_x86 --config=release_macos_base + +# Test configs for macOS Arm64 +test:release_macos_arm64 --config=release_macos_base + +# TODO(kanglan): Update windows configs after b/289091160 is fixed +build:release_cpu_windows --config=avx_win +build:release_cpu_windows --define=no_tensorflow_py_deps=true +test:release_cpu_windows --config=release_base + +# Exclude TFRT integration for anything but Linux. +build:android --config=no_tfrt +build:macos --config=no_tfrt +build:windows --config=no_tfrt +build:rocm --config=no_tfrt +build:no_tfrt --deleted_packages=tensorflow/compiler/mlir/tfrt,tensorflow/compiler/mlir/tfrt/benchmarks,tensorflow/compiler/mlir/tfrt/ir,tensorflow/compiler/mlir/tfrt/ir/mlrt,tensorflow/compiler/mlir/tfrt/jit/python_binding,tensorflow/compiler/mlir/tfrt/jit/transforms,tensorflow/compiler/mlir/tfrt/python_tests,tensorflow/compiler/mlir/tfrt/tests,tensorflow/compiler/mlir/tfrt/tests/ifrt,tensorflow/compiler/mlir/tfrt/tests/mlrt,tensorflow/compiler/mlir/tfrt/tests/ir,tensorflow/compiler/mlir/tfrt/tests/analysis,tensorflow/compiler/mlir/tfrt/tests/jit,tensorflow/compiler/mlir/tfrt/tests/lhlo_to_tfrt,tensorflow/compiler/mlir/tfrt/tests/lhlo_to_jitrt,tensorflow/compiler/mlir/tfrt/tests/tf_to_corert,tensorflow/compiler/mlir/tfrt/tests/tf_to_tfrt_data,tensorflow/compiler/mlir/tfrt/tests/saved_model,tensorflow/compiler/mlir/tfrt/transforms/lhlo_gpu_to_tfrt_gpu,tensorflow/compiler/mlir/tfrt/transforms/mlrt,tensorflow/core/runtime_fallback,tensorflow/core/runtime_fallback/conversion,tensorflow/core/runtime_fallback/kernel,tensorflow/core/runtime_fallback/opdefs,tensorflow/core/runtime_fallback/runtime,tensorflow/core/runtime_fallback/util,tensorflow/core/runtime_fallback/test,tensorflow/core/runtime_fallback/test/gpu,tensorflow/core/runtime_fallback/test/saved_model,tensorflow/core/runtime_fallback/test/testdata,tensorflow/core/tfrt/stubs,tensorflow/core/tfrt/tfrt_session,tensorflow/core/tfrt/mlrt,tensorflow/core/tfrt/mlrt/attribute,tensorflow/core/tfrt/mlrt/kernel,tensorflow/core/tfrt/mlrt/bytecode,tensorflow/core/tfrt/mlrt/interpreter,tensorflow/compiler/mlir/tfrt/translate/mlrt,tensorflow/compiler/mlir/tfrt/translate/mlrt/testdata,tensorflow/core/tfrt/gpu,tensorflow/core/tfrt/run_handler_thread_pool,tensorflow/core/tfrt/runtime,tensorflow/core/tfrt/saved_model,tensorflow/core/tfrt/graph_executor,tensorflow/core/tfrt/saved_model/tests,tensorflow/core/tfrt/tpu,tensorflow/core/tfrt/utils,tensorflow/core/tfrt/utils/debug,tensorflow/core/tfrt/saved_model/python,tensorflow/core/tfrt/graph_executor/python,tensorflow/core/tfrt/saved_model/utils + +# START CROSS-COMPILE CONFIGS +# Set execution platform to Linux x86 +# Note: Lot of the "host_" flags such as "host_cpu" and "host_crosstool_top" +# flags seem to be actually used to specify the execution platform details. It +# seems it is this way because these flags are old and predate the distinction +# between host and execution platform. +build:cross_compile_base --host_cpu=k8 +build:cross_compile_base --host_crosstool_top=//tensorflow/tools/toolchains/cross_compile/cc:cross_compile_toolchain_suite +build:cross_compile_base --extra_execution_platforms=//tensorflow/tools/toolchains/cross_compile/config:linux_x86_64 + +build:rbe_cross_compile_base --config=rbe_base +build:rbe_cross_compile_base --remote_instance_name=projects/tensorflow-testing/instances/default_instance + +# Test-related settings below this point +# We cannot run cross-compiled tests on the remote Linux x86 VMs so we need to +# force all tests to run locally on the Aarch64 host. +test:rbe_cross_compile_base --strategy=TestRunner=local --build_tests_only +test:rbe_cross_compile_base --verbose_failures=true --local_test_jobs=HOST_CPUS --test_output=errors + +# START LINUX AARCH64 CROSS-COMPILE CONFIGS +build:cross_compile_linux_arm64 --config=cross_compile_base + +# Set the target CPU to Aarch64 +build:cross_compile_linux_arm64 --platforms=//tensorflow/tools/toolchains/cross_compile/config:linux_aarch64 +build:cross_compile_linux_arm64 --cpu=aarch64 +build:cross_compile_linux_arm64 --crosstool_top=//tensorflow/tools/toolchains/cross_compile/cc:cross_compile_toolchain_suite + +# RBE cross-compile configs for Linux Aarch64 +build:rbe_cross_compile_linux_arm64 --config=cross_compile_linux_arm64 +build:rbe_cross_compile_linux_arm64 --config=rbe_cross_compile_base +test:rbe_cross_compile_linux_arm64 --config=rbe_cross_compile_base +# END LINUX AARCH64 CROSS-COMPILE CONFIGS + +# START MACOS CROSS-COMPILE CONFIGS +build:cross_compile_macos_x86 --config=cross_compile_base +build:cross_compile_macos_x86 --config=nonccl +# Target Catalina (10.15) as the minimum supported OS +build:cross_compile_macos_x86 --action_env MACOSX_DEPLOYMENT_TARGET=10.15 + +# Set the target CPU to Darwin x86 +build:cross_compile_macos_x86 --platforms=//tensorflow/tools/toolchains/cross_compile/config:darwin_x86_64 +build:cross_compile_macos_x86 --cpu=darwin +build:cross_compile_macos_x86 --crosstool_top=//tensorflow/tools/toolchains/cross_compile/cc:cross_compile_toolchain_suite +# When RBE cross-compiling for macOS, we need to explicitly register the +# toolchain. Otherwise, oddly, RBE complains that a "docker container must be +# specified". +build:cross_compile_macos_x86 --extra_toolchains=//tensorflow/tools/toolchains/cross_compile/config:macos-x86-cross-compile-cc-toolchain +# Map --platforms=darwin_x86_64 to --cpu=darwin and vice-versa to make selects() +# and transistions that use these flags work. +build:cross_compile_macos_x86 --platform_mappings=tensorflow/tools/toolchains/cross_compile/config/platform_mappings + +# RBE cross-compile configs for Darwin x86 +build:rbe_cross_compile_macos_x86 --config=cross_compile_macos_x86 +build:rbe_cross_compile_macos_x86 --config=rbe_cross_compile_base +test:rbe_cross_compile_macos_x86 --config=rbe_cross_compile_base +# Increase the test timeout as tests often take longer on mac. +test:rbe_cross_compile_macos_x86 --test_timeout=300,450,1200,3600 +# Limit jobs to 100 to avoid running into "out of memory" issues (b/316266643) +build:rbe_cross_compile_macos_x86 --jobs=100 +test:rbe_cross_compile_macos_x86 --jobs=100 +# END MACOS CROSS-COMPILE CONFIGS +# END CROSS-COMPILE CONFIGS diff --git a/CMakeLists.txt b/CMakeLists.txt index b4a67876..36213e39 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -81,6 +81,11 @@ if (COMPILE_BENCHMARK) get_directory_property(TFLITE_BENCHMARK_SRCS DIRECTORY ${TFLITE_SOURCE_DIR}/tools/benchmark DEFINITION TFLITE_BENCHMARK_SRCS) list(FILTER TFLITE_BENCHMARK_SRCS EXCLUDE REGEX benchmark_main.cc) + # The TSL dir is included in the tensorflow CMakeLists.txt but because we manually refer to those source files here we have to explicitly list this include directory again. + set(TSL_SOURCE_DIR "${TENSORFLOW_SOURCE_DIR}/third_party/xla/third_party/tsl") + include_directories( + ${TSL_SOURCE_DIR} + ) add_executable(lce_benchmark_model ${TFLITE_BENCHMARK_SRCS} ${LCE_BENCHMARK_SRCS} ${LCE_BENCHMARK_HRDS} diff --git a/WORKSPACE b/WORKSPACE index c0b27150..42688952 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -16,13 +16,89 @@ http_archive( patches = [ "//third_party/tensorflow_patches:disable_forced_mkl.patch", ], - sha256 = "e58c939079588623e6fa1d054aec2f90f95018266e0a970fd353a5244f5173dc", - strip_prefix = "tensorflow-2.13.0", + sha256 = "c729e56efc945c6df08efe5c9f5b8b89329c7c91b8f40ad2bb3e13900bd4876d", + strip_prefix = "tensorflow-2.16.1", urls = [ - "https://github.com/tensorflow/tensorflow/archive/v2.13.0.tar.gz", + "https://github.com/tensorflow/tensorflow/archive/v2.16.1.tar.gz", ], ) +# We must initialize hermetic python first. +http_archive( + name = "bazel_skylib", + sha256 = "74d544d96f4a5bb630d465ca8bbcfe231e3594e5aae57e1edbf17a6eb3ca2506", + urls = [ + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/bazelbuild/bazel-skylib/releases/download/1.3.0/bazel-skylib-1.3.0.tar.gz", + "https://github.com/bazelbuild/bazel-skylib/releases/download/1.3.0/bazel-skylib-1.3.0.tar.gz", + ], +) + +http_archive( + name = "rules_python", + sha256 = "9d04041ac92a0985e344235f5d946f71ac543f1b1565f2cdbc9a2aaee8adf55b", + strip_prefix = "rules_python-0.26.0", + url = "https://github.com/bazelbuild/rules_python/releases/download/0.26.0/rules_python-0.26.0.tar.gz", +) + +load("@rules_python//python:repositories.bzl", "py_repositories", "python_register_toolchains") + +py_repositories() + +load( + "@org_tensorflow//tensorflow/tools/toolchains/python:python_repo.bzl", + "python_repository", +) + +python_repository(name = "python_version_repo") + +load("@python_version_repo//:py_version.bzl", "TF_PYTHON_VERSION") + +python_register_toolchains( + name = "python", + ignore_root_user_error = True, + python_version = TF_PYTHON_VERSION, +) + +load("@python//:defs.bzl", "interpreter") +load("@rules_python//python:pip.bzl", "package_annotation", "pip_parse") + +NUMPY_ANNOTATIONS = { + "numpy": package_annotation( + additive_build_content = """\ +filegroup( + name = "includes", + srcs = glob(["site-packages/numpy/core/include/**/*.h"]), +) +cc_library( + name = "numpy_headers", + hdrs = [":includes"], + strip_include_prefix="site-packages/numpy/core/include/", +) +""", + ), +} + +pip_parse( + name = "pypi", + annotations = NUMPY_ANNOTATIONS, + python_interpreter_target = interpreter, + requirements = "@org_tensorflow//:requirements_lock_" + TF_PYTHON_VERSION.replace(".", "_") + ".txt", +) + +load("@pypi//:requirements.bzl", tf_install_deps = "install_deps") + +tf_install_deps() + +pip_parse( + name = "pypi_lce", + python_interpreter_target = interpreter, + requirements = "//larq_compute_engine:requirements.txt", +) + +load("@pypi_lce//:requirements.bzl", lce_install_deps = "install_deps") + +lce_install_deps() + load("@org_tensorflow//tensorflow:workspace3.bzl", "tf_workspace3") tf_workspace3() diff --git a/build_pip_pkg.sh b/build_pip_pkg.sh index 0a8bfcf3..cb762a24 100755 --- a/build_pip_pkg.sh +++ b/build_pip_pkg.sh @@ -29,7 +29,8 @@ function is_windows() { } if is_windows; then - PIP_FILE_PREFIX="bazel-bin/build_pip_pkg.exe.runfiles/larq_compute_engine/" + # On windows, the workspace name is lce to avoid the path length limit of MSVC + PIP_FILE_PREFIX="bazel-bin/build_pip_pkg.exe.runfiles/lce/" else PIP_FILE_PREFIX="bazel-bin/build_pip_pkg.runfiles/larq_compute_engine/" fi diff --git a/configure.py b/configure.py index fd53129e..1aae365b 100755 --- a/configure.py +++ b/configure.py @@ -16,6 +16,7 @@ # limitations under the License. # ============================================================================== +import json import os import platform import re @@ -24,7 +25,7 @@ _LCE_BAZELRC = ".lce_configure.bazelrc" -_SUPPORTED_ANDROID_NDK_VERSIONS = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21] +_SUPPORTED_ANDROID_NDK_VERSIONS = [19, 20, 21, 25] _DEFAULT_PROMPT_ASK_ATTEMPTS = 10 @@ -570,29 +571,27 @@ def get_ndk_api_level(environ_cp, android_ndk_home_path): "errors.\n" % (android_ndk_home_path, ndk_version, _SUPPORTED_ANDROID_NDK_VERSIONS) ) + write_action_env_to_bazelrc("ANDROID_NDK_VERSION", ndk_version) # Now grab the NDK API level to use. Note that this is different from the # SDK API level, as the NDK API level is effectively the *min* target SDK # version. - platforms = os.path.join(android_ndk_home_path, "platforms") - api_levels = sorted(os.listdir(platforms)) - api_levels = [x.replace("android-", "") for x in api_levels if "android-" in x] - - def valid_api_level(api_level): - return os.path.exists( - os.path.join(android_ndk_home_path, "platforms", "android-" + api_level) - ) + meta = open(os.path.join(android_ndk_home_path, "meta/platforms.json")) + platforms = json.load(meta) + meta.close() + aliases = platforms["aliases"] + api_levels = sorted(list(set([aliases[i] for i in aliases]))) android_ndk_api_level = prompt_loop_or_load_from_env( environ_cp, var_name="ANDROID_NDK_API_LEVEL", - var_default='26', # 26 is required to support AHardwareBuffer. + var_default="26", # 26 is required to support AHardwareBuffer. ask_for_var=( "Please specify the (min) Android NDK API level to use. " "[Available levels: %s]" ) % api_levels, - check_success=valid_api_level, + check_success=(lambda *_: True), error_msg="Android-%s is not present in the NDK path.", ) diff --git a/larq_compute_engine/mlir/BUILD b/larq_compute_engine/mlir/BUILD index d174313b..c74cc190 100644 --- a/larq_compute_engine/mlir/BUILD +++ b/larq_compute_engine/mlir/BUILD @@ -1,5 +1,6 @@ -load("@org_tensorflow//tensorflow:tensorflow.bzl", "pybind_extension", "tf_cc_binary") load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library") +load("@org_tensorflow//tensorflow:tensorflow.bzl", "pybind_extension", "tf_cc_binary") +load("@pypi_lce//:requirements.bzl", lce_requirement = "requirement") package( default_visibility = ["//visibility:public"], @@ -470,8 +471,8 @@ cc_library( "@org_tensorflow//tensorflow/compiler/mlir/lite/quantization:quantization_config", "@org_tensorflow//tensorflow/compiler/mlir/lite/quantization:quantization_passes", "@org_tensorflow//tensorflow/compiler/mlir/tensorflow", - "@org_tensorflow//tensorflow/compiler/mlir/tensorflow:tensorflow_passes", - "@org_tensorflow//tensorflow/compiler/mlir/tensorflow:tf_saved_model_passes", + "@org_tensorflow//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes", + "@org_tensorflow//tensorflow/compiler/mlir/tensorflow/transforms:tf_saved_model_passes", ], ) @@ -486,13 +487,16 @@ cc_library( "@llvm-project//llvm:Support", "@llvm-project//mlir:IR", "@llvm-project//mlir:Pass", + "@local_tsl//tsl/platform:statusor", "@org_tensorflow//tensorflow/compiler/mlir:op_or_arg_name_mapper", "@org_tensorflow//tensorflow/compiler/mlir/lite:flatbuffer_export", + "@org_tensorflow//tensorflow/compiler/mlir/lite/debug", "@org_tensorflow//tensorflow/compiler/mlir/lite/metrics:error_collector", "@org_tensorflow//tensorflow/compiler/mlir/lite/quantization:quantization_config", + "@org_tensorflow//tensorflow/compiler/mlir/lite/stablehlo:op_stat_pass", + "@org_tensorflow//tensorflow/compiler/mlir/lite/stablehlo:stablehlo_util", "@org_tensorflow//tensorflow/compiler/mlir/tensorflow:error_util", - "@org_tensorflow//tensorflow/compiler/mlir/tensorflow:tf_saved_model_freeze_variables", - "@org_tensorflow//tensorflow/tsl/platform:statusor", + "@org_tensorflow//tensorflow/compiler/mlir/tensorflow/transforms:tf_saved_model_freeze_variables", ], ) @@ -535,7 +539,7 @@ pybind_extension( "@org_tensorflow//tensorflow/compiler/mlir/tensorflow:export_graphdef", "@org_tensorflow//tensorflow/compiler/mlir/tensorflow:import_utils", "@org_tensorflow//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags", - "@org_tensorflow//tensorflow/compiler/mlir/tensorflow:tf_dialect_passes", + "@org_tensorflow//tensorflow/compiler/mlir/tensorflow/transforms:tf_dialect_passes", "@org_tensorflow//tensorflow/core:ops", "@pybind11", ], @@ -551,12 +555,15 @@ genrule( py_library( name = "converter", srcs = [ + "python/__init__.py", "python/converter.py", "python/util.py", ":tflite_schema_py", ], deps = [ ":_tf_tfl_flatbuffer", + lce_requirement("tensorflow"), + lce_requirement("flatbuffers"), ], ) diff --git a/larq_compute_engine/mlir/ir/lce_ops.h b/larq_compute_engine/mlir/ir/lce_ops.h index f19dd81b..0293e181 100644 --- a/larq_compute_engine/mlir/ir/lce_ops.h +++ b/larq_compute_engine/mlir/ir/lce_ops.h @@ -1,6 +1,7 @@ #ifndef LARQ_COMPUTE_ENGINE_MLIR_IR_LCE_OPS_H_ #define LARQ_COMPUTE_ENGINE_MLIR_IR_LCE_OPS_H_ +#include "mlir/Bytecode/BytecodeOpInterface.h" #include "mlir/Dialect/Quant/QuantTypes.h" #include "mlir/Interfaces/SideEffectInterfaces.h" diff --git a/larq_compute_engine/mlir/python/common.cc b/larq_compute_engine/mlir/python/common.cc index 83eb2f3c..a380a747 100644 --- a/larq_compute_engine/mlir/python/common.cc +++ b/larq_compute_engine/mlir/python/common.cc @@ -76,7 +76,7 @@ pybind11::bytes ConvertMLIRModuleToTFLiteFlatBuffer( const LCETarget target, const pybind11::object& default_ranges, const std::unordered_set& saved_model_tags, llvm::StringRef saved_model_dir, - llvm::Optional session, const int num_inputs, + std::optional session, const int num_inputs, const bool should_quantize, const bool mark_as_post_training_quant) { mlir::quant::QuantizationSpecs quant_specs; if (should_quantize) { @@ -86,9 +86,9 @@ pybind11::bytes ConvertMLIRModuleToTFLiteFlatBuffer( // we do that by default. quant_specs.inference_type = tensorflow::DT_QINT8; for (int i = 0; i < num_inputs; ++i) { - // Input inference type is DT_FLOAT, so set the default input ranges - // to llvm::None. - quant_specs.input_ranges.push_back({llvm::None, llvm::None}); + // Input inference type is DT_FLOAT, so set the default input range to + // None. + quant_specs.input_ranges.push_back({std::nullopt, std::nullopt}); } if (!default_ranges.is_none()) { // When there are no Quantize nodes in the graph then in the diff --git a/larq_compute_engine/mlir/python/common.h b/larq_compute_engine/mlir/python/common.h index 31b34e7f..e1f3c433 100644 --- a/larq_compute_engine/mlir/python/common.h +++ b/larq_compute_engine/mlir/python/common.h @@ -1,3 +1,5 @@ +#include + #include "larq_compute_engine/mlir/transforms/passes.h" #include "mlir/IR/BuiltinOps.h" #include "mlir/IR/MLIRContext.h" @@ -17,7 +19,7 @@ pybind11::bytes ConvertMLIRModuleToTFLiteFlatBuffer( const LCETarget target, const pybind11::object& default_ranges, const std::unordered_set& saved_model_tags, llvm::StringRef saved_model_dir, - llvm::Optional session, const int num_inputs, + std::optional session, const int num_inputs, const bool should_quantize, const bool mark_as_post_training_quant); } // namespace tensorflow diff --git a/larq_compute_engine/mlir/python/graphdef_tfl_flatbuffer.cc b/larq_compute_engine/mlir/python/graphdef_tfl_flatbuffer.cc index 220da098..6ada05a5 100644 --- a/larq_compute_engine/mlir/python/graphdef_tfl_flatbuffer.cc +++ b/larq_compute_engine/mlir/python/graphdef_tfl_flatbuffer.cc @@ -27,14 +27,13 @@ pybind11::bytes ConvertGraphDefToTFLiteFlatBuffer( auto target = GetLCETarget(target_str); - // `ParseInputArrayInfo` requires a type that isn't pybind compatible, so - // translate here. - std::vector>> translated_input_shapes; + // Convert empty shapes to `None`. We could also do that on the python side. + std::vector>> translated_input_shapes; for (auto x : input_shapes) { if (x.size() > 0) { translated_input_shapes.push_back(x); } else { - translated_input_shapes.push_back(llvm::None); + translated_input_shapes.push_back(std::nullopt); } } @@ -65,7 +64,7 @@ pybind11::bytes ConvertGraphDefToTFLiteFlatBuffer( return ConvertMLIRModuleToTFLiteFlatBuffer( &module.value(), context, target, default_ranges, /*saved_model_tags=*/{}, - /*saved_model_dir=*/"", /*session=*/llvm::None, input_arrays.size(), + /*saved_model_dir=*/"", /*session=*/std::nullopt, input_arrays.size(), should_quantize, /*mark_as_post_training_quant=*/false); } diff --git a/larq_compute_engine/mlir/tests/bitpack-weights.mlir b/larq_compute_engine/mlir/tests/bitpack-weights.mlir index bb419438..96905736 100644 --- a/larq_compute_engine/mlir/tests/bitpack-weights.mlir +++ b/larq_compute_engine/mlir/tests/bitpack-weights.mlir @@ -7,6 +7,6 @@ func.func @bitpack_bconv2d_filters(%arg0: tensor<256x32x32x1xi32>, %arg1: tensor return %0 : tensor<256x30x30x16xf32> // CHECK: %cst = arith.constant dense<0> : tensor<16x3x3x1xi32> - // CHECK: %0 = "lq.Bconv2d"(%arg0, %cst, %arg1, %arg2, %arg3) {channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32} : (tensor<256x32x32x1xi32>, tensor<16x3x3x1xi32>, tensor<16xf32>, tensor<16xf32>, none) -> tensor<256x30x30x16xf32> + // CHECK: %0 = "lq.Bconv2d"(%arg0, %cst, %arg1, %arg2, %arg3) <{channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32}> : (tensor<256x32x32x1xi32>, tensor<16x3x3x1xi32>, tensor<16xf32>, tensor<16xf32>, none) -> tensor<256x30x30x16xf32> // CHECK-NEXT: return %0 } diff --git a/larq_compute_engine/mlir/tests/const-fold.mlir b/larq_compute_engine/mlir/tests/const-fold.mlir index db8ded4e..984faa12 100644 --- a/larq_compute_engine/mlir/tests/const-fold.mlir +++ b/larq_compute_engine/mlir/tests/const-fold.mlir @@ -8,8 +8,8 @@ func.func @quantize() -> (tensor<1x1x2x1xi32>, tensor<1x1x2x1xi32>) { %1 = "lq.Quantize"(%neg) {} : (tensor<1x1x2x32xf32>) -> tensor<1x1x2x1xi32> return %0, %1 : tensor<1x1x2x1xi32>, tensor<1x1x2x1xi32> - // CHECK: %[[neg:.*]] = arith.constant dense<-1> : tensor<1x1x2x1xi32> // CHECK: %[[pos:.*]] = arith.constant dense<0> : tensor<1x1x2x1xi32> + // CHECK: %[[neg:.*]] = arith.constant dense<-1> : tensor<1x1x2x1xi32> // CHECK: return %[[pos]], %[[neg]] : tensor<1x1x2x1xi32>, tensor<1x1x2x1xi32> } @@ -21,7 +21,7 @@ func.func @dequantize() -> (tensor<1x1x2x32xf32>, tensor<1x1x2x32xf32>) { %1 = "lq.Dequantize"(%neg) {} : (tensor<1x1x2x1xi32>) -> tensor<1x1x2x32xf32> return %0, %1 : tensor<1x1x2x32xf32>, tensor<1x1x2x32xf32> - // CHECK: %[[neg:.*]] = arith.constant dense<-1.000000e+00> : tensor<1x1x2x32xf32> // CHECK: %[[pos:.*]] = arith.constant dense<1.000000e+00> : tensor<1x1x2x32xf32> + // CHECK: %[[neg:.*]] = arith.constant dense<-1.000000e+00> : tensor<1x1x2x32xf32> // CHECK: return %[[pos]], %[[neg]] : tensor<1x1x2x32xf32>, tensor<1x1x2x32xf32> } diff --git a/larq_compute_engine/mlir/tests/lce_ops_options_test.cc b/larq_compute_engine/mlir/tests/lce_ops_options_test.cc index 381435b1..c4cd0853 100644 --- a/larq_compute_engine/mlir/tests/lce_ops_options_test.cc +++ b/larq_compute_engine/mlir/tests/lce_ops_options_test.cc @@ -16,9 +16,9 @@ IntegerAttr getIntegerAttr(Builder builder, int value) { TEST(LCEOpsSerializationTest, QuantizeTest) { MLIRContext context; context.getOrLoadDialect(); - auto* op = Operation::create( - UnknownLoc::get(&context), OperationName("lq.Quantize", &context), - llvm::None, llvm::None, llvm::None, llvm::None, 0); + OperationState state(UnknownLoc::get(&context), + OperationName("lq.Quantize", &context)); + mlir::Operation* op = Operation::create(state); ASSERT_EQ(cast(op).buildCustomOptions().size(), 0); } @@ -26,9 +26,9 @@ TEST(LCEOpsSerializationTest, QuantizeTest) { TEST(LCEOpsSerializationTest, DequantizeTest) { MLIRContext context; context.getOrLoadDialect(); - auto* op = Operation::create( - UnknownLoc::get(&context), OperationName("lq.Dequantize", &context), - llvm::None, llvm::None, llvm::None, llvm::None, 0); + OperationState state(UnknownLoc::get(&context), + OperationName("lq.Dequantize", &context)); + mlir::Operation* op = Operation::create(state); ASSERT_EQ(cast(op).buildCustomOptions().size(), 0); } @@ -37,9 +37,9 @@ TEST(LCEOpsSerializationTest, BConv2dTest) { MLIRContext context; context.getOrLoadDialect(); Builder builder(&context); - auto op = Operation::create(UnknownLoc::get(&context), - OperationName("lq.Bconv2d", &context), llvm::None, - llvm::None, llvm::None, llvm::None, 0); + OperationState state(UnknownLoc::get(&context), + OperationName("lq.Bconv2d", &context)); + mlir::Operation* op = Operation::create(state); op->setAttr("channels_in", getIntegerAttr(builder, 64)); op->setAttr("dilation_height_factor", getIntegerAttr(builder, 3)); @@ -69,9 +69,9 @@ TEST(LCEOpsSerializationTest, BMaxPool2dTest) { MLIRContext context; context.getOrLoadDialect(); Builder builder(&context); - auto op = Operation::create( - UnknownLoc::get(&context), OperationName("lq.BMaxPool2d", &context), - llvm::None, llvm::None, llvm::None, llvm::None, 0); + OperationState state(UnknownLoc::get(&context), + OperationName("lq.BMaxPool2d", &context)); + mlir::Operation* op = Operation::create(state); op->setAttr("padding", builder.getStringAttr("SAME")); op->setAttr("stride_width", getIntegerAttr(builder, 2)); diff --git a/larq_compute_engine/mlir/tests/legalize-lce.mlir b/larq_compute_engine/mlir/tests/legalize-lce.mlir index 21767a41..cb6a62ee 100644 --- a/larq_compute_engine/mlir/tests/legalize-lce.mlir +++ b/larq_compute_engine/mlir/tests/legalize-lce.mlir @@ -9,7 +9,7 @@ func.func @legalize_bconv2d(%arg0: tensor<256x32x32x1xi32>, %arg1: tensor<16x3x3 // CHECK: %0 = "tfl.custom"(%arg0, %arg1, %arg2, %arg3, %arg4) {custom_code = "LceBconv2d", custom_option = #tfl} : (tensor<256x32x32x1xi32>, tensor<16x3x3x3xf32>, tensor<16xf32>, tensor<16xf32>, none) -> tensor<256x30x30x16xf32> // CHECK-NEXT: return %0 - // TRANSLATE: %0 = "lq.Bconv2d"(%arg0, %arg1, %arg2, %arg3, %arg4) {channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32} : (tensor<256x32x32x1xi32>, tensor<16x3x3x3xf32>, tensor<16xf32>, tensor<16xf32>, none) -> tensor<256x30x30x16xf32> + // TRANSLATE: %0 = "lq.Bconv2d"(%arg0, %arg1, %arg2, %arg3, %arg4) <{channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32}> : (tensor<256x32x32x1xi32>, tensor<16x3x3x3xf32>, tensor<16xf32>, tensor<16xf32>, none) -> tensor<256x30x30x16xf32> // TRANSLATE-NEXT: return %0 : tensor<256x30x30x16xf32> } @@ -21,7 +21,7 @@ func.func @legalize_bmax_pool2d(%arg0: tensor<256x32x32x3xi32>) -> tensor<256x16 // CHECK: %0 = "tfl.custom"(%arg0) {custom_code = "LceBMaxPool2d", custom_option = #tfl} : (tensor<256x32x32x3xi32>) -> tensor<256x16x16x3xi32> // CHECK-NEXT: return %0 - // TRANSLATE: %0 = "lq.BMaxPool2d"(%arg0) {filter_height = 2 : i32, filter_width = 2 : i32, padding = "SAME", stride_height = 2 : i32, stride_width = 2 : i32} : (tensor<256x32x32x3xi32>) -> tensor<256x16x16x3xi32> + // TRANSLATE: %0 = "lq.BMaxPool2d"(%arg0) <{filter_height = 2 : i32, filter_width = 2 : i32, padding = "SAME", stride_height = 2 : i32, stride_width = 2 : i32}> : (tensor<256x32x32x3xi32>) -> tensor<256x16x16x3xi32> // TRANSLATE-NEXT: return %0 : tensor<256x16x16x3xi32> } diff --git a/larq_compute_engine/mlir/tests/optimize.mlir b/larq_compute_engine/mlir/tests/optimize.mlir index c1f0efda..6b2b06a7 100644 --- a/larq_compute_engine/mlir/tests/optimize.mlir +++ b/larq_compute_engine/mlir/tests/optimize.mlir @@ -109,7 +109,7 @@ func.func @fuse_relu_into_bconv2d(%arg0: tensor<256x32x32x1xi32>, %arg1: tensor< %1 = "tfl.relu"(%0) : (tensor<256x30x30x16xf32>) -> tensor<256x30x30x16xf32> return %1 : tensor<256x30x30x16xf32> - // CHECK: %0 = "lq.Bconv2d"(%arg0, %arg1, %cst, %cst_0, %arg2) {channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "RELU", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32} + // CHECK: %0 = "lq.Bconv2d"(%arg0, %arg1, %cst, %cst_0, %arg2) <{channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "RELU", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32}> // CHECK-NEXT: return %0 } @@ -121,7 +121,7 @@ func.func @fuse_relu6_into_bconv2d(%arg0: tensor<256x32x32x1xi32>, %arg1: tensor %1 = "tfl.relu6"(%0) : (tensor<256x30x30x16xf32>) -> tensor<256x30x30x16xf32> return %1 : tensor<256x30x30x16xf32> - // CHECK: %0 = "lq.Bconv2d"(%arg0, %arg1, %cst, %cst_0, %arg2) {channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "RELU6", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32} + // CHECK: %0 = "lq.Bconv2d"(%arg0, %arg1, %cst, %cst_0, %arg2) <{channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "RELU6", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32}> // CHECK-NEXT: return %0 } @@ -133,7 +133,7 @@ func.func @fuse_relu1_into_bconv2d(%arg0: tensor<256x32x32x1xi32>, %arg1: tensor %1 = "tfl.relu_n1_to_1"(%0) : (tensor<256x30x30x16xf32>) -> tensor<256x30x30x16xf32> return %1 : tensor<256x30x30x16xf32> - // CHECK: %0 = "lq.Bconv2d"(%arg0, %arg1, %cst, %cst_0, %arg2) {channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "RELU_N1_TO_1", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32} + // CHECK: %0 = "lq.Bconv2d"(%arg0, %arg1, %cst, %cst_0, %arg2) <{channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "RELU_N1_TO_1", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32}> // CHECK-NEXT: return %0 } @@ -145,7 +145,7 @@ func.func @fuse_relu_into_bconv2d_padding_same_one(%arg0: tensor<256x32x32x1xi32 %1 = "tfl.relu"(%0) : (tensor<256x32x32x16xf32>) -> tensor<256x32x32x16xf32> return %1 : tensor<256x32x32x16xf32> - // CHECK: %0 = "lq.Bconv2d"(%arg0, %arg1, %cst, %cst_0, %arg2) {channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "RELU", pad_values = 1 : i32, padding = "SAME", stride_height = 1 : i32, stride_width = 1 : i32} + // CHECK: %0 = "lq.Bconv2d"(%arg0, %arg1, %cst, %cst_0, %arg2) <{channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "RELU", pad_values = 1 : i32, padding = "SAME", stride_height = 1 : i32, stride_width = 1 : i32}> // CHECK-NEXT: return %0 } @@ -157,7 +157,7 @@ func.func @do_not_fuse_relu_into_bconv2d_padding_same_zero(%arg0: tensor<256x32x %1 = "tfl.relu"(%0) : (tensor<256x32x32x16xf32>) -> tensor<256x32x32x16xf32> return %1 : tensor<256x32x32x16xf32> - // CHECK: %0 = "lq.Bconv2d"(%arg0, %arg1, %cst, %cst_0, %arg2) {channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "SAME", stride_height = 1 : i32, stride_width = 1 : i32} + // CHECK: %0 = "lq.Bconv2d"(%arg0, %arg1, %cst, %cst_0, %arg2) <{channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "SAME", stride_height = 1 : i32, stride_width = 1 : i32}> // CHECK-NEXT: %1 = "tfl.relu"(%0) // CHECK-NEXT: return %1 } @@ -170,7 +170,7 @@ func.func @do_not_fuse_relu_into_bconv2d_no_post_activation_bias(%arg0: tensor<2 %1 = "tfl.relu"(%0) : (tensor<256x30x30x16xf32>) -> tensor<256x30x30x16xf32> return %1 : tensor<256x30x30x16xf32> - // CHECK: %0 = "lq.Bconv2d"(%arg0, %arg1, %cst, %cst_0, %arg2) {channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32} + // CHECK: %0 = "lq.Bconv2d"(%arg0, %arg1, %cst, %cst_0, %arg2) <{channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32}> // CHECK-NEXT: %1 = "tfl.relu"(%0) // CHECK-NEXT: return %1 } @@ -183,7 +183,7 @@ func.func @do_not_fuse_relu_into_bconv2d_no_post_activation_multiplier(%arg0: te %1 = "tfl.relu"(%0) : (tensor<256x30x30x16xf32>) -> tensor<256x30x30x16xf32> return %1 : tensor<256x30x30x16xf32> - // CHECK: %0 = "lq.Bconv2d"(%arg0, %arg1, %cst, %cst_0, %arg2) {channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32} + // CHECK: %0 = "lq.Bconv2d"(%arg0, %arg1, %cst, %cst_0, %arg2) <{channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32}> // CHECK-NEXT: %1 = "tfl.relu"(%0) // CHECK-NEXT: return %1 } @@ -195,7 +195,7 @@ func.func @target_specific_reorder_maxpool_2d_quantize(%arg0: tensor<256x32x32x6 return %1 : tensor<256x16x8x3xi32> // CHECK-ARM-NEXT: %0 = "lq.Quantize"(%arg0) : (tensor<256x32x32x65xf32>) -> tensor<256x32x32x3xi32> - // CHECK-ARM-NEXT: %1 = "lq.BMaxPool2d"(%0) {filter_height = 3 : i32, filter_width = 2 : i32, padding = "SAME", stride_height = 2 : i32, stride_width = 4 : i32} : (tensor<256x32x32x3xi32>) -> tensor<256x16x8x3xi32> + // CHECK-ARM-NEXT: %1 = "lq.BMaxPool2d"(%0) <{filter_height = 3 : i32, filter_width = 2 : i32, padding = "SAME", stride_height = 2 : i32, stride_width = 4 : i32}> : (tensor<256x32x32x3xi32>) -> tensor<256x16x8x3xi32> // CHECK-ARM-NEXT: return %1 // CHECK-XCORE-NEXT: %0 = "tfl.max_pool_2d"(%arg0) {filter_height = 3 : i32, filter_width = 2 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 2 : i32, stride_w = 4 : i32} : (tensor<256x32x32x65xf32>) -> tensor<256x16x8x65xf32> @@ -236,7 +236,7 @@ func.func @bitpack_activation_thresholds_with_negative_post_multipliers(%arg0: t // Verify correct thresholds. These have been manually computed. // CHECK-NEXT: %cst_0 = arith.constant dense<[0, 3, 2, 2, -2147483648, 2, 1, 2]> : tensor<8xi32> - // CHECK-NEXT: %1 = "lq.Bconv2d"(%arg0, %cst, %0, %0, %cst_0) {channels_in = 1 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 1 : i32, padding = "SAME", stride_height = 1 : i32, stride_width = 1 : i32} : (tensor<256x32x32x1xi32>, tensor<8x2x2x1xf32>, none, none, tensor<8xi32>) -> tensor<256x32x32x1xi32> + // CHECK-NEXT: %1 = "lq.Bconv2d"(%arg0, %cst, %0, %0, %cst_0) <{channels_in = 1 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 1 : i32, padding = "SAME", stride_height = 1 : i32, stride_width = 1 : i32}> : (tensor<256x32x32x1xi32>, tensor<8x2x2x1xf32>, none, none, tensor<8xi32>) -> tensor<256x32x32x1xi32> // CHECK-NEXT: return %1 } @@ -250,7 +250,7 @@ func.func @bitpack_activations_valid_padding(%arg0: tensor<256x32x32x1xi32>) -> %2 = "lq.Quantize"(%1) : (tensor<256x30x30x65xf32>) -> tensor<256x30x30x3xi32> return %2 : tensor<256x30x30x3xi32> - // CHECK: %1 = "lq.Bconv2d"(%arg0, %cst, %0, %0, %cst_0) {channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32} : (tensor<256x32x32x1xi32>, tensor<65x3x3x3xf32>, none, none, tensor<65xi32>) -> tensor<256x30x30x3xi32> + // CHECK: %1 = "lq.Bconv2d"(%arg0, %cst, %0, %0, %cst_0) <{channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32}> : (tensor<256x32x32x1xi32>, tensor<65x3x3x3xf32>, none, none, tensor<65xi32>) -> tensor<256x30x30x3xi32> // CHECK-NEXT: return %1 } @@ -264,7 +264,7 @@ func.func @bitpack_activations_same_one_padding(%arg0: tensor<256x32x32x1xi32>) %2 = "lq.Quantize"(%1) : (tensor<256x32x32x65xf32>) -> tensor<256x32x32x3xi32> return %2 : tensor<256x32x32x3xi32> - // CHECK: %1 = "lq.Bconv2d"(%arg0, %cst, %0, %0, %cst_0) {channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 1 : i32, padding = "SAME", stride_height = 1 : i32, stride_width = 1 : i32} : (tensor<256x32x32x1xi32>, tensor<65x3x3x3xf32>, none, none, tensor<65xi32>) -> tensor<256x32x32x3xi32> + // CHECK: %1 = "lq.Bconv2d"(%arg0, %cst, %0, %0, %cst_0) <{channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 1 : i32, padding = "SAME", stride_height = 1 : i32, stride_width = 1 : i32}> : (tensor<256x32x32x1xi32>, tensor<65x3x3x3xf32>, none, none, tensor<65xi32>) -> tensor<256x32x32x3xi32> // CHECK-NEXT: return %1 } @@ -278,7 +278,7 @@ func.func @do_not_bitpack_activations_same_zero_padding(%arg0: tensor<256x32x32x %2 = "lq.Quantize"(%1) : (tensor<256x32x32x65xf32>) -> tensor<256x32x32x3xi32> return %2 : tensor<256x32x32x3xi32> - // CHECK: %1 = "lq.Bconv2d"(%arg0, %cst, %cst_0, %cst_1, %0) {channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "SAME", stride_height = 1 : i32, stride_width = 1 : i32} : (tensor<256x32x32x1xi32>, tensor<65x3x3x3xf32>, tensor<65xf32>, tensor<65xf32>, none) -> tensor<256x32x32x65xf32> + // CHECK: %1 = "lq.Bconv2d"(%arg0, %cst, %cst_0, %cst_1, %0) <{channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "SAME", stride_height = 1 : i32, stride_width = 1 : i32}> : (tensor<256x32x32x1xi32>, tensor<65x3x3x3xf32>, tensor<65xf32>, tensor<65xf32>, none) -> tensor<256x32x32x65xf32> // CHECK-NEXT: %2 = "lq.Quantize"(%1) : (tensor<256x32x32x65xf32>) -> tensor<256x32x32x3xi32> // CHECK-NEXT: return %2 } @@ -293,7 +293,7 @@ func.func @do_not_bitpack_activations_multiple_uses(%arg0: tensor<256x32x32x1xi3 %2 = "lq.Quantize"(%1) : (tensor<256x30x30x65xf32>) -> tensor<256x30x30x3xi32> return %1, %2: tensor<256x30x30x65xf32>, tensor<256x30x30x3xi32> - // CHECK: %1 = "lq.Bconv2d"(%arg0, %cst, %cst_0, %cst_1, %0) {channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32} : (tensor<256x32x32x1xi32>, tensor<65x3x3x3xf32>, tensor<65xf32>, tensor<65xf32>, none) -> tensor<256x30x30x65xf32> + // CHECK: %1 = "lq.Bconv2d"(%arg0, %cst, %cst_0, %cst_1, %0) <{channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32}> : (tensor<256x32x32x1xi32>, tensor<65x3x3x3xf32>, tensor<65xf32>, tensor<65xf32>, none) -> tensor<256x30x30x65xf32> // CHECK-NEXT: %2 = "lq.Quantize"(%1) : (tensor<256x30x30x65xf32>) -> tensor<256x30x30x3xi32> // CHECK-NEXT: return %1, %2 } diff --git a/larq_compute_engine/mlir/tests/prepare-tf.mlir b/larq_compute_engine/mlir/tests/prepare-tf.mlir index 22fa7194..31a54106 100644 --- a/larq_compute_engine/mlir/tests/prepare-tf.mlir +++ b/larq_compute_engine/mlir/tests/prepare-tf.mlir @@ -84,12 +84,13 @@ func.func @fuse_bconv2d_valid_padding(%arg0: tensor<1x112x112x1xi32>) -> tensor< %1 = "tf.Conv2D"(%0, %cst) {padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<1x112x112x2xf32>, tensor<1x3x2x2xf32>) -> tensor<1x112x110x2xf32> return %1 : tensor<1x112x110x2xf32> - // CHECK: %cst = arith.constant - // CHECK: %[[post_activation_multiplier:.*]] = arith.constant dense<1.000000e+00> : tensor<2xf32> - // CHECK: %[[post_activation_bias:.*]] = arith.constant dense<0.000000e+00> : tensor<2xf32> // CHECK: %[[output_threshold:.*]] = "tfl.no_value"() {value} : () -> none - // CHECK: %[[transpose:.*]] = "tf.Transpose" - // CHECK-NEXT: %[[conv:.*]] = "lq.Bconv2d"(%arg0, %[[transpose]], %[[post_activation_multiplier]], %[[post_activation_bias]], %[[output_threshold:.*]]) {channels_in = 2 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32} : (tensor<1x112x112x1xi32>, tensor<2x1x3x2xf32>, tensor<2xf32>, tensor<2xf32>, none) -> tensor<1x112x110x2xf32> + // CHECK: %[[post_activation_bias:.*]] = arith.constant dense<0.000000e+00> : tensor<2xf32> + // CHECK: %[[post_activation_multiplier:.*]] = arith.constant dense<1.000000e+00> : tensor<2xf32> + // CHECK: %[[weights:.*]] = arith.constant + // CHECK: %[[transpose_idx:.*]] = arith.constant {{.*}} tensor<4xi32> + // CHECK: %[[transpose:.*]] = "tf.Transpose"(%[[weights]], %[[transpose_idx]]) + // CHECK-NEXT: %[[conv:.*]] = "lq.Bconv2d"(%arg0, %[[transpose]], %[[post_activation_multiplier]], %[[post_activation_bias]], %[[output_threshold:.*]]) <{channels_in = 2 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32}> : (tensor<1x112x112x1xi32>, tensor<2x1x3x2xf32>, tensor<2xf32>, tensor<2xf32>, none) -> tensor<1x112x110x2xf32> // CHECK-NEXT: return %[[conv]] } @@ -100,12 +101,13 @@ func.func @target_specific_fuse_bconv2d_same_zero_padding(%arg0: tensor<1x112x11 %1 = "tf.Conv2D"(%0, %cst) {padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<1x112x112x2xf32>, tensor<1x2x2x2xf32>) -> tensor<1x112x112x2xf32> return %1 : tensor<1x112x112x2xf32> - // CHECK-ARM: %cst = arith.constant - // CHECK-ARM: %[[post_activation_multiplier:.*]] = arith.constant dense<1.000000e+00> : tensor<2xf32> - // CHECK-ARM: %[[post_activation_bias:.*]] = arith.constant dense<0.000000e+00> : tensor<2xf32> // CHECK-ARM: %[[output_threshold:.*]] = "tfl.no_value"() {value} : () -> none - // CHECK-ARM: %[[transpose:.*]] = "tf.Transpose" - // CHECK-ARM-NEXT: %[[conv:.*]] = "lq.Bconv2d"(%arg0, %[[transpose]], %[[post_activation_multiplier]], %[[post_activation_bias]], %[[output_threshold:.*]]) {channels_in = 2 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "SAME", stride_height = 1 : i32, stride_width = 1 : i32} : (tensor<1x112x112x1xi32>, tensor<2x1x2x2xf32>, tensor<2xf32>, tensor<2xf32>, none) -> tensor<1x112x112x2xf32> + // CHECK-ARM: %[[post_activation_bias:.*]] = arith.constant dense<0.000000e+00> : tensor<2xf32> + // CHECK-ARM: %[[post_activation_multiplier:.*]] = arith.constant dense<1.000000e+00> : tensor<2xf32> + // CHECK-ARM: %[[weights:.*]] = arith.constant + // CHECK-ARM: %[[transpose_idx:.*]] = arith.constant {{.*}} tensor<4xi32> + // CHECK-ARM: %[[transpose:.*]] = "tf.Transpose"(%[[weights]], %[[transpose_idx]]) + // CHECK-ARM-NEXT: %[[conv:.*]] = "lq.Bconv2d"(%arg0, %[[transpose]], %[[post_activation_multiplier]], %[[post_activation_bias]], %[[output_threshold:.*]]) <{channels_in = 2 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "SAME", stride_height = 1 : i32, stride_width = 1 : i32}> : (tensor<1x112x112x1xi32>, tensor<2x1x2x2xf32>, tensor<2xf32>, tensor<2xf32>, none) -> tensor<1x112x112x2xf32> // CHECK-ARM-NEXT: return %[[conv]] // CHECK-XCORE: %0 = "lq.Dequantize" @@ -120,12 +122,13 @@ func.func @fuse_bconv2d_grouped_convolution(%arg0: tensor<1x112x112x4xi32>) -> t %1 = "tf.Conv2D"(%0, %cst) {padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<1x112x112x128xf32>, tensor<3x3x64x16xf32>) -> tensor<1x110x110x16xf32> return %1 : tensor<1x110x110x16xf32> - // CHECK: %cst = arith.constant - // CHECK: %[[post_activation_multiplier:.*]] = arith.constant dense<1.000000e+00> : tensor<16xf32> - // CHECK: %[[post_activation_bias:.*]] = arith.constant dense<0.000000e+00> : tensor<16xf32> // CHECK: %[[output_threshold:.*]] = "tfl.no_value"() {value} : () -> none - // CHECK: %[[transpose:.*]] = "tf.Transpose" - // CHECK-NEXT: %[[conv:.*]] = "lq.Bconv2d"(%arg0, %[[transpose]], %[[post_activation_multiplier]], %[[post_activation_bias]], %[[output_threshold:.*]]) {channels_in = 128 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32} : (tensor<1x112x112x4xi32>, tensor<16x3x3x64xf32>, tensor<16xf32>, tensor<16xf32>, none) -> tensor<1x110x110x16xf32> + // CHECK: %[[post_activation_bias:.*]] = arith.constant dense<0.000000e+00> : tensor<16xf32> + // CHECK: %[[post_activation_multiplier:.*]] = arith.constant dense<1.000000e+00> : tensor<16xf32> + // CHECK: %[[weights:.*]] = arith.constant + // CHECK: %[[transpose_idx:.*]] = arith.constant {{.*}} tensor<4xi32> + // CHECK: %[[transpose:.*]] = "tf.Transpose"(%[[weights]], %[[transpose_idx]]) + // CHECK-NEXT: %[[conv:.*]] = "lq.Bconv2d"(%arg0, %[[transpose]], %[[post_activation_multiplier]], %[[post_activation_bias]], %[[output_threshold:.*]]) <{channels_in = 128 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32}> : (tensor<1x112x112x4xi32>, tensor<16x3x3x64xf32>, tensor<16xf32>, tensor<16xf32>, none) -> tensor<1x110x110x16xf32> // CHECK-NEXT: return %[[conv]] } @@ -148,12 +151,13 @@ func.func @fuse_scaled_bconv2d(%arg0: tensor<1x112x112x1xi32>) -> tensor<1x112x1 %1 = "tf.Conv2D"(%0, %cst) {padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<1x112x112x2xf32>, tensor<1x3x2x2xf32>) -> tensor<1x112x110x2xf32> return %1 : tensor<1x112x110x2xf32> - // CHECK: %cst = arith.constant - // CHECK: %[[post_activation_multiplier:.*]] = arith.constant dense<[3.000000e-01, 1.000000e-01]> : tensor<2xf32> - // CHECK: %[[post_activation_bias:.*]] = arith.constant dense<0.000000e+00> : tensor<2xf32> // CHECK: %[[output_threshold:.*]] = "tfl.no_value"() {value} : () -> none - // CHECK: %[[transpose:.*]] = "tf.Transpose" - // CHECK-NEXT: %[[conv:.*]] = "lq.Bconv2d"(%arg0, %[[transpose]], %[[post_activation_multiplier]], %[[post_activation_bias]], %[[output_threshold:.*]]) {channels_in = 2 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32} : (tensor<1x112x112x1xi32>, tensor<2x1x3x2xf32>, tensor<2xf32>, tensor<2xf32>, none) -> tensor<1x112x110x2xf32> + // CHECK: %[[post_activation_bias:.*]] = arith.constant dense<0.000000e+00> : tensor<2xf32> + // CHECK: %[[transpose_idx:.*]] = arith.constant {{.*}} tensor<4xi32> + // CHECK: %[[post_activation_multiplier:.*]] = arith.constant dense<[3.000000e-01, 1.000000e-01]> : tensor<2xf32> + // CHECK: %[[weights:.*]] = "tf.Div" + // CHECK: %[[transpose:.*]] = "tf.Transpose"(%[[weights]], %[[transpose_idx]]) + // CHECK-NEXT: %[[conv:.*]] = "lq.Bconv2d"(%arg0, %[[transpose]], %[[post_activation_multiplier]], %[[post_activation_bias]], %[[output_threshold:.*]]) <{channels_in = 2 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32}> : (tensor<1x112x112x1xi32>, tensor<2x1x3x2xf32>, tensor<2xf32>, tensor<2xf32>, none) -> tensor<1x112x110x2xf32> // CHECK-NEXT: return %[[conv]] } @@ -169,11 +173,13 @@ func.func @fuse_dilated_bconv(%arg0: tensor<1x128x128x1xi32>) -> tensor<1x128x12 %3 = "tf.BatchToSpaceND"(%2, %cst, %cst_0) : (tensor<4x64x64x8xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<1x128x128x8xf32> return %3 : tensor<1x128x128x8xf32> - // CHECK: %[[post_activation_multiplier:.*]] = arith.constant dense<1.000000e+00> : tensor<8xf32> - // CHECK: %[[post_activation_bias:.*]] = arith.constant dense<0.000000e+00> : tensor<8xf32> // CHECK: %[[output_threshold:.*]] = "tfl.no_value"() {value} : () -> none - // CHECK: %[[transpose:.*]] = "tf.Transpose" - // CHECK-NEXT: %[[conv:.*]] = "lq.Bconv2d"(%arg0, %[[transpose]], %[[post_activation_multiplier]], %[[post_activation_bias]], %[[output_threshold:.*]]) {channels_in = 3 : i32, dilation_height_factor = 2 : i32, dilation_width_factor = 2 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32} : (tensor<1x128x128x1xi32>, tensor<8x5x5x3xf32>, tensor<8xf32>, tensor<8xf32>, none) -> tensor<1x128x128x8xf32> + // CHECK: %[[post_activation_bias:.*]] = arith.constant dense<0.000000e+00> : tensor<8xf32> + // CHECK: %[[post_activation_multiplier:.*]] = arith.constant dense<1.000000e+00> : tensor<8xf32> + // CHECK: %[[weights:.*]] = arith.constant + // CHECK: %[[transpose_idx:.*]] = arith.constant + // CHECK: %[[transpose:.*]] = "tf.Transpose"(%[[weights]], %[[transpose_idx]]) + // CHECK-NEXT: %[[conv:.*]] = "lq.Bconv2d"(%arg0, %[[transpose]], %[[post_activation_multiplier]], %[[post_activation_bias]], %[[output_threshold:.*]]) <{channels_in = 3 : i32, dilation_height_factor = 2 : i32, dilation_width_factor = 2 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32}> : (tensor<1x128x128x1xi32>, tensor<8x5x5x3xf32>, tensor<8xf32>, tensor<8xf32>, none) -> tensor<1x128x128x8xf32> // CHECK-NEXT: return %[[conv]] : tensor<1x128x128x8xf32> } @@ -213,11 +219,11 @@ func.func @fuse_bconv2d_same_one_padding(%arg0: tensor<256x32x32x1xi32>) -> tens %2 = "tf.Conv2D"(%1, %cst) {padding = "VALID", strides = [1, 2, 2, 1]} : (tensor<256x34x34x3xf32>, tensor<3x3x3x16xf32>) -> tensor<256x16x16x16xf32> return %2 : tensor<256x16x16x16xf32> - // CHECK: %[[CST1:.*]] = arith.constant dense<1.000000e+00> : tensor<16xf32> - // CHECK: %[[CST2:.*]] = arith.constant dense<0.000000e+00> : tensor<16xf32> // CHECK: %[[CST3:.*]] = "tfl.no_value"() {value} : () -> none + // CHECK: %[[CST2:.*]] = arith.constant dense<0.000000e+00> : tensor<16xf32> + // CHECK: %[[CST1:.*]] = arith.constant dense<1.000000e+00> : tensor<16xf32> // CHECK: %[[TRP:.*]] = "tf.Transpose" - // CHECK: %[[CONV:.*]] = "lq.Bconv2d"(%arg0, %[[TRP]], %[[CST1]], %[[CST2]], %[[CST3:.*]]) {channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 1 : i32, padding = "SAME", stride_height = 2 : i32, stride_width = 2 : i32} : (tensor<256x32x32x1xi32>, tensor<16x3x3x3xf32>, tensor<16xf32>, tensor<16xf32>, none) -> tensor<256x16x16x16xf32> + // CHECK: %[[CONV:.*]] = "lq.Bconv2d"(%arg0, %[[TRP]], %[[CST1]], %[[CST2]], %[[CST3:.*]]) <{channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 1 : i32, padding = "SAME", stride_height = 2 : i32, stride_width = 2 : i32}> : (tensor<256x32x32x1xi32>, tensor<16x3x3x3xf32>, tensor<16xf32>, tensor<16xf32>, none) -> tensor<256x16x16x16xf32> } // CHECK-LABEL: @do_not_fuse_bconv2d_padding_same_twice diff --git a/larq_compute_engine/mlir/tf_tfl_passes.cc b/larq_compute_engine/mlir/tf_tfl_passes.cc index 5cc1dbf5..7139955a 100644 --- a/larq_compute_engine/mlir/tf_tfl_passes.cc +++ b/larq_compute_engine/mlir/tf_tfl_passes.cc @@ -108,7 +108,7 @@ void AddPreVariableFreezingTFToLCETFLConversionPasses( // This decomposes resource ops like ResourceGather into read-variable op // followed by gather. This is used when the saved model import path is used - // during which resources dont get frozen in the python layer. + // during which resources don't get frozen in the python layer. pass_manager->addNestedPass( mlir::TFDevice::CreateDecomposeResourceOpsPass()); @@ -257,7 +257,9 @@ void AddPostVariableFreezingTFToLCETFLConversionPasses( // Run quantization after all the floating point model conversion is // completed. - if (quant_specs.RunPropagationAndRewriteQuantizationPasses()) { + if (quant_specs.RunPropagationAndRewriteQuantizationPasses() || + quant_specs.qdq_conversion_mode != + mlir::quant::QDQConversionMode::kQDQNone) { AddQuantizationPasses(quant_specs, *pass_manager); // Remove unnecessary QDQs while handling QAT models. pass_manager->addNestedPass( diff --git a/larq_compute_engine/mlir/tf_to_tfl_flatbuffer.cc b/larq_compute_engine/mlir/tf_to_tfl_flatbuffer.cc index 90141a73..be080cef 100644 --- a/larq_compute_engine/mlir/tf_to_tfl_flatbuffer.cc +++ b/larq_compute_engine/mlir/tf_to_tfl_flatbuffer.cc @@ -3,15 +3,19 @@ #include "larq_compute_engine/mlir/tf_tfl_passes.h" #include "larq_compute_engine/mlir/transforms/passes.h" #include "llvm/Support/raw_ostream.h" +#include "mlir/Dialect/Func/Extensions/AllExtensions.h" #include "mlir/IR/BuiltinOps.h" #include "mlir/Pass/PassManager.h" +#include "tensorflow/compiler/mlir/lite/debug/debug.h" #include "tensorflow/compiler/mlir/lite/flatbuffer_export.h" #include "tensorflow/compiler/mlir/lite/metrics/error_collector_inst.h" +#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/op_stat_pass.h" +#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_util.h" #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h" #include "tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_freeze_variables.h" #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h" #include "tensorflow/core/framework/op.h" -#include "tensorflow/tsl/platform/statusor.h" +#include "tsl/platform/statusor.h" namespace tensorflow { namespace { @@ -55,79 +59,68 @@ class TruncateOpOrArgLocNameMapper : public OpOrArgLocNameMapper { }; } // namespace -Status ConvertTFExecutorToTFLOrFlatbuffer( +absl::Status ConvertTFExecutorToTFLOrFlatbuffer( mlir::ModuleOp module, bool export_to_mlir, const LCETarget target, mlir::quant::QuantizationSpecs quant_specs, const std::unordered_set& saved_model_tags, llvm::StringRef saved_model_dir, - llvm::Optional session, std::string* result) { + std::optional session, std::string* result) { // Explicitly disable dumping Op details on failures. module.getContext()->printOpOnDiagnostic(false); - // Register a warning handler only log to std out. - mlir::ScopedDiagnosticHandler s( - module.getContext(), [](mlir::Diagnostic& diag) { - if (diag.getSeverity() == mlir::DiagnosticSeverity::Warning) { - for (auto& note : diag.getNotes()) { - std::cout << note.str() << "\n"; - LOG(WARNING) << note.str() << "\n"; - } - } - return mlir::failure(); - }); + mlir::DialectRegistry registry; + mlir::func::registerAllExtensions(registry); + module.getContext()->appendDialectRegistry(registry); mlir::StatusScopedDiagnosticHandler statusHandler(module.getContext(), /*propagate=*/true); - if (failed(IsValidGraph(module))) { - return statusHandler.ConsumeStatus(); - } - mlir::PassManager pass_manager(module.getContext()); + mlir::registerPassManagerCLOptions(); if (mlir::failed(mlir::applyPassManagerCLOptions(pass_manager))) { - // We don't return here as in the normal TF converter, since apparently this - // actually fails in our case, but the failure isn't terminal. - // return tensorflow::FromAbslStatus( - // absl::UnknownError("failed to apply MLIR pass manager CL options")); + return absl::InternalError("Failed to apply MLIR pass manager CL options."); } + // DebugOptions::ir_dump_dir can be set for debugging + converter::DebugOptions debug_options; + InitPassManager(pass_manager, debug_options); + pass_manager.addInstrumentation( std::make_unique( pass_manager.getContext())); + if (mlir::failed(IsValidGraph(module))) { + return statusHandler.ConsumeStatus(); + } + tensorflow::AddPreVariableFreezingTFToLCETFLConversionPasses(&pass_manager); - if (failed(pass_manager.run(module))) { + if (mlir::failed(pass_manager.run(module))) { return statusHandler.ConsumeStatus(); } // Freeze variables if a session is provided. - if (session.has_value()) { - mlir::TFL::ErrorCollectorInstrumentation collector(module.getContext()); - if (mlir::failed( - mlir::tf_saved_model::FreezeVariables(module, session.value()))) { - auto status = statusHandler.ConsumeStatus(); - mlir::TFL::ErrorCollector* collector = - mlir::TFL::ErrorCollector::GetErrorCollector(); - if (!collector->CollectedErrors().empty()) { - return errors::InvalidArgument("Variable constant folding has failed."); - } - return status; - } + if (session.has_value() && mlir::failed(mlir::tf_saved_model::FreezeVariables( + module, session.value_or(nullptr)))) { + return statusHandler.Combine( + absl::InvalidArgumentError("Variable constant folding is failed.")); } + pass_manager.clear(); + tensorflow::AddPostVariableFreezingTFToLCETFLConversionPasses( saved_model_dir, quant_specs, &pass_manager, target); - if (failed(pass_manager.run(module))) { - auto status = statusHandler.ConsumeStatus(); - mlir::TFL::ErrorCollector* collector = - mlir::TFL::ErrorCollector::GetErrorCollector(); - for (const auto& error_data : collector->CollectedErrors()) { - if (error_data.subcomponent() == "FreezeGlobalTensorsPass") { - return errors::InvalidArgument("Variable constant folding is failed."); - } - } - return status; + if (mlir::failed(pass_manager.run(module))) { + return statusHandler.Combine( + absl::InvalidArgumentError("Variable constant folding failed.")); } if (export_to_mlir) { + pass_manager.clear(); + // Print out a detailed report of ops that are not converted to TFL ops. + pass_manager.addPass(mlir::odml::createPrintOpStatsPass( + mlir::odml::GetAcceptedTFLiteDialects())); + if (mlir::failed(pass_manager.run(module))) { + return statusHandler.ConsumeStatus(); + } + llvm::raw_string_ostream os(*result); module.print(os); return statusHandler.ConsumeStatus(); @@ -142,14 +135,18 @@ Status ConvertTFExecutorToTFLOrFlatbuffer( options.toco_flags = toco_flags; options.saved_model_tags = saved_model_tags; options.op_or_arg_name_mapper = &op_or_arg_name_mapper; - if (!tflite::MlirToFlatBufferTranslateFunction(module, options, result)) { - return statusHandler.ConsumeStatus(); + const bool serialize_stablehlo_ops = false; + if (!tflite::MlirToFlatBufferTranslateFunction(module, options, result, + serialize_stablehlo_ops)) { + return statusHandler.Combine( + absl::InternalError("Could not translate MLIR to FlatBuffer.")); } - if (mlir::failed(module.verify())) { - return tensorflow::errors::Unknown("Final module is invalid"); + if (mlir::failed(module.verifyInvariants())) { + return statusHandler.Combine( + absl::InternalError("Final module is invalid.")); } - return OkStatus(); + return absl::OkStatus(); } } // namespace tensorflow diff --git a/larq_compute_engine/mlir/tf_to_tfl_flatbuffer.h b/larq_compute_engine/mlir/tf_to_tfl_flatbuffer.h index cf89d353..e40eec8b 100644 --- a/larq_compute_engine/mlir/tf_to_tfl_flatbuffer.h +++ b/larq_compute_engine/mlir/tf_to_tfl_flatbuffer.h @@ -1,24 +1,26 @@ #ifndef LARQ_COMPUTE_ENGINE_MLIR_TF_TO_TFL_FLATBUFFER_H_ #define LARQ_COMPUTE_ENGINE_MLIR_TF_TO_TFL_FLATBUFFER_H_ +#include #include #include "larq_compute_engine/mlir/transforms/passes.h" #include "mlir/IR/BuiltinOps.h" #include "tensorflow/compiler/mlir/lite/quantization/quantization_config.h" #include "tensorflow/core/public/session.h" -#include "tensorflow/tsl/platform/statusor.h" +#include "tsl/platform/statusor.h" + namespace tensorflow { // This is a fork of ConvertTFExecutorToTFLOrFlatbuffer to enable custom // OpOrArgLocNameMapper // https://github.com/tensorflow/tensorflow/blob/v2.8.0/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h#L60-L78 -Status ConvertTFExecutorToTFLOrFlatbuffer( +absl::Status ConvertTFExecutorToTFLOrFlatbuffer( mlir::ModuleOp module, bool export_to_mlir, const LCETarget target, mlir::quant::QuantizationSpecs quant_specs, const std::unordered_set& saved_model_tags, llvm::StringRef saved_model_dir, - llvm::Optional session, std::string* result); + std::optional session, std::string* result); } // namespace tensorflow #endif // LARQ_COMPUTE_ENGINE_MLIR_TF_TO_TFL_FLATBUFFER_H_ diff --git a/larq_compute_engine/mlir/transforms/bitpack_activations_patterns.td b/larq_compute_engine/mlir/transforms/bitpack_activations_patterns.td index c8dda3c2..f6beab39 100644 --- a/larq_compute_engine/mlir/transforms/bitpack_activations_patterns.td +++ b/larq_compute_engine/mlir/transforms/bitpack_activations_patterns.td @@ -55,6 +55,6 @@ class WriteBitpackedActivationsPat; + [(HasOneUse $output)]>; def : WriteBitpackedActivationsPat; def : WriteBitpackedActivationsPat; diff --git a/larq_compute_engine/mlir/transforms/fuse_padding.td b/larq_compute_engine/mlir/transforms/fuse_padding.td index 0aab22ae..57d7be0a 100644 --- a/larq_compute_engine/mlir/transforms/fuse_padding.td +++ b/larq_compute_engine/mlir/transforms/fuse_padding.td @@ -43,8 +43,7 @@ def : Pat<(TFL_Conv2DOp:$conv_output [(HasOneUse $pad_output), (NoBatchAndChannelPadding $paddings), (SamePaddingHeight $paddings, $input, $conv_output, $stride_h), - (SamePaddingWidth $paddings, $input, $conv_output, $stride_w)], - (addBenefit 100)>; + (SamePaddingWidth $paddings, $input, $conv_output, $stride_w)]>; // PadV2 > Conv2D @@ -74,8 +73,7 @@ def : Pat<(TFL_Conv2DOp:$conv_output (ConstFloatValueIs<"0.0"> $pad_values), (NoBatchAndChannelPadding $paddings), (SamePaddingHeight $paddings, $input, $conv_output, $stride_h), - (SamePaddingWidth $paddings, $input, $conv_output, $stride_w)], - (addBenefit 100)>; + (SamePaddingWidth $paddings, $input, $conv_output, $stride_w)]>; // Pad > DepthwiseConv2D def : Pat<(TFL_DepthwiseConv2DOp:$conv_output @@ -104,8 +102,7 @@ def : Pat<(TFL_DepthwiseConv2DOp:$conv_output [(HasOneUse $pad_output), (NoBatchAndChannelPadding $paddings), (SamePaddingHeight $paddings, $input, $conv_output, $stride_h), - (SamePaddingWidth $paddings, $input, $conv_output, $stride_w)], - (addBenefit 100)>; + (SamePaddingWidth $paddings, $input, $conv_output, $stride_w)]>; // PadV2 > DepthwiseConv2D def : Pat<(TFL_DepthwiseConv2DOp:$conv_output @@ -136,5 +133,4 @@ def : Pat<(TFL_DepthwiseConv2DOp:$conv_output (ConstFloatValueIs<"0.0"> $pad_values), (NoBatchAndChannelPadding $paddings), (SamePaddingHeight $paddings, $input, $conv_output, $stride_h), - (SamePaddingWidth $paddings, $input, $conv_output, $stride_w)], - (addBenefit 100)>; + (SamePaddingWidth $paddings, $input, $conv_output, $stride_w)]>; diff --git a/larq_compute_engine/mlir/transforms/optimize.cc b/larq_compute_engine/mlir/transforms/optimize.cc index 8b43a790..9646a3d0 100644 --- a/larq_compute_engine/mlir/transforms/optimize.cc +++ b/larq_compute_engine/mlir/transforms/optimize.cc @@ -4,7 +4,6 @@ #include "larq_compute_engine/mlir/ir/lce_ops.h" #include "larq_compute_engine/mlir/transforms/common.h" #include "larq_compute_engine/mlir/transforms/passes.h" -#include "llvm/ADT/Optional.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringSwitch.h" #include "mlir/Dialect/Func/IR/FuncOps.h" diff --git a/larq_compute_engine/mlir/transforms/optimize_patterns_common.td b/larq_compute_engine/mlir/transforms/optimize_patterns_common.td index 27c8de45..9bc11fa0 100644 --- a/larq_compute_engine/mlir/transforms/optimize_patterns_common.td +++ b/larq_compute_engine/mlir/transforms/optimize_patterns_common.td @@ -13,13 +13,13 @@ def HasOneUse : Constraint>; class ConstantValue : AttrConstraint>; +// This pattern has priority (addBenefit) over the more generic pattern below def : Pat<(LQ_QuantizeOp (TFL_GreaterEqualOp:$ge_op $input, (Arith_ConstantOp ConstantValue<"0.0f">))), (LQ_QuantizeOp $input), - [(HasOneUse $ge_op)], - (addBenefit 150)>; + [(HasOneUse $ge_op)], [], (addBenefit 100)>; def : Pat<(LQ_QuantizeOp (TFL_GreaterEqualOp:$ge_op @@ -27,15 +27,13 @@ def : Pat<(LQ_QuantizeOp $threshold)), (LQ_QuantizeOp (TFL_SubOp $input, $threshold, TFL_AF_None)), - [(HasOneUse $ge_op)], - (addBenefit 100)>; + [(HasOneUse $ge_op)]>; def : Pat<(LQ_QuantizeOp (TFL_LessEqualOp:$ge_op $lhs, $rhs)), (LQ_QuantizeOp (TFL_GreaterEqualOp $rhs, $lhs)), - [(HasOneUse $ge_op)], - (addBenefit 100)>; + [(HasOneUse $ge_op)]>; // TODO: Check shapes before fusing multiclass FuseAddOrSubWithBConv2D { @@ -70,7 +68,7 @@ multiclass FuseAddOrSubWithBConv2D { $padding, $stride_height, $stride_width), - [(HasOneUse $output)], (addBenefit 100)>; + [(HasOneUse $output)]>; } foreach binaryOp = [TFL_AddOp, TFL_SubOp] in defm : FuseAddOrSubWithBConv2D; @@ -109,7 +107,7 @@ multiclass FuseMulOrDivWithBConv2D { $padding, $stride_height, $stride_width), - [(HasOneUse $conv_output)], (addBenefit 100)>; + [(HasOneUse $conv_output)]>; } foreach binaryOp = [TFL_DivOp, TFL_MulOp] in defm : FuseMulOrDivWithBConv2D; @@ -146,7 +144,7 @@ multiclass FuseActFnIntoConvOpPat { $padding, $stride_height, $stride_width), - [(HasOneUse $conv_output)], (addBenefit 100)>; + [(HasOneUse $conv_output)]>; def : Pat<(ActFnOp (LQ_Bconv2dOp:$conv_output $input, @@ -176,7 +174,7 @@ multiclass FuseActFnIntoConvOpPat { $padding, $stride_height, $stride_width), - [(HasOneUse $conv_output)], (addBenefit 100)>; + [(HasOneUse $conv_output)]>; } foreach actFnPair = [[TFL_ReluOp, TFL_AF_Relu], [TFL_Relu1Op, TFL_AF_Relu1], diff --git a/larq_compute_engine/mlir/transforms/prepare_patterns_common.td b/larq_compute_engine/mlir/transforms/prepare_patterns_common.td index 430c0e49..3eed4cf6 100644 --- a/larq_compute_engine/mlir/transforms/prepare_patterns_common.td +++ b/larq_compute_engine/mlir/transforms/prepare_patterns_common.td @@ -37,7 +37,7 @@ multiclass QuantDequantPatterns { $select_op, $select_op, /*use 32bit*/ConstBoolAttrFalse)))), - [], (addBenefit 100)>; + []>; def : Pat<(SelectOp:$select_op $cond, (Arith_ConstantOp ConstantValue<"-1.0f">), @@ -51,7 +51,7 @@ multiclass QuantDequantPatterns { $select_op, $select_op, /*use 32bit*/ConstBoolAttrFalse)))), - [], (addBenefit 100)>; + []>; } foreach SelectOp = [TF_SelectOp, TF_SelectV2Op] in defm : QuantDequantPatterns; @@ -59,9 +59,9 @@ foreach SelectOp = [TF_SelectOp, TF_SelectV2Op] in // A fallback for the old version of `ste_sign` that uses a specific `tf.sign` // based implementation of `larq.math.sign`. def : Pat<(TF_SignOp (TF_AddV2Op (TF_SignOp $arg), $c)), - (LQ_DequantizeOp (LQ_QuantizeOp $arg)), [], (addBenefit 100)>; + (LQ_DequantizeOp (LQ_QuantizeOp $arg)), []>; def : Pat<(TF_SignOp (TF_AddV2Op $c, (TF_SignOp $arg))), - (LQ_DequantizeOp (LQ_QuantizeOp $arg)), [], (addBenefit 100)>; + (LQ_DequantizeOp (LQ_QuantizeOp $arg)), []>; // Copied from legalize_patterns.td class I32VectorElementsAttr : ElementsAttrBase< @@ -123,8 +123,7 @@ class PrepareBConvPadValue0Pat : ExtractI32At<1>:$strides, ExtractI32At<2>:$strides), [(BinaryFilter $filter), - (ValidFilterShape $dequantized_input, $filter_op)], - (addBenefit 90)>; + (ValidFilterShape $dequantized_input, $filter_op)]>; def : PrepareBConvPadValue0Pat; def ConstFloatValueIsOne : Constraint< @@ -166,5 +165,4 @@ def : Pat<(TF_Conv2DOp:$output [(BinaryFilter $filter), (ConstFloatValueIsOne $pad_values), (SamePadding $paddings, $input, $output, $strides), - (ValidFilterShape $dequantized_input, $filter_op)], - (addBenefit 90)>; + (ValidFilterShape $dequantized_input, $filter_op)]>; diff --git a/larq_compute_engine/requirements.in b/larq_compute_engine/requirements.in new file mode 100644 index 00000000..5ab513f8 --- /dev/null +++ b/larq_compute_engine/requirements.in @@ -0,0 +1,7 @@ +tensorflow==2.16.1 +tf-keras==2.16.0 +tensorflow-datasets +larq +tqdm +pytest +googleapis-common-protos<2,>=1.52.0 # dependency of tensorflow-datasets, somehow not picked up by pip-compile diff --git a/larq_compute_engine/requirements.txt b/larq_compute_engine/requirements.txt new file mode 100644 index 00000000..2da7226b --- /dev/null +++ b/larq_compute_engine/requirements.txt @@ -0,0 +1,191 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile --allow-unsafe --no-emit-index-url --strip-extras larq_compute_engine/requirements.in +# +absl-py==2.1.0 + # via + # array-record + # etils + # keras + # tensorboard + # tensorflow + # tensorflow-datasets + # tensorflow-metadata +array-record==0.5.1 + # via tensorflow-datasets +astunparse==1.6.3 + # via tensorflow +certifi==2024.6.2 + # via requests +charset-normalizer==3.3.2 + # via requests +click==8.1.7 + # via tensorflow-datasets +dm-tree==0.1.8 + # via tensorflow-datasets +docstring-parser==0.16 + # via simple-parsing +etils==1.7.0 + # via + # array-record + # tensorflow-datasets +exceptiongroup==1.2.1 + # via pytest +flatbuffers==24.3.25 + # via tensorflow +fsspec==2024.6.0 + # via etils +gast==0.5.4 + # via tensorflow +google-pasta==0.2.0 + # via tensorflow +googleapis-common-protos==1.63.1 + # via -r larq_compute_engine/requirements.in +grpcio==1.64.1 + # via + # tensorboard + # tensorflow +h5py==3.11.0 + # via + # keras + # tensorflow +idna==3.7 + # via requests +immutabledict==4.2.0 + # via tensorflow-datasets +importlib-resources==6.4.0 + # via etils +iniconfig==2.0.0 + # via pytest +keras==3.3.3 + # via tensorflow +larq==0.13.3 + # via -r larq_compute_engine/requirements.in +libclang==18.1.1 + # via tensorflow +markdown==3.6 + # via tensorboard +markdown-it-py==3.0.0 + # via rich +markupsafe==2.1.5 + # via werkzeug +mdurl==0.1.2 + # via markdown-it-py +ml-dtypes==0.3.2 + # via + # keras + # tensorflow +namex==0.0.8 + # via keras +numpy==1.26.4 + # via + # etils + # h5py + # keras + # larq + # ml-dtypes + # opt-einsum + # pyarrow + # tensorboard + # tensorflow + # tensorflow-datasets +opt-einsum==3.3.0 + # via tensorflow +optree==0.11.0 + # via keras +packaging==24.1 + # via + # larq + # pytest + # tensorflow +pluggy==1.5.0 + # via pytest +promise==2.3 + # via tensorflow-datasets +protobuf==3.20.3 + # via + # googleapis-common-protos + # tensorboard + # tensorflow + # tensorflow-datasets + # tensorflow-metadata +psutil==5.9.8 + # via tensorflow-datasets +pyarrow==16.1.0 + # via tensorflow-datasets +pygments==2.18.0 + # via rich +pytest==8.2.2 + # via -r larq_compute_engine/requirements.in +requests==2.32.3 + # via + # tensorflow + # tensorflow-datasets +rich==13.7.1 + # via keras +simple-parsing==0.1.5 + # via tensorflow-datasets +six==1.16.0 + # via + # astunparse + # google-pasta + # promise + # tensorboard + # tensorflow +tensorboard==2.16.2 + # via tensorflow +tensorboard-data-server==0.7.2 + # via tensorboard +tensorflow==2.16.1 + # via + # -r larq_compute_engine/requirements.in + # tf-keras +tensorflow-datasets==4.9.6 + # via -r larq_compute_engine/requirements.in +tensorflow-io-gcs-filesystem==0.37.0 + # via tensorflow +tensorflow-metadata==1.15.0 + # via tensorflow-datasets +termcolor==2.4.0 + # via + # tensorflow + # tensorflow-datasets +terminaltables==3.1.10 + # via larq +tf-keras==2.16.0 + # via -r larq_compute_engine/requirements.in +toml==0.10.2 + # via tensorflow-datasets +tomli==2.0.1 + # via pytest +tqdm==4.66.4 + # via + # -r larq_compute_engine/requirements.in + # etils + # tensorflow-datasets +typing-extensions==4.12.2 + # via + # etils + # optree + # simple-parsing + # tensorflow +urllib3==2.2.1 + # via requests +werkzeug==3.0.3 + # via tensorboard +wheel==0.43.0 + # via astunparse +wrapt==1.16.0 + # via + # tensorflow + # tensorflow-datasets +zipp==3.19.2 + # via etils + +# The following packages are considered to be unsafe in a requirements file: +setuptools==70.0.0 + # via + # tensorboard + # tensorflow diff --git a/larq_compute_engine/tests/BUILD b/larq_compute_engine/tests/BUILD index 0356f630..0a77e4a7 100644 --- a/larq_compute_engine/tests/BUILD +++ b/larq_compute_engine/tests/BUILD @@ -1,3 +1,7 @@ +load("@pypi//:requirements.bzl", tf_requirement = "requirement") +load("@pypi_lce//:requirements.bzl", lce_requirement = "requirement") +load("//larq_compute_engine/tests:qemu_test.bzl", "lce_qemu_test_suite") + package( default_visibility = ["//visibility:public"], licenses = ["notice"], # Apache 2.0 @@ -8,15 +12,23 @@ exports_files([ "test_aarch64_binary.sh", ]) -load("//larq_compute_engine/tests:qemu_test.bzl", "lce_qemu_test_suite") - py_test( name = "end2end_test", size = "large", - srcs = ["end2end_test.py"], + srcs = [ + "end2end_test.py", + "preprocess.py", + ], deps = [ "//larq_compute_engine/mlir:converter", "//larq_compute_engine/tflite/python:interpreter", + tf_requirement("numpy"), + lce_requirement("larq"), + lce_requirement("pytest"), + lce_requirement("tensorflow"), + lce_requirement("tensorflow_datasets"), + lce_requirement("tf-keras"), + lce_requirement("importlib_resources"), ], ) @@ -25,6 +37,10 @@ py_test( srcs = ["strip_lcedequantize_test.py"], deps = [ "//larq_compute_engine/mlir:converter", + lce_requirement("larq"), + lce_requirement("pytest"), + lce_requirement("tensorflow"), + lce_requirement("tf-keras"), ], ) diff --git a/larq_compute_engine/tests/end2end_test.py b/larq_compute_engine/tests/end2end_test.py index cddad71e..f2ca87fe 100644 --- a/larq_compute_engine/tests/end2end_test.py +++ b/larq_compute_engine/tests/end2end_test.py @@ -15,7 +15,7 @@ ) from larq_compute_engine.tflite.python.interpreter import Interpreter -from preprocess import preprocess_image_tensor, IMAGE_SIZE +from larq_compute_engine.tests.preprocess import preprocess_image_tensor, IMAGE_SIZE def convert_keras_model_as_saved_model(model, **kwargs): diff --git a/larq_compute_engine/tflite/java/build_lce_aar.sh b/larq_compute_engine/tflite/java/build_lce_aar.sh index 2a4b9b47..68e83be9 100755 --- a/larq_compute_engine/tflite/java/build_lce_aar.sh +++ b/larq_compute_engine/tflite/java/build_lce_aar.sh @@ -16,11 +16,8 @@ VERSION=$(git describe --tags) BUILDER="${BUILDER:-bazel}" BASEDIR=larq_compute_engine/tflite -CROSSTOOL="//external:android/crosstool" -HOST_CROSSTOOL="@bazel_tools//tools/cpp:toolchain" -BUILD_OPTS="-c opt --fat_apk_cpu=x86,x86_64,arm64-v8a" -CROSSTOOL_OPTS="--crosstool_top=$CROSSTOOL --host_crosstool_top=$HOST_CROSSTOOL" +BUILD_OPTS="-c opt --config=android_arm64 --fat_apk_cpu=x86,x86_64,arm64-v8a" test -d $BASEDIR || (echo "Aborting: not at top-level build directory"; exit 1) diff --git a/larq_compute_engine/tflite/python/BUILD b/larq_compute_engine/tflite/python/BUILD index 3a25e789..2f57661e 100644 --- a/larq_compute_engine/tflite/python/BUILD +++ b/larq_compute_engine/tflite/python/BUILD @@ -1,5 +1,7 @@ load("@org_tensorflow//tensorflow:tensorflow.bzl", "pybind_extension") load("@org_tensorflow//tensorflow/lite:build_def.bzl", "tflite_linkopts") +load("@pypi//:requirements.bzl", tf_requirement = "requirement") +load("@pypi_lce//:requirements.bzl", lce_requirement = "requirement") package( default_visibility = ["//visibility:public"], @@ -35,6 +37,10 @@ py_library( "__init__.py", "interpreter_base.py", ], + deps = [ + tf_requirement("numpy"), + lce_requirement("tqdm"), + ], ) py_library( diff --git a/larq_compute_engine/tflite/tests/BUILD b/larq_compute_engine/tflite/tests/BUILD index aea9b72c..0f013592 100644 --- a/larq_compute_engine/tflite/tests/BUILD +++ b/larq_compute_engine/tflite/tests/BUILD @@ -1,3 +1,6 @@ +load("@pypi//:requirements.bzl", tf_requirement = "requirement") +load("@pypi_lce//:requirements.bzl", lce_requirement = "requirement") + package( default_visibility = ["//visibility:public"], licenses = ["notice"], # Apache 2.0 @@ -77,9 +80,13 @@ cc_test( py_test( name = "interpreter_test", + size = "small", srcs = ["interpreter_test.py"], deps = [ "//larq_compute_engine/tflite/python:interpreter", + tf_requirement("numpy"), + lce_requirement("pytest"), + lce_requirement("tensorflow"), ], ) diff --git a/third_party/install_android.sh b/third_party/install_android.sh index 0245e09c..3ba0a37e 100755 --- a/third_party/install_android.sh +++ b/third_party/install_android.sh @@ -4,12 +4,16 @@ set -e # **NOTE**: This requires Java 8 and won't work on never versions. See: # https://stackoverflow.com/questions/46402772/failed-to-install-android-sdk-java-lang-noclassdeffounderror-javax-xml-bind-a +# Taken from tensorflow/lite/tools/tflite-android.Dockerfile + # default LCE Android Env. variables -export ANDROID_SDK_URL="https://dl.google.com/android/repository/sdk-tools-linux-3859397.zip" +export ANDROID_SDK_URL="https://dl.google.com/android/repository/commandlinetools-linux-6858069_latest.zip" export ANDROID_HOME="/tmp/lce_android" -export ANDROID_VERSION=29 -export ANDROID_BUILD_TOOLS_VERSION=30.0.2 -export ANDROID_NDK_VERSION=21.4.7075529 +export ANDROID_API_LEVEL=30 +export ANDROID_BUILD_TOOLS_VERSION=31.0.0 +export ANDROID_NDK_VERSION=25.2.9519653 +export ANDROID_NDK_API_LEVEL=30 + # download android SDK mkdir -p $ANDROID_HOME; cd $ANDROID_HOME; @@ -19,24 +23,27 @@ curl -o lce_android_sdk.zip $ANDROID_SDK_URL; echo -e "DONE.\n\n" echo -e "Unpacking Android SDK ... " -unzip lce_android_sdk.zip; +unzip lce_android_sdk.zip -d /tmp +mkdir -p ${ANDROID_HOME}/cmdline-tools +mv /tmp/cmdline-tools ${ANDROID_HOME}/cmdline-tools/latest echo -e "DONE.\n\n" rm lce_android_sdk.zip; # install android platform and build tools echo -e "Updating SDK manager ... " -yes | $ANDROID_HOME/tools/bin/sdkmanager --licenses -$ANDROID_HOME/tools/bin/sdkmanager --update +yes | $ANDROID_HOME/cmdline-tools/latest/bin/sdkmanager --licenses +$ANDROID_HOME/cmdline-tools/latest/bin/sdkmanager --update echo -e "DONE.\n\n" echo -e "Installing Android SDK Platform and Build Tools ... " -$ANDROID_HOME/tools/bin/sdkmanager "build-tools;${ANDROID_BUILD_TOOLS_VERSION}" \ - "platforms;android-${ANDROID_VERSION}" \ +$ANDROID_HOME/cmdline-tools/latest/bin/sdkmanager \ + "build-tools;${ANDROID_BUILD_TOOLS_VERSION}" \ + "platforms;android-${ANDROID_API_LEVEL}" \ "platform-tools" echo -e "DONE.\n\n" echo -e "Installing Android NDK ... " -$ANDROID_HOME/tools/bin/sdkmanager \ +$ANDROID_HOME/cmdline-tools/latest/bin/sdkmanager \ "ndk;${ANDROID_NDK_VERSION}" echo -e "DONE.\n\n" diff --git a/third_party/tensorflow b/third_party/tensorflow index 1cb1a030..5bc9d266 160000 --- a/third_party/tensorflow +++ b/third_party/tensorflow @@ -1 +1 @@ -Subproject commit 1cb1a030a62b169d90d34c747ab9b09f332bf905 +Subproject commit 5bc9d26649cca274750ad3625bd93422617eed4b diff --git a/third_party/tensorflow_patches/disable_forced_mkl.patch b/third_party/tensorflow_patches/disable_forced_mkl.patch index 8791d659..ea60601f 100644 --- a/third_party/tensorflow_patches/disable_forced_mkl.patch +++ b/third_party/tensorflow_patches/disable_forced_mkl.patch @@ -1,29 +1,27 @@ -diff --git a/tensorflow/tsl/mkl/build_defs.bzl b/tensorflow/tsl/mkl/build_defs.bzl -index eaa0b2dbde7..9d709f8abf5 100644 ---- a/tensorflow/tsl/mkl/build_defs.bzl -+++ b/tensorflow/tsl/mkl/build_defs.bzl +diff --git a/third_party/xla/third_party/tsl/tsl/mkl/build_defs.bzl b/third_party/xla/third_party/tsl/tsl/mkl/build_defs.bzl +index 90030a39744..489ebaa5aa7 100644 +--- a/third_party/xla/third_party/tsl/tsl/mkl/build_defs.bzl ++++ b/third_party/xla/third_party/tsl/tsl/mkl/build_defs.bzl @@ -33,8 +33,9 @@ def if_mkl(if_true, if_false = []): """ return select({ - "@org_tensorflow//tensorflow/tsl/mkl:build_with_mkl_aarch64": if_true, -- "@org_tensorflow//tensorflow/tsl:linux_x86_64": if_true, -- "@org_tensorflow//tensorflow/tsl:windows": if_true, -+ "@org_tensorflow//tensorflow/tsl/mkl:build_with_mkl_lnx_x64": if_true, -+ "@org_tensorflow//tensorflow/tsl/mkl:build_with_mkl_lnx_openmp": if_true, -+ "@org_tensorflow//tensorflow/tsl/mkl:build_with_mkl_windows_openmp": if_true, + "@local_tsl//tsl/mkl:build_with_mkl_aarch64": if_true, +- "@local_tsl//tsl:linux_x86_64": if_true, +- "@local_tsl//tsl:windows": if_true, ++ "@local_tsl//tsl/mkl:build_with_mkl_lnx_x64": if_true, ++ "@local_tsl//tsl/mkl:build_with_mkl_lnx_openmp": if_true, ++ "@local_tsl//tsl/mkl:build_with_mkl_windows_openmp": if_true, "//conditions:default": if_false, }) - -@@ -102,9 +103,9 @@ def mkl_deps(): + +@@ -102,8 +103,8 @@ def mkl_deps(): """ return select({ - "@org_tensorflow//tensorflow/tsl/mkl:build_with_mkl_aarch64": ["@mkl_dnn_acl_compatible//:mkl_dnn_acl"], -- "@org_tensorflow//tensorflow/tsl:linux_x86_64_with_onednn_v2": ["@mkl_dnn_v1//:mkl_dnn"], -- "@org_tensorflow//tensorflow/tsl:linux_x86_64_with_onednn_v3": ["@onednn_v3//:mkl_dnn"], -- "@org_tensorflow//tensorflow/tsl:windows": ["@mkl_dnn_v1//:mkl_dnn"], -+ "@org_tensorflow//tensorflow/tsl/mkl:build_with_mkl_lnx_x64": ["@mkl_dnn_v1//:mkl_dnn"], -+ "@org_tensorflow//tensorflow/tsl/mkl:build_with_mkl_lnx_openmp": ["@mkl_dnn_v1//:mkl_dnn"], -+ "@org_tensorflow//tensorflow/tsl/mkl:build_with_mkl_windows_openmp": ["@mkl_dnn_v1//:mkl_dnn"], + "@local_tsl//tsl/mkl:build_with_mkl_aarch64": ["@mkl_dnn_acl_compatible//:mkl_dnn_acl"], +- "@local_tsl//tsl:linux_x86_64": ["@onednn//:mkl_dnn"], +- "@local_tsl//tsl:windows": ["@onednn//:mkl_dnn"], ++ "@local_tsl//tsl/mkl:build_with_mkl_lnx_x64": ["@onednn//:mkl_dnn"], ++ "@local_tsl//tsl/mkl:build_with_mkl_windows_openmp": ["@onednn//:mkl_dnn"], "//conditions:default": [], }) - +