diff --git a/.bazelversion b/.bazelversion
index 03f488b0..f22d756d 100644
--- a/.bazelversion
+++ b/.bazelversion
@@ -1 +1 @@
-5.3.0
+6.5.0
diff --git a/.github/tools/release_linux.sh b/.github/tools/release_linux.sh
index 2fd22064..b697de6f 100755
--- a/.github/tools/release_linux.sh
+++ b/.github/tools/release_linux.sh
@@ -3,13 +3,17 @@ set -e -x
 
 python configure.py
 
-# Build
-bazel build :build_pip_pkg \
+# Inside the docker container on github actions there is not
+# enough space for the bazel cache, but a larger disk is mounted at /github_disk
+# so we tell bazel to store everything there
+
+# `release_cpu_linux` will activate absolute paths to files that only exist in the tensorflow/build:2.16-pythonXX docker container
+bazel --output_user_root=/github_disk/bazel_root \
+  build :build_pip_pkg \
+  -c opt \
+  --config=release_cpu_linux \
   --copt=-fvisibility=hidden \
-  --copt=-mavx \
-  --distinct_host_configuration=false \
-  --verbose_failures \
-  --crosstool_top=@ubuntu20.04-gcc9_manylinux2014-cuda11.2-cudnn8.1-tensorrt7.2_config_cuda//crosstool:toolchain
+  --verbose_failures
 
 # Package Whl
 bazel-bin/build_pip_pkg artifacts
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 1241c867..be6ed26f 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -36,7 +36,7 @@ jobs:
         if: steps.cache.outputs.cache-hit != 'true'
         run: ./third_party/install_android.sh
       - name: Configure Bazel
-        run: LCE_SET_ANDROID_WORKSPACE=1 ANDROID_SDK_HOME="/tmp/lce_android" ANDROID_NDK_HOME="/tmp/lce_android/ndk/21.4.7075529" ./configure.py
+        run: LCE_SET_ANDROID_WORKSPACE=1 ANDROID_SDK_HOME="/tmp/lce_android" ANDROID_API_LEVEL=30 ANDROID_NDK_HOME="/tmp/lce_android/ndk/25.2.9519653" ANDROID_NDK_API_LEVEL=30 ANDROID_BUILD_TOOLS_VERSION=31.0.0 ./configure.py
         shell: bash
       - run: mkdir benchmark-binaries
       - name: Build Benchmark utility for AArch64
@@ -108,7 +108,7 @@ jobs:
         if: steps.cache.outputs.cache-hit != 'true'
         run: ./third_party/install_android.sh
       - name: Configure Bazel
-        run: LCE_SET_ANDROID_WORKSPACE=1 ANDROID_SDK_HOME="/tmp/lce_android" ANDROID_NDK_HOME="/tmp/lce_android/ndk/21.4.7075529" ./configure.py
+        run: LCE_SET_ANDROID_WORKSPACE=1 ANDROID_SDK_HOME="/tmp/lce_android" ANDROID_API_LEVEL=30 ANDROID_NDK_HOME="/tmp/lce_android/ndk/25.2.9519653" ANDROID_NDK_API_LEVEL=30 ANDROID_BUILD_TOOLS_VERSION=31.0.0 ./configure.py
         shell: bash
       - name: Build LCE AAR
         run: BUILDER=bazelisk ./larq_compute_engine/tflite/java/build_lce_aar.sh
@@ -134,10 +134,10 @@ jobs:
 
   macos-release-wheel:
     name: Build release wheels for macOS
-    runs-on: macos-latest
+    runs-on: macos-13
     strategy:
       matrix:
-        python-version: [3.9, "3.10", 3.11]
+        python-version: ["3.10", 3.11]
       fail-fast: false
     steps:
       - uses: actions/checkout@v4
@@ -154,33 +154,35 @@ jobs:
           python -m pip install delocate wheel setuptools numpy six --no-cache-dir
 
           ./configure.py
-          export MACOSX_DEPLOYMENT_TARGET=10.14
+          # This matches `release_macox_x86` in .tensorflow.bazelrc
+          export MACOSX_DEPLOYMENT_TARGET=10.15
 
           if [[ -n $GOOGLE_APPLICATION_CREDENTIALS ]]; then
-            echo -e 'build --remote_http_cache=https://storage.googleapis.com/plumerai-bazel-cache/lce-release-macos-python${{ matrix.python-version }}' >> .bazelrc.user
+            echo -e 'build --remote_cache=https://storage.googleapis.com/plumerai-bazel-cache/lce-release-macos-python${{ matrix.python-version }}' >> .bazelrc.user
             echo -e 'build --google_default_credentials' >> .bazelrc.user
           fi
 
-          bazelisk build :build_pip_pkg --copt=-fvisibility=hidden --copt=-mavx --linkopt=-dead_strip --distinct_host_configuration=false
-          bazel-bin/build_pip_pkg artifacts --plat-name macosx_10_14_x86_64
+          bazelisk build :build_pip_pkg --config=release_macos_x86 --config=release_cpu_macos --copt=-fvisibility=hidden --linkopt=-dead_strip
+          bazel-bin/build_pip_pkg artifacts --plat-name macosx_10_15_x86_64
 
           for f in artifacts/*.whl; do
             delocate-wheel -w wheelhouse $f
           done
         env:
           LCE_RELEASE_VERSION: ${{ github.event.inputs.version }}
+          TF_PYTHON_VERSION: ${{ matrix.python-version }}
         shell: bash
       - uses: actions/upload-artifact@v4
         with:
-          name: ${{ runner.os }}-wheels
+          name: ${{ runner.os }}-wheels-${{ matrix.python-version }}
           path: wheelhouse
 
   macos-arm-release-wheel:
     name: Build release arm wheels for macOS
-    runs-on: macos-11
+    runs-on: macos-14
     strategy:
       matrix:
-        python-version: [3.9, "3.10", 3.11]
+        python-version: ["3.10", 3.11]
       fail-fast: false
     steps:
       - uses: actions/checkout@v4
@@ -197,25 +199,27 @@ jobs:
           python -m pip install delocate wheel setuptools numpy six --no-cache-dir
 
           ./configure.py
-          export MACOSX_DEPLOYMENT_TARGET=11.0
+          # This matches `release_macox_arm64` in .tensorflow.bazelrc
+          export MACOSX_DEPLOYMENT_TARGET=12.0
 
           if [[ -n $GOOGLE_APPLICATION_CREDENTIALS ]]; then
-            echo -e 'build --remote_http_cache=https://storage.googleapis.com/plumerai-bazel-cache/lce-release-macos-arm-python${{ matrix.python-version }}' >> .bazelrc.user
+            echo -e 'build --remote_cache=https://storage.googleapis.com/plumerai-bazel-cache/lce-release-macos-arm-python${{ matrix.python-version }}' >> .bazelrc.user
             echo -e 'build --google_default_credentials' >> .bazelrc.user
           fi
 
-          bazelisk build :build_pip_pkg --copt=-fvisibility=hidden --linkopt=-dead_strip --config=macos_arm64
-          bazel-bin/build_pip_pkg artifacts --plat-name macosx_11_0_arm64
+          bazelisk build :build_pip_pkg --config=release_macos_arm64 --copt=-fvisibility=hidden --linkopt=-dead_strip
+          bazel-bin/build_pip_pkg artifacts --plat-name macosx_12_0_arm64
 
           for f in artifacts/*.whl; do
             delocate-wheel -w wheelhouse $f
           done
         env:
           LCE_RELEASE_VERSION: ${{ github.event.inputs.version }}
+          TF_PYTHON_VERSION: ${{ matrix.python-version }}
         shell: bash
       - uses: actions/upload-artifact@v4
         with:
-          name: ${{ runner.os }}-arm-wheels
+          name: ${{ runner.os }}-arm-wheels-${{ matrix.python-version }}
           path: wheelhouse
 
   manylinux-release-wheel:
@@ -223,7 +227,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.9, "3.10", 3.11]
+        python-version: ["3.10", 3.11]
       fail-fast: false
     steps:
       - uses: actions/checkout@v4
@@ -237,15 +241,17 @@ jobs:
       - name: Build manylinux2014 wheels
         run: |
           if [[ -n $GOOGLE_APPLICATION_CREDENTIALS ]]; then
-            echo -e 'build --remote_http_cache=https://storage.googleapis.com/plumerai-bazel-cache/lce-release-manylinux-python${{ matrix.python-version }}' >> .bazelrc.user
+            echo -e 'build --remote_cache=https://storage.googleapis.com/plumerai-bazel-cache/lce-release-manylinux-python${{ matrix.python-version }}' >> .bazelrc.user
             echo -e 'build --google_default_credentials' >> .bazelrc.user
           fi
 
           docker run -e LCE_RELEASE_VERSION=${{ github.event.inputs.version }} \
             -e GOOGLE_APPLICATION_CREDENTIALS=/tmp/gcloud-credentials.json \
+            -e TF_PYTHON_VERSION=${{ matrix.python-version }} \
             -v $GOOGLE_APPLICATION_CREDENTIALS:/tmp/gcloud-credentials.json:ro \
             -v ${PWD}:/compute-engine -w /compute-engine \
-            tensorflow/build:2.13-python${{ matrix.python-version }} \
+            -v /mnt:/github_disk \
+            tensorflow/build:2.16-python${{ matrix.python-version }} \
             .github/tools/release_linux.sh
 
           sudo apt-get -y -qq install patchelf --no-install-recommends
@@ -258,7 +264,7 @@ jobs:
           ls -al wheelhouse/
       - uses: actions/upload-artifact@v4
         with:
-          name: ${{ runner.os }}-wheels
+          name: ${{ runner.os }}-wheels-${{ matrix.python-version }}
           path: wheelhouse
 
   windows-release-wheel:
@@ -266,7 +272,7 @@ jobs:
     runs-on: windows-2019
     strategy:
       matrix:
-        python-version: [3.9, "3.10", 3.11]
+        python-version: ["3.10", 3.11]
       fail-fast: false
     steps:
       - name: Configure Pagefile
@@ -294,18 +300,24 @@ jobs:
           $Env:CC_OPT_FLAGS = "/O2"
 
           python --version
-          python -m pip install wheel setuptools numpy six --no-cache-dir
+          python -m pip install wheel setuptools numpy six pip-tools --no-cache-dir
+          # This is needed because the requirements on windows are different than on other systems
+          pip-compile --strip-extras --no-emit-index-url --allow-unsafe larq_compute_engine/requirements.in
+
+          # Fix for path length limit: replace workspace name by 'lce'
+          (Get-Content WORKSPACE).Replace('workspace(name = "larq_compute_engine")', 'workspace(name = "lce")') | Set-Content WORKSPACE
 
           "" | python configure.py
 
-          bazelisk --output_base=C:\build_output build :build_pip_pkg --enable_runfiles --local_ram_resources=4096 --remote_http_cache=https://storage.googleapis.com/plumerai-bazel-cache/lce-release-windows-python${{ matrix.python-version }} --google_default_credentials
+          bazelisk --output_base=C:\bzl build :build_pip_pkg --enable_runfiles --local_ram_resources=4096 --remote_cache=https://storage.googleapis.com/plumerai-bazel-cache/lce-release-windows-python${{ matrix.python-version }} --google_default_credentials
           bazel-bin/build_pip_pkg wheelhouse
         env:
           LCE_RELEASE_VERSION: ${{ github.event.inputs.version }}
+          TF_PYTHON_VERSION: ${{ matrix.python-version }}
         shell: pwsh
       - uses: actions/upload-artifact@v4
         with:
-          name: ${{ runner.os }}-wheels
+          name: ${{ runner.os }}-wheels-${{ matrix.python-version }}
           path: wheelhouse
 
   upload-wheels:
@@ -322,31 +334,11 @@ jobs:
     steps:
       - uses: actions/download-artifact@v4
         with:
-          name: Linux-wheels
-          path: Linux-wheels
-        if: ${{ needs.manylinux-release-wheel.result == 'success' }}
-      - uses: actions/download-artifact@v4
-        with:
-          name: macOS-wheels
-          path: macOS-wheels
-        if: ${{ needs.macos-release-wheel.result == 'success' }}
-      - uses: actions/download-artifact@v4
-        with:
-          name: macOS-arm-wheels
-          path: macOS-arm-wheels
-        if: ${{ needs.macos-arm-release-wheel.result == 'success' }}
-      - uses: actions/download-artifact@v4
-        with:
-          name: Windows-wheels
-          path: Windows-wheels
-        if: ${{ needs.windows-release-wheel.result == 'success' }}
+          pattern: "*wheels*"
+          path: dist
+          merge-multiple: true
       - run: |
           set -x
-          mkdir -p dist
-          cp Linux-wheels/*.whl dist/ || true
-          cp macOS-wheels/*.whl dist/ || true
-          cp macOS-arm-wheels/*.whl dist/ || true
-          cp Windows-wheels/*.whl dist/ || true
           ls -la dist/
           sha256sum dist/*.whl
       - uses: pypa/gh-action-pypi-publish@master
diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml
index 459f1022..15fd6394 100644
--- a/.github/workflows/unittests.yml
+++ b/.github/workflows/unittests.yml
@@ -10,6 +10,9 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: true
 
+env:
+  TF_PYTHON_VERSION: "3.11"
+
 jobs:
   TFLite:
     runs-on: ubuntu-latest
@@ -19,16 +22,8 @@ jobs:
         with:
           submodules: true
           fetch-depth: 0
-      - uses: actions/setup-python@v5
-        with:
-          python-version: 3.9
-      - name: Configure Bazel
-        run: ./configure.py
-        shell: bash
-      - name: Install pip dependencies
-        run: pip install numpy --no-cache-dir
       - name: Run C++ Unit Tests
-        run: bazelisk test larq_compute_engine/tests:cc_tests --distinct_host_configuration=false --test_output=all
+        run: bazelisk test larq_compute_engine/tests:cc_tests --test_output=all
       - name: Build TF Lite Static Library with CMake
         run: |
           mkdir build
@@ -45,14 +40,6 @@ jobs:
         run: |
           sudo apt-get update
           sudo apt-get install -y --no-install-recommends qemu-user
-      - uses: actions/setup-python@v5
-        with:
-          python-version: 3.9
-      - name: Configure Bazel
-        run: ./configure.py
-        shell: bash
-      - name: Install pip dependencies
-        run: pip install numpy six --no-cache-dir
       - name: "TF Lite Arm32: Cross-compile and run unit tests in qemu"
         run: bazelisk test larq_compute_engine/tests:arm32_tests --config=rpi3 --test_output=all --test_filter="-*BigTest*" --copt=-O1
       - name: "TF Lite Aarch64: Cross-compile and run unit tests in qemu"
@@ -69,32 +56,21 @@ jobs:
         continue-on-error: true
         with:
           credentials_json: ${{ secrets.gcs_bazel_cache }}
-      - uses: actions/setup-python@v5
-        with:
-          python-version: 3.9
       - name: Configure Bazel
         run: |
-          ./configure.py
-          echo -e 'build --distinct_host_configuration=false' >> .bazelrc.user
           if [[ -n $GOOGLE_APPLICATION_CREDENTIALS ]]; then
-            echo -e 'build --remote_http_cache=https://storage.googleapis.com/plumerai-bazel-cache/lce-ubuntu' >> .bazelrc.user
+            echo -e 'build --remote_cache=https://storage.googleapis.com/plumerai-bazel-cache/lce-ubuntu' >> .bazelrc.user
             echo -e 'build --google_default_credentials' >> .bazelrc.user
           fi
         shell: bash
-      - name: Set bazel cache
-        run: echo -e 'build --remote_upload_local_results=false' >> .bazelrc.user
-        if: github.ref != 'refs/heads/main'
-        shell: bash
-      - name: Install pip dependencies
-        run: pip install tensorflow-cpu~=2.13.0 larq~=0.13 pytest tensorflow_datasets~=4.9 flatbuffers==23.1.21 tqdm --no-cache-dir
       - name: Run Interpreter test
         run: bazelisk test larq_compute_engine/tflite/tests:interpreter_test --test_output=all
       - name: Run FileCheck tests
         run: bazelisk test larq_compute_engine/mlir/tests:all --test_output=all
       - name: Run End2End tests
-        run: bazelisk test larq_compute_engine/tests:end2end_test --test_output=all
+        run: bazelisk test larq_compute_engine/tests:end2end_test --test_output=all --test_env=TF_USE_LEGACY_KERAS=1
       - name: Run Strip dequantize op tests
-        run: bazelisk test larq_compute_engine/tests:strip_lcedequantize_test --test_output=all
+        run: bazelisk test larq_compute_engine/tests:strip_lcedequantize_test --test_output=all --test_env=TF_USE_LEGACY_KERAS=1
 
   ConverterPython:
     runs-on: ubuntu-latest
@@ -109,6 +85,10 @@ jobs:
             python-version: 3.11
             flatbuffers-version: 23.1.21
             protobuf-version: 4.23.4
+          - tf-version: 2.16.1
+            python-version: 3.11
+            flatbuffers-version: 24.3.25
+            protobuf-version: 4.25.3
     if: "!contains(github.event.head_commit.message, 'ci-skip')"
     steps:
       - uses: actions/checkout@v4
@@ -117,6 +97,9 @@ jobs:
           python-version: ${{ matrix.python-version }}
       - name: Install TensorFlow
         run: pip install tensorflow==${{matrix.tf-version}} --no-cache-dir
+      - name: Install legacy tf-keras
+        if: matrix.tf-version == '2.16.1'
+        run: pip install tf-keras==2.16.0
       - name: Install flatbuffers
         run: pip install flatbuffers==${{matrix.flatbuffers-version}} --no-cache-dir
       - name: Install protobuf
@@ -124,7 +107,7 @@ jobs:
       - name: Install other dependencies
         run: pip install larq~=0.13.3 packaging tqdm --no-cache-dir
       - name: Run Converter test
-        run: PYTHONPATH=./ python larq_compute_engine/mlir/python/converter_test.py
+        run: TF_USE_LEGACY_KERAS=1 PYTHONPATH=./ python larq_compute_engine/mlir/python/converter_test.py
 
   Android_AAR:
     runs-on: ubuntu-latest
@@ -142,15 +125,13 @@ jobs:
         with:
           path: /tmp/lce_android
           key: ${{ runner.os }}-${{ hashFiles('**/third_party/install_android.sh') }}
-      - name: Install pip dependencies
-        run: pip install numpy six --no-cache-dir
       - name: Set Java version
         run: echo "JAVA_HOME=${JAVA_HOME_8_X64}" >> $GITHUB_ENV
       - name: Download and install Android NDK/SDK
         if: steps.cache.outputs.cache-hit != 'true'
         run: ./third_party/install_android.sh
       - name: Configure Bazel
-        run: LCE_SET_ANDROID_WORKSPACE=1 ANDROID_SDK_HOME="/tmp/lce_android" ANDROID_NDK_HOME="/tmp/lce_android/ndk/21.4.7075529" ./configure.py
+        run: LCE_SET_ANDROID_WORKSPACE=1 ANDROID_SDK_HOME="/tmp/lce_android" ANDROID_API_LEVEL=30 ANDROID_NDK_HOME="/tmp/lce_android/ndk/25.2.9519653" ANDROID_NDK_API_LEVEL=30 ANDROID_BUILD_TOOLS_VERSION=31.0.0 ./configure.py
         shell: bash
       - name: Build LCE AAR
         run: BUILDER=bazelisk ./larq_compute_engine/tflite/java/build_lce_aar.sh
diff --git a/.tensorflow.bazelrc b/.tensorflow.bazelrc
index 98cf5a53..6a0e08e8 100644
--- a/.tensorflow.bazelrc
+++ b/.tensorflow.bazelrc
@@ -6,12 +6,11 @@
 
 # Default build options. These are applied first and unconditionally.
 
-# The following line opts in to modular op registration support by default.
+# For projects which use TensorFlow as part of a Bazel build process, putting
+# nothing in a bazelrc will default to a monolithic build. The following line
+# opts in to modular op registration support by default.
 build --define framework_shared_object=true
-
-# For workaround https://github.com/bazelbuild/bazel/issues/8772 with Bazel >= 0.29.1
-build --java_toolchain=@tf_toolchains//toolchains/java:tf_java_toolchain
-build --host_java_toolchain=@tf_toolchains//toolchains/java:tf_java_toolchain
+build --define tsl_protobuf_header_only=true
 
 build --define=use_fast_cpp_protos=true
 build --define=allow_oversize_protos=true
@@ -19,15 +18,38 @@ build --define=allow_oversize_protos=true
 build --spawn_strategy=standalone
 build -c opt
 
+# Make Bazel print out all options from rc files.
+build --announce_rc
+
 build --define=grpc_no_ares=true
 
+build --noincompatible_remove_legacy_whole_archive
+build --features=-force_no_whole_archive
+
 build --enable_platform_specific_config
 
 # Enable XLA support by default.
 build --define=with_xla_support=true
 
+build --config=short_logs
+
 build --config=v2
 
+# Disable AWS/HDFS support by default
+build --define=no_aws_support=true
+build --define=no_hdfs_support=true
+
+# TF now has `cc_shared_library` targets, so it needs the experimental flag
+# TODO(rostam): Remove when `cc_shared_library` is enabled by default
+build --experimental_cc_shared_library
+
+# cc_shared_library ensures no library is linked statically more than once.
+build --experimental_link_static_libraries_once=false
+
+# Prevent regressions on those two incompatible changes
+# TODO: remove those flags when they are flipped in the default Bazel version TF uses.
+build --incompatible_enforce_config_setting_visibility
+
 # Default options should come above this line.
 
 # Android configs. Bazel needs to have --cpu and --fat_apk_cpu both set to the
@@ -48,20 +70,124 @@ build:android_x86_64 --config=android
 build:android_x86_64 --cpu=x86_64
 build:android_x86_64 --fat_apk_cpu=x86_64
 
+# Build everything statically for Android since all static libs are later
+# bundled together into a single .so for deployment.
+build:android --dynamic_mode=off
+
 # Sets the default Apple platform to macOS.
 build:macos --apple_platform_type=macos
 
 # gRPC on MacOS requires this #define
 build:macos --copt=-DGRPC_BAZEL_BUILD
 
+# Avoid hitting command line argument limit
+build:macos --features=archive_param_file
+
 # Settings for MacOS on ARM CPUs.
 build:macos_arm64 --cpu=darwin_arm64
+build:macos_arm64 --macos_minimum_os=11.0
 
 # Config to use a mostly-static build and disable modular op registration
 # support (this will revert to loading TensorFlow with RTLD_GLOBAL in Python).
 # By default, TensorFlow will build with a dependence on
 # //tensorflow:libtensorflow_framework.so.
 build:monolithic --define framework_shared_object=false
+build:monolithic --define tsl_protobuf_header_only=false
+build:monolithic --experimental_link_static_libraries_once=false  # b/229868128
+
+# Please note that MKL on MacOS is still not supported.
+# If you would like to use a local MKL instead of downloading, please set the
+# environment variable "TF_MKL_ROOT" every time before build.
+build:mkl --define=build_with_mkl=true --define=enable_mkl=true
+build:mkl --define=tensorflow_mkldnn_contraction_kernel=0
+build:mkl --define=build_with_openmp=true
+build:mkl -c opt
+
+# config to build OneDNN backend with a user specified threadpool.
+build:mkl_threadpool --define=build_with_mkl=true --define=enable_mkl=true
+build:mkl_threadpool --define=tensorflow_mkldnn_contraction_kernel=0
+build:mkl_threadpool --define=build_with_mkl_opensource=true
+build:mkl_threadpool -c opt
+
+# Config setting to build oneDNN with Compute Library for the Arm Architecture (ACL).
+build:mkl_aarch64 --define=build_with_mkl_aarch64=true
+build:mkl_aarch64 --define=build_with_openmp=true
+build:mkl_aarch64 --define=build_with_acl=true
+build:mkl_aarch64 -c opt
+
+# Config setting to build oneDNN with Compute Library for the Arm Architecture (ACL).
+# with Eigen threadpool support
+build:mkl_aarch64_threadpool --define=build_with_mkl_aarch64=true
+build:mkl_aarch64_threadpool -c opt
+
+# CUDA: This config refers to building CUDA op kernels with nvcc.
+build:cuda --repo_env TF_NEED_CUDA=1
+build:cuda --crosstool_top=@local_config_cuda//crosstool:toolchain
+build:cuda --@local_config_cuda//:enable_cuda
+
+# CUDA: This config refers to building CUDA op kernels with clang.
+build:cuda_clang --config=cuda
+# Enable TensorRT optimizations https://developer.nvidia.com/tensorrt
+build:cuda_clang --config=tensorrt
+build:cuda_clang --action_env=TF_CUDA_CLANG="1"
+build:cuda_clang --@local_config_cuda//:cuda_compiler=clang
+# Select supported compute capabilities (supported graphics cards).
+# This is the same as the official TensorFlow builds.
+# See https://developer.nvidia.com/cuda-gpus#compute
+# `compute_XY` enables PTX embedding in addition to SASS. PTX
+# is forward compatible beyond the current compute capability major
+# release while SASS is only forward compatible inside the current
+# major release. Example: sm_80 kernels can run on sm_89 GPUs but
+# not on sm_90 GPUs. compute_80 kernels though can also run on sm_90 GPUs.
+build:cuda_clang --repo_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_50,sm_60,sm_70,sm_80,compute_90"
+
+# Set up compilation CUDA version and paths and use the CUDA Clang toolchain.
+build:cuda_clang_official --config=cuda_clang
+build:cuda_clang_official --action_env=TF_CUDA_VERSION="12"
+build:cuda_clang_official --action_env=TF_CUDNN_VERSION="8"
+build:cuda_clang_official --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-12.3"
+build:cuda_clang_official --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
+build:cuda_clang_official --action_env=CLANG_CUDA_COMPILER_PATH="/usr/lib/llvm-17/bin/clang"
+build:cuda_clang_official --action_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
+build:cuda_clang_official --crosstool_top="@sigbuild-r2.16-clang_config_cuda//crosstool:toolchain"
+
+# Build with nvcc for CUDA and clang for host
+build:nvcc_clang --config=cuda
+# Unfortunately, cuda_configure.bzl demands this for using nvcc + clang
+build:nvcc_clang --action_env=TF_CUDA_CLANG="1"
+build:nvcc_clang --action_env=TF_NVCC_CLANG="1"
+build:nvcc_clang --@local_config_cuda//:cuda_compiler=nvcc
+
+
+# Debug config
+build:dbg -c dbg
+# Only include debug info for files under tensorflow/, excluding kernels, to
+# reduce the size of the debug info in the binary. This is because if the debug
+# sections in the ELF binary are too large, errors can occur. See
+# https://github.com/tensorflow/tensorflow/issues/48919.
+# Users can still include debug info for a specific kernel, e.g. with:
+#     --config=dbg --per_file_copt=+tensorflow/core/kernels/identity_op.*@-g
+# Since this .bazelrc file is synced between the tensorflow/tensorflow repo and
+# the openxla/xla repo, also include debug info for files under xla/.
+build:dbg --per_file_copt=+.*,-tensorflow.*,-xla.*@-g0
+build:dbg --per_file_copt=+tensorflow/core/kernels.*@-g0
+# for now, disable arm_neon. see: https://github.com/tensorflow/tensorflow/issues/33360
+build:dbg --cxxopt -DTF_LITE_DISABLE_X86_NEON
+# AWS SDK must be compiled in release mode. see: https://github.com/tensorflow/tensorflow/issues/37498
+build:dbg --copt -DDEBUG_BUILD
+
+# Config to build TF TPU
+build:tpu --define=with_tpu_support=true
+build:tpu --define=framework_shared_object=true
+build:tpu --copt=-DLIBTPU_ON_GCE
+build:tpu --define=enable_mlir_bridge=true
+
+build:tensorrt --repo_env TF_NEED_TENSORRT=1
+
+build:rocm --crosstool_top=@local_config_rocm//crosstool:toolchain
+build:rocm --define=using_rocm_hipcc=true
+build:rocm --define=tensorflow_mkldnn_contraction_kernel=0
+build:rocm --repo_env TF_NEED_ROCM=1
 
 # Options to disable default on features
 build:noaws --define=no_aws_support=true
@@ -75,6 +201,37 @@ build:dynamic_kernels --copt=-DAUTOLOAD_DYNAMIC_KERNELS
 
 # Don't trigger --config=<host platform> when cross-compiling.
 build:android --noenable_platform_specific_config
+build:ios --noenable_platform_specific_config
+
+# Suppress all C++ compiler warnings, otherwise build logs become 10s of MBs.
+build:android --copt=-w
+build:ios --copt=-w
+build:linux --host_copt=-w
+build:macos --copt=-w
+build:windows --copt=/W0
+build:windows --host_copt=/W0
+
+# Suppress most C++ compiler warnings to reduce log size but allow
+# for specific warnings to still be present.
+build:linux --copt="-Wno-all"
+build:linux --copt="-Wno-extra"
+build:linux --copt="-Wno-deprecated"
+build:linux --copt="-Wno-deprecated-declarations"
+build:linux --copt="-Wno-ignored-attributes"
+build:linux --copt="-Wno-array-bounds"
+
+# Add unused-result as an error on Linux.
+build:linux --copt="-Wunused-result"
+build:linux --copt="-Werror=unused-result"
+# Add switch as an error on Linux.
+build:linux --copt="-Wswitch"
+build:linux --copt="-Werror=switch"
+# Required for building with clang
+build:linux --copt="-Wno-error=unused-but-set-variable"
+
+# Linux ARM64 specific options
+build:linux_arm64 --copt="-mtune=generic" --copt="-march=armv8-a" --copt="-O3"
+
 
 # On Windows, `__cplusplus` is wrongly defined without this switch
 # See https://devblogs.microsoft.com/cppblog/msvc-now-correctly-reports-__cplusplus/
@@ -89,8 +246,32 @@ build:windows --host_copt=/D_USE_MATH_DEFINES
 # Windows has a relatively short command line limit, which TF has begun to hit.
 # See https://docs.bazel.build/versions/main/windows.html
 build:windows --features=compiler_param_file
-
-# By default, build LCE in C++ 17 mode.
+build:windows --features=archive_param_file
+
+# Speed Windows compile times. Available in VS 16.4 (we are on 16.11). See
+# https://groups.google.com/a/tensorflow.org/d/topic/build/SsW98Eo7l3o/discussion
+build:windows --copt=/d2ReducedOptimizeHugeFunctions
+build:windows --host_copt=/d2ReducedOptimizeHugeFunctions
+
+# Enable the runfiles symlink tree on Windows. This makes it possible to build
+# the pip package on Windows without an intermediate data-file archive, as the
+# build_pip_package script in its current form (as of Aug 2023) uses the
+# runfiles symlink tree to decide what to put into the Python wheel.
+startup --windows_enable_symlinks
+build:windows --enable_runfiles
+
+# Default paths for TF_SYSTEM_LIBS
+build:linux --define=PREFIX=/usr
+build:linux --define=LIBDIR=$(PREFIX)/lib
+build:linux --define=INCLUDEDIR=$(PREFIX)/include
+build:linux --define=PROTOBUF_INCLUDE_PATH=$(PREFIX)/include
+build:macos --define=PREFIX=/usr
+build:macos --define=LIBDIR=$(PREFIX)/lib
+build:macos --define=INCLUDEDIR=$(PREFIX)/include
+build:macos --define=PROTOBUF_INCLUDE_PATH=$(PREFIX)/include
+# TF_SYSTEM_LIBS do not work on windows.
+
+# By default, build TF in C++ 17 mode.
 build:android --cxxopt=-std=c++17
 build:android --host_cxxopt=-std=c++17
 build:linux --cxxopt=-std=c++17
@@ -113,7 +294,7 @@ build:windows --copt=-DNOGDI
 build:windows --host_copt=-DNOGDI
 
 # MSVC (Windows): Standards-conformant preprocessor mode
-# See https://docs.microsoft.com/en-us/cpp/build/reference/zc-preprocessor
+# See https://docs.microsoft.com/en-us/cpp/preprocessor/preprocessor-experimental-overview
 build:windows --copt=/Zc:preprocessor
 build:windows --host_copt=/Zc:preprocessor
 
@@ -128,13 +309,45 @@ build:windows --host_linkopt=/OPT:ICF
 # Verbose failure logs when something goes wrong
 build:windows --verbose_failures
 
-# On windows, we never cross compile
-build:windows --distinct_host_configuration=false
+# Work around potential issues with large command lines on windows.
+# See: https://github.com/bazelbuild/bazel/issues/5163
+build:windows --features=compiler_param_file
+
+# Do not risk cache corruption. See:
+# https://github.com/bazelbuild/bazel/issues/3360
+build:linux --experimental_guard_against_concurrent_changes
+
+# Configure short or long logs
+build:short_logs --output_filter=DONT_MATCH_ANYTHING
+build:verbose_logs --output_filter=
+
+# Instruction set optimizations
+# TODO(gunan): Create a feature in toolchains for avx/avx2 to
+#   avoid having to define linux/win separately.
+build:avx_linux --copt=-mavx
+build:avx_linux --host_copt=-mavx
+build:avx_win --copt=/arch:AVX
+
+# Use Clang-cl compiler on Windows
+build:win_clang --copt=/clang:-Weverything
+build:win_clang --extra_toolchains=@local_config_cc//:cc-toolchain-x64_windows-clang-cl
+build:win_clang --extra_execution_platforms=//tensorflow/tools/toolchains/win:x64_windows-clang-cl
+build:win_clang --host_platform=//tensorflow/tools/toolchains/win:x64_windows-clang-cl
+build:win_clang --compiler=clang-cl
+build:win_clang --linkopt=/FORCE:MULTIPLE
+build:win_clang --host_linkopt=/FORCE:MULTIPLE
+test:win_clang --linkopt=/FORCE:MULTIPLE
+test:win_clang --host_linkopt=/FORCE:MULTIPLE
 
 # Options to build TensorFlow 1.x or 2.x.
-build:v1 --define=tf_api_version=1 --action_env=TF2_BEHAVIOR=0
+# TODO(kanglan): Change v2's define to default behavior
 build:v2 --define=tf_api_version=2 --action_env=TF2_BEHAVIOR=1
 
+# Disable XLA on mobile.
+build:xla     --define=with_xla_support=true # TODO: remove, it's on by default.
+build:android --define=with_xla_support=false
+build:ios     --define=with_xla_support=false
+
 # Flag to enable remote config
 common --experimental_repo_remote_exec
 
@@ -147,48 +360,203 @@ build:elinux_armhf --config=elinux
 build:elinux_armhf --cpu=armhf
 build:elinux_armhf --copt -mfp16-format=ieee
 
-# Address sanitizer
-# CC=clang bazel build --config asan
-build:asan --strip=never
-build:asan --copt -fsanitize=address
-build:asan --copt -DADDRESS_SANITIZER
-build:asan --copt -g
-build:asan --copt -O3
-build:asan --copt -fno-omit-frame-pointer
-build:asan --linkopt -fsanitize=address
-
-# Memory sanitizer
-# CC=clang bazel build --config msan
-build:msan --strip=never
-build:msan --copt -fsanitize=memory
-build:msan --copt -DADDRESS_SANITIZER
-build:msan --copt -g
-build:msan --copt -O3
-build:msan --copt -fno-omit-frame-pointer
-build:msan --linkopt -fsanitize=memory
-
-# Undefined Behavior Sanitizer
-# CC=clang bazel build --config ubsan
-build:ubsan --strip=never
-build:ubsan --copt -fsanitize=undefined
-build:ubsan --copt -g
-build:ubsan --copt -O3
-build:ubsan --copt -fno-omit-frame-pointer
-build:ubsan --linkopt -fsanitize=undefined
-build:ubsan --linkopt -lubsan
-
-
-# Debug config
-build:dbg -c dbg
-# Only include debug info for files under tensorflow/, excluding kernels, to
-# reduce the size of the debug info in the binary. This is because if the debug
-# sections in the ELF binary are too large, errors can occur. See
-# https://github.com/tensorflow/tensorflow/issues/48919.
-# Users can still include debug info for a specific kernel, e.g. with:
-#     --config=dbg --per_file_copt=+tensorflow/core/kernels/identity_op.*@-g
-build:dbg --per_file_copt=+.*,-tensorflow.*@-g0
-build:dbg --per_file_copt=+tensorflow/core/kernels.*@-g0
-# for now, disable arm_neon. see: https://github.com/tensorflow/tensorflow/issues/33360
-build:dbg --cxxopt -DTF_LITE_DISABLE_X86_NEON
-# AWS SDK must be compiled in release mode. see: https://github.com/tensorflow/tensorflow/issues/37498
-build:dbg --copt -DDEBUG_BUILD
+# Config-specific options should come above this line.
+
+# Load rc file written by ./configure.
+try-import %workspace%/.tf_configure.bazelrc
+try-import %workspace%/xla_configure.bazelrc
+
+# Here are bazelrc configs for release builds
+# Build TensorFlow v2.
+test:release_base --test_size_filters=small,medium
+test:release_base --flaky_test_attempts=3
+
+# Target the AVX instruction set
+build:release_linux_base --config=avx_linux
+
+# Disable clang extension that rejects type definitions within offsetof.
+# This was added in clang-16 by https://reviews.llvm.org/D133574.
+# Can be removed once upb is updated, since a type definition is used within
+# offset of in the current version of ubp.
+# See https://github.com/protocolbuffers/upb/blob/9effcbcb27f0a665f9f345030188c0b291e32482/upb/upb.c#L183.
+build:release_linux_base --copt=-Wno-gnu-offsetof-extensions
+build:release_linux_base --copt=-Wno-error=array-parameter
+build:release_linux_base --copt=-Wno-error=unused-command-line-argument
+# Set lld as the linker.
+build:release_linux_base --linkopt="-fuse-ld=lld"
+build:release_linux_base --linkopt="-lm"
+
+# We have some invalid linker scripts in the build,
+# so we need to disable this check
+build:release_linux_base --linkopt=-Wl,--undefined-version
+
+# Container environment settings below this point.
+# Use Python 3.X as installed in container image
+build:release_linux_base --action_env PYTHON_BIN_PATH="/usr/bin/python3"
+build:release_linux_base --action_env PYTHON_LIB_PATH="/usr/lib/tf_python"
+build:release_linux_base --python_path="/usr/bin/python3"
+# Set Clang as compiler. Use the actual path to clang installed in container.
+build:release_cpu_linux_base --repo_env=CC="/usr/lib/llvm-17/bin/clang"
+build:release_cpu_linux_base --repo_env=BAZEL_COMPILER="/usr/lib/llvm-17/bin/clang"
+# Test-related settings below this point.
+test:release_linux_base --build_tests_only --keep_going --test_output=errors --verbose_failures=true
+test:release_linux_base --local_test_jobs=HOST_CPUS
+test:release_linux_base --test_env=LD_LIBRARY_PATH
+# Give only the list of failed tests at the end of the log
+test:release_linux_base --test_summary=short
+
+# Use the Clang toolchain to compile
+build:release_cpu_linux --config=release_linux_base
+build:release_cpu_linux --crosstool_top="@sigbuild-r2.16-clang_config_cuda//crosstool:toolchain"
+
+build:release_gpu_linux --config=release_cpu_linux
+# Set up compilation CUDA version and paths and use the CUDA Clang toolchain.
+# Note that linux cpu and cuda builds share the same toolchain now.
+build:release_gpu_linux --config=cuda_clang_official
+test:release_gpu_linux --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
+# Local test jobs has to be 4 because parallel_gpu_execute is fragile, I think
+test:release_gpu_linux --test_timeout=300,450,1200,3600 --local_test_jobs=4 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute
+
+build:release_arm64_linux --config=release_linux_base
+build:release_arm64_linux --config=linux_arm64
+build:release_arm64_linux --crosstool_top="@ml2014_clang_aarch64_config_aarch64//crosstool:toolchain"
+build:release_arm64_linux --config=mkl_aarch64_threadpool
+build:release_arm64_linux --copt=-flax-vector-conversions
+test:release_arm64_linux --flaky_test_attempts=3
+
+# The old gcc linux build options are preserved in the unsupported_*_linux
+# configs. If your project fails to build with Clang, you can use these
+# unsupported flags to replace the release flags in your build command.
+# However, please note that the old toolchain is no longer officially supported
+# by TensorFlow and the unsupported configs will be removed soon b/299962977. We
+# strongly recommend that you migrate to Clang as your compiler for TensorFlow
+# Linux builds. Instructions are available in the official documentation:
+# https://www.tensorflow.org/install/source#install_clang_recommended_linux_only
+# Another good option is to use our Docker containers to build and test TF:
+# https://github.com/tensorflow/tensorflow/tree/master/tensorflow/tools/tf_sig_build_dockerfiles.
+build:unsupported_cpu_linux --config=avx_linux
+build:unsupported_cpu_linux --crosstool_top="@ubuntu20.04-gcc9_manylinux2014-cuda11.2-cudnn8.1-tensorrt7.2_config_cuda//crosstool:toolchain"
+test:unsupported_cpu_linux --test_env=LD_LIBRARY_PATH
+test:unsupported_cpu_linux --config=release_base
+
+build:unsupported_gpu_linux --config=cuda
+build:unsupported_gpu_linux --config=unsupported_cpu_linux
+build:unsupported_gpu_linux --action_env=TF_CUDA_VERSION="11"
+build:unsupported_gpu_linux --action_env=TF_CUDNN_VERSION="8"
+build:unsupported_gpu_linux --repo_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_35,sm_50,sm_60,sm_70,sm_75,compute_80"
+build:unsupported_gpu_linux --config=tensorrt
+build:unsupported_gpu_linux --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-11.2"
+build:unsupported_gpu_linux --action_env=LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda-11.1/lib64:/usr/local/tensorrt/lib"
+build:unsupported_gpu_linux --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
+build:unsupported_gpu_linux --crosstool_top=@ubuntu20.04-gcc9_manylinux2014-cuda11.2-cudnn8.1-tensorrt7.2_config_cuda//crosstool:toolchain
+
+build:release_cpu_macos --config=avx_linux
+test:release_cpu_macos --config=release_base
+
+# Base build configs for macOS
+build:release_macos_base --action_env  DEVELOPER_DIR=/Applications/Xcode.app/Contents/Developer
+build:release_macos_base --define=no_nccl_support=true --output_filter=^$
+
+# Build configs for macOS x86
+build:release_macos_x86 --config=release_macos_base
+# Build with the AVX instruction set when on macOS x86
+build:release_macos_x86 --config=avx_linux
+build:release_macos_x86 --cpu=darwin
+# Target Catalina as the minimum compatible OS version
+build:release_macos_x86 --macos_minimum_os=10.15
+build:release_macos_x86 --action_env MACOSX_DEPLOYMENT_TARGET=10.15
+
+# Build configs for macOS Arm64
+build:release_macos_arm64 --config=release_macos_base
+build:release_macos_arm64 --cpu=darwin_arm64
+build:release_macos_arm64 --define=tensorflow_mkldnn_contraction_kernel=0
+# Target Moneterey as the minimum compatible OS version
+build:release_macos_arm64 --macos_minimum_os=12.0
+build:release_macos_arm64 --action_env MACOSX_DEPLOYMENT_TARGET=12.0
+
+# Base test configs for macOS
+test:release_macos_base --verbose_failures=true --local_test_jobs=HOST_CPUS
+test:release_macos_base --test_timeout=300,450,1200,3600 --test_output=errors
+test:release_macos_base --build_tests_only --keep_going
+test:release_macos_base --flaky_test_attempts=3
+
+# Test configs for macOS x86
+test:release_macos_x86 --config=release_macos_base
+
+# Test configs for macOS Arm64
+test:release_macos_arm64 --config=release_macos_base
+
+# TODO(kanglan): Update windows configs after b/289091160 is fixed
+build:release_cpu_windows --config=avx_win
+build:release_cpu_windows --define=no_tensorflow_py_deps=true
+test:release_cpu_windows --config=release_base
+
+# Exclude TFRT integration for anything but Linux.
+build:android --config=no_tfrt
+build:macos   --config=no_tfrt
+build:windows --config=no_tfrt
+build:rocm --config=no_tfrt
+build:no_tfrt --deleted_packages=tensorflow/compiler/mlir/tfrt,tensorflow/compiler/mlir/tfrt/benchmarks,tensorflow/compiler/mlir/tfrt/ir,tensorflow/compiler/mlir/tfrt/ir/mlrt,tensorflow/compiler/mlir/tfrt/jit/python_binding,tensorflow/compiler/mlir/tfrt/jit/transforms,tensorflow/compiler/mlir/tfrt/python_tests,tensorflow/compiler/mlir/tfrt/tests,tensorflow/compiler/mlir/tfrt/tests/ifrt,tensorflow/compiler/mlir/tfrt/tests/mlrt,tensorflow/compiler/mlir/tfrt/tests/ir,tensorflow/compiler/mlir/tfrt/tests/analysis,tensorflow/compiler/mlir/tfrt/tests/jit,tensorflow/compiler/mlir/tfrt/tests/lhlo_to_tfrt,tensorflow/compiler/mlir/tfrt/tests/lhlo_to_jitrt,tensorflow/compiler/mlir/tfrt/tests/tf_to_corert,tensorflow/compiler/mlir/tfrt/tests/tf_to_tfrt_data,tensorflow/compiler/mlir/tfrt/tests/saved_model,tensorflow/compiler/mlir/tfrt/transforms/lhlo_gpu_to_tfrt_gpu,tensorflow/compiler/mlir/tfrt/transforms/mlrt,tensorflow/core/runtime_fallback,tensorflow/core/runtime_fallback/conversion,tensorflow/core/runtime_fallback/kernel,tensorflow/core/runtime_fallback/opdefs,tensorflow/core/runtime_fallback/runtime,tensorflow/core/runtime_fallback/util,tensorflow/core/runtime_fallback/test,tensorflow/core/runtime_fallback/test/gpu,tensorflow/core/runtime_fallback/test/saved_model,tensorflow/core/runtime_fallback/test/testdata,tensorflow/core/tfrt/stubs,tensorflow/core/tfrt/tfrt_session,tensorflow/core/tfrt/mlrt,tensorflow/core/tfrt/mlrt/attribute,tensorflow/core/tfrt/mlrt/kernel,tensorflow/core/tfrt/mlrt/bytecode,tensorflow/core/tfrt/mlrt/interpreter,tensorflow/compiler/mlir/tfrt/translate/mlrt,tensorflow/compiler/mlir/tfrt/translate/mlrt/testdata,tensorflow/core/tfrt/gpu,tensorflow/core/tfrt/run_handler_thread_pool,tensorflow/core/tfrt/runtime,tensorflow/core/tfrt/saved_model,tensorflow/core/tfrt/graph_executor,tensorflow/core/tfrt/saved_model/tests,tensorflow/core/tfrt/tpu,tensorflow/core/tfrt/utils,tensorflow/core/tfrt/utils/debug,tensorflow/core/tfrt/saved_model/python,tensorflow/core/tfrt/graph_executor/python,tensorflow/core/tfrt/saved_model/utils
+
+# START CROSS-COMPILE CONFIGS
+# Set execution platform to Linux x86
+# Note: Lot of the "host_" flags such as "host_cpu" and "host_crosstool_top"
+# flags seem to be actually used to specify the execution platform details. It
+# seems it is this way because these flags are old and predate the distinction
+# between host and execution platform.
+build:cross_compile_base --host_cpu=k8
+build:cross_compile_base --host_crosstool_top=//tensorflow/tools/toolchains/cross_compile/cc:cross_compile_toolchain_suite
+build:cross_compile_base --extra_execution_platforms=//tensorflow/tools/toolchains/cross_compile/config:linux_x86_64
+
+build:rbe_cross_compile_base --config=rbe_base
+build:rbe_cross_compile_base --remote_instance_name=projects/tensorflow-testing/instances/default_instance
+
+# Test-related settings below this point
+# We cannot run cross-compiled tests on the remote Linux x86 VMs so we need to
+# force all tests to run locally on the Aarch64 host.
+test:rbe_cross_compile_base --strategy=TestRunner=local --build_tests_only
+test:rbe_cross_compile_base --verbose_failures=true --local_test_jobs=HOST_CPUS --test_output=errors
+
+# START LINUX AARCH64 CROSS-COMPILE CONFIGS
+build:cross_compile_linux_arm64 --config=cross_compile_base
+
+# Set the target CPU to Aarch64
+build:cross_compile_linux_arm64 --platforms=//tensorflow/tools/toolchains/cross_compile/config:linux_aarch64
+build:cross_compile_linux_arm64 --cpu=aarch64
+build:cross_compile_linux_arm64 --crosstool_top=//tensorflow/tools/toolchains/cross_compile/cc:cross_compile_toolchain_suite
+
+# RBE cross-compile configs for Linux Aarch64
+build:rbe_cross_compile_linux_arm64 --config=cross_compile_linux_arm64
+build:rbe_cross_compile_linux_arm64 --config=rbe_cross_compile_base
+test:rbe_cross_compile_linux_arm64 --config=rbe_cross_compile_base
+# END LINUX AARCH64 CROSS-COMPILE CONFIGS
+
+# START MACOS CROSS-COMPILE CONFIGS
+build:cross_compile_macos_x86 --config=cross_compile_base
+build:cross_compile_macos_x86 --config=nonccl
+# Target Catalina (10.15) as the minimum supported OS
+build:cross_compile_macos_x86 --action_env  MACOSX_DEPLOYMENT_TARGET=10.15
+
+# Set the target CPU to Darwin x86
+build:cross_compile_macos_x86 --platforms=//tensorflow/tools/toolchains/cross_compile/config:darwin_x86_64
+build:cross_compile_macos_x86 --cpu=darwin
+build:cross_compile_macos_x86 --crosstool_top=//tensorflow/tools/toolchains/cross_compile/cc:cross_compile_toolchain_suite
+# When RBE cross-compiling for macOS, we need to explicitly register the
+# toolchain. Otherwise, oddly, RBE complains that a "docker container must be
+# specified".
+build:cross_compile_macos_x86 --extra_toolchains=//tensorflow/tools/toolchains/cross_compile/config:macos-x86-cross-compile-cc-toolchain
+# Map --platforms=darwin_x86_64 to --cpu=darwin and vice-versa to make selects()
+# and transistions that use these flags work.
+build:cross_compile_macos_x86 --platform_mappings=tensorflow/tools/toolchains/cross_compile/config/platform_mappings
+
+# RBE cross-compile configs for Darwin x86
+build:rbe_cross_compile_macos_x86 --config=cross_compile_macos_x86
+build:rbe_cross_compile_macos_x86 --config=rbe_cross_compile_base
+test:rbe_cross_compile_macos_x86 --config=rbe_cross_compile_base
+# Increase the test timeout as tests often take longer on mac.
+test:rbe_cross_compile_macos_x86 --test_timeout=300,450,1200,3600
+# Limit jobs to 100 to avoid running into "out of memory" issues (b/316266643)
+build:rbe_cross_compile_macos_x86 --jobs=100
+test:rbe_cross_compile_macos_x86 --jobs=100
+# END MACOS CROSS-COMPILE CONFIGS
+# END CROSS-COMPILE CONFIGS
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b4a67876..36213e39 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -81,6 +81,11 @@ if (COMPILE_BENCHMARK)
     get_directory_property(TFLITE_BENCHMARK_SRCS DIRECTORY ${TFLITE_SOURCE_DIR}/tools/benchmark DEFINITION TFLITE_BENCHMARK_SRCS)
     list(FILTER TFLITE_BENCHMARK_SRCS EXCLUDE REGEX benchmark_main.cc)
 
+    # The TSL dir is included in the tensorflow CMakeLists.txt but because we manually refer to those source files here we have to explicitly list this include directory again.
+    set(TSL_SOURCE_DIR "${TENSORFLOW_SOURCE_DIR}/third_party/xla/third_party/tsl")
+    include_directories(
+      ${TSL_SOURCE_DIR}
+    )
     add_executable(lce_benchmark_model
             ${TFLITE_BENCHMARK_SRCS}
             ${LCE_BENCHMARK_SRCS} ${LCE_BENCHMARK_HRDS}
diff --git a/WORKSPACE b/WORKSPACE
index c0b27150..42688952 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -16,13 +16,89 @@ http_archive(
     patches = [
         "//third_party/tensorflow_patches:disable_forced_mkl.patch",
     ],
-    sha256 = "e58c939079588623e6fa1d054aec2f90f95018266e0a970fd353a5244f5173dc",
-    strip_prefix = "tensorflow-2.13.0",
+    sha256 = "c729e56efc945c6df08efe5c9f5b8b89329c7c91b8f40ad2bb3e13900bd4876d",
+    strip_prefix = "tensorflow-2.16.1",
     urls = [
-        "https://github.com/tensorflow/tensorflow/archive/v2.13.0.tar.gz",
+        "https://github.com/tensorflow/tensorflow/archive/v2.16.1.tar.gz",
     ],
 )
 
+# We must initialize hermetic python first.
+http_archive(
+    name = "bazel_skylib",
+    sha256 = "74d544d96f4a5bb630d465ca8bbcfe231e3594e5aae57e1edbf17a6eb3ca2506",
+    urls = [
+        "https://storage.googleapis.com/mirror.tensorflow.org/github.com/bazelbuild/bazel-skylib/releases/download/1.3.0/bazel-skylib-1.3.0.tar.gz",
+        "https://github.com/bazelbuild/bazel-skylib/releases/download/1.3.0/bazel-skylib-1.3.0.tar.gz",
+    ],
+)
+
+http_archive(
+    name = "rules_python",
+    sha256 = "9d04041ac92a0985e344235f5d946f71ac543f1b1565f2cdbc9a2aaee8adf55b",
+    strip_prefix = "rules_python-0.26.0",
+    url = "https://github.com/bazelbuild/rules_python/releases/download/0.26.0/rules_python-0.26.0.tar.gz",
+)
+
+load("@rules_python//python:repositories.bzl", "py_repositories", "python_register_toolchains")
+
+py_repositories()
+
+load(
+    "@org_tensorflow//tensorflow/tools/toolchains/python:python_repo.bzl",
+    "python_repository",
+)
+
+python_repository(name = "python_version_repo")
+
+load("@python_version_repo//:py_version.bzl", "TF_PYTHON_VERSION")
+
+python_register_toolchains(
+    name = "python",
+    ignore_root_user_error = True,
+    python_version = TF_PYTHON_VERSION,
+)
+
+load("@python//:defs.bzl", "interpreter")
+load("@rules_python//python:pip.bzl", "package_annotation", "pip_parse")
+
+NUMPY_ANNOTATIONS = {
+    "numpy": package_annotation(
+        additive_build_content = """\
+filegroup(
+    name = "includes",
+    srcs = glob(["site-packages/numpy/core/include/**/*.h"]),
+)
+cc_library(
+    name = "numpy_headers",
+    hdrs = [":includes"],
+    strip_include_prefix="site-packages/numpy/core/include/",
+)
+""",
+    ),
+}
+
+pip_parse(
+    name = "pypi",
+    annotations = NUMPY_ANNOTATIONS,
+    python_interpreter_target = interpreter,
+    requirements = "@org_tensorflow//:requirements_lock_" + TF_PYTHON_VERSION.replace(".", "_") + ".txt",
+)
+
+load("@pypi//:requirements.bzl", tf_install_deps = "install_deps")
+
+tf_install_deps()
+
+pip_parse(
+    name = "pypi_lce",
+    python_interpreter_target = interpreter,
+    requirements = "//larq_compute_engine:requirements.txt",
+)
+
+load("@pypi_lce//:requirements.bzl", lce_install_deps = "install_deps")
+
+lce_install_deps()
+
 load("@org_tensorflow//tensorflow:workspace3.bzl", "tf_workspace3")
 
 tf_workspace3()
diff --git a/build_pip_pkg.sh b/build_pip_pkg.sh
index 0a8bfcf3..cb762a24 100755
--- a/build_pip_pkg.sh
+++ b/build_pip_pkg.sh
@@ -29,7 +29,8 @@ function is_windows() {
 }
 
 if is_windows; then
-  PIP_FILE_PREFIX="bazel-bin/build_pip_pkg.exe.runfiles/larq_compute_engine/"
+  # On windows, the workspace name is lce to avoid the path length limit of MSVC
+  PIP_FILE_PREFIX="bazel-bin/build_pip_pkg.exe.runfiles/lce/"
 else
   PIP_FILE_PREFIX="bazel-bin/build_pip_pkg.runfiles/larq_compute_engine/"
 fi
diff --git a/configure.py b/configure.py
index fd53129e..1aae365b 100755
--- a/configure.py
+++ b/configure.py
@@ -16,6 +16,7 @@
 # limitations under the License.
 # ==============================================================================
 
+import json
 import os
 import platform
 import re
@@ -24,7 +25,7 @@
 
 _LCE_BAZELRC = ".lce_configure.bazelrc"
 
-_SUPPORTED_ANDROID_NDK_VERSIONS = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]
+_SUPPORTED_ANDROID_NDK_VERSIONS = [19, 20, 21, 25]
 
 _DEFAULT_PROMPT_ASK_ATTEMPTS = 10
 
@@ -570,29 +571,27 @@ def get_ndk_api_level(environ_cp, android_ndk_home_path):
             "errors.\n"
             % (android_ndk_home_path, ndk_version, _SUPPORTED_ANDROID_NDK_VERSIONS)
         )
+    write_action_env_to_bazelrc("ANDROID_NDK_VERSION", ndk_version)
 
     # Now grab the NDK API level to use. Note that this is different from the
     # SDK API level, as the NDK API level is effectively the *min* target SDK
     # version.
-    platforms = os.path.join(android_ndk_home_path, "platforms")
-    api_levels = sorted(os.listdir(platforms))
-    api_levels = [x.replace("android-", "") for x in api_levels if "android-" in x]
-
-    def valid_api_level(api_level):
-        return os.path.exists(
-            os.path.join(android_ndk_home_path, "platforms", "android-" + api_level)
-        )
+    meta = open(os.path.join(android_ndk_home_path, "meta/platforms.json"))
+    platforms = json.load(meta)
+    meta.close()
+    aliases = platforms["aliases"]
+    api_levels = sorted(list(set([aliases[i] for i in aliases])))
 
     android_ndk_api_level = prompt_loop_or_load_from_env(
         environ_cp,
         var_name="ANDROID_NDK_API_LEVEL",
-        var_default='26',  # 26 is required to support AHardwareBuffer.
+        var_default="26",  # 26 is required to support AHardwareBuffer.
         ask_for_var=(
             "Please specify the (min) Android NDK API level to use. "
             "[Available levels: %s]"
         )
         % api_levels,
-        check_success=valid_api_level,
+        check_success=(lambda *_: True),
         error_msg="Android-%s is not present in the NDK path.",
     )
 
diff --git a/larq_compute_engine/mlir/BUILD b/larq_compute_engine/mlir/BUILD
index d174313b..c74cc190 100644
--- a/larq_compute_engine/mlir/BUILD
+++ b/larq_compute_engine/mlir/BUILD
@@ -1,5 +1,6 @@
-load("@org_tensorflow//tensorflow:tensorflow.bzl", "pybind_extension", "tf_cc_binary")
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
+load("@org_tensorflow//tensorflow:tensorflow.bzl", "pybind_extension", "tf_cc_binary")
+load("@pypi_lce//:requirements.bzl", lce_requirement = "requirement")
 
 package(
     default_visibility = ["//visibility:public"],
@@ -470,8 +471,8 @@ cc_library(
         "@org_tensorflow//tensorflow/compiler/mlir/lite/quantization:quantization_config",
         "@org_tensorflow//tensorflow/compiler/mlir/lite/quantization:quantization_passes",
         "@org_tensorflow//tensorflow/compiler/mlir/tensorflow",
-        "@org_tensorflow//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
-        "@org_tensorflow//tensorflow/compiler/mlir/tensorflow:tf_saved_model_passes",
+        "@org_tensorflow//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
+        "@org_tensorflow//tensorflow/compiler/mlir/tensorflow/transforms:tf_saved_model_passes",
     ],
 )
 
@@ -486,13 +487,16 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@local_tsl//tsl/platform:statusor",
         "@org_tensorflow//tensorflow/compiler/mlir:op_or_arg_name_mapper",
         "@org_tensorflow//tensorflow/compiler/mlir/lite:flatbuffer_export",
+        "@org_tensorflow//tensorflow/compiler/mlir/lite/debug",
         "@org_tensorflow//tensorflow/compiler/mlir/lite/metrics:error_collector",
         "@org_tensorflow//tensorflow/compiler/mlir/lite/quantization:quantization_config",
+        "@org_tensorflow//tensorflow/compiler/mlir/lite/stablehlo:op_stat_pass",
+        "@org_tensorflow//tensorflow/compiler/mlir/lite/stablehlo:stablehlo_util",
         "@org_tensorflow//tensorflow/compiler/mlir/tensorflow:error_util",
-        "@org_tensorflow//tensorflow/compiler/mlir/tensorflow:tf_saved_model_freeze_variables",
-        "@org_tensorflow//tensorflow/tsl/platform:statusor",
+        "@org_tensorflow//tensorflow/compiler/mlir/tensorflow/transforms:tf_saved_model_freeze_variables",
     ],
 )
 
@@ -535,7 +539,7 @@ pybind_extension(
         "@org_tensorflow//tensorflow/compiler/mlir/tensorflow:export_graphdef",
         "@org_tensorflow//tensorflow/compiler/mlir/tensorflow:import_utils",
         "@org_tensorflow//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
-        "@org_tensorflow//tensorflow/compiler/mlir/tensorflow:tf_dialect_passes",
+        "@org_tensorflow//tensorflow/compiler/mlir/tensorflow/transforms:tf_dialect_passes",
         "@org_tensorflow//tensorflow/core:ops",
         "@pybind11",
     ],
@@ -551,12 +555,15 @@ genrule(
 py_library(
     name = "converter",
     srcs = [
+        "python/__init__.py",
         "python/converter.py",
         "python/util.py",
         ":tflite_schema_py",
     ],
     deps = [
         ":_tf_tfl_flatbuffer",
+        lce_requirement("tensorflow"),
+        lce_requirement("flatbuffers"),
     ],
 )
 
diff --git a/larq_compute_engine/mlir/ir/lce_ops.h b/larq_compute_engine/mlir/ir/lce_ops.h
index f19dd81b..0293e181 100644
--- a/larq_compute_engine/mlir/ir/lce_ops.h
+++ b/larq_compute_engine/mlir/ir/lce_ops.h
@@ -1,6 +1,7 @@
 #ifndef LARQ_COMPUTE_ENGINE_MLIR_IR_LCE_OPS_H_
 #define LARQ_COMPUTE_ENGINE_MLIR_IR_LCE_OPS_H_
 
+#include "mlir/Bytecode/BytecodeOpInterface.h"
 #include "mlir/Dialect/Quant/QuantTypes.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 
diff --git a/larq_compute_engine/mlir/python/common.cc b/larq_compute_engine/mlir/python/common.cc
index 83eb2f3c..a380a747 100644
--- a/larq_compute_engine/mlir/python/common.cc
+++ b/larq_compute_engine/mlir/python/common.cc
@@ -76,7 +76,7 @@ pybind11::bytes ConvertMLIRModuleToTFLiteFlatBuffer(
     const LCETarget target, const pybind11::object& default_ranges,
     const std::unordered_set<std::string>& saved_model_tags,
     llvm::StringRef saved_model_dir,
-    llvm::Optional<tensorflow::Session*> session, const int num_inputs,
+    std::optional<tensorflow::Session*> session, const int num_inputs,
     const bool should_quantize, const bool mark_as_post_training_quant) {
   mlir::quant::QuantizationSpecs quant_specs;
   if (should_quantize) {
@@ -86,9 +86,9 @@ pybind11::bytes ConvertMLIRModuleToTFLiteFlatBuffer(
     // we do that by default.
     quant_specs.inference_type = tensorflow::DT_QINT8;
     for (int i = 0; i < num_inputs; ++i) {
-      // Input inference type is DT_FLOAT, so set the default input ranges
-      // to llvm::None.
-      quant_specs.input_ranges.push_back({llvm::None, llvm::None});
+      // Input inference type is DT_FLOAT, so set the default input range to
+      // None.
+      quant_specs.input_ranges.push_back({std::nullopt, std::nullopt});
     }
     if (!default_ranges.is_none()) {
       // When there are no Quantize nodes in the graph then in the
diff --git a/larq_compute_engine/mlir/python/common.h b/larq_compute_engine/mlir/python/common.h
index 31b34e7f..e1f3c433 100644
--- a/larq_compute_engine/mlir/python/common.h
+++ b/larq_compute_engine/mlir/python/common.h
@@ -1,3 +1,5 @@
+#include <optional>
+
 #include "larq_compute_engine/mlir/transforms/passes.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/MLIRContext.h"
@@ -17,7 +19,7 @@ pybind11::bytes ConvertMLIRModuleToTFLiteFlatBuffer(
     const LCETarget target, const pybind11::object& default_ranges,
     const std::unordered_set<std::string>& saved_model_tags,
     llvm::StringRef saved_model_dir,
-    llvm::Optional<tensorflow::Session*> session, const int num_inputs,
+    std::optional<tensorflow::Session*> session, const int num_inputs,
     const bool should_quantize, const bool mark_as_post_training_quant);
 
 }  // namespace tensorflow
diff --git a/larq_compute_engine/mlir/python/graphdef_tfl_flatbuffer.cc b/larq_compute_engine/mlir/python/graphdef_tfl_flatbuffer.cc
index 220da098..6ada05a5 100644
--- a/larq_compute_engine/mlir/python/graphdef_tfl_flatbuffer.cc
+++ b/larq_compute_engine/mlir/python/graphdef_tfl_flatbuffer.cc
@@ -27,14 +27,13 @@ pybind11::bytes ConvertGraphDefToTFLiteFlatBuffer(
 
   auto target = GetLCETarget(target_str);
 
-  // `ParseInputArrayInfo` requires a type that isn't pybind compatible, so
-  // translate here.
-  std::vector<llvm::Optional<std::vector<int>>> translated_input_shapes;
+  // Convert empty shapes to `None`. We could also do that on the python side.
+  std::vector<std::optional<std::vector<int>>> translated_input_shapes;
   for (auto x : input_shapes) {
     if (x.size() > 0) {
       translated_input_shapes.push_back(x);
     } else {
-      translated_input_shapes.push_back(llvm::None);
+      translated_input_shapes.push_back(std::nullopt);
     }
   }
 
@@ -65,7 +64,7 @@ pybind11::bytes ConvertGraphDefToTFLiteFlatBuffer(
   return ConvertMLIRModuleToTFLiteFlatBuffer(
       &module.value(), context, target, default_ranges,
       /*saved_model_tags=*/{},
-      /*saved_model_dir=*/"", /*session=*/llvm::None, input_arrays.size(),
+      /*saved_model_dir=*/"", /*session=*/std::nullopt, input_arrays.size(),
       should_quantize,
       /*mark_as_post_training_quant=*/false);
 }
diff --git a/larq_compute_engine/mlir/tests/bitpack-weights.mlir b/larq_compute_engine/mlir/tests/bitpack-weights.mlir
index bb419438..96905736 100644
--- a/larq_compute_engine/mlir/tests/bitpack-weights.mlir
+++ b/larq_compute_engine/mlir/tests/bitpack-weights.mlir
@@ -7,6 +7,6 @@ func.func @bitpack_bconv2d_filters(%arg0: tensor<256x32x32x1xi32>, %arg1: tensor
   return %0 : tensor<256x30x30x16xf32>
 
   // CHECK: %cst = arith.constant dense<0> : tensor<16x3x3x1xi32>
-  // CHECK: %0 = "lq.Bconv2d"(%arg0, %cst, %arg1, %arg2, %arg3) {channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32} : (tensor<256x32x32x1xi32>, tensor<16x3x3x1xi32>, tensor<16xf32>, tensor<16xf32>, none) -> tensor<256x30x30x16xf32>
+  // CHECK: %0 = "lq.Bconv2d"(%arg0, %cst, %arg1, %arg2, %arg3) <{channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32}> : (tensor<256x32x32x1xi32>, tensor<16x3x3x1xi32>, tensor<16xf32>, tensor<16xf32>, none) -> tensor<256x30x30x16xf32>
   // CHECK-NEXT: return %0
 }
diff --git a/larq_compute_engine/mlir/tests/const-fold.mlir b/larq_compute_engine/mlir/tests/const-fold.mlir
index db8ded4e..984faa12 100644
--- a/larq_compute_engine/mlir/tests/const-fold.mlir
+++ b/larq_compute_engine/mlir/tests/const-fold.mlir
@@ -8,8 +8,8 @@ func.func @quantize() -> (tensor<1x1x2x1xi32>, tensor<1x1x2x1xi32>) {
   %1 = "lq.Quantize"(%neg) {} : (tensor<1x1x2x32xf32>) -> tensor<1x1x2x1xi32>
   return %0, %1 : tensor<1x1x2x1xi32>, tensor<1x1x2x1xi32>
 
-  // CHECK: %[[neg:.*]] = arith.constant dense<-1> : tensor<1x1x2x1xi32>
   // CHECK: %[[pos:.*]] = arith.constant dense<0> : tensor<1x1x2x1xi32>
+  // CHECK: %[[neg:.*]] = arith.constant dense<-1> : tensor<1x1x2x1xi32>
   // CHECK: return %[[pos]], %[[neg]] : tensor<1x1x2x1xi32>, tensor<1x1x2x1xi32>
 }
 
@@ -21,7 +21,7 @@ func.func @dequantize() -> (tensor<1x1x2x32xf32>, tensor<1x1x2x32xf32>) {
   %1 = "lq.Dequantize"(%neg) {} : (tensor<1x1x2x1xi32>) -> tensor<1x1x2x32xf32>
   return %0, %1 : tensor<1x1x2x32xf32>, tensor<1x1x2x32xf32>
 
-  // CHECK: %[[neg:.*]] = arith.constant dense<-1.000000e+00> : tensor<1x1x2x32xf32>
   // CHECK: %[[pos:.*]] = arith.constant dense<1.000000e+00> : tensor<1x1x2x32xf32>
+  // CHECK: %[[neg:.*]] = arith.constant dense<-1.000000e+00> : tensor<1x1x2x32xf32>
   // CHECK: return %[[pos]], %[[neg]] : tensor<1x1x2x32xf32>, tensor<1x1x2x32xf32>
 }
diff --git a/larq_compute_engine/mlir/tests/lce_ops_options_test.cc b/larq_compute_engine/mlir/tests/lce_ops_options_test.cc
index 381435b1..c4cd0853 100644
--- a/larq_compute_engine/mlir/tests/lce_ops_options_test.cc
+++ b/larq_compute_engine/mlir/tests/lce_ops_options_test.cc
@@ -16,9 +16,9 @@ IntegerAttr getIntegerAttr(Builder builder, int value) {
 TEST(LCEOpsSerializationTest, QuantizeTest) {
   MLIRContext context;
   context.getOrLoadDialect<lq::LarqDialect>();
-  auto* op = Operation::create(
-      UnknownLoc::get(&context), OperationName("lq.Quantize", &context),
-      llvm::None, llvm::None, llvm::None, llvm::None, 0);
+  OperationState state(UnknownLoc::get(&context),
+                       OperationName("lq.Quantize", &context));
+  mlir::Operation* op = Operation::create(state);
 
   ASSERT_EQ(cast<lq::QuantizeOp>(op).buildCustomOptions().size(), 0);
 }
@@ -26,9 +26,9 @@ TEST(LCEOpsSerializationTest, QuantizeTest) {
 TEST(LCEOpsSerializationTest, DequantizeTest) {
   MLIRContext context;
   context.getOrLoadDialect<lq::LarqDialect>();
-  auto* op = Operation::create(
-      UnknownLoc::get(&context), OperationName("lq.Dequantize", &context),
-      llvm::None, llvm::None, llvm::None, llvm::None, 0);
+  OperationState state(UnknownLoc::get(&context),
+                       OperationName("lq.Dequantize", &context));
+  mlir::Operation* op = Operation::create(state);
 
   ASSERT_EQ(cast<lq::DequantizeOp>(op).buildCustomOptions().size(), 0);
 }
@@ -37,9 +37,9 @@ TEST(LCEOpsSerializationTest, BConv2dTest) {
   MLIRContext context;
   context.getOrLoadDialect<lq::LarqDialect>();
   Builder builder(&context);
-  auto op = Operation::create(UnknownLoc::get(&context),
-                              OperationName("lq.Bconv2d", &context), llvm::None,
-                              llvm::None, llvm::None, llvm::None, 0);
+  OperationState state(UnknownLoc::get(&context),
+                       OperationName("lq.Bconv2d", &context));
+  mlir::Operation* op = Operation::create(state);
 
   op->setAttr("channels_in", getIntegerAttr(builder, 64));
   op->setAttr("dilation_height_factor", getIntegerAttr(builder, 3));
@@ -69,9 +69,9 @@ TEST(LCEOpsSerializationTest, BMaxPool2dTest) {
   MLIRContext context;
   context.getOrLoadDialect<lq::LarqDialect>();
   Builder builder(&context);
-  auto op = Operation::create(
-      UnknownLoc::get(&context), OperationName("lq.BMaxPool2d", &context),
-      llvm::None, llvm::None, llvm::None, llvm::None, 0);
+  OperationState state(UnknownLoc::get(&context),
+                       OperationName("lq.BMaxPool2d", &context));
+  mlir::Operation* op = Operation::create(state);
 
   op->setAttr("padding", builder.getStringAttr("SAME"));
   op->setAttr("stride_width", getIntegerAttr(builder, 2));
diff --git a/larq_compute_engine/mlir/tests/legalize-lce.mlir b/larq_compute_engine/mlir/tests/legalize-lce.mlir
index 21767a41..cb6a62ee 100644
--- a/larq_compute_engine/mlir/tests/legalize-lce.mlir
+++ b/larq_compute_engine/mlir/tests/legalize-lce.mlir
@@ -9,7 +9,7 @@ func.func @legalize_bconv2d(%arg0: tensor<256x32x32x1xi32>, %arg1: tensor<16x3x3
   // CHECK: %0 = "tfl.custom"(%arg0, %arg1, %arg2, %arg3, %arg4) {custom_code = "LceBconv2d", custom_option = #tfl<const_bytes : "0x6368616E6E656C735F696E0064696C6174696F6E5F6865696768745F666163746F720064696C6174696F6E5F77696474685F666163746F720066757365645F61637469766174696F6E5F66756E6374696F6E007061645F76616C7565730070616464696E67007374726964655F686569676874007374726964655F776964746800088277614C3329221508010803010100000101010404040404040404102401">} : (tensor<256x32x32x1xi32>, tensor<16x3x3x3xf32>, tensor<16xf32>, tensor<16xf32>, none) -> tensor<256x30x30x16xf32>
   // CHECK-NEXT: return %0
 
-  // TRANSLATE: %0 = "lq.Bconv2d"(%arg0, %arg1, %arg2, %arg3, %arg4) {channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32} : (tensor<256x32x32x1xi32>, tensor<16x3x3x3xf32>, tensor<16xf32>, tensor<16xf32>, none) -> tensor<256x30x30x16xf32>
+  // TRANSLATE: %0 = "lq.Bconv2d"(%arg0, %arg1, %arg2, %arg3, %arg4) <{channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32}> : (tensor<256x32x32x1xi32>, tensor<16x3x3x3xf32>, tensor<16xf32>, tensor<16xf32>, none) -> tensor<256x30x30x16xf32>
   // TRANSLATE-NEXT: return %0 : tensor<256x30x30x16xf32>
 }
 
@@ -21,7 +21,7 @@ func.func @legalize_bmax_pool2d(%arg0: tensor<256x32x32x3xi32>) -> tensor<256x16
   // CHECK: %0 = "tfl.custom"(%arg0) {custom_code = "LceBMaxPool2d", custom_option = #tfl<const_bytes : "0x70616464696E67007374726964655F7769647468007374726964655F6865696768740066696C7465725F77696474680066696C7465725F68656967687400050F1D412D3B050105020200020204040404040A2401">} : (tensor<256x32x32x3xi32>) -> tensor<256x16x16x3xi32>
   // CHECK-NEXT: return %0
 
-  // TRANSLATE: %0 = "lq.BMaxPool2d"(%arg0) {filter_height = 2 : i32, filter_width = 2 : i32, padding = "SAME", stride_height = 2 : i32, stride_width = 2 : i32} : (tensor<256x32x32x3xi32>) -> tensor<256x16x16x3xi32>
+  // TRANSLATE: %0 = "lq.BMaxPool2d"(%arg0) <{filter_height = 2 : i32, filter_width = 2 : i32, padding = "SAME", stride_height = 2 : i32, stride_width = 2 : i32}> : (tensor<256x32x32x3xi32>) -> tensor<256x16x16x3xi32>
   // TRANSLATE-NEXT: return %0 : tensor<256x16x16x3xi32>
 }
 
diff --git a/larq_compute_engine/mlir/tests/optimize.mlir b/larq_compute_engine/mlir/tests/optimize.mlir
index c1f0efda..6b2b06a7 100644
--- a/larq_compute_engine/mlir/tests/optimize.mlir
+++ b/larq_compute_engine/mlir/tests/optimize.mlir
@@ -109,7 +109,7 @@ func.func @fuse_relu_into_bconv2d(%arg0: tensor<256x32x32x1xi32>, %arg1: tensor<
   %1 = "tfl.relu"(%0) : (tensor<256x30x30x16xf32>) -> tensor<256x30x30x16xf32>
   return %1 : tensor<256x30x30x16xf32>
 
-  // CHECK: %0 = "lq.Bconv2d"(%arg0, %arg1, %cst, %cst_0, %arg2) {channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "RELU", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32}
+  // CHECK: %0 = "lq.Bconv2d"(%arg0, %arg1, %cst, %cst_0, %arg2) <{channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "RELU", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32}>
   // CHECK-NEXT: return %0
 }
 
@@ -121,7 +121,7 @@ func.func @fuse_relu6_into_bconv2d(%arg0: tensor<256x32x32x1xi32>, %arg1: tensor
   %1 = "tfl.relu6"(%0) : (tensor<256x30x30x16xf32>) -> tensor<256x30x30x16xf32>
   return %1 : tensor<256x30x30x16xf32>
 
-  // CHECK: %0 = "lq.Bconv2d"(%arg0, %arg1, %cst, %cst_0, %arg2) {channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "RELU6", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32}
+  // CHECK: %0 = "lq.Bconv2d"(%arg0, %arg1, %cst, %cst_0, %arg2) <{channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "RELU6", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32}>
   // CHECK-NEXT: return %0
 }
 
@@ -133,7 +133,7 @@ func.func @fuse_relu1_into_bconv2d(%arg0: tensor<256x32x32x1xi32>, %arg1: tensor
   %1 = "tfl.relu_n1_to_1"(%0) : (tensor<256x30x30x16xf32>) -> tensor<256x30x30x16xf32>
   return %1 : tensor<256x30x30x16xf32>
 
-  // CHECK: %0 = "lq.Bconv2d"(%arg0, %arg1, %cst, %cst_0, %arg2) {channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "RELU_N1_TO_1", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32}
+  // CHECK: %0 = "lq.Bconv2d"(%arg0, %arg1, %cst, %cst_0, %arg2) <{channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "RELU_N1_TO_1", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32}>
   // CHECK-NEXT: return %0
 }
 
@@ -145,7 +145,7 @@ func.func @fuse_relu_into_bconv2d_padding_same_one(%arg0: tensor<256x32x32x1xi32
   %1 = "tfl.relu"(%0) : (tensor<256x32x32x16xf32>) -> tensor<256x32x32x16xf32>
   return %1 : tensor<256x32x32x16xf32>
 
-  // CHECK: %0 = "lq.Bconv2d"(%arg0, %arg1, %cst, %cst_0, %arg2) {channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "RELU", pad_values = 1 : i32, padding = "SAME", stride_height = 1 : i32, stride_width = 1 : i32}
+  // CHECK: %0 = "lq.Bconv2d"(%arg0, %arg1, %cst, %cst_0, %arg2) <{channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "RELU", pad_values = 1 : i32, padding = "SAME", stride_height = 1 : i32, stride_width = 1 : i32}>
   // CHECK-NEXT: return %0
 }
 
@@ -157,7 +157,7 @@ func.func @do_not_fuse_relu_into_bconv2d_padding_same_zero(%arg0: tensor<256x32x
   %1 = "tfl.relu"(%0) : (tensor<256x32x32x16xf32>) -> tensor<256x32x32x16xf32>
   return %1 : tensor<256x32x32x16xf32>
 
-  // CHECK: %0 = "lq.Bconv2d"(%arg0, %arg1, %cst, %cst_0, %arg2) {channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "SAME", stride_height = 1 : i32, stride_width = 1 : i32}
+  // CHECK: %0 = "lq.Bconv2d"(%arg0, %arg1, %cst, %cst_0, %arg2) <{channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "SAME", stride_height = 1 : i32, stride_width = 1 : i32}>
   // CHECK-NEXT: %1 = "tfl.relu"(%0)
   // CHECK-NEXT: return %1
 }
@@ -170,7 +170,7 @@ func.func @do_not_fuse_relu_into_bconv2d_no_post_activation_bias(%arg0: tensor<2
   %1 = "tfl.relu"(%0) : (tensor<256x30x30x16xf32>) -> tensor<256x30x30x16xf32>
   return %1 : tensor<256x30x30x16xf32>
 
-  // CHECK: %0 = "lq.Bconv2d"(%arg0, %arg1, %cst, %cst_0, %arg2) {channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32}
+  // CHECK: %0 = "lq.Bconv2d"(%arg0, %arg1, %cst, %cst_0, %arg2) <{channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32}>
   // CHECK-NEXT: %1 = "tfl.relu"(%0)
   // CHECK-NEXT: return %1
 }
@@ -183,7 +183,7 @@ func.func @do_not_fuse_relu_into_bconv2d_no_post_activation_multiplier(%arg0: te
   %1 = "tfl.relu"(%0) : (tensor<256x30x30x16xf32>) -> tensor<256x30x30x16xf32>
   return %1 : tensor<256x30x30x16xf32>
 
-  // CHECK: %0 = "lq.Bconv2d"(%arg0, %arg1, %cst, %cst_0, %arg2) {channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32}
+  // CHECK: %0 = "lq.Bconv2d"(%arg0, %arg1, %cst, %cst_0, %arg2) <{channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32}>
   // CHECK-NEXT: %1 = "tfl.relu"(%0)
   // CHECK-NEXT: return %1
 }
@@ -195,7 +195,7 @@ func.func @target_specific_reorder_maxpool_2d_quantize(%arg0: tensor<256x32x32x6
   return %1 : tensor<256x16x8x3xi32>
 
   // CHECK-ARM-NEXT: %0 = "lq.Quantize"(%arg0) : (tensor<256x32x32x65xf32>) -> tensor<256x32x32x3xi32>
-  // CHECK-ARM-NEXT: %1 = "lq.BMaxPool2d"(%0) {filter_height = 3 : i32, filter_width = 2 : i32, padding = "SAME", stride_height = 2 : i32, stride_width = 4 : i32} : (tensor<256x32x32x3xi32>) -> tensor<256x16x8x3xi32>
+  // CHECK-ARM-NEXT: %1 = "lq.BMaxPool2d"(%0) <{filter_height = 3 : i32, filter_width = 2 : i32, padding = "SAME", stride_height = 2 : i32, stride_width = 4 : i32}> : (tensor<256x32x32x3xi32>) -> tensor<256x16x8x3xi32>
   // CHECK-ARM-NEXT: return %1
 
   // CHECK-XCORE-NEXT: %0 = "tfl.max_pool_2d"(%arg0) {filter_height = 3 : i32, filter_width = 2 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 2 : i32, stride_w = 4 : i32} : (tensor<256x32x32x65xf32>) -> tensor<256x16x8x65xf32>
@@ -236,7 +236,7 @@ func.func @bitpack_activation_thresholds_with_negative_post_multipliers(%arg0: t
   // Verify correct thresholds. These have been manually computed.
   // CHECK-NEXT: %cst_0 = arith.constant dense<[0, 3, 2, 2, -2147483648, 2, 1, 2]> : tensor<8xi32>
 
-  // CHECK-NEXT: %1 = "lq.Bconv2d"(%arg0, %cst, %0, %0, %cst_0) {channels_in = 1 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 1 : i32, padding = "SAME", stride_height = 1 : i32, stride_width = 1 : i32} : (tensor<256x32x32x1xi32>, tensor<8x2x2x1xf32>, none, none, tensor<8xi32>) -> tensor<256x32x32x1xi32>
+  // CHECK-NEXT: %1 = "lq.Bconv2d"(%arg0, %cst, %0, %0, %cst_0) <{channels_in = 1 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 1 : i32, padding = "SAME", stride_height = 1 : i32, stride_width = 1 : i32}> : (tensor<256x32x32x1xi32>, tensor<8x2x2x1xf32>, none, none, tensor<8xi32>) -> tensor<256x32x32x1xi32>
   // CHECK-NEXT: return %1
 }
 
@@ -250,7 +250,7 @@ func.func @bitpack_activations_valid_padding(%arg0: tensor<256x32x32x1xi32>) ->
   %2 = "lq.Quantize"(%1) : (tensor<256x30x30x65xf32>) -> tensor<256x30x30x3xi32>
   return %2 : tensor<256x30x30x3xi32>
 
-  // CHECK: %1 = "lq.Bconv2d"(%arg0, %cst, %0, %0, %cst_0) {channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32} : (tensor<256x32x32x1xi32>, tensor<65x3x3x3xf32>, none, none, tensor<65xi32>) -> tensor<256x30x30x3xi32>
+  // CHECK: %1 = "lq.Bconv2d"(%arg0, %cst, %0, %0, %cst_0) <{channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32}> : (tensor<256x32x32x1xi32>, tensor<65x3x3x3xf32>, none, none, tensor<65xi32>) -> tensor<256x30x30x3xi32>
   // CHECK-NEXT: return %1
 }
 
@@ -264,7 +264,7 @@ func.func @bitpack_activations_same_one_padding(%arg0: tensor<256x32x32x1xi32>)
   %2 = "lq.Quantize"(%1) : (tensor<256x32x32x65xf32>) -> tensor<256x32x32x3xi32>
   return %2 : tensor<256x32x32x3xi32>
 
-  // CHECK: %1 = "lq.Bconv2d"(%arg0, %cst, %0, %0, %cst_0) {channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 1 : i32, padding = "SAME", stride_height = 1 : i32, stride_width = 1 : i32} : (tensor<256x32x32x1xi32>, tensor<65x3x3x3xf32>, none, none, tensor<65xi32>) -> tensor<256x32x32x3xi32>
+  // CHECK: %1 = "lq.Bconv2d"(%arg0, %cst, %0, %0, %cst_0) <{channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 1 : i32, padding = "SAME", stride_height = 1 : i32, stride_width = 1 : i32}> : (tensor<256x32x32x1xi32>, tensor<65x3x3x3xf32>, none, none, tensor<65xi32>) -> tensor<256x32x32x3xi32>
   // CHECK-NEXT: return %1
 }
 
@@ -278,7 +278,7 @@ func.func @do_not_bitpack_activations_same_zero_padding(%arg0: tensor<256x32x32x
   %2 = "lq.Quantize"(%1) : (tensor<256x32x32x65xf32>) -> tensor<256x32x32x3xi32>
   return %2 : tensor<256x32x32x3xi32>
 
-  // CHECK: %1 = "lq.Bconv2d"(%arg0, %cst, %cst_0, %cst_1, %0) {channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "SAME", stride_height = 1 : i32, stride_width = 1 : i32} : (tensor<256x32x32x1xi32>, tensor<65x3x3x3xf32>, tensor<65xf32>, tensor<65xf32>, none) -> tensor<256x32x32x65xf32>
+  // CHECK: %1 = "lq.Bconv2d"(%arg0, %cst, %cst_0, %cst_1, %0) <{channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "SAME", stride_height = 1 : i32, stride_width = 1 : i32}> : (tensor<256x32x32x1xi32>, tensor<65x3x3x3xf32>, tensor<65xf32>, tensor<65xf32>, none) -> tensor<256x32x32x65xf32>
   // CHECK-NEXT: %2 = "lq.Quantize"(%1) : (tensor<256x32x32x65xf32>) -> tensor<256x32x32x3xi32>
   // CHECK-NEXT: return %2
 }
@@ -293,7 +293,7 @@ func.func @do_not_bitpack_activations_multiple_uses(%arg0: tensor<256x32x32x1xi3
   %2 = "lq.Quantize"(%1) : (tensor<256x30x30x65xf32>) -> tensor<256x30x30x3xi32>
   return %1, %2: tensor<256x30x30x65xf32>, tensor<256x30x30x3xi32>
 
-  // CHECK: %1 = "lq.Bconv2d"(%arg0, %cst, %cst_0, %cst_1, %0) {channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32} : (tensor<256x32x32x1xi32>, tensor<65x3x3x3xf32>, tensor<65xf32>, tensor<65xf32>, none) -> tensor<256x30x30x65xf32>
+  // CHECK: %1 = "lq.Bconv2d"(%arg0, %cst, %cst_0, %cst_1, %0) <{channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32}> : (tensor<256x32x32x1xi32>, tensor<65x3x3x3xf32>, tensor<65xf32>, tensor<65xf32>, none) -> tensor<256x30x30x65xf32>
   // CHECK-NEXT: %2 = "lq.Quantize"(%1) : (tensor<256x30x30x65xf32>) -> tensor<256x30x30x3xi32>
   // CHECK-NEXT: return %1, %2
 }
diff --git a/larq_compute_engine/mlir/tests/prepare-tf.mlir b/larq_compute_engine/mlir/tests/prepare-tf.mlir
index 22fa7194..31a54106 100644
--- a/larq_compute_engine/mlir/tests/prepare-tf.mlir
+++ b/larq_compute_engine/mlir/tests/prepare-tf.mlir
@@ -84,12 +84,13 @@ func.func @fuse_bconv2d_valid_padding(%arg0: tensor<1x112x112x1xi32>) -> tensor<
   %1 = "tf.Conv2D"(%0, %cst) {padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<1x112x112x2xf32>, tensor<1x3x2x2xf32>) -> tensor<1x112x110x2xf32>
   return %1 : tensor<1x112x110x2xf32>
 
-  // CHECK: %cst = arith.constant
-  // CHECK: %[[post_activation_multiplier:.*]] = arith.constant dense<1.000000e+00> : tensor<2xf32>
-  // CHECK: %[[post_activation_bias:.*]] = arith.constant dense<0.000000e+00> : tensor<2xf32>
   // CHECK: %[[output_threshold:.*]] = "tfl.no_value"() {value} : () -> none
-  // CHECK: %[[transpose:.*]] = "tf.Transpose"
-  // CHECK-NEXT: %[[conv:.*]] = "lq.Bconv2d"(%arg0, %[[transpose]], %[[post_activation_multiplier]], %[[post_activation_bias]], %[[output_threshold:.*]]) {channels_in = 2 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32} : (tensor<1x112x112x1xi32>, tensor<2x1x3x2xf32>, tensor<2xf32>, tensor<2xf32>, none) -> tensor<1x112x110x2xf32>
+  // CHECK: %[[post_activation_bias:.*]] = arith.constant dense<0.000000e+00> : tensor<2xf32>
+  // CHECK: %[[post_activation_multiplier:.*]] = arith.constant dense<1.000000e+00> : tensor<2xf32>
+  // CHECK: %[[weights:.*]] = arith.constant
+  // CHECK: %[[transpose_idx:.*]] = arith.constant {{.*}} tensor<4xi32>
+  // CHECK: %[[transpose:.*]] = "tf.Transpose"(%[[weights]], %[[transpose_idx]])
+  // CHECK-NEXT: %[[conv:.*]] = "lq.Bconv2d"(%arg0, %[[transpose]], %[[post_activation_multiplier]], %[[post_activation_bias]], %[[output_threshold:.*]]) <{channels_in = 2 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32}> : (tensor<1x112x112x1xi32>, tensor<2x1x3x2xf32>, tensor<2xf32>, tensor<2xf32>, none) -> tensor<1x112x110x2xf32>
   // CHECK-NEXT: return %[[conv]]
 }
 
@@ -100,12 +101,13 @@ func.func @target_specific_fuse_bconv2d_same_zero_padding(%arg0: tensor<1x112x11
   %1 = "tf.Conv2D"(%0, %cst) {padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<1x112x112x2xf32>, tensor<1x2x2x2xf32>) -> tensor<1x112x112x2xf32>
   return %1 : tensor<1x112x112x2xf32>
 
-  // CHECK-ARM: %cst = arith.constant
-  // CHECK-ARM: %[[post_activation_multiplier:.*]] = arith.constant dense<1.000000e+00> : tensor<2xf32>
-  // CHECK-ARM: %[[post_activation_bias:.*]] = arith.constant dense<0.000000e+00> : tensor<2xf32>
   // CHECK-ARM: %[[output_threshold:.*]] = "tfl.no_value"() {value} : () -> none
-  // CHECK-ARM: %[[transpose:.*]] = "tf.Transpose"
-  // CHECK-ARM-NEXT: %[[conv:.*]] = "lq.Bconv2d"(%arg0, %[[transpose]], %[[post_activation_multiplier]], %[[post_activation_bias]], %[[output_threshold:.*]]) {channels_in = 2 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "SAME", stride_height = 1 : i32, stride_width = 1 : i32} : (tensor<1x112x112x1xi32>, tensor<2x1x2x2xf32>, tensor<2xf32>, tensor<2xf32>, none) -> tensor<1x112x112x2xf32>
+  // CHECK-ARM: %[[post_activation_bias:.*]] = arith.constant dense<0.000000e+00> : tensor<2xf32>
+  // CHECK-ARM: %[[post_activation_multiplier:.*]] = arith.constant dense<1.000000e+00> : tensor<2xf32>
+  // CHECK-ARM: %[[weights:.*]] = arith.constant
+  // CHECK-ARM: %[[transpose_idx:.*]] = arith.constant {{.*}} tensor<4xi32>
+  // CHECK-ARM: %[[transpose:.*]] = "tf.Transpose"(%[[weights]], %[[transpose_idx]])
+  // CHECK-ARM-NEXT: %[[conv:.*]] = "lq.Bconv2d"(%arg0, %[[transpose]], %[[post_activation_multiplier]], %[[post_activation_bias]], %[[output_threshold:.*]]) <{channels_in = 2 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "SAME", stride_height = 1 : i32, stride_width = 1 : i32}> : (tensor<1x112x112x1xi32>, tensor<2x1x2x2xf32>, tensor<2xf32>, tensor<2xf32>, none) -> tensor<1x112x112x2xf32>
   // CHECK-ARM-NEXT: return %[[conv]]
 
   // CHECK-XCORE: %0 = "lq.Dequantize"
@@ -120,12 +122,13 @@ func.func @fuse_bconv2d_grouped_convolution(%arg0: tensor<1x112x112x4xi32>) -> t
   %1 = "tf.Conv2D"(%0, %cst) {padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<1x112x112x128xf32>, tensor<3x3x64x16xf32>) -> tensor<1x110x110x16xf32>
   return %1 : tensor<1x110x110x16xf32>
 
-  // CHECK: %cst = arith.constant
-  // CHECK: %[[post_activation_multiplier:.*]] = arith.constant dense<1.000000e+00> : tensor<16xf32>
-  // CHECK: %[[post_activation_bias:.*]] = arith.constant dense<0.000000e+00> : tensor<16xf32>
   // CHECK: %[[output_threshold:.*]] = "tfl.no_value"() {value} : () -> none
-  // CHECK: %[[transpose:.*]] = "tf.Transpose"
-  // CHECK-NEXT: %[[conv:.*]] = "lq.Bconv2d"(%arg0, %[[transpose]], %[[post_activation_multiplier]], %[[post_activation_bias]], %[[output_threshold:.*]]) {channels_in = 128 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32} : (tensor<1x112x112x4xi32>, tensor<16x3x3x64xf32>, tensor<16xf32>, tensor<16xf32>, none) -> tensor<1x110x110x16xf32>
+  // CHECK: %[[post_activation_bias:.*]] = arith.constant dense<0.000000e+00> : tensor<16xf32>
+  // CHECK: %[[post_activation_multiplier:.*]] = arith.constant dense<1.000000e+00> : tensor<16xf32>
+  // CHECK: %[[weights:.*]] = arith.constant
+  // CHECK: %[[transpose_idx:.*]] = arith.constant {{.*}} tensor<4xi32>
+  // CHECK: %[[transpose:.*]] = "tf.Transpose"(%[[weights]], %[[transpose_idx]])
+  // CHECK-NEXT: %[[conv:.*]] = "lq.Bconv2d"(%arg0, %[[transpose]], %[[post_activation_multiplier]], %[[post_activation_bias]], %[[output_threshold:.*]]) <{channels_in = 128 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32}> : (tensor<1x112x112x4xi32>, tensor<16x3x3x64xf32>, tensor<16xf32>, tensor<16xf32>, none) -> tensor<1x110x110x16xf32>
   // CHECK-NEXT: return %[[conv]]
 }
 
@@ -148,12 +151,13 @@ func.func @fuse_scaled_bconv2d(%arg0: tensor<1x112x112x1xi32>) -> tensor<1x112x1
   %1 = "tf.Conv2D"(%0, %cst) {padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<1x112x112x2xf32>, tensor<1x3x2x2xf32>) -> tensor<1x112x110x2xf32>
   return %1 : tensor<1x112x110x2xf32>
 
-  // CHECK: %cst = arith.constant
-  // CHECK: %[[post_activation_multiplier:.*]] = arith.constant dense<[3.000000e-01, 1.000000e-01]> : tensor<2xf32>
-  // CHECK: %[[post_activation_bias:.*]] = arith.constant dense<0.000000e+00> : tensor<2xf32>
   // CHECK: %[[output_threshold:.*]] = "tfl.no_value"() {value} : () -> none
-  // CHECK: %[[transpose:.*]] = "tf.Transpose"
-  // CHECK-NEXT: %[[conv:.*]] = "lq.Bconv2d"(%arg0, %[[transpose]], %[[post_activation_multiplier]], %[[post_activation_bias]], %[[output_threshold:.*]]) {channels_in = 2 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32} : (tensor<1x112x112x1xi32>, tensor<2x1x3x2xf32>, tensor<2xf32>, tensor<2xf32>, none) -> tensor<1x112x110x2xf32>
+  // CHECK: %[[post_activation_bias:.*]] = arith.constant dense<0.000000e+00> : tensor<2xf32>
+  // CHECK: %[[transpose_idx:.*]] = arith.constant {{.*}} tensor<4xi32>
+  // CHECK: %[[post_activation_multiplier:.*]] = arith.constant dense<[3.000000e-01, 1.000000e-01]> : tensor<2xf32>
+  // CHECK: %[[weights:.*]] = "tf.Div"
+  // CHECK: %[[transpose:.*]] = "tf.Transpose"(%[[weights]], %[[transpose_idx]])
+  // CHECK-NEXT: %[[conv:.*]] = "lq.Bconv2d"(%arg0, %[[transpose]], %[[post_activation_multiplier]], %[[post_activation_bias]], %[[output_threshold:.*]]) <{channels_in = 2 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32}> : (tensor<1x112x112x1xi32>, tensor<2x1x3x2xf32>, tensor<2xf32>, tensor<2xf32>, none) -> tensor<1x112x110x2xf32>
   // CHECK-NEXT: return %[[conv]]
 }
 
@@ -169,11 +173,13 @@ func.func @fuse_dilated_bconv(%arg0: tensor<1x128x128x1xi32>) -> tensor<1x128x12
   %3 = "tf.BatchToSpaceND"(%2, %cst, %cst_0) : (tensor<4x64x64x8xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<1x128x128x8xf32>
   return %3 : tensor<1x128x128x8xf32>
 
-  // CHECK: %[[post_activation_multiplier:.*]] = arith.constant dense<1.000000e+00> : tensor<8xf32>
-  // CHECK: %[[post_activation_bias:.*]] = arith.constant dense<0.000000e+00> : tensor<8xf32>
   // CHECK: %[[output_threshold:.*]] = "tfl.no_value"() {value} : () -> none
-  // CHECK: %[[transpose:.*]] = "tf.Transpose"
-  // CHECK-NEXT: %[[conv:.*]] = "lq.Bconv2d"(%arg0, %[[transpose]], %[[post_activation_multiplier]], %[[post_activation_bias]], %[[output_threshold:.*]]) {channels_in = 3 : i32, dilation_height_factor = 2 : i32, dilation_width_factor = 2 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32} : (tensor<1x128x128x1xi32>, tensor<8x5x5x3xf32>, tensor<8xf32>, tensor<8xf32>, none) -> tensor<1x128x128x8xf32>
+  // CHECK: %[[post_activation_bias:.*]] = arith.constant dense<0.000000e+00> : tensor<8xf32>
+  // CHECK: %[[post_activation_multiplier:.*]] = arith.constant dense<1.000000e+00> : tensor<8xf32>
+  // CHECK: %[[weights:.*]] = arith.constant
+  // CHECK: %[[transpose_idx:.*]] = arith.constant
+  // CHECK: %[[transpose:.*]] = "tf.Transpose"(%[[weights]], %[[transpose_idx]])
+  // CHECK-NEXT: %[[conv:.*]] = "lq.Bconv2d"(%arg0, %[[transpose]], %[[post_activation_multiplier]], %[[post_activation_bias]], %[[output_threshold:.*]]) <{channels_in = 3 : i32, dilation_height_factor = 2 : i32, dilation_width_factor = 2 : i32, fused_activation_function = "NONE", pad_values = 0 : i32, padding = "VALID", stride_height = 1 : i32, stride_width = 1 : i32}> : (tensor<1x128x128x1xi32>, tensor<8x5x5x3xf32>, tensor<8xf32>, tensor<8xf32>, none) -> tensor<1x128x128x8xf32>
   // CHECK-NEXT: return %[[conv]] : tensor<1x128x128x8xf32>
 }
 
@@ -213,11 +219,11 @@ func.func @fuse_bconv2d_same_one_padding(%arg0: tensor<256x32x32x1xi32>) -> tens
   %2 = "tf.Conv2D"(%1, %cst) {padding = "VALID", strides = [1, 2, 2, 1]} : (tensor<256x34x34x3xf32>, tensor<3x3x3x16xf32>) -> tensor<256x16x16x16xf32>
   return %2 : tensor<256x16x16x16xf32>
 
-  // CHECK:  %[[CST1:.*]] = arith.constant dense<1.000000e+00> : tensor<16xf32>
-  // CHECK:  %[[CST2:.*]] = arith.constant dense<0.000000e+00> : tensor<16xf32>
   // CHECK:  %[[CST3:.*]] = "tfl.no_value"() {value} : () -> none
+  // CHECK:  %[[CST2:.*]] = arith.constant dense<0.000000e+00> : tensor<16xf32>
+  // CHECK:  %[[CST1:.*]] = arith.constant dense<1.000000e+00> : tensor<16xf32>
   // CHECK:  %[[TRP:.*]] = "tf.Transpose"
-  // CHECK:  %[[CONV:.*]] = "lq.Bconv2d"(%arg0, %[[TRP]], %[[CST1]], %[[CST2]], %[[CST3:.*]]) {channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 1 : i32, padding = "SAME", stride_height = 2 : i32, stride_width = 2 : i32} : (tensor<256x32x32x1xi32>, tensor<16x3x3x3xf32>, tensor<16xf32>, tensor<16xf32>, none) -> tensor<256x16x16x16xf32>
+  // CHECK:  %[[CONV:.*]] = "lq.Bconv2d"(%arg0, %[[TRP]], %[[CST1]], %[[CST2]], %[[CST3:.*]]) <{channels_in = 3 : i32, dilation_height_factor = 1 : i32, dilation_width_factor = 1 : i32, fused_activation_function = "NONE", pad_values = 1 : i32, padding = "SAME", stride_height = 2 : i32, stride_width = 2 : i32}> : (tensor<256x32x32x1xi32>, tensor<16x3x3x3xf32>, tensor<16xf32>, tensor<16xf32>, none) -> tensor<256x16x16x16xf32>
 }
 
 // CHECK-LABEL: @do_not_fuse_bconv2d_padding_same_twice
diff --git a/larq_compute_engine/mlir/tf_tfl_passes.cc b/larq_compute_engine/mlir/tf_tfl_passes.cc
index 5cc1dbf5..7139955a 100644
--- a/larq_compute_engine/mlir/tf_tfl_passes.cc
+++ b/larq_compute_engine/mlir/tf_tfl_passes.cc
@@ -108,7 +108,7 @@ void AddPreVariableFreezingTFToLCETFLConversionPasses(
 
   // This decomposes resource ops like ResourceGather into read-variable op
   // followed by gather. This is used when the saved model import path is used
-  // during which resources dont get frozen in the python layer.
+  // during which resources don't get frozen in the python layer.
   pass_manager->addNestedPass<mlir::func::FuncOp>(
       mlir::TFDevice::CreateDecomposeResourceOpsPass());
 
@@ -257,7 +257,9 @@ void AddPostVariableFreezingTFToLCETFLConversionPasses(
 
   // Run quantization after all the floating point model conversion is
   // completed.
-  if (quant_specs.RunPropagationAndRewriteQuantizationPasses()) {
+  if (quant_specs.RunPropagationAndRewriteQuantizationPasses() ||
+      quant_specs.qdq_conversion_mode !=
+          mlir::quant::QDQConversionMode::kQDQNone) {
     AddQuantizationPasses(quant_specs, *pass_manager);
     // Remove unnecessary QDQs while handling QAT models.
     pass_manager->addNestedPass<mlir::func::FuncOp>(
diff --git a/larq_compute_engine/mlir/tf_to_tfl_flatbuffer.cc b/larq_compute_engine/mlir/tf_to_tfl_flatbuffer.cc
index 90141a73..be080cef 100644
--- a/larq_compute_engine/mlir/tf_to_tfl_flatbuffer.cc
+++ b/larq_compute_engine/mlir/tf_to_tfl_flatbuffer.cc
@@ -3,15 +3,19 @@
 #include "larq_compute_engine/mlir/tf_tfl_passes.h"
 #include "larq_compute_engine/mlir/transforms/passes.h"
 #include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/Func/Extensions/AllExtensions.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/Pass/PassManager.h"
+#include "tensorflow/compiler/mlir/lite/debug/debug.h"
 #include "tensorflow/compiler/mlir/lite/flatbuffer_export.h"
 #include "tensorflow/compiler/mlir/lite/metrics/error_collector_inst.h"
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/op_stat_pass.h"
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_freeze_variables.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "tensorflow/core/framework/op.h"
-#include "tensorflow/tsl/platform/statusor.h"
+#include "tsl/platform/statusor.h"
 
 namespace tensorflow {
 namespace {
@@ -55,79 +59,68 @@ class TruncateOpOrArgLocNameMapper : public OpOrArgLocNameMapper {
 };
 
 }  // namespace
-Status ConvertTFExecutorToTFLOrFlatbuffer(
+absl::Status ConvertTFExecutorToTFLOrFlatbuffer(
     mlir::ModuleOp module, bool export_to_mlir, const LCETarget target,
     mlir::quant::QuantizationSpecs quant_specs,
     const std::unordered_set<std::string>& saved_model_tags,
     llvm::StringRef saved_model_dir,
-    llvm::Optional<tensorflow::Session*> session, std::string* result) {
+    std::optional<tensorflow::Session*> session, std::string* result) {
   // Explicitly disable dumping Op details on failures.
   module.getContext()->printOpOnDiagnostic(false);
 
-  // Register a warning handler only log to std out.
-  mlir::ScopedDiagnosticHandler s(
-      module.getContext(), [](mlir::Diagnostic& diag) {
-        if (diag.getSeverity() == mlir::DiagnosticSeverity::Warning) {
-          for (auto& note : diag.getNotes()) {
-            std::cout << note.str() << "\n";
-            LOG(WARNING) << note.str() << "\n";
-          }
-        }
-        return mlir::failure();
-      });
+  mlir::DialectRegistry registry;
+  mlir::func::registerAllExtensions(registry);
+  module.getContext()->appendDialectRegistry(registry);
 
   mlir::StatusScopedDiagnosticHandler statusHandler(module.getContext(),
                                                     /*propagate=*/true);
-  if (failed(IsValidGraph(module))) {
-    return statusHandler.ConsumeStatus();
-  }
-
   mlir::PassManager pass_manager(module.getContext());
+  mlir::registerPassManagerCLOptions();
   if (mlir::failed(mlir::applyPassManagerCLOptions(pass_manager))) {
-    // We don't return here as in the normal TF converter, since apparently this
-    // actually fails in our case, but the failure isn't terminal.
-    // return tensorflow::FromAbslStatus(
-    //    absl::UnknownError("failed to apply MLIR pass manager CL options"));
+    return absl::InternalError("Failed to apply MLIR pass manager CL options.");
   }
+  // DebugOptions::ir_dump_dir can be set for debugging
+  converter::DebugOptions debug_options;
+  InitPassManager(pass_manager, debug_options);
+
   pass_manager.addInstrumentation(
       std::make_unique<mlir::TFL::ErrorCollectorInstrumentation>(
           pass_manager.getContext()));
 
+  if (mlir::failed(IsValidGraph(module))) {
+    return statusHandler.ConsumeStatus();
+  }
+
   tensorflow::AddPreVariableFreezingTFToLCETFLConversionPasses(&pass_manager);
-  if (failed(pass_manager.run(module))) {
+  if (mlir::failed(pass_manager.run(module))) {
     return statusHandler.ConsumeStatus();
   }
 
   // Freeze variables if a session is provided.
-  if (session.has_value()) {
-    mlir::TFL::ErrorCollectorInstrumentation collector(module.getContext());
-    if (mlir::failed(
-            mlir::tf_saved_model::FreezeVariables(module, session.value()))) {
-      auto status = statusHandler.ConsumeStatus();
-      mlir::TFL::ErrorCollector* collector =
-          mlir::TFL::ErrorCollector::GetErrorCollector();
-      if (!collector->CollectedErrors().empty()) {
-        return errors::InvalidArgument("Variable constant folding has failed.");
-      }
-      return status;
-    }
+  if (session.has_value() && mlir::failed(mlir::tf_saved_model::FreezeVariables(
+                                 module, session.value_or(nullptr)))) {
+    return statusHandler.Combine(
+        absl::InvalidArgumentError("Variable constant folding is failed."));
   }
+
   pass_manager.clear();
+
   tensorflow::AddPostVariableFreezingTFToLCETFLConversionPasses(
       saved_model_dir, quant_specs, &pass_manager, target);
-  if (failed(pass_manager.run(module))) {
-    auto status = statusHandler.ConsumeStatus();
-    mlir::TFL::ErrorCollector* collector =
-        mlir::TFL::ErrorCollector::GetErrorCollector();
-    for (const auto& error_data : collector->CollectedErrors()) {
-      if (error_data.subcomponent() == "FreezeGlobalTensorsPass") {
-        return errors::InvalidArgument("Variable constant folding is failed.");
-      }
-    }
-    return status;
+  if (mlir::failed(pass_manager.run(module))) {
+    return statusHandler.Combine(
+        absl::InvalidArgumentError("Variable constant folding failed."));
   }
 
   if (export_to_mlir) {
+    pass_manager.clear();
+    // Print out a detailed report of ops that are not converted to TFL ops.
+    pass_manager.addPass(mlir::odml::createPrintOpStatsPass(
+        mlir::odml::GetAcceptedTFLiteDialects()));
+    if (mlir::failed(pass_manager.run(module))) {
+      return statusHandler.ConsumeStatus();
+    }
+
     llvm::raw_string_ostream os(*result);
     module.print(os);
     return statusHandler.ConsumeStatus();
@@ -142,14 +135,18 @@ Status ConvertTFExecutorToTFLOrFlatbuffer(
   options.toco_flags = toco_flags;
   options.saved_model_tags = saved_model_tags;
   options.op_or_arg_name_mapper = &op_or_arg_name_mapper;
-  if (!tflite::MlirToFlatBufferTranslateFunction(module, options, result)) {
-    return statusHandler.ConsumeStatus();
+  const bool serialize_stablehlo_ops = false;
+  if (!tflite::MlirToFlatBufferTranslateFunction(module, options, result,
+                                                 serialize_stablehlo_ops)) {
+    return statusHandler.Combine(
+        absl::InternalError("Could not translate MLIR to FlatBuffer."));
   }
 
-  if (mlir::failed(module.verify())) {
-    return tensorflow::errors::Unknown("Final module is invalid");
+  if (mlir::failed(module.verifyInvariants())) {
+    return statusHandler.Combine(
+        absl::InternalError("Final module is invalid."));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/larq_compute_engine/mlir/tf_to_tfl_flatbuffer.h b/larq_compute_engine/mlir/tf_to_tfl_flatbuffer.h
index cf89d353..e40eec8b 100644
--- a/larq_compute_engine/mlir/tf_to_tfl_flatbuffer.h
+++ b/larq_compute_engine/mlir/tf_to_tfl_flatbuffer.h
@@ -1,24 +1,26 @@
 #ifndef LARQ_COMPUTE_ENGINE_MLIR_TF_TO_TFL_FLATBUFFER_H_
 #define LARQ_COMPUTE_ENGINE_MLIR_TF_TO_TFL_FLATBUFFER_H_
 
+#include <optional>
 #include <unordered_set>
 
 #include "larq_compute_engine/mlir/transforms/passes.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_config.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/tsl/platform/statusor.h"
+#include "tsl/platform/statusor.h"
+
 namespace tensorflow {
 
 // This is a fork of ConvertTFExecutorToTFLOrFlatbuffer to enable custom
 // OpOrArgLocNameMapper
 // https://github.com/tensorflow/tensorflow/blob/v2.8.0/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h#L60-L78
-Status ConvertTFExecutorToTFLOrFlatbuffer(
+absl::Status ConvertTFExecutorToTFLOrFlatbuffer(
     mlir::ModuleOp module, bool export_to_mlir, const LCETarget target,
     mlir::quant::QuantizationSpecs quant_specs,
     const std::unordered_set<std::string>& saved_model_tags,
     llvm::StringRef saved_model_dir,
-    llvm::Optional<tensorflow::Session*> session, std::string* result);
+    std::optional<tensorflow::Session*> session, std::string* result);
 }  // namespace tensorflow
 
 #endif  // LARQ_COMPUTE_ENGINE_MLIR_TF_TO_TFL_FLATBUFFER_H_
diff --git a/larq_compute_engine/mlir/transforms/bitpack_activations_patterns.td b/larq_compute_engine/mlir/transforms/bitpack_activations_patterns.td
index c8dda3c2..f6beab39 100644
--- a/larq_compute_engine/mlir/transforms/bitpack_activations_patterns.td
+++ b/larq_compute_engine/mlir/transforms/bitpack_activations_patterns.td
@@ -55,6 +55,6 @@ class WriteBitpackedActivationsPat<ConstantStrAttr padding_type, string pad_valu
               padding_type,
               $stride_height,
               $stride_width),
-          [(HasOneUse $output)], (addBenefit 10)>;
+          [(HasOneUse $output)]>;
 def : WriteBitpackedActivationsPat<TFL_PAD_Valid, "0">;
 def : WriteBitpackedActivationsPat<TFL_PAD_Same, "1">;
diff --git a/larq_compute_engine/mlir/transforms/fuse_padding.td b/larq_compute_engine/mlir/transforms/fuse_padding.td
index 0aab22ae..57d7be0a 100644
--- a/larq_compute_engine/mlir/transforms/fuse_padding.td
+++ b/larq_compute_engine/mlir/transforms/fuse_padding.td
@@ -43,8 +43,7 @@ def : Pat<(TFL_Conv2DOp:$conv_output
           [(HasOneUse $pad_output),
            (NoBatchAndChannelPadding $paddings),
            (SamePaddingHeight $paddings, $input, $conv_output, $stride_h),
-           (SamePaddingWidth $paddings, $input, $conv_output, $stride_w)],
-          (addBenefit 100)>;
+           (SamePaddingWidth $paddings, $input, $conv_output, $stride_w)]>;
 
 
 // PadV2 > Conv2D
@@ -74,8 +73,7 @@ def : Pat<(TFL_Conv2DOp:$conv_output
            (ConstFloatValueIs<"0.0"> $pad_values),
            (NoBatchAndChannelPadding $paddings),
            (SamePaddingHeight $paddings, $input, $conv_output, $stride_h),
-           (SamePaddingWidth $paddings, $input, $conv_output, $stride_w)],
-          (addBenefit 100)>;
+           (SamePaddingWidth $paddings, $input, $conv_output, $stride_w)]>;
 
 // Pad > DepthwiseConv2D
 def : Pat<(TFL_DepthwiseConv2DOp:$conv_output
@@ -104,8 +102,7 @@ def : Pat<(TFL_DepthwiseConv2DOp:$conv_output
           [(HasOneUse $pad_output),
            (NoBatchAndChannelPadding $paddings),
            (SamePaddingHeight $paddings, $input, $conv_output, $stride_h),
-           (SamePaddingWidth $paddings, $input, $conv_output, $stride_w)],
-          (addBenefit 100)>;
+           (SamePaddingWidth $paddings, $input, $conv_output, $stride_w)]>;
 
 // PadV2 > DepthwiseConv2D
 def : Pat<(TFL_DepthwiseConv2DOp:$conv_output
@@ -136,5 +133,4 @@ def : Pat<(TFL_DepthwiseConv2DOp:$conv_output
            (ConstFloatValueIs<"0.0"> $pad_values),
            (NoBatchAndChannelPadding $paddings),
            (SamePaddingHeight $paddings, $input, $conv_output, $stride_h),
-           (SamePaddingWidth $paddings, $input, $conv_output, $stride_w)],
-          (addBenefit 100)>;
+           (SamePaddingWidth $paddings, $input, $conv_output, $stride_w)]>;
diff --git a/larq_compute_engine/mlir/transforms/optimize.cc b/larq_compute_engine/mlir/transforms/optimize.cc
index 8b43a790..9646a3d0 100644
--- a/larq_compute_engine/mlir/transforms/optimize.cc
+++ b/larq_compute_engine/mlir/transforms/optimize.cc
@@ -4,7 +4,6 @@
 #include "larq_compute_engine/mlir/ir/lce_ops.h"
 #include "larq_compute_engine/mlir/transforms/common.h"
 #include "larq_compute_engine/mlir/transforms/passes.h"
-#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
diff --git a/larq_compute_engine/mlir/transforms/optimize_patterns_common.td b/larq_compute_engine/mlir/transforms/optimize_patterns_common.td
index 27c8de45..9bc11fa0 100644
--- a/larq_compute_engine/mlir/transforms/optimize_patterns_common.td
+++ b/larq_compute_engine/mlir/transforms/optimize_patterns_common.td
@@ -13,13 +13,13 @@ def HasOneUse : Constraint<CPred<"$0.hasOneUse()">>;
 
 class ConstantValue<string val> : AttrConstraint<CPred<"IsConstantValue($_self, " # val # ")">>;
 
+// This pattern has priority (addBenefit) over the more generic pattern below
 def : Pat<(LQ_QuantizeOp
               (TFL_GreaterEqualOp:$ge_op
                   $input,
                   (Arith_ConstantOp ConstantValue<"0.0f">))),
           (LQ_QuantizeOp $input),
-          [(HasOneUse $ge_op)],
-          (addBenefit 150)>;
+          [(HasOneUse $ge_op)], [], (addBenefit 100)>;
 
 def : Pat<(LQ_QuantizeOp
               (TFL_GreaterEqualOp:$ge_op
@@ -27,15 +27,13 @@ def : Pat<(LQ_QuantizeOp
                   $threshold)),
           (LQ_QuantizeOp
               (TFL_SubOp $input, $threshold, TFL_AF_None)),
-          [(HasOneUse $ge_op)],
-          (addBenefit 100)>;
+          [(HasOneUse $ge_op)]>;
 
 def : Pat<(LQ_QuantizeOp
               (TFL_LessEqualOp:$ge_op $lhs, $rhs)),
           (LQ_QuantizeOp
               (TFL_GreaterEqualOp $rhs, $lhs)),
-          [(HasOneUse $ge_op)],
-          (addBenefit 100)>;
+          [(HasOneUse $ge_op)]>;
 
 // TODO: Check shapes before fusing
 multiclass FuseAddOrSubWithBConv2D<Op binaryOp> {
@@ -70,7 +68,7 @@ multiclass FuseAddOrSubWithBConv2D<Op binaryOp> {
                 $padding,
                 $stride_height,
                 $stride_width),
-            [(HasOneUse $output)], (addBenefit 100)>;
+            [(HasOneUse $output)]>;
 }
 foreach binaryOp = [TFL_AddOp, TFL_SubOp] in
   defm : FuseAddOrSubWithBConv2D<binaryOp>;
@@ -109,7 +107,7 @@ multiclass FuseMulOrDivWithBConv2D<Op binaryOp> {
                 $padding,
                 $stride_height,
                 $stride_width),
-            [(HasOneUse $conv_output)], (addBenefit 100)>;
+            [(HasOneUse $conv_output)]>;
 }
 foreach binaryOp = [TFL_DivOp, TFL_MulOp] in
   defm : FuseMulOrDivWithBConv2D<binaryOp>;
@@ -146,7 +144,7 @@ multiclass FuseActFnIntoConvOpPat<Op ActFnOp, ConstantStrAttr ActFnAttr> {
                 $padding,
                 $stride_height,
                 $stride_width),
-            [(HasOneUse $conv_output)], (addBenefit 100)>;
+            [(HasOneUse $conv_output)]>;
   def : Pat<(ActFnOp
                 (LQ_Bconv2dOp:$conv_output
                     $input,
@@ -176,7 +174,7 @@ multiclass FuseActFnIntoConvOpPat<Op ActFnOp, ConstantStrAttr ActFnAttr> {
                 $padding,
                 $stride_height,
                 $stride_width),
-            [(HasOneUse $conv_output)], (addBenefit 100)>;
+            [(HasOneUse $conv_output)]>;
 }
 foreach actFnPair = [[TFL_ReluOp, TFL_AF_Relu],
                      [TFL_Relu1Op, TFL_AF_Relu1],
diff --git a/larq_compute_engine/mlir/transforms/prepare_patterns_common.td b/larq_compute_engine/mlir/transforms/prepare_patterns_common.td
index 430c0e49..3eed4cf6 100644
--- a/larq_compute_engine/mlir/transforms/prepare_patterns_common.td
+++ b/larq_compute_engine/mlir/transforms/prepare_patterns_common.td
@@ -37,7 +37,7 @@ multiclass QuantDequantPatterns<Op SelectOp> {
                             $select_op,
                             $select_op,
                             /*use 32bit*/ConstBoolAttrFalse)))),
-            [], (addBenefit 100)>;
+            []>;
   def : Pat<(SelectOp:$select_op
                 $cond,
                 (Arith_ConstantOp ConstantValue<"-1.0f">),
@@ -51,7 +51,7 @@ multiclass QuantDequantPatterns<Op SelectOp> {
                             $select_op,
                             $select_op,
                             /*use 32bit*/ConstBoolAttrFalse)))),
-            [], (addBenefit 100)>;
+            []>;
 }
 foreach SelectOp = [TF_SelectOp, TF_SelectV2Op]<Op> in
   defm : QuantDequantPatterns<SelectOp>;
@@ -59,9 +59,9 @@ foreach SelectOp = [TF_SelectOp, TF_SelectV2Op]<Op> in
 // A fallback for the old version of `ste_sign` that uses a specific `tf.sign`
 // based implementation of `larq.math.sign`.
 def : Pat<(TF_SignOp (TF_AddV2Op (TF_SignOp $arg), $c)),
-          (LQ_DequantizeOp (LQ_QuantizeOp $arg)), [], (addBenefit 100)>;
+          (LQ_DequantizeOp (LQ_QuantizeOp $arg)), []>;
 def : Pat<(TF_SignOp (TF_AddV2Op $c, (TF_SignOp $arg))),
-          (LQ_DequantizeOp (LQ_QuantizeOp $arg)), [], (addBenefit 100)>;
+          (LQ_DequantizeOp (LQ_QuantizeOp $arg)), []>;
 
 // Copied from legalize_patterns.td
 class I32VectorElementsAttr<int len> : ElementsAttrBase<
@@ -123,8 +123,7 @@ class PrepareBConvPadValue0Pat<ConstantStrAttr padding_type> :
               ExtractI32At<1>:$strides,
               ExtractI32At<2>:$strides),
           [(BinaryFilter $filter),
-           (ValidFilterShape $dequantized_input, $filter_op)],
-          (addBenefit 90)>;
+           (ValidFilterShape $dequantized_input, $filter_op)]>;
 def : PrepareBConvPadValue0Pat<TFL_PAD_Valid>;
 
 def ConstFloatValueIsOne : Constraint<
@@ -166,5 +165,4 @@ def : Pat<(TF_Conv2DOp:$output
           [(BinaryFilter $filter),
            (ConstFloatValueIsOne $pad_values),
            (SamePadding $paddings, $input, $output, $strides),
-           (ValidFilterShape $dequantized_input, $filter_op)],
-          (addBenefit 90)>;
+           (ValidFilterShape $dequantized_input, $filter_op)]>;
diff --git a/larq_compute_engine/requirements.in b/larq_compute_engine/requirements.in
new file mode 100644
index 00000000..5ab513f8
--- /dev/null
+++ b/larq_compute_engine/requirements.in
@@ -0,0 +1,7 @@
+tensorflow==2.16.1
+tf-keras==2.16.0
+tensorflow-datasets
+larq
+tqdm
+pytest
+googleapis-common-protos<2,>=1.52.0 # dependency of tensorflow-datasets, somehow not picked up by pip-compile
diff --git a/larq_compute_engine/requirements.txt b/larq_compute_engine/requirements.txt
new file mode 100644
index 00000000..2da7226b
--- /dev/null
+++ b/larq_compute_engine/requirements.txt
@@ -0,0 +1,191 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile --allow-unsafe --no-emit-index-url --strip-extras larq_compute_engine/requirements.in
+#
+absl-py==2.1.0
+    # via
+    #   array-record
+    #   etils
+    #   keras
+    #   tensorboard
+    #   tensorflow
+    #   tensorflow-datasets
+    #   tensorflow-metadata
+array-record==0.5.1
+    # via tensorflow-datasets
+astunparse==1.6.3
+    # via tensorflow
+certifi==2024.6.2
+    # via requests
+charset-normalizer==3.3.2
+    # via requests
+click==8.1.7
+    # via tensorflow-datasets
+dm-tree==0.1.8
+    # via tensorflow-datasets
+docstring-parser==0.16
+    # via simple-parsing
+etils==1.7.0
+    # via
+    #   array-record
+    #   tensorflow-datasets
+exceptiongroup==1.2.1
+    # via pytest
+flatbuffers==24.3.25
+    # via tensorflow
+fsspec==2024.6.0
+    # via etils
+gast==0.5.4
+    # via tensorflow
+google-pasta==0.2.0
+    # via tensorflow
+googleapis-common-protos==1.63.1
+    # via -r larq_compute_engine/requirements.in
+grpcio==1.64.1
+    # via
+    #   tensorboard
+    #   tensorflow
+h5py==3.11.0
+    # via
+    #   keras
+    #   tensorflow
+idna==3.7
+    # via requests
+immutabledict==4.2.0
+    # via tensorflow-datasets
+importlib-resources==6.4.0
+    # via etils
+iniconfig==2.0.0
+    # via pytest
+keras==3.3.3
+    # via tensorflow
+larq==0.13.3
+    # via -r larq_compute_engine/requirements.in
+libclang==18.1.1
+    # via tensorflow
+markdown==3.6
+    # via tensorboard
+markdown-it-py==3.0.0
+    # via rich
+markupsafe==2.1.5
+    # via werkzeug
+mdurl==0.1.2
+    # via markdown-it-py
+ml-dtypes==0.3.2
+    # via
+    #   keras
+    #   tensorflow
+namex==0.0.8
+    # via keras
+numpy==1.26.4
+    # via
+    #   etils
+    #   h5py
+    #   keras
+    #   larq
+    #   ml-dtypes
+    #   opt-einsum
+    #   pyarrow
+    #   tensorboard
+    #   tensorflow
+    #   tensorflow-datasets
+opt-einsum==3.3.0
+    # via tensorflow
+optree==0.11.0
+    # via keras
+packaging==24.1
+    # via
+    #   larq
+    #   pytest
+    #   tensorflow
+pluggy==1.5.0
+    # via pytest
+promise==2.3
+    # via tensorflow-datasets
+protobuf==3.20.3
+    # via
+    #   googleapis-common-protos
+    #   tensorboard
+    #   tensorflow
+    #   tensorflow-datasets
+    #   tensorflow-metadata
+psutil==5.9.8
+    # via tensorflow-datasets
+pyarrow==16.1.0
+    # via tensorflow-datasets
+pygments==2.18.0
+    # via rich
+pytest==8.2.2
+    # via -r larq_compute_engine/requirements.in
+requests==2.32.3
+    # via
+    #   tensorflow
+    #   tensorflow-datasets
+rich==13.7.1
+    # via keras
+simple-parsing==0.1.5
+    # via tensorflow-datasets
+six==1.16.0
+    # via
+    #   astunparse
+    #   google-pasta
+    #   promise
+    #   tensorboard
+    #   tensorflow
+tensorboard==2.16.2
+    # via tensorflow
+tensorboard-data-server==0.7.2
+    # via tensorboard
+tensorflow==2.16.1
+    # via
+    #   -r larq_compute_engine/requirements.in
+    #   tf-keras
+tensorflow-datasets==4.9.6
+    # via -r larq_compute_engine/requirements.in
+tensorflow-io-gcs-filesystem==0.37.0
+    # via tensorflow
+tensorflow-metadata==1.15.0
+    # via tensorflow-datasets
+termcolor==2.4.0
+    # via
+    #   tensorflow
+    #   tensorflow-datasets
+terminaltables==3.1.10
+    # via larq
+tf-keras==2.16.0
+    # via -r larq_compute_engine/requirements.in
+toml==0.10.2
+    # via tensorflow-datasets
+tomli==2.0.1
+    # via pytest
+tqdm==4.66.4
+    # via
+    #   -r larq_compute_engine/requirements.in
+    #   etils
+    #   tensorflow-datasets
+typing-extensions==4.12.2
+    # via
+    #   etils
+    #   optree
+    #   simple-parsing
+    #   tensorflow
+urllib3==2.2.1
+    # via requests
+werkzeug==3.0.3
+    # via tensorboard
+wheel==0.43.0
+    # via astunparse
+wrapt==1.16.0
+    # via
+    #   tensorflow
+    #   tensorflow-datasets
+zipp==3.19.2
+    # via etils
+
+# The following packages are considered to be unsafe in a requirements file:
+setuptools==70.0.0
+    # via
+    #   tensorboard
+    #   tensorflow
diff --git a/larq_compute_engine/tests/BUILD b/larq_compute_engine/tests/BUILD
index 0356f630..0a77e4a7 100644
--- a/larq_compute_engine/tests/BUILD
+++ b/larq_compute_engine/tests/BUILD
@@ -1,3 +1,7 @@
+load("@pypi//:requirements.bzl", tf_requirement = "requirement")
+load("@pypi_lce//:requirements.bzl", lce_requirement = "requirement")
+load("//larq_compute_engine/tests:qemu_test.bzl", "lce_qemu_test_suite")
+
 package(
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],  # Apache 2.0
@@ -8,15 +12,23 @@ exports_files([
     "test_aarch64_binary.sh",
 ])
 
-load("//larq_compute_engine/tests:qemu_test.bzl", "lce_qemu_test_suite")
-
 py_test(
     name = "end2end_test",
     size = "large",
-    srcs = ["end2end_test.py"],
+    srcs = [
+        "end2end_test.py",
+        "preprocess.py",
+    ],
     deps = [
         "//larq_compute_engine/mlir:converter",
         "//larq_compute_engine/tflite/python:interpreter",
+        tf_requirement("numpy"),
+        lce_requirement("larq"),
+        lce_requirement("pytest"),
+        lce_requirement("tensorflow"),
+        lce_requirement("tensorflow_datasets"),
+        lce_requirement("tf-keras"),
+        lce_requirement("importlib_resources"),
     ],
 )
 
@@ -25,6 +37,10 @@ py_test(
     srcs = ["strip_lcedequantize_test.py"],
     deps = [
         "//larq_compute_engine/mlir:converter",
+        lce_requirement("larq"),
+        lce_requirement("pytest"),
+        lce_requirement("tensorflow"),
+        lce_requirement("tf-keras"),
     ],
 )
 
diff --git a/larq_compute_engine/tests/end2end_test.py b/larq_compute_engine/tests/end2end_test.py
index cddad71e..f2ca87fe 100644
--- a/larq_compute_engine/tests/end2end_test.py
+++ b/larq_compute_engine/tests/end2end_test.py
@@ -15,7 +15,7 @@
 )
 from larq_compute_engine.tflite.python.interpreter import Interpreter
 
-from preprocess import preprocess_image_tensor, IMAGE_SIZE
+from larq_compute_engine.tests.preprocess import preprocess_image_tensor, IMAGE_SIZE
 
 
 def convert_keras_model_as_saved_model(model, **kwargs):
diff --git a/larq_compute_engine/tflite/java/build_lce_aar.sh b/larq_compute_engine/tflite/java/build_lce_aar.sh
index 2a4b9b47..68e83be9 100755
--- a/larq_compute_engine/tflite/java/build_lce_aar.sh
+++ b/larq_compute_engine/tflite/java/build_lce_aar.sh
@@ -16,11 +16,8 @@ VERSION=$(git describe --tags)
 
 BUILDER="${BUILDER:-bazel}"
 BASEDIR=larq_compute_engine/tflite
-CROSSTOOL="//external:android/crosstool"
-HOST_CROSSTOOL="@bazel_tools//tools/cpp:toolchain"
 
-BUILD_OPTS="-c opt --fat_apk_cpu=x86,x86_64,arm64-v8a"
-CROSSTOOL_OPTS="--crosstool_top=$CROSSTOOL --host_crosstool_top=$HOST_CROSSTOOL"
+BUILD_OPTS="-c opt --config=android_arm64 --fat_apk_cpu=x86,x86_64,arm64-v8a"
 
 test -d $BASEDIR || (echo "Aborting: not at top-level build directory"; exit 1)
 
diff --git a/larq_compute_engine/tflite/python/BUILD b/larq_compute_engine/tflite/python/BUILD
index 3a25e789..2f57661e 100644
--- a/larq_compute_engine/tflite/python/BUILD
+++ b/larq_compute_engine/tflite/python/BUILD
@@ -1,5 +1,7 @@
 load("@org_tensorflow//tensorflow:tensorflow.bzl", "pybind_extension")
 load("@org_tensorflow//tensorflow/lite:build_def.bzl", "tflite_linkopts")
+load("@pypi//:requirements.bzl", tf_requirement = "requirement")
+load("@pypi_lce//:requirements.bzl", lce_requirement = "requirement")
 
 package(
     default_visibility = ["//visibility:public"],
@@ -35,6 +37,10 @@ py_library(
         "__init__.py",
         "interpreter_base.py",
     ],
+    deps = [
+        tf_requirement("numpy"),
+        lce_requirement("tqdm"),
+    ],
 )
 
 py_library(
diff --git a/larq_compute_engine/tflite/tests/BUILD b/larq_compute_engine/tflite/tests/BUILD
index aea9b72c..0f013592 100644
--- a/larq_compute_engine/tflite/tests/BUILD
+++ b/larq_compute_engine/tflite/tests/BUILD
@@ -1,3 +1,6 @@
+load("@pypi//:requirements.bzl", tf_requirement = "requirement")
+load("@pypi_lce//:requirements.bzl", lce_requirement = "requirement")
+
 package(
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],  # Apache 2.0
@@ -77,9 +80,13 @@ cc_test(
 
 py_test(
     name = "interpreter_test",
+    size = "small",
     srcs = ["interpreter_test.py"],
     deps = [
         "//larq_compute_engine/tflite/python:interpreter",
+        tf_requirement("numpy"),
+        lce_requirement("pytest"),
+        lce_requirement("tensorflow"),
     ],
 )
 
diff --git a/third_party/install_android.sh b/third_party/install_android.sh
index 0245e09c..3ba0a37e 100755
--- a/third_party/install_android.sh
+++ b/third_party/install_android.sh
@@ -4,12 +4,16 @@ set -e
 # **NOTE**: This requires Java 8 and won't work on never versions. See:
 # https://stackoverflow.com/questions/46402772/failed-to-install-android-sdk-java-lang-noclassdeffounderror-javax-xml-bind-a
 
+# Taken from tensorflow/lite/tools/tflite-android.Dockerfile
+
 # default LCE Android Env. variables
-export ANDROID_SDK_URL="https://dl.google.com/android/repository/sdk-tools-linux-3859397.zip"
+export ANDROID_SDK_URL="https://dl.google.com/android/repository/commandlinetools-linux-6858069_latest.zip"
 export ANDROID_HOME="/tmp/lce_android"
-export ANDROID_VERSION=29
-export ANDROID_BUILD_TOOLS_VERSION=30.0.2
-export ANDROID_NDK_VERSION=21.4.7075529
+export ANDROID_API_LEVEL=30
+export ANDROID_BUILD_TOOLS_VERSION=31.0.0
+export ANDROID_NDK_VERSION=25.2.9519653
+export ANDROID_NDK_API_LEVEL=30
+
 
 # download android SDK
 mkdir -p $ANDROID_HOME; cd $ANDROID_HOME;
@@ -19,24 +23,27 @@ curl -o lce_android_sdk.zip $ANDROID_SDK_URL;
 echo -e "DONE.\n\n"
 
 echo -e "Unpacking Android SDK ... "
-unzip lce_android_sdk.zip;
+unzip lce_android_sdk.zip -d /tmp
+mkdir -p ${ANDROID_HOME}/cmdline-tools
+mv /tmp/cmdline-tools ${ANDROID_HOME}/cmdline-tools/latest
 echo -e "DONE.\n\n"
 
 rm lce_android_sdk.zip;
 
 # install android platform and build tools
 echo -e "Updating SDK manager ... "
-yes | $ANDROID_HOME/tools/bin/sdkmanager --licenses
-$ANDROID_HOME/tools/bin/sdkmanager --update
+yes | $ANDROID_HOME/cmdline-tools/latest/bin/sdkmanager --licenses
+$ANDROID_HOME/cmdline-tools/latest/bin/sdkmanager --update
 echo -e "DONE.\n\n"
 
 echo -e "Installing Android SDK Platform and Build Tools ... "
-$ANDROID_HOME/tools/bin/sdkmanager "build-tools;${ANDROID_BUILD_TOOLS_VERSION}" \
-    "platforms;android-${ANDROID_VERSION}" \
+$ANDROID_HOME/cmdline-tools/latest/bin/sdkmanager \
+    "build-tools;${ANDROID_BUILD_TOOLS_VERSION}" \
+    "platforms;android-${ANDROID_API_LEVEL}" \
     "platform-tools"
 echo -e "DONE.\n\n"
 
 echo -e "Installing Android NDK ... "
-$ANDROID_HOME/tools/bin/sdkmanager \
+$ANDROID_HOME/cmdline-tools/latest/bin/sdkmanager \
     "ndk;${ANDROID_NDK_VERSION}"
 echo -e "DONE.\n\n"
diff --git a/third_party/tensorflow b/third_party/tensorflow
index 1cb1a030..5bc9d266 160000
--- a/third_party/tensorflow
+++ b/third_party/tensorflow
@@ -1 +1 @@
-Subproject commit 1cb1a030a62b169d90d34c747ab9b09f332bf905
+Subproject commit 5bc9d26649cca274750ad3625bd93422617eed4b
diff --git a/third_party/tensorflow_patches/disable_forced_mkl.patch b/third_party/tensorflow_patches/disable_forced_mkl.patch
index 8791d659..ea60601f 100644
--- a/third_party/tensorflow_patches/disable_forced_mkl.patch
+++ b/third_party/tensorflow_patches/disable_forced_mkl.patch
@@ -1,29 +1,27 @@
-diff --git a/tensorflow/tsl/mkl/build_defs.bzl b/tensorflow/tsl/mkl/build_defs.bzl
-index eaa0b2dbde7..9d709f8abf5 100644
---- a/tensorflow/tsl/mkl/build_defs.bzl
-+++ b/tensorflow/tsl/mkl/build_defs.bzl
+diff --git a/third_party/xla/third_party/tsl/tsl/mkl/build_defs.bzl b/third_party/xla/third_party/tsl/tsl/mkl/build_defs.bzl
+index 90030a39744..489ebaa5aa7 100644
+--- a/third_party/xla/third_party/tsl/tsl/mkl/build_defs.bzl
++++ b/third_party/xla/third_party/tsl/tsl/mkl/build_defs.bzl
 @@ -33,8 +33,9 @@ def if_mkl(if_true, if_false = []):
      """
      return select({
-         "@org_tensorflow//tensorflow/tsl/mkl:build_with_mkl_aarch64": if_true,
--        "@org_tensorflow//tensorflow/tsl:linux_x86_64": if_true,
--        "@org_tensorflow//tensorflow/tsl:windows": if_true,
-+        "@org_tensorflow//tensorflow/tsl/mkl:build_with_mkl_lnx_x64": if_true,
-+        "@org_tensorflow//tensorflow/tsl/mkl:build_with_mkl_lnx_openmp": if_true,
-+        "@org_tensorflow//tensorflow/tsl/mkl:build_with_mkl_windows_openmp": if_true,
+         "@local_tsl//tsl/mkl:build_with_mkl_aarch64": if_true,
+-        "@local_tsl//tsl:linux_x86_64": if_true,
+-        "@local_tsl//tsl:windows": if_true,
++        "@local_tsl//tsl/mkl:build_with_mkl_lnx_x64": if_true,
++        "@local_tsl//tsl/mkl:build_with_mkl_lnx_openmp": if_true,
++        "@local_tsl//tsl/mkl:build_with_mkl_windows_openmp": if_true,
          "//conditions:default": if_false,
      })
-
-@@ -102,9 +103,9 @@ def mkl_deps():
+ 
+@@ -102,8 +103,8 @@ def mkl_deps():
      """
      return select({
-         "@org_tensorflow//tensorflow/tsl/mkl:build_with_mkl_aarch64": ["@mkl_dnn_acl_compatible//:mkl_dnn_acl"],
--        "@org_tensorflow//tensorflow/tsl:linux_x86_64_with_onednn_v2": ["@mkl_dnn_v1//:mkl_dnn"],
--        "@org_tensorflow//tensorflow/tsl:linux_x86_64_with_onednn_v3": ["@onednn_v3//:mkl_dnn"],
--        "@org_tensorflow//tensorflow/tsl:windows": ["@mkl_dnn_v1//:mkl_dnn"],
-+        "@org_tensorflow//tensorflow/tsl/mkl:build_with_mkl_lnx_x64":  ["@mkl_dnn_v1//:mkl_dnn"],
-+        "@org_tensorflow//tensorflow/tsl/mkl:build_with_mkl_lnx_openmp":  ["@mkl_dnn_v1//:mkl_dnn"],
-+        "@org_tensorflow//tensorflow/tsl/mkl:build_with_mkl_windows_openmp": ["@mkl_dnn_v1//:mkl_dnn"],
+         "@local_tsl//tsl/mkl:build_with_mkl_aarch64": ["@mkl_dnn_acl_compatible//:mkl_dnn_acl"],
+-        "@local_tsl//tsl:linux_x86_64": ["@onednn//:mkl_dnn"],
+-        "@local_tsl//tsl:windows": ["@onednn//:mkl_dnn"],
++        "@local_tsl//tsl/mkl:build_with_mkl_lnx_x64": ["@onednn//:mkl_dnn"],
++        "@local_tsl//tsl/mkl:build_with_mkl_windows_openmp": ["@onednn//:mkl_dnn"],
          "//conditions:default": [],
      })
-
+