From 5caa9e83798cb510c9b49acee8a64efdb746207c Mon Sep 17 00:00:00 2001 From: Brian Zhao Date: Tue, 19 Nov 2019 11:39:01 -0800 Subject: [PATCH] Fix for https://github.com/tensorflow/tensorflow/issues/34117 Bazel's change to legacy_whole_archive behavior is not the cause for TF's linking issues with protobuf. Protobuf's implementation and runtime are correctly being linked into TF here: https://github.com/tensorflow/tensorflow/blob/da5765ebad2e1d3c25d11ee45aceef0b60da499f/tensorflow/core/platform/default/build_config.bzl#L239 and https://github.com/tensorflow/tensorflow/blob/da5765ebad2e1d3c25d11ee45aceef0b60da499f/third_party/protobuf/protobuf.patch#L18, and I've confirmed that protobuf symbols are still present in libtensorflow_framework.so via nm. After examining the linker flags that bazel passes to gcc, https://gist.github.com/bmzhao/f51bbdef50e9db9b24acd5b5acc95080, I discovered that the order of the linker flags was what was causing the undefined reference. See https://eli.thegreenplace.net/2013/07/09/library-order-in-static-linking/ and https://stackoverflow.com/a/12272890. Basically linkers discard the objects they've been asked to link if those objects do not export any symbols that the linker currently has kept track as "undefined". To prove this was the issue, I was able to successfully link after moving the linking shared object flag (-l:libtensorflow_framework.so.2) to the bottom of the flag order, and manually invoking g++. This change uses cc_import to to link against a .so in the "deps" of tf_cc_binary, rather than as the "srcs" of tf_cc_binary. This technique was inspired by the comment here: https://github.com/bazelbuild/bazel/blob/387c610d09b99536f7f5b8ecb883d14ee6063fdd/examples/windows/dll/windows_dll_library.bzl#L47-L48 Successfully built on vanilla Ubuntu 18.04 VM: bmzhao@bmzhao-tf-build-failure-reproing:~/tf-fix/tf$ bazel build -c opt --config=cuda --config=v2 --host_force_python=PY3 //tensorflow/tools/pip_package:build_pip_package Target //tensorflow/tools/pip_package:build_pip_package up-to-date: bazel-bin/tensorflow/tools/pip_package/build_pip_package INFO: Elapsed time: 2067.380s, Critical Path: 828.19s INFO: 12942 processes: 51 remote cache hit, 12891 local. INFO: Build completed successfully, 14877 total actions The root cause might instead be https://github.com/bazelbuild/bazel/issues/7687, which is pending further investigation. PiperOrigin-RevId: 281341817 Change-Id: Ia240eb050d9514ed5ac95b7b5fb7e0e98b7d1e83 --- .bazelrc | 15 ++------------- tensorflow/BUILD | 12 ++++++++++++ tensorflow/tensorflow.bzl | 5 +++++ 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/.bazelrc b/.bazelrc index ff85372049d..b8772c76216 100644 --- a/.bazelrc +++ b/.bazelrc @@ -209,19 +209,8 @@ build --announce_rc # Other build flags. build --define=grpc_no_ares=true -# See https://github.com/bazelbuild/bazel/issues/7362 for information on what -# --incompatible_remove_legacy_whole_archive flag does. -# This flag is set to true in Bazel 1.0 and newer versions. We tried to migrate -# Tensorflow to the default, however test coverage wasn't enough to catch the -# errors. -# There is ongoing work on Bazel team's side to provide support for transitive -# shared libraries. As part of migrating to transitive shared libraries, we -# hope to provide a better mechanism for control over symbol exporting, and -# then tackle this issue again. -# -# TODO: Remove this line once TF doesn't depend on Bazel wrapping all library -# archives in -whole_archive -no_whole_archive. -build --noincompatible_remove_legacy_whole_archive +# Prevent regression of https://github.com/bazelbuild/bazel/issues/7362 +build --incompatible_remove_legacy_whole_archive # Modular TF build options build:dynamic_kernels --define=dynamic_loaded_kernels=true diff --git a/tensorflow/BUILD b/tensorflow/BUILD index 31e18e4e8f9..a346a546038 100644 --- a/tensorflow/BUILD +++ b/tensorflow/BUILD @@ -587,6 +587,18 @@ tf_cc_shared_object( ] + tf_additional_binary_deps(), ) +# This is intended to be the same as tf_binary_additional_srcs: +# https://github.com/tensorflow/tensorflow/blob/cd67f4f3723f9165aabedd0171aaadc6290636e5/tensorflow/tensorflow.bzl#L396-L425 +# And is usable in the "deps" attribute instead of the "srcs" attribute +# as a workaround for https://github.com/tensorflow/tensorflow/issues/34117 +cc_import( + name = "libtensorflow_framework_import_lib", + shared_library = select({ + "//tensorflow:macos": ":libtensorflow_framework.dylib", + "//conditions:default": ":libtensorflow_framework.so", + }), +) + # ------------------------------------------- # New rules should be added above this target. # ------------------------------------------- diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl index c62a56215a5..57ebb1e2164 100644 --- a/tensorflow/tensorflow.bzl +++ b/tensorflow/tensorflow.bzl @@ -626,6 +626,11 @@ def tf_cc_binary( [ clean_dep("//third_party/mkl:intel_binary_blob"), ], + ) + if_static( + extra_deps = [], + otherwise = [ + clean_dep("//tensorflow:libtensorflow_framework_import_lib"), + ], ), data = depset(data + added_data_deps), linkopts = linkopts + _rpath_linkopts(name_os),