horovod · romerojosh · Aug 25, 2021 · Jul 12, 2021 · Jul 13, 2021 · Jul 14, 2021
diff --git a/README.rst b/README.rst
@@ -171,6 +171,7 @@ Supported frameworks
 See these pages for Horovod examples and best practices:
 
 - `Horovod with TensorFlow <docs/tensorflow.rst>`_
+- `Horovod with XLA in Tensorflow <xla.rst>`_
 - `Horovod with Keras <docs/keras.rst>`_
 - `Horovod with PyTorch <docs/pytorch.rst>`_
 - `Horovod with MXNet <docs/mxnet.rst>`_

diff --git a/cmake/Modules/FindTensorflow.cmake b/cmake/Modules/FindTensorflow.cmake
@@ -19,7 +19,13 @@ if (LEN EQUAL "4")
     list(GET Tensorflow_OUTPUT 0 Tensorflow_VERSION)
     list(GET Tensorflow_OUTPUT 1 Tensorflow_INCLUDE_DIRS)
     list(GET Tensorflow_OUTPUT 2 Tensorflow_LIBRARIES)
-    string(REPLACE " " ";" Tensorflow_LIBRARIES "${Tensorflow_LIBRARIES}")
+    string(REPLACE " " ";" Tensorflow_LIBRARIES_LIST "${Tensorflow_LIBRARIES}")
+    list(GET Tensorflow_LIBRARIES_LIST 0 Tensorflow_LIB_PATH)
+    if (Tensorflow_VERSION VERSION_GREATER_EQUAL "2.6")
+        # XLA implementations are in _pywrap_tensorflow_internal.so
+        set(Tensorflow_LIBRARIES "${Tensorflow_LIBRARIES} ${Tensorflow_LIB_PATH}/python/ -l:_pywrap_tensorflow_internal.so")
+    endif()
+    message("Tensorflow_LIBRARIES := ${Tensorflow_LIBRARIES}")
     list(GET Tensorflow_OUTPUT 3 Tensorflow_COMPILE_FLAGS)
     if("${Tensorflow_COMPILE_FLAGS}" MATCHES "-D_GLIBCXX_USE_CXX11_ABI=1")
         set(Tensorflow_CXX11 TRUE)

diff --git a/docs/index.rst b/docs/index.rst
@@ -101,6 +101,8 @@ Guides
 
    tensorflow
 
+   xla
+
    keras
 
    pytorch

diff --git a/docs/summary.rst b/docs/summary.rst
@@ -163,6 +163,7 @@ Supported frameworks
 See these pages for Horovod examples and best practices:
 
 - `Horovod with TensorFlow <tensorflow.rst>`_
+- `Horovod with XLA in Tensorflow <xla.rst>`_
 - `Horovod with Keras <keras.rst>`_
 - `Horovod with PyTorch <pytorch.rst>`_
 - `Horovod with MXNet <mxnet.rst>`_

diff --git a/docs/xla.rst b/docs/xla.rst
@@ -0,0 +1,37 @@
+Horovod with XLA in Tensorflow
+===============================
+
+Basic usage
+-----------
+
+XLA Horovod ops can be enabled by setting ``HOROVOD_ENABLE_XLA_OPS = 1`` by controlling the registration of the ops to Tensorflow/XLA.
+
+There are two main ways to enable XLA and they could work with Horovod in different ways:
+
+For **Explicit compilation with tf.function(jit_compile=True)**:
+
+.. code-block:: python
+
+    os.environ["HOROVOD_ENABLE_XLA_OPS"] = "1"
+
+     @tf.function(jit_compile=True)
+     def compiled_hvd_allreduce(self, dtype, dim):
+         tensor = self.random_uniform(
+             [17] * dim, -100, 100, dtype=dtype)
+         summed = hvd.allreduce(tensor, average=False)
+         return summed
+
+In this way, all the ops in the ``compiled_hvd_allreduce`` function are lowered into XLA per the compilation requirement. If the XLA Horovod ops are not enabled, XLA will report compilation errors.
+
+
+For **Auto-clustering**:
+
+Auto-clustering is a convenient way to use XLA by simply setting ``TF_XLA_FLAGS=--tf_xla_auto_jit=2`` and the XLA JIT automatically selects ops in the Tensorflow graph to be lowered into XLA. In this mode, enabling XLA Horovod ops is optional, because the auto-clustering can work even if the Horovod ops are left to be run by Tensorflow (devices) while only parts of the graphs are lowered onto XLA (devices).
+
+List of supported XLA Horovod ops
+---------------------------------
+
+The supported op list is:
+
+``HorovodAllreduce``
+
diff --git a/horovod/common/common.h b/horovod/common/common.h
@@ -137,6 +137,7 @@ namespace common {
 #define HOROVOD_DISABLE_NVTX_RANGES "HOROVOD_DISABLE_NVTX_RANGES"
 #define HOROVOD_ENABLE_ASYNC_COMPLETION "HOROVOD_ENABLE_ASYNC_COMPLETION"
 #define HOROVOD_DYNAMIC_PROCESS_SETS "HOROVOD_DYNAMIC_PROCESS_SETS"
+#define HOROVOD_ENABLE_XLA_OPS "HOROVOD_ENABLE_XLA_OPS"
 
 // String constant for gloo interface.
 #define GLOO_DEFAULT_IFACE ""
@@ -153,7 +154,7 @@ namespace common {
 #define JOIN_TENSOR_NAME "join.noname"
 
 // List of supported frameworks.
-enum Framework { TENSORFLOW, PYTORCH, MXNET };
+enum Framework { TENSORFLOW, PYTORCH, MXNET, XLA };
 
 enum StatusType { OK, UNKNOWN_ERROR, PRECONDITION_ERROR, ABORTED, INVALID_ARGUMENT, IN_PROGRESS };
 
@@ -228,6 +229,8 @@ const Status DUPLICATE_NAME_ERROR = Status::InvalidArgument(
 
 class TensorShape {
 public:
+  TensorShape() : shape_() {}
+  TensorShape(std::vector<int64_t> vec) : shape_(vec) {}
   void AddDim(int64_t dim);
   void AppendShape(TensorShape& other);
 

diff --git a/horovod/common/operations.cc b/horovod/common/operations.cc
@@ -494,6 +494,14 @@ void BackgroundThreadLoop(HorovodGlobalState& state) {
 
   // Override the cycle time.
   state.parameter_manager.SetCycleTimeMs(1);
+  bool enable_xla_ops = false;
+  common::SetBoolFromEnv(HOROVOD_ENABLE_XLA_OPS, enable_xla_ops, true);
+  if (enable_xla_ops) {
+    // Setting the default Cycle Time to 0 because the XLA runtime is sensitive
+    // to latencies.
+    state.parameter_manager.SetCycleTimeMs(0);
+  }
+
   auto horovod_cycle_time = std::getenv(HOROVOD_CYCLE_TIME);
   if (horovod_cycle_time != nullptr) {
     state.parameter_manager.SetCycleTimeMs(
@@ -563,6 +571,11 @@ void BackgroundThreadLoop(HorovodGlobalState& state) {
 
   // Check if async completion should be enabled
   SetBoolFromEnv(HOROVOD_ENABLE_ASYNC_COMPLETION, state.enable_async_completion, true);
+  if (enable_xla_ops) {
+    // Enable async completion when XLA ops are enabled. Sine the XLA runtime is
+    // single-threaded, async completion is essential to reduce host overhead.
+    state.enable_async_completion = true;
+  }
 
   // Enable auto-tuning.
   auto horovod_autotune = std::getenv(HOROVOD_AUTOTUNE);

diff --git a/horovod/tensorflow/CMakeLists.txt b/horovod/tensorflow/CMakeLists.txt
@@ -59,6 +59,7 @@ set(Tensorflow_CXX11 ${Tensorflow_CXX11} PARENT_SCOPE)
 
 # TF SOURCES
 list(APPEND TF_SOURCES "${PROJECT_SOURCE_DIR}/horovod/tensorflow/mpi_ops.cc")
+list(APPEND TF_SOURCES "${PROJECT_SOURCE_DIR}/horovod/tensorflow/xla_mpi_ops.cc")
 
 # Create library
 set_output_dir()

diff --git a/horovod/tensorflow/custom_call_config.fbs b/horovod/tensorflow/custom_call_config.fbs
@@ -0,0 +1,43 @@
+// Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+// Modifications copyright (C) 2017 Uber Technologies, Inc.
+// Modifications copyright Microsoft
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+include "horovod/common/wire/message.fbs";
+
+namespace horovod.xla.wire;
+
+table TensorShape {
+    dims:[long];
+}
+
+table CustomCallConfig {
+    tensor_name:string;
+    tensor_type:common.wire.DataType;
+    input_shapes:[TensorShape];
+    output_shapes:[TensorShape];
+
+    // Prescale and postscale factors
+    prescale_factor:float;
+    postscale_factor:float;
+
+    // Root rank is necessary for broadcast operation.
+    root_rank:int;
+
+    // Reduce op.
+    reduce_op:int;
+
+    process_set_id:int;
+}