horovod · maxhgerlach · May 26, 2022 · Mar 18, 2022 · Mar 19, 2022 · Mar 21, 2022
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -350,6 +350,10 @@ if(HAVE_CUDA OR HAVE_SUB_PROJECT_CUDA)
     add_subdirectory(horovod/common/ops/cuda)
 endif()
 
+if(HAVE_ROCM)
+    add_subdirectory(horovod/common/ops/rocm)
+endif()
+
 # if we need compatible c++ abi
 # Duplicate gloo folder and add it as a new sub-project
 if(HAVE_GLOO AND ((DEFINED Tensorflow_CXX11 AND NOT Tensorflow_CXX11) OR (DEFINED Pytorch_CXX11 AND NOT Pytorch_CXX11) OR (DEFINED Mxnet_CXX11 AND NOT Mxnet_CXX11)))

diff --git a/horovod/common/common.h b/horovod/common/common.h
@@ -45,7 +45,6 @@ using gpuStream_t = cudaStream_t;
       throw std::logic_error(std::string("GPU Error:") + cudaGetErrorString(cuda_result));  \
     }                                                                                       \
   } while (0)
-#endif
 #elif HAVE_ROCM
 #include <hip/hip_runtime_api.h>
 using gpuError_t = hipError_t;
@@ -64,6 +63,7 @@ using gpuStream_t = hipStream_t;
     }                                                                                     \
   } while (0)
 #endif
+#endif
 
 
 namespace horovod {

diff --git a/horovod/common/ops/gpu_operations.h b/horovod/common/ops/gpu_operations.h
@@ -176,6 +176,22 @@ class GPUAllreduce : public AllreduceOp {
                                   double scale_factor,
                                   std::vector<TensorTableEntry>& entries);
 #endif
+#if HAVE_ROCM
+  void MemcpyInFusionBuffer(const std::vector<TensorTableEntry>& entries,
+                            const void*& fused_input_data, void*& buffer_data,
+                            size_t& buffer_len) override;
+
+  void MemcpyOutFusionBuffer(const void* buffer_data,
+                             std::vector<TensorTableEntry>& entries) override;
+
+  void ScaleMemcpyInFusionBuffer(const std::vector<TensorTableEntry>& entries,
+                                 const void*& fused_input_data,
+                                 void*& buffer_data, size_t& buffer_len,
+                                 double scale_factor);
+  void ScaleMemcpyOutFusionBuffer(void* buffer_data, size_t buffer_len,
+                                  double scale_factor,
+                                  std::vector<TensorTableEntry>& entries);
+#endif
 
   void MemcpyEntryInFusionBuffer(const std::vector<TensorTableEntry>& entries,
                                  const TensorTableEntry& e,

diff --git a/horovod/tensorflow/mpi_ops.cc b/horovod/tensorflow/mpi_ops.cc
@@ -27,6 +27,10 @@
 #define EIGEN_USE_GPU
 #endif  // HAVE_CUDA || HAVE_ROCM
 
+#if HAVE_ROCM
+#define EIGEN_USE_HIP
+#endif
+
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/shape_inference.h"