Initialize correctly IBM Spectrum MPI provided with IBM PowerAI DDL (#…

…1093) Signed-off-by: Nicolas V Castet <nvcastet@us.ibm.com>
horovod · May 21, 2019 · 26439a1 · 26439a1
1 parent e7ec0a5
commit 26439a1
Show file tree

Hide file tree

Showing 3 changed files with 21 additions and 16 deletions.
diff --git a/horovod/common/operations.cc b/horovod/common/operations.cc
@@ -883,7 +883,13 @@ void BackgroundThreadLoop(HorovodGlobalState& state, MPIContext& ctx) {
                       "likely cause a segmentation fault.";
     }
   } else {
+#if HAVE_DDL
+    // DDL comes with IBM Spectrum MPI
+    // and needs to initialize MPI with the proper license.
+    DDLAllreduce::DDLInit(&ddl_context, &cuda_context);
+#else
     MPI_Init_thread(NULL, NULL, required, &provided);
+#endif
     state.should_finalize = true;
   }
 

diff --git a/horovod/common/ops/ddl_operations.cc b/horovod/common/ops/ddl_operations.cc
@@ -42,20 +42,7 @@ Status DDLAllreduce::Execute(std::vector<TensorTableEntry>& entries, const Respo
   InitCUDAQueue(entries, response);
 
   auto& timeline = global_state_->timeline;
-  if (!ddl_context_->ddl_initialized) {
-    // Initialize DDL
-    auto ddl_options = std::getenv("DDL_OPTIONS");
-    if (ddl_options == nullptr) {
-      throw std::logic_error("DDL_OPTIONS env variable needs to be set to use DDL.");
-    }
-
-    auto ddl_result = ddl_init(ddl_options);
-    if (ddl_result != DDL_SUCCESS) {
-      throw std::logic_error("ddl_init failed.");
-    }
-    ddl_context_->ddl_initialized = true;
-    ddl_context_->ddl_local_device_id = first_entry.device;
-  } else if (ddl_context_->ddl_local_device_id != first_entry.device) {
+  if (ddl_context_->ddl_local_device_id != first_entry.device) {
     throw std::logic_error("DDL does not support more than one GPU device per process.");
   }
 
@@ -113,5 +100,17 @@ Status DDLAllreduce::Execute(std::vector<TensorTableEntry>& entries, const Respo
   return FinalizeCUDAQueue(entries);
 }
 
+void DDLAllreduce::DDLInit(DDLContext* ddl_context, CUDAContext* cuda_context) {
+  auto ddl_options = std::getenv("DDL_OPTIONS");
+  if (ddl_options == nullptr) {
+    throw std::logic_error("DDL_OPTIONS env variable needs to be set to use DDL.");
+  }
+  auto ddl_result = ddl_init(ddl_options);
+  if (ddl_result != DDL_SUCCESS) {
+    throw std::logic_error("ddl_init failed.");
+  }
+  cuda_context->ErrorCheck("cudaGetDevice", cudaGetDevice(&ddl_context->ddl_local_device_id));
+}
+
 } // namespace common
 } // namespace horovod
diff --git a/horovod/common/ops/ddl_operations.h b/horovod/common/ops/ddl_operations.h
@@ -25,8 +25,6 @@ namespace horovod {
 namespace common {
 
 struct DDLContext {
-  // Will be set to true after initialization when ddl is used
-  bool ddl_initialized = false;
   int32_t ddl_local_device_id = 0;
 };
 
@@ -38,6 +36,8 @@ class DDLAllreduce : public CUDAAllreduce {
 
   Status Execute(std::vector<TensorTableEntry>& entries, const Response& response) override;
 
+  static void DDLInit(DDLContext* ddl_context, CUDAContext* cuda_context);
+
 protected:
   DDLContext* ddl_context_;
 };