Skip to content

Commit

Permalink
Initialize correctly IBM Spectrum MPI provided with IBM PowerAI DDL (#…
Browse files Browse the repository at this point in the history
…1093)

Signed-off-by: Nicolas V Castet <nvcastet@us.ibm.com>
  • Loading branch information
nvcastet authored and alsrgv committed May 21, 2019
1 parent e7ec0a5 commit 26439a1
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 16 deletions.
6 changes: 6 additions & 0 deletions horovod/common/operations.cc
Original file line number Diff line number Diff line change
Expand Up @@ -883,7 +883,13 @@ void BackgroundThreadLoop(HorovodGlobalState& state, MPIContext& ctx) {
"likely cause a segmentation fault.";
}
} else {
#if HAVE_DDL
// DDL comes with IBM Spectrum MPI
// and needs to initialize MPI with the proper license.
DDLAllreduce::DDLInit(&ddl_context, &cuda_context);
#else
MPI_Init_thread(NULL, NULL, required, &provided);
#endif
state.should_finalize = true;
}

Expand Down
27 changes: 13 additions & 14 deletions horovod/common/ops/ddl_operations.cc
Original file line number Diff line number Diff line change
Expand Up @@ -42,20 +42,7 @@ Status DDLAllreduce::Execute(std::vector<TensorTableEntry>& entries, const Respo
InitCUDAQueue(entries, response);

auto& timeline = global_state_->timeline;
if (!ddl_context_->ddl_initialized) {
// Initialize DDL
auto ddl_options = std::getenv("DDL_OPTIONS");
if (ddl_options == nullptr) {
throw std::logic_error("DDL_OPTIONS env variable needs to be set to use DDL.");
}

auto ddl_result = ddl_init(ddl_options);
if (ddl_result != DDL_SUCCESS) {
throw std::logic_error("ddl_init failed.");
}
ddl_context_->ddl_initialized = true;
ddl_context_->ddl_local_device_id = first_entry.device;
} else if (ddl_context_->ddl_local_device_id != first_entry.device) {
if (ddl_context_->ddl_local_device_id != first_entry.device) {
throw std::logic_error("DDL does not support more than one GPU device per process.");
}

Expand Down Expand Up @@ -113,5 +100,17 @@ Status DDLAllreduce::Execute(std::vector<TensorTableEntry>& entries, const Respo
return FinalizeCUDAQueue(entries);
}

void DDLAllreduce::DDLInit(DDLContext* ddl_context, CUDAContext* cuda_context) {
auto ddl_options = std::getenv("DDL_OPTIONS");
if (ddl_options == nullptr) {
throw std::logic_error("DDL_OPTIONS env variable needs to be set to use DDL.");
}
auto ddl_result = ddl_init(ddl_options);
if (ddl_result != DDL_SUCCESS) {
throw std::logic_error("ddl_init failed.");
}
cuda_context->ErrorCheck("cudaGetDevice", cudaGetDevice(&ddl_context->ddl_local_device_id));
}

} // namespace common
} // namespace horovod
4 changes: 2 additions & 2 deletions horovod/common/ops/ddl_operations.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,6 @@ namespace horovod {
namespace common {

struct DDLContext {
// Will be set to true after initialization when ddl is used
bool ddl_initialized = false;
int32_t ddl_local_device_id = 0;
};

Expand All @@ -38,6 +36,8 @@ class DDLAllreduce : public CUDAAllreduce {

Status Execute(std::vector<TensorTableEntry>& entries, const Response& response) override;

static void DDLInit(DDLContext* ddl_context, CUDAContext* cuda_context);

protected:
DDLContext* ddl_context_;
};
Expand Down

0 comments on commit 26439a1

Please sign in to comment.