Skip to content
Permalink
Browse files

Add NCCL Broadcast operation (#1579)

Signed-off-by: Nicolas V Castet <nvcastet@us.ibm.com>
  • Loading branch information
nvcastet authored and tgaddair committed Jan 8, 2020
1 parent fc06c68 commit 80167f6dea0ba6b853d790a3d3a342368811f0da
@@ -69,7 +69,7 @@ RUN mkdir /tmp/openmpi && \

# Install Horovod, temporarily using CUDA stubs
RUN ldconfig /usr/local/cuda/targets/x86_64-linux/lib/stubs && \
HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 HOROVOD_WITH_MXNET=1 \
HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 HOROVOD_WITH_MXNET=1 \
pip install --no-cache-dir horovod && \
ldconfig

@@ -14,7 +14,7 @@ ARG PYTORCH_PACKAGE=torch==1.2.0
ARG TORCHVISION_PACKAGE=torchvision==0.4.0
ARG MXNET_PACKAGE=mxnet-cu100==1.5.0
ARG PYSPARK_PACKAGE=pyspark==2.4.0
ARG HOROVOD_BUILD_FLAGS="HOROVOD_GPU_ALLREDUCE=NCCL"
ARG HOROVOD_BUILD_FLAGS="HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL"
ARG HOROVOD_MIXED_INSTALL=0

# Set default shell to /bin/bash
@@ -45,20 +45,20 @@ by installing an `nv_peer_memory <https://github.com/Mellanox/nv_peer_memory>`__

.. code-block:: bash
$ HOROVOD_NCCL_HOME=/usr/local/nccl-<version> HOROVOD_GPU_ALLREDUCE=NCCL pip install --no-cache-dir horovod
$ HOROVOD_NCCL_HOME=/usr/local/nccl-<version> HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL pip install --no-cache-dir horovod
If you installed NCCL 2 using the Ubuntu package, you can run:

.. code-block:: bash
$ HOROVOD_GPU_ALLREDUCE=NCCL pip install --no-cache-dir horovod
$ HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL pip install --no-cache-dir horovod
If you installed NCCL 2 using the `CentOS / RHEL package <https://docs.nvidia.com/deeplearning/sdk/nccl-install-guide/index.html#rhel_centos>`__, you can run:

.. code-block:: bash
$ HOROVOD_NCCL_INCLUDE=/usr/include HOROVOD_NCCL_LIB=/usr/lib64 HOROVOD_GPU_ALLREDUCE=NCCL pip install --no-cache-dir horovod
$ HOROVOD_NCCL_INCLUDE=/usr/include HOROVOD_NCCL_LIB=/usr/lib64 HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL pip install --no-cache-dir horovod
**Note**: Some models with a high computation to communication ratio benefit from doing allreduce on CPU, even if a
@@ -56,7 +56,7 @@ To use CUDA stub drivers:
$ ldconfig /usr/local/cuda/lib64/stubs
# install Horovod, add other HOROVOD_* environment variables as necessary
$ HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_NCCL_HOME=/path/to/nccl pip install --no-cache-dir horovod
$ HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL HOROVOD_NCCL_HOME=/path/to/nccl pip install --no-cache-dir horovod
# revert to standard libraries
$ ldconfig
@@ -90,7 +90,7 @@ To use custom MPI directory:
.. code-block:: bash
$ export PATH=$PATH:/path/to/mpi/bin
$ HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_NCCL_HOME=/path/to/nccl pip install --no-cache-dir horovod
$ HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL HOROVOD_NCCL_HOME=/path/to/nccl pip install --no-cache-dir horovod
2. Are MPI libraries added to ``$LD_LIBRARY_PATH`` or ``ld.so.conf``?
@@ -202,14 +202,14 @@ For example:
.. code-block:: bash
$ HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_NCCL_HOME=/path/to/nccl pip install --no-cache-dir horovod
$ HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL HOROVOD_NCCL_HOME=/path/to/nccl pip install --no-cache-dir horovod
Or:
.. code-block:: bash
$ HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_NCCL_INCLUDE=/path/to/nccl/include HOROVOD_NCCL_LIB=/path/to/nccl/lib pip install --no-cache-dir horovod
$ HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL HOROVOD_NCCL_INCLUDE=/path/to/nccl/include HOROVOD_NCCL_LIB=/path/to/nccl/lib pip install --no-cache-dir horovod
Pip install: no such option: --no-cache-dir
@@ -237,7 +237,7 @@ For example:
.. code-block:: bash
$ HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_NCCL_HOME=/path/to/nccl pip install --no-cache-dir horovod
$ HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL HOROVOD_NCCL_HOME=/path/to/nccl pip install --no-cache-dir horovod
ncclAllReduce failed: invalid data type
@@ -260,7 +260,7 @@ the package and reinstall Horovod:
$ conda remove nccl
$ pip uninstall -y horovod
$ HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_NCCL_HOME=/path/to/nccl pip install --no-cache-dir horovod
$ HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL HOROVOD_NCCL_HOME=/path/to/nccl pip install --no-cache-dir horovod
transport/p2p.cu:431 WARN failed to open CUDA IPC handle : 30 unknown error
@@ -322,15 +322,15 @@ To build Horovod with a specific CUDA version, use the ``HOROVOD_CUDA_HOME`` env
.. code-block:: bash
$ pip uninstall -y horovod
$ HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_NCCL_HOME=/path/to/nccl HOROVOD_CUDA_HOME=/path/to/cuda pip install --no-cache-dir horovod
$ HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL HOROVOD_NCCL_HOME=/path/to/nccl HOROVOD_CUDA_HOME=/path/to/cuda pip install --no-cache-dir horovod
Alternatively, you can use the ``HOROVOD_CUDA_INCLUDE`` and ``HOROVOD_CUDA_LIB`` environment variables to specify the CUDA library to use:
.. code-block:: bash
$ pip uninstall -y horovod
$ HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_NCCL_HOME=/path/to/nccl HOROVOD_CUDA_INCLUDE=/path/to/cuda/include HOROVOD_CUDA_LIB=/path/to/cuda/lib64 pip install --no-cache-dir horovod
$ HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL HOROVOD_NCCL_HOME=/path/to/nccl HOROVOD_CUDA_INCLUDE=/path/to/cuda/include HOROVOD_CUDA_LIB=/path/to/cuda/lib64 pip install --no-cache-dir horovod
FORCE-TERMINATE AT Data unpack would read past end of buffer
@@ -93,29 +93,10 @@ void GlooController::Initialize() {

int GlooController::GetTypeSize(DataType dtype) {
switch (dtype) {
case HOROVOD_UINT8:
return sizeof(u_int8_t);
case HOROVOD_INT8:
return sizeof(int8_t);
case HOROVOD_UINT16:
return sizeof(u_int16_t);
case HOROVOD_INT16:
return sizeof(int16_t);
case HOROVOD_INT32:
return sizeof(int);
case HOROVOD_INT64:
return sizeof(int64_t);
case HOROVOD_FLOAT16:
return sizeof(gloo::float16);
case HOROVOD_FLOAT32:
return sizeof(float);
case HOROVOD_FLOAT64:
return sizeof(double);
case HOROVOD_BOOL:
return sizeof(bool);
default:
throw std::logic_error("Type " + DataType_Name(dtype) +
" is not supported in Gloo mode.");
return DataType_Size(dtype);
}
}

@@ -62,6 +62,34 @@ const std::string& DataType_Name(DataType value) {
}
}

std::size_t DataType_Size(DataType value) {
switch (value) {
case HOROVOD_UINT8:
return sizeof(u_int8_t);
case HOROVOD_INT8:
return sizeof(int8_t);
case HOROVOD_UINT16:
return sizeof(u_int16_t);
case HOROVOD_INT16:
return sizeof(int16_t);
case HOROVOD_INT32:
return sizeof(int32_t);
case HOROVOD_INT64:
return sizeof(int64_t);
case HOROVOD_FLOAT16:
return 2;
case HOROVOD_FLOAT32:
return sizeof(float);
case HOROVOD_FLOAT64:
return sizeof(double);
case HOROVOD_BOOL:
return sizeof(bool);
default:
throw std::logic_error("Type " + DataType_Name(value) +
" is not supported.");
}
}

const std::string& Request::RequestType_Name(RequestType value) {
switch (value) {
case RequestType::ALLREDUCE:
@@ -35,11 +35,12 @@ enum DataType {
HOROVOD_FLOAT32 = 7,
HOROVOD_FLOAT64 = 8,
HOROVOD_BOOL = 9,
HOROVOD_BYTE = 10,
};

const std::string& DataType_Name(DataType value);

std::size_t DataType_Size(DataType value);

// A Request is a message sent from a rank greater than zero to the
// coordinator (rank zero), informing the coordinator of an operation that
// the rank wants to do and the tensor that it wants to apply the operation to.
@@ -53,8 +53,6 @@ MPI_Datatype MPIContext::GetMPIDataType(const DataType dtype) {
return MPI_DOUBLE;
case HOROVOD_BOOL:
return MPI_C_BOOL;
case HOROVOD_BYTE:
return MPI_BYTE;
default:
throw std::logic_error("Type " + DataType_Name(dtype) +
" is not supported in MPI mode.");
@@ -180,6 +180,11 @@ OperationManager* CreateOperationManager(HorovodGlobalState& state) {
new NCCLAllreduce(&nccl_context, &cuda_context, &state)));
#endif

#if HAVE_NCCL && HOROVOD_GPU_BROADCAST == 'N'
broadcast_ops.push_back(
std::shared_ptr<BroadcastOp>(new NCCLBroadcast(&nccl_context, &cuda_context, &state)));
#endif

#if HAVE_GLOO
if (gloo_context.IsEnabled()) {
allreduce_ops.push_back(
@@ -22,8 +22,8 @@ AdasumCudaAllreduceOp::AdasumCudaAllreduceOp(MPIContext* mpi_context,
NCCLContext* nccl_context,
CUDAContext* cuda_context,
HorovodGlobalState* global_state)
: NCCLAllreduce(nccl_context, cuda_context, global_state),
AdasumMPI(mpi_context, global_state) {
: AdasumMPI(mpi_context, global_state),
NCCLAllreduce(nccl_context, cuda_context, global_state, Communicator::LOCAL) {
// Pre-allocate host buffer size equal to the fusion buffer length
current_host_buffer_length =
global_state->parameter_manager.TensorFusionThresholdBytes();
@@ -66,7 +66,7 @@ AdasumCudaAllreduceOp::NcclHierarchical(std::vector<TensorTableEntry>& entries,
nccl_device_map.push_back(response.devices()[rank]);
}
cuda_op_context_.InitCUDA(entries);
InitNCCLComm(entries, nccl_device_map);
nccl_op_context_.InitNCCLComm(entries, nccl_device_map);
cuda_op_context_.InitCUDAQueue(entries, response);
const void* fused_input_data;
void* buffer_data;
@@ -157,7 +157,7 @@ AdasumCudaAllreduceOp::NcclHierarchical(std::vector<TensorTableEntry>& entries,
auto nccl_result = ncclReduceScatter(
fused_input_data, buffer_data_at_rank_offset,
(size_t)num_elements_per_rank, GetNCCLDataType(first_entry.tensor),
ncclSum, *nccl_comm_, *cuda_op_context_.stream);
ncclSum, *nccl_op_context_.nccl_comm_, *cuda_op_context_.stream);

nccl_context_->ErrorCheck("ncclReduceScatter", nccl_result);
if (global_state_->timeline.Initialized()) {
@@ -172,7 +172,7 @@ AdasumCudaAllreduceOp::NcclHierarchical(std::vector<TensorTableEntry>& entries,
auto nccl_result = ncclReduce(
fused_input_data_remainder, buffer_data_remainder,
(size_t)num_elements_remaining, GetNCCLDataType(first_entry.tensor),
ncclSum, root_rank, *nccl_comm_, *cuda_op_context_.stream);
ncclSum, root_rank, *nccl_op_context_.nccl_comm_, *cuda_op_context_.stream);

nccl_context_->ErrorCheck("ncclReduce", nccl_result);
if (global_state_->timeline.Initialized()) {
@@ -272,7 +272,7 @@ AdasumCudaAllreduceOp::NcclHierarchical(std::vector<TensorTableEntry>& entries,
"ncclAllGather", ncclAllGather(buffer_data_at_rank_offset, buffer_data,
(size_t)num_elements_per_rank,
GetNCCLDataType(first_entry.tensor),
*nccl_comm_, *cuda_op_context_.stream));
*nccl_op_context_.nccl_comm_, *cuda_op_context_.stream));
if (global_state_->timeline.Initialized()) {
cuda_context_->RecordEvent(cuda_op_context_.event_queue, NCCL_ALLGATHER,
*cuda_op_context_.stream);
@@ -282,7 +282,7 @@ AdasumCudaAllreduceOp::NcclHierarchical(std::vector<TensorTableEntry>& entries,
nccl_context_->ErrorCheck(
"ncclBcast",
ncclBcast(buffer_data_remainder, (size_t)num_elements_remaining,
GetNCCLDataType(first_entry.tensor), root_rank, *nccl_comm_,
GetNCCLDataType(first_entry.tensor), root_rank, *nccl_op_context_.nccl_comm_,
*cuda_op_context_.stream));
if (global_state_->timeline.Initialized()) {
cuda_context_->RecordEvent(cuda_op_context_.event_queue, NCCL_BCAST,
@@ -304,13 +304,6 @@ AdasumCudaAllreduceOp::NcclHierarchical(std::vector<TensorTableEntry>& entries,
return cuda_op_context_.FinalizeCUDAQueue(entries, false);
}

void AdasumCudaAllreduceOp::PopulateNCCLCommStrategy(
int& nccl_rank, int& nccl_size, Communicator& nccl_id_bcast_comm) {
nccl_rank = global_state_->controller->GetLocalRank();
nccl_size = global_state_->controller->GetLocalSize();
nccl_id_bcast_comm = Communicator::LOCAL;
}

bool AdasumCudaAllreduceOp::Enabled(
const ParameterManager& param_manager,
const std::vector<TensorTableEntry>& entries,
@@ -39,9 +39,6 @@ class AdasumCudaAllreduceOp : public AdasumMPI, public NCCLAllreduce {
const Response& response) override;

protected:
void PopulateNCCLCommStrategy(int& nccl_rank, int& nccl_size,
Communicator& nccl_id_bcast_comm) override;

Status NcclHierarchical(std::vector<TensorTableEntry>& entries,
const Response& response);

@@ -34,8 +34,6 @@ namespace common {

ccl_datatype_t GetCCLDataType(const std::shared_ptr<Tensor>& tensor) {
switch (tensor->dtype()) {
case HOROVOD_BYTE:
return ccl_dtype_char;
case HOROVOD_FLOAT32:
return ccl_dtype_float;
case HOROVOD_FLOAT64:
@@ -219,5 +219,15 @@ void CUDAAllgather::MemcpyEntryOutFusionBuffer(const std::vector<TensorTableEntr
cuda_context_->ErrorCheck("cudaMemcpyAsync", cuda_result);
}

CUDABroadcast::CUDABroadcast(CUDAContext* context,
HorovodGlobalState* global_state)
: BroadcastOp(global_state), cuda_context_(context), cuda_op_context_(context, global_state) {}

bool CUDABroadcast::Enabled(const ParameterManager& param_manager,
const std::vector<TensorTableEntry>& entries,
const Response& response) const {
return entries[0].device != CPU_DEVICE_ID;
}

} // namespace common
} // namespace horovod
@@ -137,6 +137,20 @@ class CUDAAllgather : public AllgatherOp {
CUDAOpContext cuda_op_context_;
};

class CUDABroadcast : public BroadcastOp {
public:
CUDABroadcast(CUDAContext* context,
HorovodGlobalState* global_state);

bool Enabled(const ParameterManager& param_manager,
const std::vector<TensorTableEntry>& entries,
const Response& response) const override;

protected:
struct CUDAContext* cuda_context_;
CUDAOpContext cuda_op_context_;
};

} // namespace common
} // namespace horovod

0 comments on commit 80167f6

Please sign in to comment.
You can’t perform that action at this time.