Skip to content
Permalink
Browse files

Updated Gloo with fixes for context size = 1, fixed issue with overla…

…pping namespaces in local and cross context (#1367)
  • Loading branch information...
tgaddair committed Aug 30, 2019
1 parent 59a420e commit c5b53eca9686ba296ba3a8b3a7e25dda25065eea
@@ -114,7 +114,7 @@ ResponseList Controller::ComputeResponseList(std::atomic_bool& shut_down) {
// a shutdown. This function removes any invalid cache entries, if they
// exist.
CoordinateCacheAndState(cache_coordinator);
LOG(DEBUG) << "Cache coordinated.";
LOG(TRACE) << "Cache coordinated.";
// Remove uncommon cached tensors from queue and replace to state
// queue for next cycle. Skip adding common cached tensors to
// queue as they are handled separately.
@@ -57,7 +57,8 @@ std::shared_ptr<gloo::Context> Rendezvous(const std::string& prefix,
} else {
store.reset(new MemoryStore());
}
LOG(DEBUG) << prefix << " rendezvous started for rank=" << rank << ", size=" << size;
LOG(DEBUG) << prefix << " rendezvous started for rank=" << rank << ", size=" << size
<< ", dev={" << dev->str() << "}";

auto context = std::make_shared<gloo::rendezvous::Context>(rank, size);
context->connectFullMesh(*store, dev);
@@ -130,12 +131,12 @@ void GlooContext::Initialize(const std::string& gloo_iface) {
rank, size, dev);
LOG(DEBUG) << "Global Gloo context initialized.";

local_ctx = Rendezvous(HOROVOD_GLOO_LOCAL_PREFIX,
local_ctx = Rendezvous(HOROVOD_GLOO_LOCAL_PREFIX + std::to_string(cross_rank),
rendezvous_addr_env, rendezvous_port,
local_rank, local_size, dev);
LOG(DEBUG) << "Local Gloo context initialized.";

cross_ctx = Rendezvous(HOROVOD_GLOO_CROSS_PREFIX,
cross_ctx = Rendezvous(HOROVOD_GLOO_CROSS_PREFIX + std::to_string(local_rank),
rendezvous_addr_env, rendezvous_port,
cross_rank, cross_size, dev);
LOG(DEBUG) << "Cross-node Gloo context initialized.";
@@ -39,17 +39,6 @@ void GlooController::Initialize() {
LOG(DEBUG) << "Started Horovod with " << size_ << " processes";
}

// TODO(travis): Gloo doesn't handle running with size=1 for all ops at this time.
// Remove this after https://github.com/facebookincubator/gloo/issues/209
if (size_ == 1) {
local_rank_ = 0;
cross_rank_ = 0;
local_size_ = 1;
cross_size_ = 1;
is_homogeneous_ = true;
return;
}

// Determine local rank by if local context is presented.
if (gloo_context_.local_ctx != nullptr) {
local_rank_ = gloo_context_.local_ctx->rank;
@@ -132,10 +121,6 @@ int GlooController::GetTypeSize(DataType dtype) {

void GlooController::CrossRankBitwiseAnd(std::vector<long long>& bitvector,
int count) {
if (size_ == 1) {
return;
}

gloo::AllreduceOptions opts(gloo_context_.ctx);
opts.setOutput(bitvector.data(), count);
void (*func)(void*, const void*, const void*, size_t) = &BitAnd<long long>;
@@ -145,10 +130,6 @@ void GlooController::CrossRankBitwiseAnd(std::vector<long long>& bitvector,

void GlooController::CrossRankBitwiseOr(std::vector<long long>& bitvector,
int count) {
if (size_ == 1) {
return;
}

gloo::AllreduceOptions opts(gloo_context_.ctx);
opts.setOutput(bitvector.data(), count);
void (*func)(void*, const void*, const void*, size_t) = &BitOr<long long>;
@@ -158,10 +139,6 @@ void GlooController::CrossRankBitwiseOr(std::vector<long long>& bitvector,

void GlooController::RecvReadyTensors(std::vector<std::string>& ready_to_reduce,
std::vector<RequestList>& ready_list) {
if (size_ == 1) {
return;
}

// Rank zero has put all its own tensors in the tensor count table.
// Now, it should count all the tensors that are coming from other
// ranks at this tick.
@@ -221,10 +198,6 @@ void GlooController::RecvReadyTensors(std::vector<std::string>& ready_to_reduce,
}

void GlooController::SendFinalTensors(ResponseList& response_list) {
if (size_ == 1) {
return;
}

// Notify all nodes which tensors we'd like to reduce at this step.
std::string encoded_response;
ResponseList::SerializeToString(response_list, encoded_response);
@@ -249,10 +222,6 @@ void GlooController::SendFinalTensors(ResponseList& response_list) {
}

void GlooController::SendReadyTensors(RequestList& message_list) {
if (size_ == 1) {
return;
}

std::string encoded_message;
RequestList::SerializeToString(message_list, encoded_message);

@@ -296,10 +265,6 @@ void GlooController::SendReadyTensors(RequestList& message_list) {
}

void GlooController::RecvFinalTensors(ResponseList& response_list) {
if (size_ == 1) {
return;
}

int msg_length;
// root broadcast final message length to others
{
@@ -324,21 +289,13 @@ void GlooController::RecvFinalTensors(ResponseList& response_list) {

void GlooController::Bcast(void* buffer, size_t size, int root_rank,
Communicator communicator) {
if (size_ == 1) {
return;
}

gloo::BroadcastOptions opts(gloo_context_.GetGlooContext(communicator));
opts.setOutput((uint8_t*)buffer, size);
opts.setRoot(root_rank);
gloo::broadcast(opts);
}

void GlooController::Barrier(Communicator communicator) {
if (size_ == 1) {
return;
}

gloo::BarrierOptions opts(gloo_context_.GetGlooContext(communicator));
gloo::barrier(opts);
}
@@ -29,9 +29,7 @@ class GlooController : public Controller {
Timeline& timeline, ParameterManager& parameter_manager,
GlooContext& gloo_context)
: Controller(response_cache, tensor_queue, timeline, parameter_manager),
gloo_context_(gloo_context) {
LOG(DEBUG) << "GLOO Controller Initialized.";
};
gloo_context_(gloo_context) {};

void Initialize() override;

@@ -63,10 +63,6 @@ GlooAlgorithms<T>::GlooAlgorithms(GlooContext* gloo_context)

template <typename T>
void GlooAlgorithms<T>::Allreduce(void* buffer_data, int num_elements) {
if (gloo_context_->ctx->size == 1) {
return;
}

gloo::AllreduceOptions opts(gloo_context_->ctx);
opts.setOutput<T>(static_cast<T*>(buffer_data), (size_t) num_elements);

@@ -79,10 +75,6 @@ void GlooAlgorithms<T>::Allreduce(void* buffer_data, int num_elements) {
template <typename T>
void GlooAlgorithms<T>::Allgather(void* buffer_data, void* buffer_out,
int* recvcounts, int* displcmnts) {
if (gloo_context_->ctx->size == 1) {
return;
}

// create count index
std::vector<size_t> counts(recvcounts, recvcounts + gloo_context_->ctx->size);

@@ -98,10 +90,6 @@ void GlooAlgorithms<T>::Allgather(void* buffer_data, void* buffer_out,
template <typename T>
void GlooAlgorithms<T>::Broadcast(void* buffer_data, int num_elements,
int root_rank) {
if (gloo_context_->ctx->size == 1) {
return;
}

gloo::BroadcastOptions opts(gloo_context_->ctx);
opts.setRoot(root_rank);
opts.setOutput<T>(static_cast<T*>(buffer_data), (size_t) num_elements);
@@ -149,7 +149,11 @@ def kill_executor_children_if_sigterm_received():

def kill_middleman_if_master_thread_terminate():
event.wait()
os.kill(middleman_pid, signal.SIGTERM)
try:
os.kill(middleman_pid, signal.SIGTERM)
except:
# The process has already been killed elsewhere
pass

# TODO: Currently this requires explicitly declaration of the event and signal handler to set
# the event (gloo_run.py:_launch_jobs()). Need to figure out a generalized way to hide this behind
Submodule gloo updated 65 files
+4 −0 CMakeLists.txt
+39 −14 cmake/Dependencies.cmake
+1 −34 cmake/Hip.cmake
+59 −0 cmake/Modules/Findrccl.cmake
+33 −1 gloo/CMakeLists.txt
+7 −2 gloo/allgather.cc
+13 −6 gloo/allgatherv.cc
+348 −28 gloo/allreduce.cc
+99 −54 gloo/allreduce.h
+0 −5 gloo/benchmark/CMakeLists.txt
+1 −0 gloo/benchmark/benchmark.h
+95 −0 gloo/benchmark/main.cc
+8 −6 gloo/benchmark/runner.cc
+11 −3 gloo/common/CMakeLists.txt
+67 −0 gloo/common/aligned_allocator.h
+0 −44 gloo/common/common.h
+6 −0 gloo/config.h.in
+8 −2 gloo/cuda.cu
+10 −2 gloo/reduce.cc
+11 −4 gloo/test/CMakeLists.txt
+46 −52 gloo/test/allgather_test.cc
+9 −7 gloo/test/allgatherv_test.cc
+165 −149 gloo/test/allreduce_test.cc
+17 −11 gloo/test/barrier_test.cc
+58 −10 gloo/test/base_test.h
+87 −102 gloo/test/broadcast_test.cc
+33 −33 gloo/test/buffer_test.cc
+20 −18 gloo/test/context_factory_test.cc
+181 −167 gloo/test/cuda_allreduce_test.cc
+16 −18 gloo/test/cuda_base_test.h
+66 −88 gloo/test/cuda_broadcast_test.cc
+33 −41 gloo/test/gather_test.cc
+1 −1 gloo/test/main.cc
+1 −1 gloo/test/memory_test.cc
+3 −5 gloo/test/multiproc_test.cc
+2 −4 gloo/test/multiproc_test.h
+49 −42 gloo/test/reduce_scatter_test.cc
+10 −18 gloo/test/reduce_test.cc
+9 −7 gloo/test/scatter_test.cc
+355 −295 gloo/test/send_recv_test.cc
+53 −57 gloo/test/transport_test.cc
+10 −3 gloo/transport/CMakeLists.txt
+2 −2 gloo/transport/ibverbs/device.h
+18 −3 gloo/transport/tcp/device.cc
+12 −12 gloo/transport/tcp/pair.cc
+9 −3 gloo/transport/tcp/unbound_buffer.cc
+2 −2 gloo/transport/tcp/unbound_buffer.h
+9 −3 gloo/transport/unbound_buffer.h
+19 −0 gloo/transport/uv/CMakeLists.txt
+21 −0 gloo/transport/uv/LICENSE.uvw
+71 −0 gloo/transport/uv/address.cc
+66 −0 gloo/transport/uv/address.h
+37 −0 gloo/transport/uv/common.h
+196 −0 gloo/transport/uv/context.cc
+244 −0 gloo/transport/uv/context.h
+455 −0 gloo/transport/uv/device.cc
+200 −0 gloo/transport/uv/device.h
+58 −0 gloo/transport/uv/libuv.cc
+689 −0 gloo/transport/uv/libuv.h
+561 −0 gloo/transport/uv/pair.cc
+270 −0 gloo/transport/uv/pair.h
+133 −0 gloo/transport/uv/unbound_buffer.cc
+81 −0 gloo/transport/uv/unbound_buffer.h
+10 −1 tools/amd_build/build_amd.py
+2 −0 tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py

0 comments on commit c5b53ec

Please sign in to comment.
You can’t perform that action at this time.