Skip to content
Permalink
Browse files

Refactored Gloo and MPI components into separate subdirectories, redu…

…ced INFO logs to DEBUG, and fixed op order for Hierarchical Allreduce (#1294)
  • Loading branch information...
tgaddair committed Aug 13, 2019
1 parent 529acb2 commit 339f850d7ce83293261c49e3c413e887c08f46f1
@@ -57,9 +57,37 @@ namespace common {
#define GLOO_ALLGATHER "GLOO_ALLGATHER"
#define GLOO_BCAST "GLOO_BCAST"

// Horovod knobs.
#define HOROVOD_MPI_THREADS_DISABLE "HOROVOD_MPI_THREADS_DISABLE"
#define HOROVOD_TIMELINE "HOROVOD_TIMELINE"
#define HOROVOD_TIMELINE_MARK_CYCLES "HOROVOD_TIMELINE_MARK_CYCLES"
#define HOROVOD_AUTOTUNE "HOROVOD_AUTOTUNE"
#define HOROVOD_AUTOTUNE_LOG "HOROVOD_AUTOTUNE_LOG"
#define HOROVOD_FUSION_THRESHOLD "HOROVOD_FUSION_THRESHOLD"
#define HOROVOD_CYCLE_TIME "HOROVOD_CYCLE_TIME"
#define HOROVOD_STALL_CHECK_DISABLE "HOROVOD_STALL_CHECK_DISABLE"
#define HOROVOD_STALL_CHECK_TIME_SECONDS "HOROVOD_STALL_CHECK_TIME_SECONDS"
#define HOROVOD_STALL_SHUTDOWN_TIME_SECONDS "HOROVOD_STALL_SHUTDOWN_TIME_SECONDS"
#define HOROVOD_HIERARCHICAL_ALLREDUCE "HOROVOD_HIERARCHICAL_ALLREDUCE"
#define HOROVOD_HIERARCHICAL_ALLGATHER "HOROVOD_HIERARCHICAL_ALLGATHER"
#define HOROVOD_CACHE_CAPACITY "HOROVOD_CACHE_CAPACITY"
#define HOROVOD_MLSL_BGT_AFFINITY "HOROVOD_MLSL_BGT_AFFINITY"
#define HOROVOD_NUM_NCCL_STREAMS "HOROVOD_NUM_NCCL_STREAMS"
#define HOROVOD_CPU_OPERATIONS "HOROVOD_CPU_OPERATIONS"
#define HOROVOD_CONTROLLER "HOROVOD_CONTROLLER"
#define HOROVOD_GLOO_IFACE "HOROVOD_GLOO_IFACE"
#define HOROVOD_MPI "MPI"
#define HOROVOD_MLSL "MLSL"
#define HOROVOD_GLOO "GLOO"

// String constant for gloo interface.
#define GLOO_DEFAULT_IFACE "eth0"

// The number of elements held by fusion buffer and hierarchical
// allreduce size is always a multiple of FUSION_BUFFER_ATOMIC_UNIT
#define FUSION_BUFFER_ATOMIC_UNIT 64
#define RANK_ZERO 0

// Device ID used for CPU.
#define CPU_DEVICE_ID (-1)

@@ -26,16 +26,9 @@
#include "tensor_queue.h"
#include "timeline.h"

#if HAVE_GLOO
#include "gloo_context.h"
#endif

namespace horovod {
namespace common {

// Forward declaration
class HorovodGlobalState;

using MessageTable = std::unordered_map<std::string, std::vector<Request>>;

class Controller : public std::enable_shared_from_this<Controller> {
@@ -14,11 +14,13 @@
// ============================================================================

#include "gloo_context.h"

#include "gloo/rendezvous/context.h"
#include "gloo/rendezvous/file_store.h"
#include "gloo/rendezvous/prefix_store.h"
#include "gloo/transport/tcp/device.h"
#include "rendezvous/http_rendezvous.h"

#include "http_store.h"

#if HAVE_MPI
#include "gloo/mpi/context.h"
@@ -33,9 +35,10 @@ void GlooContext::InitializeFromMPI(MPIContext& mpi_ctx,
if (!enabled_) {
return;
}

// TODO(sihan): Add support for multiple interfaces:
// https://github.com/facebookincubator/gloo/issues/190
gloo::transport::tcp::attr attr;
// TODO(sihan): add interface load balancing after
// https://github.com/facebookincubator/gloo/issues/183 is resolved
attr.iface = gloo_iface;
attr.ai_family = AF_UNSPEC;
auto dev = gloo::transport::tcp::CreateDevice(attr);
@@ -71,8 +74,10 @@ void GlooContext::Initialize(const std::string& gloo_iface) {
if (!enabled_) {
return;
}
// create a tcp device for communication
// TODO: Add support for interface multiplex

// Create a tcp device for communication
// TODO(sihan): Add support for multiple interfaces:
// https://github.com/facebookincubator/gloo/issues/190
gloo::transport::tcp::attr attr;
attr.iface = gloo_iface;

@@ -16,13 +16,15 @@
#ifndef HOROVOD_GLOO_CONTEXT_H
#define HOROVOD_GLOO_CONTEXT_H

#include "common.h"
#include "gloo/context.h"
#include "logging.h"

#include "../common.h"
#include "../logging.h"

#if HAVE_MPI
#include "mpi_context.h"
#include "../mpi/mpi_context.h"
#endif

namespace horovod {
namespace common {

@@ -51,18 +53,20 @@ struct GlooContext {

void Enable() {
enabled_ = true;
LOG(INFO) << "Gloo context enabled.";
LOG(DEBUG) << "Gloo context enabled.";
}

bool IsEnabled() { return enabled_; }

std::shared_ptr<gloo::Context> GetGlooContext(Communicator communicator);

// Flag indicating whether gloo is enabled.
bool enabled_ = false;
std::shared_ptr<gloo::Context> ctx = nullptr; // Global context
std::shared_ptr<gloo::Context> cross_ctx = nullptr;
std::shared_ptr<gloo::Context> local_ctx = nullptr;

private:
// Flag indicating whether gloo is enabled.
bool enabled_ = false;
};

} // namespace common
@@ -13,20 +13,20 @@
// limitations under the License.
// =============================================================================

#include <cstring>

#include "gloo_controller.h"

#include <cstring>

#include "gloo/allgather.h"
#include "gloo/allgatherv.h"
#include "gloo/allreduce.h"
#include "gloo/barrier.h"
#include "gloo/broadcast.h"
#include "gloo/gather.h"

#include "gloo_context.h"
#include "logging.h"
#include "operations.h"
#include "ops/gloo_operations.h"
#include "../logging.h"
#include "../ops/gloo_operations.h"

namespace horovod {
namespace common {
@@ -36,7 +36,7 @@ void GlooController::Initialize() {
size_ = gloo_context_.ctx->size;
is_coordinator_ = rank_ == 0;
if (is_coordinator_) {
LOG(INFO) << "Started Horovod with " << size_ << " processes";
LOG(DEBUG) << "Started Horovod with " << size_ << " processes";
}

// Determine local rank by if local context is presented.
@@ -16,8 +16,9 @@
#ifndef HOROVOD_GLOO_CONTROLLER_H
#define HOROVOD_GLOO_CONTROLLER_H

#include "controller.h"
#include "logging.h"
#include "../controller.h"
#include "../logging.h"
#include "gloo_context.h"

namespace horovod {
namespace common {
@@ -29,7 +30,7 @@ class GlooController : public Controller {
GlooContext& gloo_context)
: Controller(response_cache, tensor_queue, timeline, parameter_manager),
gloo_context_(gloo_context) {
LOG(INFO) << "GLOO Controller Initialized.";
LOG(DEBUG) << "GLOO Controller Initialized.";
};

void Initialize() override;
@@ -13,7 +13,7 @@
// limitations under the License.
// ============================================================================

#include "http_rendezvous.h"
#include "http_store.h"

#include <cstring>
#include <iostream>
@@ -22,9 +22,10 @@
#include <string>
#include <thread>

#include "../logging.h"
#include "gloo/common/error.h"

#include "../logging.h"

namespace horovod {
namespace common {

@@ -13,8 +13,8 @@
// limitations under the License.
// ============================================================================

#ifndef HOROVOD_RENDEZVOUS_HTTP_RENDEZVOUS_H_
#define HOROVOD_RENDEZVOUS_HTTP_RENDEZVOUS_H_
#ifndef HOROVOD_GLOO_HTTP_STORE_H
#define HOROVOD_GLOO_HTTP_STORE_H

#include "HTTPRequest.hpp"
#include "gloo/rendezvous/store.h"
@@ -80,4 +80,4 @@ class HTTPStore : public gloo::rendezvous::Store {
} // namespace common
} // namespace horovod

#endif // HOROVOD_RENDEZVOUS_HTTP_RENDEZVOUS_H_
#endif // HOROVOD_GLOO_HTTP_STORE_H
File renamed without changes.
@@ -17,8 +17,8 @@
#define HOROVOD_HOROVOD_COMMON_DDL_MPI_CONTEXT_MANAGER_H_

#include "mpi_context.h"
#include "ops/cuda_operations.h"
#include "ops/ddl_operations.h"
#include "../ops/cuda_operations.h"
#include "../ops/ddl_operations.h"

namespace horovod {
namespace common {
@@ -14,15 +14,15 @@
// limitations under the License.
// =============================================================================

#include "mpi_context.h"

#include <iostream>
#include <memory>
#include <vector>

#include "common.h"
#include "half.h"
#include "logging.h"
#include "mpi_context.h"
#include "operations.h"
#include "../common.h"
#include "../half.h"
#include "../logging.h"

namespace horovod {
namespace common {
@@ -21,11 +21,9 @@
#include <memory>
#include <vector>

#include "common.h"
#include "half.h"
#include "logging.h"
#include "mpi_context.h"
#include "operations.h"
#include "../common.h"
#include "../half.h"
#include "../logging.h"

namespace horovod {
namespace common {
@@ -45,7 +43,7 @@ struct MPIContext {

void Enable() {
enabled_ = true;
LOG(INFO) << "MPI context enabled.";
LOG(DEBUG) << "MPI context enabled.";
};

bool IsEnabled() { return enabled_; }
@@ -14,8 +14,9 @@
// =============================================================================

#include "mpi_controller.h"
#include "logging.h"
#include "operations.h"

#include "../common.h"
#include "../logging.h"

namespace horovod {
namespace common {
@@ -35,7 +36,7 @@ void MPIController::Initialize() {
MPI_Comm_size(mpi_ctx_.mpi_comm, &size_);

if (is_coordinator_) {
LOG(INFO) << "Started Horovod with " << size_ << " processes";
LOG(DEBUG) << "Started Horovod with " << size_ << " processes";
}

// Determine local rank by querying the local communicator.
@@ -16,8 +16,8 @@
#ifndef HOROVOD_MPI_CONTROLLER_H
#define HOROVOD_MPI_CONTROLLER_H

#include "controller.h"
#include "mpi_context.h"
#include "../controller.h"

namespace horovod {
namespace common {
@@ -29,7 +29,7 @@ class MPIController : public Controller {
MPIContext& mpi_ctx)
: Controller(response_cache, tensor_queue, timeline, parameter_manager),
mpi_ctx_(mpi_ctx) {
LOG(INFO) << "MPI Controller Initialized.";
LOG(DEBUG) << "MPI Controller Initialized.";
}

void Initialize() override;

0 comments on commit 339f850

Please sign in to comment.
You can’t perform that action at this time.