Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added Gloo controller #1181

Merged
merged 8 commits into from Aug 10, 2019
Merged
Changes from all commits
Commits
File filter...
Filter file types
Jump to…
Jump to file or symbol
Failed to load files and symbols.

Always

Just for now

@@ -17,6 +17,12 @@ tests=( \
test-cpu-openmpi-py2_7-tf1_14_0-keras2_2_4-torch1_1_0-mxnet1_4_1-pyspark2_4_0 \
test-cpu-openmpi-py3_5-tf1_14_0-keras2_2_4-torch1_1_0-mxnet1_4_1-pyspark2_4_0 \
test-cpu-openmpi-py3_6-tf1_14_0-keras2_2_4-torch1_1_0-mxnet1_4_1-pyspark2_4_0 \
test-cpu-gloo-py2_7-tf1_14_0-keras2_2_4-torch1_1_0-mxnet1_4_1-pyspark2_4_0 \
test-cpu-gloo-py3_5-tf1_14_0-keras2_2_4-torch1_1_0-mxnet1_4_1-pyspark2_4_0 \
test-cpu-gloo-py3_6-tf1_14_0-keras2_2_4-torch1_1_0-mxnet1_4_1-pyspark2_4_0 \
test-cpu-openmpi-gloo-py2_7-tf1_14_0-keras2_2_4-torch1_1_0-mxnet1_4_1-pyspark2_4_0 \
test-cpu-openmpi-gloo-py3_5-tf1_14_0-keras2_2_4-torch1_1_0-mxnet1_4_1-pyspark2_4_0 \
test-cpu-openmpi-gloo-py3_6-tf1_14_0-keras2_2_4-torch1_1_0-mxnet1_4_1-pyspark2_4_0 \
test-cpu-openmpi-py2_7-tf2_0_0-keras2_2_4-torch1_1_0-mxnet1_5_0-pyspark2_4_0 \
test-cpu-openmpi-py3_5-tf2_0_0-keras2_2_4-torch1_1_0-mxnet1_5_0-pyspark2_4_0 \
test-cpu-openmpi-py3_6-tf2_0_0-keras2_2_4-torch1_1_0-mxnet1_5_0-pyspark2_4_0 \
@@ -28,6 +34,10 @@ tests=( \
test-gpu-openmpi-py3_5-tf1_6_0-keras2_1_2-torch0_4_1-mxnet1_4_1-pyspark2_3_2 \
test-gpu-openmpi-py2_7-tf1_14_0-keras2_2_4-torch1_1_0-mxnet1_4_1-pyspark2_4_0 \
test-gpu-openmpi-py3_6-tf1_14_0-keras2_2_4-torch1_1_0-mxnet1_4_1-pyspark2_4_0 \
test-gpu-gloo-py2_7-tf1_14_0-keras2_2_4-torch1_1_0-mxnet1_4_1-pyspark2_4_0 \
test-gpu-gloo-py3_6-tf1_14_0-keras2_2_4-torch1_1_0-mxnet1_4_1-pyspark2_4_0 \
test-gpu-openmpi-gloo-py2_7-tf1_14_0-keras2_2_4-torch1_1_0-mxnet1_4_1-pyspark2_4_0 \
test-gpu-openmpi-gloo-py3_6-tf1_14_0-keras2_2_4-torch1_1_0-mxnet1_4_1-pyspark2_4_0 \
test-gpu-openmpi-py2_7-tf2_0_0-keras2_2_4-torch1_1_0-mxnet1_5_0-pyspark2_4_0 \
test-gpu-openmpi-py3_6-tf2_0_0-keras2_2_4-torch1_1_0-mxnet1_5_0-pyspark2_4_0 \
test-gpu-openmpi-py2_7-tfhead-kerashead-torchhead-mxnethead-pyspark2_4_0 \
@@ -161,6 +171,33 @@ run_all() {
fi
}

run_gloo() {
local test=$1
local queue=$2

# Seems that spark tests depend on MPI, do not test those when mpi is not available
local exclude_spark_if_needed=""
if [[ ${test} != *"mpi"* ]]; then
exclude_spark_if_needed="| sed 's/[a-z_]*spark[a-z_.]*//g'"
fi

run_test "${test}" "${queue}" \
":pytest: Run PyTests (${test})" \
"bash -c \"cd /horovod/test && (echo test_*.py ${exclude_spark_if_needed} | xargs -n 1 horovodrun -np 2 -H localhost:2 --gloo pytest -v --capture=no)\""

run_test "${test}" "${queue}" \
":muscle: Test Keras MNIST (${test})" \
"horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/keras_mnist_advanced.py"

run_test "${test}" "${queue}" \
":muscle: Test PyTorch MNIST (${test})" \
"horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/pytorch_mnist.py"

run_test "${test}" "${queue}" \
":muscle: Test MXNet MNIST (${test})" \
"horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet_mnist.py"
}

build_docs() {
echo "- label: ':book: Build Docs'"
echo " command: 'cd /workdir/docs && pip install -r requirements.txt && make html'"
@@ -198,7 +235,14 @@ fi
# run all the cpu tests
for test in ${tests[@]}; do
if [[ ${test} == *-cpu-* ]]; then
run_all ${test} "cpu"
# if gloo is specified, run gloo_test
if [[ ${test} == *-gloo* ]]; then
run_gloo ${test} "cpu"
fi
# if mpi is specified, run mpi cpu_test
if [[ ${test} == *mpi* ]]; then
run_all ${test} "cpu"
fi
fi
done

@@ -208,6 +252,13 @@ echo "- wait"
# run all the gpu tests
for test in ${tests[@]}; do
if [[ ${test} == *-gpu-* ]]; then
run_all ${test} "gpu"
# if gloo is specified, run gloo_test
if [[ ${test} == *-gloo* ]]; then
run_gloo ${test} "gpu"
fi
# if mpi is specified, run mpi gpu_test
if [[ ${test} == *mpi* ]]; then
run_all ${test} "gpu"
fi
fi
done
@@ -49,3 +49,6 @@
[submodule "third_party/gloo"]
path = third_party/gloo
url = https://github.com/facebookincubator/gloo.git
[submodule "third_party/HTTPRequest"]
path = third_party/HTTPRequest
url = https://github.com/elnormous/HTTPRequest
@@ -78,17 +78,19 @@ RUN if [[ ${MPI_KIND} == "OpenMPI" ]]; then \
echo "-L/usr/local/mlsl/intel64/lib/thread -lmpi -I/usr/local/mlsl/intel64/include" > /mpicc_mlsl && \
chmod +x /mpicc_mlsl && \
echo "/mpirun_command_script" > /mpirun_command; \
else \
elif [[ ${MPI_KIND} == "MPICH" ]]; then \
apt-get install -y mpich && \
echo "mpirun -np 2" > /mpirun_command; \
fi

# Install mpi4py.
RUN if [[ ${MPI_KIND} == "MLSL" ]]; then \
export I_MPI_ROOT=/usr/local/mlsl; \
export MPICC=/usr/local/mlsl/intel64/bin/mpicc; \
fi; \
pip install mpi4py
RUN if [[ ${MPI_KIND} != "None" ]]; then \
if [[ ${MPI_KIND} == "MLSL" ]]; then \
export I_MPI_ROOT=/usr/local/mlsl; \
export MPICC=/usr/local/mlsl/intel64/bin/mpicc; \
fi; \
pip install mpi4py; \
fi

### END OF CACHE ###
COPY . /horovod
@@ -58,7 +58,7 @@ RUN if [[ ${MPI_KIND} == "OpenMPI" ]]; then \
wget -O /tmp/openmpi-3.0.0-bin.tar.gz https://github.com/horovod/horovod/files/1596799/openmpi-3.0.0-bin.tar.gz && \
cd /usr/local && tar -zxf /tmp/openmpi-3.0.0-bin.tar.gz && ldconfig && \
echo "mpirun -allow-run-as-root -np 2 -H localhost:2 -bind-to none -map-by slot -mca mpi_abort_print_stack 1" > /mpirun_command; \
else \
elif [[ ${MPI_KIND} == "MPICH" ]]; then \
apt-get install -y mpich && \
echo "mpirun -np 2" > /mpirun_command; \
fi
@@ -67,7 +67,9 @@ RUN if [[ ${MPI_KIND} == "OpenMPI" ]]; then \
RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf

# Install mpi4py.
RUN pip install mpi4py
RUN if [[ ${MPI_KIND} != "None" ]]; then \
pip install mpi4py; \
fi

### END OF CACHE ###
COPY . /horovod
25 NOTICE
@@ -131,3 +131,28 @@
The derived work can be found in the files:

- examples/keras_spark_rossmann.py

elnormous/HTTPRequest
Copyright (c) 2017, Elviss Strazdiņš
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.

* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -116,6 +116,80 @@ services:
TORCHVISION_PACKAGE: https://download.pytorch.org/whl/cpu/torchvision-0.3.0-cp36-cp36m-linux_x86_64.whl
MXNET_PACKAGE: mxnet==1.4.1
PYSPARK_PACKAGE: pyspark==2.4.0
test-cpu-openmpi-gloo-py2_7-tf1_14_0-keras2_2_4-torch1_1_0-mxnet1_4_1-pyspark2_4_0:
extends: test-cpu-base
build:
args:
MPI_KIND: OpenMPI
PYTHON_VERSION: 2.7
TENSORFLOW_PACKAGE: tensorflow==1.14.0
KERAS_PACKAGE: keras==2.2.4
PYTORCH_PACKAGE: torch==1.1.0
TORCHVISION_PACKAGE: https://download.pytorch.org/whl/cpu/torchvision-0.3.0-cp27-cp27mu-linux_x86_64.whl
MXNET_PACKAGE: mxnet==1.4.1
PYSPARK_PACKAGE: pyspark==2.4.0
test-cpu-openmpi-gloo-py3_5-tf1_14_0-keras2_2_4-torch1_1_0-mxnet1_4_1-pyspark2_4_0:
extends: test-cpu-base
build:
args:
MPI_KIND: OpenMPI
PYTHON_VERSION: 3.5
TENSORFLOW_PACKAGE: tensorflow==1.14.0
KERAS_PACKAGE: keras==2.2.4
PYTORCH_PACKAGE: torch==1.1.0
TORCHVISION_PACKAGE: https://download.pytorch.org/whl/cpu/torchvision-0.3.0-cp35-cp35m-linux_x86_64.whl
MXNET_PACKAGE: mxnet==1.4.1
PYSPARK_PACKAGE: pyspark==2.4.0
test-cpu-openmpi-gloo-py3_6-tf1_14_0-keras2_2_4-torch1_1_0-mxnet1_4_1-pyspark2_4_0:
extends: test-cpu-base
build:
args:
UBUNTU_VERSION: 18.04
MPI_KIND: OpenMPI
PYTHON_VERSION: 3.6
TENSORFLOW_PACKAGE: tensorflow==1.14.0
KERAS_PACKAGE: keras==2.2.4
PYTORCH_PACKAGE: torch==1.1.0
TORCHVISION_PACKAGE: https://download.pytorch.org/whl/cpu/torchvision-0.3.0-cp36-cp36m-linux_x86_64.whl
MXNET_PACKAGE: mxnet==1.4.1
PYSPARK_PACKAGE: pyspark==2.4.0
test-cpu-gloo-py2_7-tf1_14_0-keras2_2_4-torch1_1_0-mxnet1_4_1-pyspark2_4_0:
extends: test-cpu-base
build:
args:
MPI_KIND: None
PYTHON_VERSION: 2.7
TENSORFLOW_PACKAGE: tensorflow==1.14.0
KERAS_PACKAGE: keras==2.2.4
PYTORCH_PACKAGE: torch==1.1.0
TORCHVISION_PACKAGE: https://download.pytorch.org/whl/cpu/torchvision-0.3.0-cp27-cp27mu-linux_x86_64.whl
MXNET_PACKAGE: mxnet==1.4.1
PYSPARK_PACKAGE: pyspark==2.4.0
test-cpu-gloo-py3_5-tf1_14_0-keras2_2_4-torch1_1_0-mxnet1_4_1-pyspark2_4_0:
extends: test-cpu-base
build:
args:
MPI_KIND: None
PYTHON_VERSION: 3.5
TENSORFLOW_PACKAGE: tensorflow==1.14.0
KERAS_PACKAGE: keras==2.2.4
PYTORCH_PACKAGE: torch==1.1.0
TORCHVISION_PACKAGE: https://download.pytorch.org/whl/cpu/torchvision-0.3.0-cp35-cp35m-linux_x86_64.whl
MXNET_PACKAGE: mxnet==1.4.1
PYSPARK_PACKAGE: pyspark==2.4.0
test-cpu-gloo-py3_6-tf1_14_0-keras2_2_4-torch1_1_0-mxnet1_4_1-pyspark2_4_0:
extends: test-cpu-base
build:
args:
UBUNTU_VERSION: 18.04
MPI_KIND: None
PYTHON_VERSION: 3.6
TENSORFLOW_PACKAGE: tensorflow==1.14.0
KERAS_PACKAGE: keras==2.2.4
PYTORCH_PACKAGE: torch==1.1.0
TORCHVISION_PACKAGE: https://download.pytorch.org/whl/cpu/torchvision-0.3.0-cp36-cp36m-linux_x86_64.whl
MXNET_PACKAGE: mxnet==1.4.1
PYSPARK_PACKAGE: pyspark==2.4.0
test-cpu-openmpi-py2_7-tf2_0_0-keras2_2_4-torch1_1_0-mxnet1_5_0-pyspark2_4_0:
extends: test-cpu-base
build:
@@ -270,6 +344,56 @@ services:
TORCHVISION_PACKAGE: https://download.pytorch.org/whl/cu100/torchvision-0.3.0-cp36-cp36m-linux_x86_64.whl
MXNET_PACKAGE: mxnet-cu100==1.4.1
PYSPARK_PACKAGE: pyspark==2.4.0
test-gpu-gloo-py2_7-tf1_14_0-keras2_2_4-torch1_1_0-mxnet1_4_1-pyspark2_4_0:
extends: test-gpu-base
build:
args:
MPI_KIND: None
PYTHON_VERSION: 2.7
TENSORFLOW_PACKAGE: tensorflow-gpu==1.14.0
KERAS_PACKAGE: keras==2.2.4
PYTORCH_PACKAGE: https://download.pytorch.org/whl/cu100/torch-1.1.0-cp27-cp27mu-linux_x86_64.whl
TORCHVISION_PACKAGE: https://download.pytorch.org/whl/cu100/torchvision-0.3.0-cp27-cp27mu-linux_x86_64.whl
MXNET_PACKAGE: mxnet-cu100==1.4.1
PYSPARK_PACKAGE: pyspark==2.4.0
test-gpu-gloo-py3_6-tf1_14_0-keras2_2_4-torch1_1_0-mxnet1_4_1-pyspark2_4_0:
extends: test-gpu-base
build:
args:
CUDA_DOCKER_VERSION: 10.0-devel-ubuntu18.04
MPI_KIND: None
PYTHON_VERSION: 3.6
TENSORFLOW_PACKAGE: tensorflow-gpu==1.14.0
KERAS_PACKAGE: keras==2.2.4
PYTORCH_PACKAGE: https://download.pytorch.org/whl/cu100/torch-1.1.0-cp36-cp36m-linux_x86_64.whl
TORCHVISION_PACKAGE: https://download.pytorch.org/whl/cu100/torchvision-0.3.0-cp36-cp36m-linux_x86_64.whl
MXNET_PACKAGE: mxnet-cu100==1.4.1
PYSPARK_PACKAGE: pyspark==2.4.0
test-gpu-openmpi-gloo-py2_7-tf1_14_0-keras2_2_4-torch1_1_0-mxnet1_4_1-pyspark2_4_0:
extends: test-gpu-base
build:
args:
MPI_KIND: OpenMPI
PYTHON_VERSION: 2.7
TENSORFLOW_PACKAGE: tensorflow-gpu==1.14.0
KERAS_PACKAGE: keras==2.2.4
PYTORCH_PACKAGE: https://download.pytorch.org/whl/cu100/torch-1.1.0-cp27-cp27mu-linux_x86_64.whl
TORCHVISION_PACKAGE: https://download.pytorch.org/whl/cu100/torchvision-0.3.0-cp27-cp27mu-linux_x86_64.whl
MXNET_PACKAGE: mxnet-cu100==1.4.1
PYSPARK_PACKAGE: pyspark==2.4.0
test-gpu-openmpi-gloo-py3_6-tf1_14_0-keras2_2_4-torch1_1_0-mxnet1_4_1-pyspark2_4_0:
extends: test-gpu-base
build:
args:
CUDA_DOCKER_VERSION: 10.0-devel-ubuntu18.04
MPI_KIND: OpenMPI
PYTHON_VERSION: 3.6
TENSORFLOW_PACKAGE: tensorflow-gpu==1.14.0
KERAS_PACKAGE: keras==2.2.4
PYTORCH_PACKAGE: https://download.pytorch.org/whl/cu100/torch-1.1.0-cp36-cp36m-linux_x86_64.whl
TORCHVISION_PACKAGE: https://download.pytorch.org/whl/cu100/torchvision-0.3.0-cp36-cp36m-linux_x86_64.whl
MXNET_PACKAGE: mxnet-cu100==1.4.1
PYSPARK_PACKAGE: pyspark==2.4.0
test-gpu-openmpi-py2_7-tf2_0_0-keras2_2_4-torch1_1_0-mxnet1_5_0-pyspark2_4_0:
extends: test-gpu-base
build:
@@ -40,6 +40,11 @@ def init(self, comm=None):
atexit.register(self.shutdown)

if not isinstance(comm, list):
mpi_enabled = self.MPI_LIB_CTYPES.horovod_mpi_enabled()
if not bool(mpi_enabled):
raise ValueError(
'Horovod MPI is not enabled; Please make sure it\'s installed and enabled.')

from mpi4py import MPI
if MPI._sizeof(MPI.Comm) == ctypes.sizeof(ctypes.c_int):
MPI_Comm = ctypes.c_int
@@ -118,8 +123,35 @@ def mpi_threads_supported(self):
Returns:
A boolean value indicating whether MPI multi-threading is supported.
"""
mpi_enabled = self.MPI_LIB_CTYPES.horovod_mpi_enabled()
if not bool(mpi_enabled):
raise ValueError(
'Horovod MPI is not enabled; Please make sure it\'s installed and enabled.')

mpi_threads_supported = self.MPI_LIB_CTYPES.horovod_mpi_threads_supported()
if mpi_threads_supported == -1:
raise ValueError(
'Horovod has not been initialized; use hvd.init().')
return bool(mpi_threads_supported)

def gloo_enabled(self):
"""A function that returns a flag indicating whether Gloo is enabled.
If Gloo is enabled, users can use it for controller or data transfer operations.
Returns:
A boolean value indicating whether Gloo is enabled.
"""
gloo_enabled = self.MPI_LIB_CTYPES.horovod_gloo_enabled()
return bool(gloo_enabled)

def mpi_enabled(self):
"""A function that returns a flag indicating whether MPI is enabled.
If MPI is enabled, users can use it for controller or data transfer operations.
Returns:
A boolean value indicating whether MPI is enabled.
"""
mpi_enabled = self.MPI_LIB_CTYPES.horovod_mpi_enabled()
return bool(mpi_enabled)
ProTip! Use n and p to navigate between commits in a pull request.
You can’t perform that action at this time.