-
Notifications
You must be signed in to change notification settings - Fork 2.2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
11 changed files
with
827 additions
and
84 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,145 @@ | ||
#!/bin/bash | ||
|
||
# exit immediately on failure, or if an undefined variable is used | ||
set -eu | ||
|
||
# our repository in AWS | ||
repository=823773083436.dkr.ecr.us-east-1.amazonaws.com/buildkite | ||
|
||
# list of all the tests | ||
tests=( \ | ||
test-cpu-openmpi-py2_7-tf1_1_0-keras2_0_0-torch0_4_0-mxnet1_4_0-pyspark2_1_2 \ | ||
test-cpu-openmpi-py3_5-tf1_1_0-keras2_0_0-torch0_4_0-mxnet1_4_0-pyspark2_1_2 \ | ||
test-cpu-openmpi-py3_6-tf1_1_0-keras2_0_0-torch0_4_0-mxnet1_4_0-pyspark2_1_2 \ | ||
test-cpu-openmpi-py2_7-tf1_6_0-keras2_1_2-torch0_4_1-mxnet1_4_0-pyspark2_3_2 \ | ||
test-cpu-openmpi-py3_5-tf1_6_0-keras2_1_2-torch0_4_1-mxnet1_4_0-pyspark2_3_2 \ | ||
test-cpu-openmpi-py3_6-tf1_6_0-keras2_1_2-torch0_4_1-mxnet1_4_0-pyspark2_3_2 \ | ||
test-cpu-openmpi-py2_7-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_4_0-pyspark2_4_0 \ | ||
test-cpu-openmpi-py3_5-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_4_0-pyspark2_4_0 \ | ||
test-cpu-openmpi-py3_6-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_4_0-pyspark2_4_0 \ | ||
test-cpu-openmpi-py2_7-tfhead-kerashead-torchhead-mxnet1_4_0-pyspark2_4_0 \ | ||
test-cpu-openmpi-py3_6-tfhead-kerashead-torchhead-mxnet1_4_0-pyspark2_4_0 \ | ||
test-cpu-mpich-py2_7-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_4_0-pyspark2_4_0 \ | ||
test-gpu-openmpi-py2_7-tf1_6_0-keras2_1_2-torch0_4_1-mxnet1_4_0-pyspark2_3_2 \ | ||
test-gpu-openmpi-py3_5-tf1_6_0-keras2_1_2-torch0_4_1-mxnet1_4_0-pyspark2_3_2 \ | ||
test-gpu-openmpi-py2_7-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_4_0-pyspark2_4_0 \ | ||
test-gpu-openmpi-py3_5-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_4_0-pyspark2_4_0 \ | ||
test-gpu-openmpi-py2_7-tfhead-kerashead-torchhead-mxnet1_4_0-pyspark2_4_0 \ | ||
test-gpu-openmpi-py3_6-tfhead-kerashead-torchhead-mxnet1_4_0-pyspark2_4_0 \ | ||
test-gpu-mpich-py2_7-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_4_0-pyspark2_4_0 \ | ||
test-mixed-openmpi-py2_7-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_4_0-pyspark2_4_0 \ | ||
) | ||
|
||
build_test() { | ||
local test=$1 | ||
|
||
echo "- label: ':docker: Build ${test}'" | ||
echo " plugins:" | ||
echo " - docker-compose#6b0df8a98ff97f42f4944dbb745b5b8cbf04b78c:" | ||
echo " build: ${test}" | ||
echo " image-repository: ${repository}" | ||
echo " cache-from: ${test}:${repository}:${BUILDKITE_PIPELINE_SLUG}-${test}-latest" | ||
echo " config: docker-compose.test.yml" | ||
echo " push-retries: 3" | ||
echo " - ecr#v1.2.0:" | ||
echo " login: true" | ||
echo " timeout_in_minutes: 15" | ||
echo " retry:" | ||
echo " automatic: true" | ||
echo " agents:" | ||
echo " queue: cpu" | ||
} | ||
|
||
cache_test() { | ||
local test=$1 | ||
|
||
echo "- label: ':docker: Update ${BUILDKITE_PIPELINE_SLUG}-${test}-latest'" | ||
echo " plugins:" | ||
echo " - docker-compose#v2.6.0:" | ||
echo " push: ${test}:${repository}:${BUILDKITE_PIPELINE_SLUG}-${test}-latest" | ||
echo " config: docker-compose.test.yml" | ||
echo " push-retries: 3" | ||
echo " - ecr#v1.2.0:" | ||
echo " login: true" | ||
echo " timeout_in_minutes: 5" | ||
echo " retry:" | ||
echo " automatic: true" | ||
echo " agents:" | ||
echo " queue: cpu" | ||
} | ||
|
||
run_test() { | ||
local test=$1 | ||
local queue=$2 | ||
local label=$3 | ||
local command=$4 | ||
|
||
echo "- label: '${label}'" | ||
echo " command: ${command}" | ||
echo " plugins:" | ||
echo " - docker-compose#v2.6.0:" | ||
echo " run: ${test}" | ||
echo " config: docker-compose.test.yml" | ||
echo " pull-retries: 3" | ||
echo " - ecr#v1.2.0:" | ||
echo " login: true" | ||
echo " timeout_in_minutes: 5" | ||
echo " retry:" | ||
echo " automatic: true" | ||
echo " agents:" | ||
echo " queue: ${queue}" | ||
} | ||
|
||
# begin the pipeline.yml file | ||
echo "steps:" | ||
|
||
# build every test container | ||
for test in ${tests[@]}; do | ||
build_test "${test}" | ||
done | ||
|
||
# wait for all builds to finish | ||
echo "- wait" | ||
|
||
# cache test containers if built from master | ||
# TODO: replace with master | ||
if [[ "${BUILDKITE_BRANCH}" == "buildkite" ]]; then | ||
for test in ${tests[@]}; do | ||
cache_test "${test}" | ||
done | ||
fi | ||
|
||
# run all the tests | ||
for test in ${tests[@]}; do | ||
if [[ ${test} == *-cpu-* ]]; then | ||
queue=cpu | ||
else | ||
queue=gpu | ||
fi | ||
|
||
run_test "${test}" "${queue}" \ | ||
":pytest: Run PyTests (${test})" \ | ||
"bash -c \"cd /horovod/test && (echo test_*.py | xargs -n 1 \\\$(cat /mpirun_command) pytest -v --capture=no)\"" | ||
|
||
run_test "${test}" "${queue}" \ | ||
":muscle: Test TensorFlow MNIST (${test})" \ | ||
"bash -c \"\\\$(cat /mpirun_command) python /horovod/examples/tensorflow_mnist.py\"" | ||
|
||
if [[ ${test} != *"tf1_1_0"* && ${test} != *"tf1_6_0"* ]]; then | ||
run_test "${test}" "${queue}" \ | ||
":muscle: Test TensorFlow Eager MNIST (${test})" \ | ||
"bash -c \"\\\$(cat /mpirun_command) python /horovod/examples/tensorflow_mnist_eager.py\"" | ||
fi | ||
|
||
run_test "${test}" "${queue}" \ | ||
":muscle: Test Keras MNIST (${test})" \ | ||
"bash -c \"\\\$(cat /mpirun_command) python /horovod/examples/keras_mnist_advanced.py\"" | ||
|
||
run_test "${test}" "${queue}" \ | ||
":muscle: Test PyTorch MNIST (${test})" \ | ||
"bash -c \"\\\$(cat /mpirun_command) python /horovod/examples/pytorch_mnist.py\"" | ||
|
||
run_test "${test}" "${queue}" \ | ||
":muscle: Test MXNet MNIST (${test})" \ | ||
"bash -c \"OMP_NUM_THREADS=1 \\\$(cat /mpirun_command) python /horovod/examples/mxnet_mnist.py\"" | ||
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
ARG UBUNTU_VERSION=16.04 | ||
FROM ubuntu:${UBUNTU_VERSION} | ||
|
||
# Arguments for the build. UBUNTU_VERSION needs to be repeated becaus | ||
# the first usage only applies to the FROM tag. | ||
ARG UBUNTU_VERSION=16.04 | ||
ARG MPI_KIND=OpenMPI | ||
ARG PYTHON_VERSION=2.7 | ||
ARG TENSORFLOW_PACKAGE=tensorflow==1.12.0 | ||
ARG KERAS_PACKAGE=keras==2.2.2 | ||
ARG PYTORCH_PACKAGE=torch==1.0.0 | ||
ARG MXNET_PACKAGE=mxnet-gcc5 | ||
ARG PYSPARK_PACKAGE=pyspark==2.4.0 | ||
|
||
# Set default shell to /bin/bash | ||
SHELL ["/bin/bash", "-cu"] | ||
|
||
# Install essential packages. | ||
RUN apt-get update -qq | ||
RUN apt-get install -y wget openssh-client git build-essential | ||
|
||
# Install Python. | ||
RUN if [[ "${PYTHON_VERSION}" == "3.6" ]]; then \ | ||
apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python3-distutils; \ | ||
else \ | ||
apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev; \ | ||
fi | ||
RUN ln -s -f /usr/bin/python${PYTHON_VERSION} /usr/bin/python | ||
RUN wget https://bootstrap.pypa.io/get-pip.py && python get-pip.py && rm get-pip.py | ||
RUN pip install -U --force pip setuptools requests pytest | ||
|
||
# Install PySpark. | ||
RUN apt install -y openjdk-8-jdk-headless | ||
RUN pip install ${PYSPARK_PACKAGE} | ||
|
||
# Install MPI. | ||
RUN if [[ ${MPI_KIND} == "OpenMPI" ]]; then \ | ||
wget -O /tmp/openmpi-3.0.0-bin.tar.gz https://github.com/uber/horovod/files/1596799/openmpi-3.0.0-bin.tar.gz && \ | ||
cd /usr/local && tar -zxf /tmp/openmpi-3.0.0-bin.tar.gz && ldconfig && \ | ||
echo "mpirun -allow-run-as-root -np 2 -H localhost:2 -bind-to none -map-by slot -mca mpi_abort_print_stack 1" > /mpirun_command; \ | ||
else \ | ||
apt-get install -y mpich && \ | ||
echo "mpirun -np 2" > /mpirun_command; \ | ||
fi | ||
|
||
# Install TensorFlow. | ||
RUN pip install ${TENSORFLOW_PACKAGE} | ||
|
||
# Install Keras. | ||
RUN pip install ${KERAS_PACKAGE} h5py scipy pandas | ||
RUN mkdir -p ~/.keras | ||
RUN python -c "from keras.datasets import mnist; mnist.load_data()" | ||
|
||
# Install PyTorch. | ||
RUN if [[ ${PYTORCH_PACKAGE} == "torch-nightly" ]]; then \ | ||
pip install torch_nightly -v -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html && \ | ||
pip install torchvision_nightly; \ | ||
else \ | ||
pip install ${PYTORCH_PACKAGE} torchvision; \ | ||
fi | ||
RUN pip install future typing | ||
|
||
# Install MXNet. | ||
RUN pip install ${MXNET_PACKAGE} | ||
|
||
# Install Horovod. | ||
COPY . /horovod | ||
RUN cd /horovod && python setup.py sdist | ||
RUN pip install -v /horovod/dist/horovod-*.tar.gz | ||
|
||
# Hack for compatibility of MNIST example with TensorFlow 1.1.0. | ||
RUN if [[ ${TENSORFLOW_PACKAGE} == "tensorflow==1.1.0" ]]; then \ | ||
sed -i "s/from tensorflow import keras/from tensorflow.contrib import keras/" /horovod/examples/tensorflow_mnist.py; \ | ||
fi | ||
|
||
# Hack TensorFlow MNIST example to be smaller. | ||
RUN sed -i "s/last_step=20000/last_step=100/" /horovod/examples/tensorflow_mnist.py | ||
|
||
# Hack TensorFlow Eager MNIST example to be smaller. | ||
RUN sed -i "s/dataset.take(20000/dataset.take(100/" /horovod/examples/tensorflow_mnist_eager.py | ||
|
||
# Hack Keras MNIST advanced example to be smaller. | ||
RUN sed -i "s/epochs = .*/epochs = 9/" /horovod/examples/keras_mnist_advanced.py | ||
RUN sed -i "s/model.add(Conv2D(32, kernel_size=(3, 3),/model.add(Conv2D(1, kernel_size=(3, 3),/" /horovod/examples/keras_mnist_advanced.py | ||
RUN sed -i "s/model.add(Conv2D(64, (3, 3), activation='relu'))//" /horovod/examples/keras_mnist_advanced.py | ||
|
||
# Hack PyTorch MNIST example to be smaller. | ||
RUN sed -i "s/'--epochs', type=int, default=10,/'--epochs', type=int, default=2,/" /horovod/examples/pytorch_mnist.py | ||
RUN sed -i "s/self.fc1 = nn.Linear(320, 50)/self.fc1 = nn.Linear(784, 50)/" /horovod/examples/pytorch_mnist.py | ||
RUN sed -i "s/x = F.relu(F.max_pool2d(self.conv1(x), 2))//" /horovod/examples/pytorch_mnist.py | ||
RUN sed -i "s/x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))//" /horovod/examples/pytorch_mnist.py | ||
RUN sed -i "s/x = x.view(-1, 320)/x = x.view(-1, 784)/" /horovod/examples/pytorch_mnist.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
ARG CUDA_DOCKER_VERSION=9.0-devel-ubuntu16.04 | ||
FROM nvidia/cuda:${CUDA_DOCKER_VERSION} | ||
|
||
# Arguments for the build. CUDA_DOCKER_VERSION needs to be repeated becaus | ||
# the first usage only applies to the FROM tag. | ||
ARG CUDA_DOCKER_VERSION=9.0-devel-ubuntu16.04 | ||
ARG CUDNN_VERSION=7.4.1.5-1+cuda9.0 | ||
ARG NCCL_VERSION_OVERRIDE=2.4.2-1+cuda9.0 | ||
ARG MPI_KIND=OpenMPI | ||
ARG PYTHON_VERSION=2.7 | ||
ARG TENSORFLOW_PACKAGE=tensorflow-gpu==1.12.0 | ||
ARG KERAS_PACKAGE=keras==2.2.2 | ||
ARG PYTORCH_PACKAGE=torch==1.0.0 | ||
ARG MXNET_PACKAGE=https://s3-us-west-2.amazonaws.com/mxnet-python-packages-gcc5/mxnet_cu90_gcc5-1.4.0-py2.py3-none-manylinux1_x86_64.whl | ||
ARG PYSPARK_PACKAGE=pyspark==2.4.0 | ||
ARG HOROVOD_BUILD_FLAGS=HOROVOD_GPU_ALLREDUCE=NCCL | ||
|
||
# Set default shell to /bin/bash | ||
SHELL ["/bin/bash", "-cu"] | ||
|
||
# Install essential packages. | ||
RUN apt-get update -qq | ||
RUN apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \ | ||
wget \ | ||
openssh-client \ | ||
git \ | ||
build-essential \ | ||
libcudnn7=${CUDNN_VERSION} \ | ||
libnccl2=${NCCL_VERSION_OVERRIDE} \ | ||
libnccl-dev=${NCCL_VERSION_OVERRIDE} | ||
|
||
# Install Python. | ||
RUN if [[ ${PYTHON_VERSION} == "3.6" ]]; then \ | ||
apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python3-distutils; \ | ||
else \ | ||
apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev; \ | ||
fi | ||
RUN ln -s -f /usr/bin/python${PYTHON_VERSION} /usr/bin/python | ||
RUN wget https://bootstrap.pypa.io/get-pip.py && python get-pip.py && rm get-pip.py | ||
RUN pip install -U --force pip setuptools requests pytest | ||
|
||
# Install PySpark. | ||
RUN apt install -y openjdk-8-jdk-headless | ||
RUN pip install ${PYSPARK_PACKAGE} | ||
|
||
# Install MPI. | ||
RUN if [[ ${MPI_KIND} == "OpenMPI" ]]; then \ | ||
wget -O /tmp/openmpi-3.0.0-bin.tar.gz https://github.com/uber/horovod/files/1596799/openmpi-3.0.0-bin.tar.gz && \ | ||
cd /usr/local && tar -zxf /tmp/openmpi-3.0.0-bin.tar.gz && ldconfig && \ | ||
echo "mpirun -allow-run-as-root -np 2 -H localhost:2 -bind-to none -map-by slot -mca mpi_abort_print_stack 1" > /mpirun_command; \ | ||
else \ | ||
apt-get install -y mpich && \ | ||
echo "mpirun -np 2" > /mpirun_command; \ | ||
fi | ||
|
||
# Set default NCCL parameters | ||
RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf | ||
|
||
# Install TensorFlow. | ||
RUN pip install ${TENSORFLOW_PACKAGE} | ||
|
||
# Install Keras. | ||
RUN pip install ${KERAS_PACKAGE} h5py scipy pandas | ||
RUN mkdir -p ~/.keras | ||
RUN ldconfig /usr/local/cuda/targets/x86_64-linux/lib/stubs && \ | ||
python -c "from keras.datasets import mnist; mnist.load_data()" && \ | ||
ldconfig | ||
|
||
# Install PyTorch. | ||
RUN if [[ ${PYTORCH_PACKAGE} == "torch-nightly" ]]; then \ | ||
PYTORCH_CUDA=$(echo ${CUDA_DOCKER_VERSION} | awk -F- '{print $1}' | sed 's/\.//'); \ | ||
pip install torch_nightly -v -f https://download.pytorch.org/whl/nightly/cu${PYTORCH_CUDA}/torch_nightly.html && \ | ||
pip install torchvision_nightly; \ | ||
else \ | ||
pip install ${PYTORCH_PACKAGE} torchvision; \ | ||
fi | ||
RUN pip install future typing | ||
|
||
# Install MXNet. | ||
RUN pip install ${MXNET_PACKAGE} | ||
|
||
# Install Horovod. | ||
COPY . /horovod | ||
RUN cd /horovod && python setup.py sdist | ||
RUN ldconfig /usr/local/cuda/targets/x86_64-linux/lib/stubs && \ | ||
bash -c "${HOROVOD_BUILD_FLAGS} pip install -v /horovod/dist/horovod-*.tar.gz" && \ | ||
ldconfig | ||
|
||
# Hack for compatibility of MNIST example with TensorFlow 1.1.0. | ||
RUN if [[ ${TENSORFLOW_PACKAGE} == "tensorflow-gpu==1.1.0" ]]; then \ | ||
sed -i "s/from tensorflow import keras/from tensorflow.contrib import keras/" /horovod/examples/tensorflow_mnist.py; \ | ||
fi | ||
|
||
# Hack TensorFlow MNIST example to be smaller. | ||
RUN sed -i "s/last_step=20000/last_step=100/" /horovod/examples/tensorflow_mnist.py | ||
|
||
# Hack TensorFlow Eager MNIST example to be smaller. | ||
RUN sed -i "s/dataset.take(20000/dataset.take(100/" /horovod/examples/tensorflow_mnist_eager.py | ||
|
||
# Hack Keras MNIST advanced example to be smaller. | ||
RUN sed -i "s/epochs = .*/epochs = 9/" /horovod/examples/keras_mnist_advanced.py | ||
|
||
# Hack PyTorch MNIST example to be smaller. | ||
RUN sed -i "s/'--epochs', type=int, default=10,/'--epochs', type=int, default=2,/" /horovod/examples/pytorch_mnist.py |
Oops, something went wrong.