Skip to content
Permalink
Browse files

Add Buildkite CI support (#984)

* Buildkite support

Signed-off-by: Alex Sergeev <alsrgv@users.noreply.github.com>

* Incorporate recent changes

Signed-off-by: Alex Sergeev <alsrgv@users.noreply.github.com>

* Bugfix env variable

Signed-off-by: Alex Sergeev <alsrgv@users.noreply.github.com>

* Improved update-alternatives

Signed-off-by: Alex Sergeev <alsrgv@users.noreply.github.com>

* Move cache cutover to a point right before framework installation

Signed-off-by: Alex Sergeev <alsrgv@users.noreply.github.com>

* Bump build timeout to 20 minutes

Signed-off-by: Alex Sergeev <alsrgv@users.noreply.github.com>

* Increase # of push retries to 5

Signed-off-by: Alex Sergeev <alsrgv@users.noreply.github.com>

* Fix cpu-gpu error tests

Signed-off-by: Alex Sergeev <alsrgv@users.noreply.github.com>

* Bump timeout further

Signed-off-by: Alex Sergeev <alsrgv@users.noreply.github.com>

* Downgrade NCCL to 2.3.7

Signed-off-by: Alex Sergeev <alsrgv@users.noreply.github.com>

* Add more exclusions to TensorFlow mixed mode tests

Signed-off-by: Alex Sergeev <alsrgv@users.noreply.github.com>

* Pin tf-nightly to a version before the breaking change

Signed-off-by: Alex Sergeev <alsrgv@users.noreply.github.com>

* Update cache condition

Signed-off-by: Alex Sergeev <alsrgv@users.noreply.github.com>

* Replace torchvision_nightly with torchvision

Signed-off-by: Alex Sergeev <alsrgv@users.noreply.github.com>

* Add Pillow for torchvision

Signed-off-by: Alex Sergeev <alsrgv@users.noreply.github.com>

* Fix test_horovod_allreduce_multi_gpu

Signed-off-by: Alex Sergeev <alsrgv@users.noreply.github.com>
  • Loading branch information...
alsrgv committed Apr 4, 2019
1 parent af11495 commit f601a3a0da458b6ff451d65b1b92296a3b1735b0
@@ -0,0 +1,154 @@
#!/bin/bash

# exit immediately on failure, or if an undefined variable is used
set -eu

# our repository in AWS
repository=823773083436.dkr.ecr.us-east-1.amazonaws.com/buildkite

# list of all the tests
tests=( \
test-cpu-openmpi-py2_7-tf1_1_0-keras2_0_0-torch0_4_0-mxnet1_4_0-pyspark2_1_2 \
test-cpu-openmpi-py3_5-tf1_1_0-keras2_0_0-torch0_4_0-mxnet1_4_0-pyspark2_1_2 \
test-cpu-openmpi-py3_6-tf1_1_0-keras2_0_0-torch0_4_0-mxnet1_4_0-pyspark2_1_2 \
test-cpu-openmpi-py2_7-tf1_6_0-keras2_1_2-torch0_4_1-mxnet1_4_0-pyspark2_3_2 \
test-cpu-openmpi-py3_5-tf1_6_0-keras2_1_2-torch0_4_1-mxnet1_4_0-pyspark2_3_2 \
test-cpu-openmpi-py3_6-tf1_6_0-keras2_1_2-torch0_4_1-mxnet1_4_0-pyspark2_3_2 \
test-cpu-openmpi-py2_7-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_4_0-pyspark2_4_0 \
test-cpu-openmpi-py3_5-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_4_0-pyspark2_4_0 \
test-cpu-openmpi-py3_6-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_4_0-pyspark2_4_0 \
test-cpu-openmpi-py2_7-tfhead-kerashead-torchhead-mxnet1_4_0-pyspark2_4_0 \
test-cpu-openmpi-py3_6-tfhead-kerashead-torchhead-mxnet1_4_0-pyspark2_4_0 \
test-cpu-mpich-py2_7-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_4_0-pyspark2_4_0 \
test-gpu-openmpi-py2_7-tf1_6_0-keras2_1_2-torch0_4_1-mxnet1_4_0-pyspark2_3_2 \
test-gpu-openmpi-py3_5-tf1_6_0-keras2_1_2-torch0_4_1-mxnet1_4_0-pyspark2_3_2 \
test-gpu-openmpi-py2_7-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_4_0-pyspark2_4_0 \
test-gpu-openmpi-py3_5-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_4_0-pyspark2_4_0 \
test-gpu-openmpi-py2_7-tfhead-kerashead-torchhead-mxnet1_4_0-pyspark2_4_0 \
test-gpu-openmpi-py3_6-tfhead-kerashead-torchhead-mxnet1_4_0-pyspark2_4_0 \
test-gpu-mpich-py2_7-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_4_0-pyspark2_4_0 \
test-mixed-openmpi-py2_7-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_4_0-pyspark2_4_0 \
)

build_test() {
local test=$1

echo "- label: ':docker: Build ${test}'"
echo " plugins:"
echo " - docker-compose#6b0df8a98ff97f42f4944dbb745b5b8cbf04b78c:"
echo " build: ${test}"
echo " image-repository: ${repository}"
echo " cache-from: ${test}:${repository}:${BUILDKITE_PIPELINE_SLUG}-${test}-latest"
echo " config: docker-compose.test.yml"
echo " push-retries: 5"
echo " - ecr#v1.2.0:"
echo " login: true"
echo " timeout_in_minutes: 30"
echo " retry:"
echo " automatic: true"
echo " agents:"
echo " queue: cpu"
}

cache_test() {
local test=$1

echo "- label: ':docker: Update ${BUILDKITE_PIPELINE_SLUG}-${test}-latest'"
echo " plugins:"
echo " - docker-compose#v2.6.0:"
echo " push: ${test}:${repository}:${BUILDKITE_PIPELINE_SLUG}-${test}-latest"
echo " config: docker-compose.test.yml"
echo " push-retries: 3"
echo " - ecr#v1.2.0:"
echo " login: true"
echo " timeout_in_minutes: 5"
echo " retry:"
echo " automatic: true"
echo " agents:"
echo " queue: cpu"
}

run_test() {
local test=$1
local queue=$2
local label=$3
local command=$4

echo "- label: '${label}'"
echo " command: ${command}"
echo " plugins:"
echo " - docker-compose#v2.6.0:"
echo " run: ${test}"
echo " config: docker-compose.test.yml"
echo " pull-retries: 3"
echo " - ecr#v1.2.0:"
echo " login: true"
echo " timeout_in_minutes: 5"
echo " retry:"
echo " automatic: true"
echo " agents:"
echo " queue: ${queue}"
}

# begin the pipeline.yml file
echo "steps:"

# build every test container
for test in ${tests[@]}; do
build_test "${test}"
done

# wait for all builds to finish
echo "- wait"

# cache test containers if built from master
if [[ "${BUILDKITE_BRANCH}" == "master" ]]; then
for test in ${tests[@]}; do
cache_test "${test}"
done
fi

# run all the tests
for test in ${tests[@]}; do
if [[ ${test} == *-cpu-* ]]; then
queue=cpu
else
queue=gpu
fi

run_test "${test}" "${queue}" \
":pytest: Run PyTests (${test})" \
"bash -c \"cd /horovod/test && (echo test_*.py | xargs -n 1 \\\$(cat /mpirun_command) pytest -v --capture=no)\""

run_test "${test}" "${queue}" \
":muscle: Test TensorFlow MNIST (${test})" \
"bash -c \"\\\$(cat /mpirun_command) python /horovod/examples/tensorflow_mnist.py\""

if [[ ${test} != *"tf1_1_0"* && ${test} != *"tf1_6_0"* ]]; then
run_test "${test}" "${queue}" \
":muscle: Test TensorFlow Eager MNIST (${test})" \
"bash -c \"\\\$(cat /mpirun_command) python /horovod/examples/tensorflow_mnist_eager.py\""
fi

run_test "${test}" "${queue}" \
":muscle: Test Keras MNIST (${test})" \
"bash -c \"\\\$(cat /mpirun_command) python /horovod/examples/keras_mnist_advanced.py\""

run_test "${test}" "${queue}" \
":muscle: Test PyTorch MNIST (${test})" \
"bash -c \"\\\$(cat /mpirun_command) python /horovod/examples/pytorch_mnist.py\""

run_test "${test}" "${queue}" \
":muscle: Test MXNet MNIST (${test})" \
"bash -c \"OMP_NUM_THREADS=1 \\\$(cat /mpirun_command) python /horovod/examples/mxnet_mnist.py\""

run_test "${test}" "${queue}" \
":muscle: Test Stall (${test})" \
"bash -c \"\\\$(cat /mpirun_command) python /horovod/test/test_stall.py\""

if [[ ${test} == *"openmpi"* ]]; then
run_test "${test}" "${queue}" \
":muscle: Test Horovodrun (${test})" \
"horovodrun -np 2 -H localhost:2 python /horovod/examples/tensorflow_mnist.py"
fi
done
@@ -38,7 +38,7 @@ env:
- TF_PACKAGE=tensorflow==1.1.0 KERAS_PACKAGE=keras==2.0.0 PYTORCH_PACKAGE=torch==0.4.0 MXNET_PACKAGE=mxnet==1.4.0.post0 MPI=OpenMPI PYSPARK=2.1.2
- TF_PACKAGE=tensorflow==1.6.0 KERAS_PACKAGE=keras==2.1.2 PYTORCH_PACKAGE=torch==0.4.1 MXNET_PACKAGE=mxnet==1.4.0.post0 MPI=OpenMPI PYSPARK=2.3.2
- TF_PACKAGE=tensorflow==1.12.0 KERAS_PACKAGE=keras==2.2.2 PYTORCH_PACKAGE=torch==1.0.0 MXNET_PACKAGE=mxnet==1.4.0.post0 MPI=OpenMPI PYSPARK=2.4.0
- TF_PACKAGE=tf-nightly KERAS_PACKAGE=git+https://github.com/keras-team/keras.git PYTORCH_PACKAGE=torch-nightly MXNET_PACKAGE=mxnet==1.4.0.post0 MPI=OpenMPI PYSPARK=2.4.0
- TF_PACKAGE=tf-nightly==1.14.1.dev20190401 KERAS_PACKAGE=git+https://github.com/keras-team/keras.git PYTORCH_PACKAGE=torch-nightly MXNET_PACKAGE=mxnet==1.4.0.post0 MPI=OpenMPI PYSPARK=2.4.0
- TF_PACKAGE=tensorflow==1.12.0 KERAS_PACKAGE=keras==2.2.2 PYTORCH_PACKAGE=torch==1.0.0 MXNET_PACKAGE=mxnet==1.4.0.post0 MPI=MPICH PYSPARK=2.4.0

matrix:
@@ -53,7 +53,7 @@ matrix:
- python: "3.6"
env: TF_PACKAGE=tensorflow==1.12.0 KERAS_PACKAGE=keras==2.2.2 PYTORCH_PACKAGE=torch==1.0.0 MXNET_PACKAGE=mxnet==1.4.0.post0 MPI=MPICH PYSPARK=2.4.0
- python: "3.5"
env: TF_PACKAGE=tf-nightly KERAS_PACKAGE=git+https://github.com/keras-team/keras.git PYTORCH_PACKAGE=torch-nightly MXNET_PACKAGE=mxnet==1.4.0.post0 MPI=OpenMPI PYSPARK=2.4.0
env: TF_PACKAGE=tf-nightly==1.14.1.dev20190401 KERAS_PACKAGE=git+https://github.com/keras-team/keras.git PYTORCH_PACKAGE=torch-nightly MXNET_PACKAGE=mxnet==1.4.0.post0 MPI=OpenMPI PYSPARK=2.4.0

install:
- |
@@ -79,12 +79,11 @@ install:
# PyTorch
- |
if [[ ${PYTORCH_PACKAGE} == "torch-nightly" ]]; then
docker exec ${CONTAINER} /bin/sh -c "pip install torchvision"
docker exec ${CONTAINER} /bin/sh -c "pip uninstall -y torch"
docker exec ${CONTAINER} /bin/sh -c "pip install torch_nightly -v -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html"
else
docker exec ${CONTAINER} /bin/sh -c "pip install ${PYTORCH_PACKAGE} torchvision"
docker exec ${CONTAINER} /bin/sh -c "pip install ${PYTORCH_PACKAGE}"
fi
docker exec ${CONTAINER} /bin/sh -c "pip install torchvision Pillow --no-deps"
# MXNet
- docker exec ${CONTAINER} /bin/sh -c "pip install ${MXNET_PACKAGE}"
@@ -0,0 +1,122 @@
ARG UBUNTU_VERSION=16.04
FROM ubuntu:${UBUNTU_VERSION}

# Arguments for the build. UBUNTU_VERSION needs to be repeated becaus
# the first usage only applies to the FROM tag.
ARG UBUNTU_VERSION=16.04
ARG MPI_KIND=OpenMPI
ARG PYTHON_VERSION=2.7
ARG TENSORFLOW_PACKAGE=tensorflow==1.12.0
ARG KERAS_PACKAGE=keras==2.2.2
ARG PYTORCH_PACKAGE=torch==1.0.0
ARG MXNET_PACKAGE=mxnet==1.4.0.post0
ARG PYSPARK_PACKAGE=pyspark==2.4.0

# Set default shell to /bin/bash
SHELL ["/bin/bash", "-cu"]

# Install essential packages.
RUN apt-get update -qq
RUN apt-get install -y --no-install-recommends \
wget \
ca-certificates \
openssh-client \
git \
build-essential \
gcc-4.9 \
g++-4.9 \
gcc-4.9-base \
software-properties-common

# Install Python.
RUN if [[ "${PYTHON_VERSION}" == "3.6" ]]; then \
add-apt-repository ppa:deadsnakes/ppa && apt-get update -qq; \
fi
RUN apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev
RUN ln -s -f /usr/bin/python${PYTHON_VERSION} /usr/bin/python
RUN wget https://bootstrap.pypa.io/get-pip.py && python get-pip.py && rm get-pip.py
RUN pip install -U --force pip setuptools requests pytest

# Install PySpark.
RUN apt install -y openjdk-8-jdk-headless
RUN pip install ${PYSPARK_PACKAGE}

# Install MPI.
RUN if [[ ${MPI_KIND} == "OpenMPI" ]]; then \
wget -O /tmp/openmpi-3.0.0-bin.tar.gz https://github.com/uber/horovod/files/1596799/openmpi-3.0.0-bin.tar.gz && \
cd /usr/local && tar -zxf /tmp/openmpi-3.0.0-bin.tar.gz && ldconfig && \
echo "mpirun -allow-run-as-root -np 2 -H localhost:2 -bind-to none -map-by slot -mca mpi_abort_print_stack 1" > /mpirun_command; \
else \
apt-get install -y mpich && \
echo "mpirun -np 2" > /mpirun_command; \
fi

# Install mpi4py.
RUN pip install mpi4py

### END OF CACHE ###
COPY . /horovod

# Install TensorFlow.
RUN pip install ${TENSORFLOW_PACKAGE}

# Install Keras.
RUN pip install ${KERAS_PACKAGE} h5py scipy pandas
RUN mkdir -p ~/.keras
RUN python -c "from keras.datasets import mnist; mnist.load_data()"

# Install PyTorch.
RUN if [[ ${PYTORCH_PACKAGE} == "torch-nightly" ]]; then \
pip install torch_nightly -v -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html; \
else \
pip install ${PYTORCH_PACKAGE}; \
fi
RUN pip install torchvision Pillow --no-deps
RUN pip install future typing

# Install MXNet.
RUN pip install ${MXNET_PACKAGE}

# Pin GCC to 4.9 (priority 200) to compile correctly against TensorFlow, PyTorch, and MXNet.
# Backup existing GCC installation as priority 100, so that it can be recovered later.
RUN update-alternatives --install /usr/bin/gcc gcc $(readlink -f $(which gcc)) 100 && \
update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc $(readlink -f $(which gcc)) 100 && \
update-alternatives --install /usr/bin/g++ g++ $(readlink -f $(which g++)) 100 && \
update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ $(readlink -f $(which g++)) 100
RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.9 200 && \
update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc /usr/bin/gcc-4.9 200 && \
update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.9 200 && \
update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ /usr/bin/g++-4.9 200

# Install Horovod.
RUN cd /horovod && python setup.py sdist
RUN pip install -v /horovod/dist/horovod-*.tar.gz

# Remove GCC pinning
RUN update-alternatives --remove gcc /usr/bin/gcc-4.9 && \
update-alternatives --remove x86_64-linux-gnu-gcc /usr/bin/gcc-4.9 && \
update-alternatives --remove g++ /usr/bin/g++-4.9 && \
update-alternatives --remove x86_64-linux-gnu-g++ /usr/bin/g++-4.9

# Hack for compatibility of MNIST example with TensorFlow 1.1.0.
RUN if [[ ${TENSORFLOW_PACKAGE} == "tensorflow==1.1.0" ]]; then \
sed -i "s/from tensorflow import keras/from tensorflow.contrib import keras/" /horovod/examples/tensorflow_mnist.py; \
fi

# Hack TensorFlow MNIST example to be smaller.
RUN sed -i "s/last_step=20000/last_step=100/" /horovod/examples/tensorflow_mnist.py

# Hack TensorFlow Eager MNIST example to be smaller.
RUN sed -i "s/dataset.take(20000/dataset.take(100/" /horovod/examples/tensorflow_mnist_eager.py

# Hack Keras MNIST advanced example to be smaller.
RUN sed -i "s/epochs = .*/epochs = 9/" /horovod/examples/keras_mnist_advanced.py
RUN sed -i "s/model.add(Conv2D(32, kernel_size=(3, 3),/model.add(Conv2D(1, kernel_size=(3, 3),/" /horovod/examples/keras_mnist_advanced.py
RUN sed -i "s/model.add(Conv2D(64, (3, 3), activation='relu'))//" /horovod/examples/keras_mnist_advanced.py

# Hack PyTorch MNIST example to be smaller.
RUN sed -i "s/'--epochs', type=int, default=10,/'--epochs', type=int, default=2,/" /horovod/examples/pytorch_mnist.py
RUN sed -i "s/self.fc1 = nn.Linear(320, 50)/self.fc1 = nn.Linear(784, 50)/" /horovod/examples/pytorch_mnist.py
RUN sed -i "s/x = F.relu(F.max_pool2d(self.conv1(x), 2))//" /horovod/examples/pytorch_mnist.py
RUN sed -i "s/x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))//" /horovod/examples/pytorch_mnist.py
RUN sed -i "s/x = x.view(-1, 320)/x = x.view(-1, 784)/" /horovod/examples/pytorch_mnist.py
Oops, something went wrong.

0 comments on commit f601a3a

Please sign in to comment.
You can’t perform that action at this time.