Skip to content

Commit

Permalink
Buildkite support
Browse files Browse the repository at this point in the history
  • Loading branch information
alsrgv committed Feb 23, 2019
1 parent ca3c1e3 commit a5fb7a2
Show file tree
Hide file tree
Showing 11 changed files with 827 additions and 84 deletions.
145 changes: 145 additions & 0 deletions .buildkite/gen-pipeline.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
#!/bin/bash

# exit immediately on failure, or if an undefined variable is used
set -eu

# our repository in AWS
repository=823773083436.dkr.ecr.us-east-1.amazonaws.com/buildkite

# list of all the tests
tests=( \
test-cpu-openmpi-py2_7-tf1_1_0-keras2_0_0-torch0_4_0-mxnet1_4_0-pyspark2_1_2 \
test-cpu-openmpi-py3_5-tf1_1_0-keras2_0_0-torch0_4_0-mxnet1_4_0-pyspark2_1_2 \
test-cpu-openmpi-py3_6-tf1_1_0-keras2_0_0-torch0_4_0-mxnet1_4_0-pyspark2_1_2 \
test-cpu-openmpi-py2_7-tf1_6_0-keras2_1_2-torch0_4_1-mxnet1_4_0-pyspark2_3_2 \
test-cpu-openmpi-py3_5-tf1_6_0-keras2_1_2-torch0_4_1-mxnet1_4_0-pyspark2_3_2 \
test-cpu-openmpi-py3_6-tf1_6_0-keras2_1_2-torch0_4_1-mxnet1_4_0-pyspark2_3_2 \
test-cpu-openmpi-py2_7-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_4_0-pyspark2_4_0 \
test-cpu-openmpi-py3_5-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_4_0-pyspark2_4_0 \
test-cpu-openmpi-py3_6-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_4_0-pyspark2_4_0 \
test-cpu-openmpi-py2_7-tfhead-kerashead-torchhead-mxnet1_4_0-pyspark2_4_0 \
test-cpu-openmpi-py3_6-tfhead-kerashead-torchhead-mxnet1_4_0-pyspark2_4_0 \
test-cpu-mpich-py2_7-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_4_0-pyspark2_4_0 \
test-gpu-openmpi-py2_7-tf1_6_0-keras2_1_2-torch0_4_1-mxnet1_4_0-pyspark2_3_2 \
test-gpu-openmpi-py3_5-tf1_6_0-keras2_1_2-torch0_4_1-mxnet1_4_0-pyspark2_3_2 \
test-gpu-openmpi-py2_7-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_4_0-pyspark2_4_0 \
test-gpu-openmpi-py3_5-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_4_0-pyspark2_4_0 \
test-gpu-openmpi-py2_7-tfhead-kerashead-torchhead-mxnet1_4_0-pyspark2_4_0 \
test-gpu-openmpi-py3_6-tfhead-kerashead-torchhead-mxnet1_4_0-pyspark2_4_0 \
test-gpu-mpich-py2_7-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_4_0-pyspark2_4_0 \
test-mixed-openmpi-py2_7-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_4_0-pyspark2_4_0 \
)

build_test() {
local test=$1

echo "- label: ':docker: Build ${test}'"
echo " plugins:"
echo " - docker-compose#6b0df8a98ff97f42f4944dbb745b5b8cbf04b78c:"
echo " build: ${test}"
echo " image-repository: ${repository}"
echo " cache-from: ${test}:${repository}:${BUILDKITE_PIPELINE_SLUG}-${test}-latest"
echo " config: docker-compose.test.yml"
echo " push-retries: 3"
echo " - ecr#v1.2.0:"
echo " login: true"
echo " timeout_in_minutes: 15"
echo " retry:"
echo " automatic: true"
echo " agents:"
echo " queue: cpu"
}

cache_test() {
local test=$1

echo "- label: ':docker: Update ${BUILDKITE_PIPELINE_SLUG}-${test}-latest'"
echo " plugins:"
echo " - docker-compose#v2.6.0:"
echo " push: ${test}:${repository}:${BUILDKITE_PIPELINE_SLUG}-${test}-latest"
echo " config: docker-compose.test.yml"
echo " push-retries: 3"
echo " - ecr#v1.2.0:"
echo " login: true"
echo " timeout_in_minutes: 5"
echo " retry:"
echo " automatic: true"
echo " agents:"
echo " queue: cpu"
}

run_test() {
local test=$1
local queue=$2
local label=$3
local command=$4

echo "- label: '${label}'"
echo " command: ${command}"
echo " plugins:"
echo " - docker-compose#v2.6.0:"
echo " run: ${test}"
echo " config: docker-compose.test.yml"
echo " pull-retries: 3"
echo " - ecr#v1.2.0:"
echo " login: true"
echo " timeout_in_minutes: 5"
echo " retry:"
echo " automatic: true"
echo " agents:"
echo " queue: ${queue}"
}

# begin the pipeline.yml file
echo "steps:"

# build every test container
for test in ${tests[@]}; do
build_test "${test}"
done

# wait for all builds to finish
echo "- wait"

# cache test containers if built from master
# TODO: replace with master
if [[ "${BUILDKITE_BRANCH}" == "buildkite" ]]; then
for test in ${tests[@]}; do
cache_test "${test}"
done
fi

# run all the tests
for test in ${tests[@]}; do
if [[ ${test} == *-cpu-* ]]; then
queue=cpu
else
queue=gpu
fi

run_test "${test}" "${queue}" \
":pytest: Run PyTests (${test})" \
"bash -c \"cd /horovod/test && (echo test_*.py | xargs -n 1 \\\$(cat /mpirun_command) pytest -v --capture=no)\""

run_test "${test}" "${queue}" \
":muscle: Test TensorFlow MNIST (${test})" \
"bash -c \"\\\$(cat /mpirun_command) python /horovod/examples/tensorflow_mnist.py\""

if [[ ${test} != *"tf1_1_0"* && ${test} != *"tf1_6_0"* ]]; then
run_test "${test}" "${queue}" \
":muscle: Test TensorFlow Eager MNIST (${test})" \
"bash -c \"\\\$(cat /mpirun_command) python /horovod/examples/tensorflow_mnist_eager.py\""
fi

run_test "${test}" "${queue}" \
":muscle: Test Keras MNIST (${test})" \
"bash -c \"\\\$(cat /mpirun_command) python /horovod/examples/keras_mnist_advanced.py\""

run_test "${test}" "${queue}" \
":muscle: Test PyTorch MNIST (${test})" \
"bash -c \"\\\$(cat /mpirun_command) python /horovod/examples/pytorch_mnist.py\""

run_test "${test}" "${queue}" \
":muscle: Test MXNet MNIST (${test})" \
"bash -c \"OMP_NUM_THREADS=1 \\\$(cat /mpirun_command) python /horovod/examples/mxnet_mnist.py\""
done
92 changes: 92 additions & 0 deletions Dockerfile.test.cpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
ARG UBUNTU_VERSION=16.04
FROM ubuntu:${UBUNTU_VERSION}

# Arguments for the build. UBUNTU_VERSION needs to be repeated becaus
# the first usage only applies to the FROM tag.
ARG UBUNTU_VERSION=16.04
ARG MPI_KIND=OpenMPI
ARG PYTHON_VERSION=2.7
ARG TENSORFLOW_PACKAGE=tensorflow==1.12.0
ARG KERAS_PACKAGE=keras==2.2.2
ARG PYTORCH_PACKAGE=torch==1.0.0
ARG MXNET_PACKAGE=mxnet-gcc5
ARG PYSPARK_PACKAGE=pyspark==2.4.0

# Set default shell to /bin/bash
SHELL ["/bin/bash", "-cu"]

# Install essential packages.
RUN apt-get update -qq
RUN apt-get install -y wget openssh-client git build-essential

# Install Python.
RUN if [[ "${PYTHON_VERSION}" == "3.6" ]]; then \
apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python3-distutils; \
else \
apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev; \
fi
RUN ln -s -f /usr/bin/python${PYTHON_VERSION} /usr/bin/python
RUN wget https://bootstrap.pypa.io/get-pip.py && python get-pip.py && rm get-pip.py
RUN pip install -U --force pip setuptools requests pytest

# Install PySpark.
RUN apt install -y openjdk-8-jdk-headless
RUN pip install ${PYSPARK_PACKAGE}

# Install MPI.
RUN if [[ ${MPI_KIND} == "OpenMPI" ]]; then \
wget -O /tmp/openmpi-3.0.0-bin.tar.gz https://github.com/uber/horovod/files/1596799/openmpi-3.0.0-bin.tar.gz && \
cd /usr/local && tar -zxf /tmp/openmpi-3.0.0-bin.tar.gz && ldconfig && \
echo "mpirun -allow-run-as-root -np 2 -H localhost:2 -bind-to none -map-by slot -mca mpi_abort_print_stack 1" > /mpirun_command; \
else \
apt-get install -y mpich && \
echo "mpirun -np 2" > /mpirun_command; \
fi

# Install TensorFlow.
RUN pip install ${TENSORFLOW_PACKAGE}

# Install Keras.
RUN pip install ${KERAS_PACKAGE} h5py scipy pandas
RUN mkdir -p ~/.keras
RUN python -c "from keras.datasets import mnist; mnist.load_data()"

# Install PyTorch.
RUN if [[ ${PYTORCH_PACKAGE} == "torch-nightly" ]]; then \
pip install torch_nightly -v -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html && \
pip install torchvision_nightly; \
else \
pip install ${PYTORCH_PACKAGE} torchvision; \
fi
RUN pip install future typing

# Install MXNet.
RUN pip install ${MXNET_PACKAGE}

# Install Horovod.
COPY . /horovod
RUN cd /horovod && python setup.py sdist
RUN pip install -v /horovod/dist/horovod-*.tar.gz

# Hack for compatibility of MNIST example with TensorFlow 1.1.0.
RUN if [[ ${TENSORFLOW_PACKAGE} == "tensorflow==1.1.0" ]]; then \
sed -i "s/from tensorflow import keras/from tensorflow.contrib import keras/" /horovod/examples/tensorflow_mnist.py; \
fi

# Hack TensorFlow MNIST example to be smaller.
RUN sed -i "s/last_step=20000/last_step=100/" /horovod/examples/tensorflow_mnist.py

# Hack TensorFlow Eager MNIST example to be smaller.
RUN sed -i "s/dataset.take(20000/dataset.take(100/" /horovod/examples/tensorflow_mnist_eager.py

# Hack Keras MNIST advanced example to be smaller.
RUN sed -i "s/epochs = .*/epochs = 9/" /horovod/examples/keras_mnist_advanced.py
RUN sed -i "s/model.add(Conv2D(32, kernel_size=(3, 3),/model.add(Conv2D(1, kernel_size=(3, 3),/" /horovod/examples/keras_mnist_advanced.py
RUN sed -i "s/model.add(Conv2D(64, (3, 3), activation='relu'))//" /horovod/examples/keras_mnist_advanced.py

# Hack PyTorch MNIST example to be smaller.
RUN sed -i "s/'--epochs', type=int, default=10,/'--epochs', type=int, default=2,/" /horovod/examples/pytorch_mnist.py
RUN sed -i "s/self.fc1 = nn.Linear(320, 50)/self.fc1 = nn.Linear(784, 50)/" /horovod/examples/pytorch_mnist.py
RUN sed -i "s/x = F.relu(F.max_pool2d(self.conv1(x), 2))//" /horovod/examples/pytorch_mnist.py
RUN sed -i "s/x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))//" /horovod/examples/pytorch_mnist.py
RUN sed -i "s/x = x.view(-1, 320)/x = x.view(-1, 784)/" /horovod/examples/pytorch_mnist.py
104 changes: 104 additions & 0 deletions Dockerfile.test.gpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
ARG CUDA_DOCKER_VERSION=9.0-devel-ubuntu16.04
FROM nvidia/cuda:${CUDA_DOCKER_VERSION}

# Arguments for the build. CUDA_DOCKER_VERSION needs to be repeated becaus
# the first usage only applies to the FROM tag.
ARG CUDA_DOCKER_VERSION=9.0-devel-ubuntu16.04
ARG CUDNN_VERSION=7.4.1.5-1+cuda9.0
ARG NCCL_VERSION_OVERRIDE=2.4.2-1+cuda9.0
ARG MPI_KIND=OpenMPI
ARG PYTHON_VERSION=2.7
ARG TENSORFLOW_PACKAGE=tensorflow-gpu==1.12.0
ARG KERAS_PACKAGE=keras==2.2.2
ARG PYTORCH_PACKAGE=torch==1.0.0
ARG MXNET_PACKAGE=https://s3-us-west-2.amazonaws.com/mxnet-python-packages-gcc5/mxnet_cu90_gcc5-1.4.0-py2.py3-none-manylinux1_x86_64.whl
ARG PYSPARK_PACKAGE=pyspark==2.4.0
ARG HOROVOD_BUILD_FLAGS=HOROVOD_GPU_ALLREDUCE=NCCL

# Set default shell to /bin/bash
SHELL ["/bin/bash", "-cu"]

# Install essential packages.
RUN apt-get update -qq
RUN apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
wget \
openssh-client \
git \
build-essential \
libcudnn7=${CUDNN_VERSION} \
libnccl2=${NCCL_VERSION_OVERRIDE} \
libnccl-dev=${NCCL_VERSION_OVERRIDE}

# Install Python.
RUN if [[ ${PYTHON_VERSION} == "3.6" ]]; then \
apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python3-distutils; \
else \
apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev; \
fi
RUN ln -s -f /usr/bin/python${PYTHON_VERSION} /usr/bin/python
RUN wget https://bootstrap.pypa.io/get-pip.py && python get-pip.py && rm get-pip.py
RUN pip install -U --force pip setuptools requests pytest

# Install PySpark.
RUN apt install -y openjdk-8-jdk-headless
RUN pip install ${PYSPARK_PACKAGE}

# Install MPI.
RUN if [[ ${MPI_KIND} == "OpenMPI" ]]; then \
wget -O /tmp/openmpi-3.0.0-bin.tar.gz https://github.com/uber/horovod/files/1596799/openmpi-3.0.0-bin.tar.gz && \
cd /usr/local && tar -zxf /tmp/openmpi-3.0.0-bin.tar.gz && ldconfig && \
echo "mpirun -allow-run-as-root -np 2 -H localhost:2 -bind-to none -map-by slot -mca mpi_abort_print_stack 1" > /mpirun_command; \
else \
apt-get install -y mpich && \
echo "mpirun -np 2" > /mpirun_command; \
fi

# Set default NCCL parameters
RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf

# Install TensorFlow.
RUN pip install ${TENSORFLOW_PACKAGE}

# Install Keras.
RUN pip install ${KERAS_PACKAGE} h5py scipy pandas
RUN mkdir -p ~/.keras
RUN ldconfig /usr/local/cuda/targets/x86_64-linux/lib/stubs && \
python -c "from keras.datasets import mnist; mnist.load_data()" && \
ldconfig

# Install PyTorch.
RUN if [[ ${PYTORCH_PACKAGE} == "torch-nightly" ]]; then \
PYTORCH_CUDA=$(echo ${CUDA_DOCKER_VERSION} | awk -F- '{print $1}' | sed 's/\.//'); \
pip install torch_nightly -v -f https://download.pytorch.org/whl/nightly/cu${PYTORCH_CUDA}/torch_nightly.html && \
pip install torchvision_nightly; \
else \
pip install ${PYTORCH_PACKAGE} torchvision; \
fi
RUN pip install future typing

# Install MXNet.
RUN pip install ${MXNET_PACKAGE}

# Install Horovod.
COPY . /horovod
RUN cd /horovod && python setup.py sdist
RUN ldconfig /usr/local/cuda/targets/x86_64-linux/lib/stubs && \
bash -c "${HOROVOD_BUILD_FLAGS} pip install -v /horovod/dist/horovod-*.tar.gz" && \
ldconfig

# Hack for compatibility of MNIST example with TensorFlow 1.1.0.
RUN if [[ ${TENSORFLOW_PACKAGE} == "tensorflow-gpu==1.1.0" ]]; then \
sed -i "s/from tensorflow import keras/from tensorflow.contrib import keras/" /horovod/examples/tensorflow_mnist.py; \
fi

# Hack TensorFlow MNIST example to be smaller.
RUN sed -i "s/last_step=20000/last_step=100/" /horovod/examples/tensorflow_mnist.py

# Hack TensorFlow Eager MNIST example to be smaller.
RUN sed -i "s/dataset.take(20000/dataset.take(100/" /horovod/examples/tensorflow_mnist_eager.py

# Hack Keras MNIST advanced example to be smaller.
RUN sed -i "s/epochs = .*/epochs = 9/" /horovod/examples/keras_mnist_advanced.py

# Hack PyTorch MNIST example to be smaller.
RUN sed -i "s/'--epochs', type=int, default=10,/'--epochs', type=int, default=2,/" /horovod/examples/pytorch_mnist.py
Loading

0 comments on commit a5fb7a2

Please sign in to comment.