Skip to content
Permalink
Browse files

Update Dockerfile & tests to TF 1.14.0 + RDMA (#1159)

* Update Dockerfile & tests to TF 1.14.0 + RDMA

Signed-off-by: Alex Sergeev <alsrgv@users.noreply.github.com>

* Fix CPU Ubuntu 18.04 builds

Signed-off-by: Alex Sergeev <alsrgv@users.noreply.github.com>

* Install python3.6-distutils

Signed-off-by: Alex Sergeev <alsrgv@users.noreply.github.com>

* Switch to master torchvision for nightly builds

Signed-off-by: Alex Sergeev <alsrgv@users.noreply.github.com>

* Install future/typing before torchvision@master

Signed-off-by: Alex Sergeev <alsrgv@users.noreply.github.com>

* Change default to CUDA 10 and fix MXNet packages

Signed-off-by: Alex Sergeev <alsrgv@users.noreply.github.com>

* Bugfix, switch to Python 3.6 for single-Python tests

Signed-off-by: Alex Sergeev <alsrgv@users.noreply.github.com>

* Remove redundant labels & environment variables

Signed-off-by: Alex Sergeev <alsrgv@users.noreply.github.com>

* Fix PyTorch CU9/CU10

Signed-off-by: Alex Sergeev <alsrgv@users.noreply.github.com>
  • Loading branch information...
alsrgv committed Jun 28, 2019
1 parent f87d763 commit fda07a628e687102928a090bfa6814e4bf7c827c
Showing with 177 additions and 103 deletions.
  1. +10 −10 .buildkite/gen-pipeline.sh
  2. +46 −9 Dockerfile
  3. +10 −18 Dockerfile.test.cpu
  4. +16 −13 Dockerfile.test.gpu
  5. +1 −1 build-docker-images.sh
  6. +94 −52 docker-compose.test.yml
@@ -14,21 +14,21 @@ tests=( \
test-cpu-openmpi-py2_7-tf1_6_0-keras2_1_2-torch0_4_1-mxnet1_4_1-pyspark2_3_2 \
test-cpu-openmpi-py3_5-tf1_6_0-keras2_1_2-torch0_4_1-mxnet1_4_1-pyspark2_3_2 \
test-cpu-openmpi-py3_6-tf1_6_0-keras2_1_2-torch0_4_1-mxnet1_4_1-pyspark2_3_2 \
test-cpu-openmpi-py2_7-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_4_1-pyspark2_4_0 \
test-cpu-openmpi-py3_5-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_4_1-pyspark2_4_0 \
test-cpu-openmpi-py3_6-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_4_1-pyspark2_4_0 \
test-cpu-openmpi-py2_7-tf1_14_0-keras2_2_4-torch1_1_0-mxnet1_4_1-pyspark2_4_0 \
test-cpu-openmpi-py3_5-tf1_14_0-keras2_2_4-torch1_1_0-mxnet1_4_1-pyspark2_4_0 \
test-cpu-openmpi-py3_6-tf1_14_0-keras2_2_4-torch1_1_0-mxnet1_4_1-pyspark2_4_0 \
test-cpu-openmpi-py2_7-tfhead-kerashead-torchhead-mxnethead-pyspark2_4_0 \
test-cpu-openmpi-py3_6-tfhead-kerashead-torchhead-mxnethead-pyspark2_4_0 \
test-cpu-mpich-py2_7-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_4_1-pyspark2_4_0 \
test-cpu-mlsl-py3_6-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_4_0-pyspark2_4_0 \
test-cpu-mpich-py3_6-tf1_14_0-keras2_2_4-torch1_1_0-mxnet1_4_1-pyspark2_4_0 \
test-cpu-mlsl-py3_6-tf1_14_0-keras2_2_4-torch1_1_0-mxnet1_4_1-pyspark2_4_0 \
test-gpu-openmpi-py2_7-tf1_6_0-keras2_1_2-torch0_4_1-mxnet1_4_1-pyspark2_3_2 \
test-gpu-openmpi-py3_5-tf1_6_0-keras2_1_2-torch0_4_1-mxnet1_4_1-pyspark2_3_2 \
test-gpu-openmpi-py2_7-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_4_1-pyspark2_4_0 \
test-gpu-openmpi-py3_5-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_4_1-pyspark2_4_0 \
test-gpu-openmpi-py2_7-tf1_14_0-keras2_2_4-torch1_1_0-mxnet1_4_1-pyspark2_4_0 \
test-gpu-openmpi-py3_6-tf1_14_0-keras2_2_4-torch1_1_0-mxnet1_4_1-pyspark2_4_0 \
test-gpu-openmpi-py2_7-tfhead-kerashead-torchhead-mxnethead-pyspark2_4_0 \
test-gpu-openmpi-py3_6-tfhead-kerashead-torchhead-mxnethead-pyspark2_4_0 \
test-gpu-mpich-py2_7-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_4_1-pyspark2_4_0 \
test-mixed-openmpi-py2_7-tf1_12_0-keras2_2_2-torch1_0_0-mxnet1_4_1-pyspark2_4_0 \
test-gpu-mpich-py3_6-tf1_14_0-keras2_2_4-torch1_1_0-mxnet1_4_1-pyspark2_4_0 \
test-mixed-openmpi-py3_6-tf1_14_0-keras2_2_4-torch1_1_0-mxnet1_4_1-pyspark2_4_0 \
)

build_test() {
@@ -123,7 +123,7 @@ run_all() {

# tests that should be executed only with the latest release since they don't test
# a framework-specific functionality
if [[ ${test} == *"tf1_12_0"* ]]; then
if [[ ${test} == *"tf1_14_0"* ]]; then
run_test "${test}" "${queue}" \
":muscle: Test Stall (${test})" \
"bash -c \"\\\$(cat /mpirun_command) python /horovod/test/test_stall.py\""
@@ -1,20 +1,27 @@
FROM nvidia/cuda:9.0-devel-ubuntu16.04
FROM nvidia/cuda:10.0-devel-ubuntu18.04

# TensorFlow version is tightly coupled to CUDA and cuDNN so it should be selected carefully
ENV TENSORFLOW_VERSION=1.12.0
ENV TENSORFLOW_VERSION=1.14.0
ENV PYTORCH_VERSION=1.1.0
ENV TORCHVISION_VERSION=0.2.2.post3
ENV CUDNN_VERSION=7.4.1.5-1+cuda9.0
ENV NCCL_VERSION=2.3.7-1+cuda9.0
ENV TORCHVISION_VERSION=0.3.0
ENV CUDNN_VERSION=7.6.0.64-1+cuda10.0
ENV NCCL_VERSION=2.4.7-1+cuda10.0
ENV MXNET_VERSION=1.4.1

# Python 2.7 or 3.5 is supported by Ubuntu Xenial out of the box
# Python 2.7 or 3.6 is supported by Ubuntu Bionic out of the box
ARG python=2.7
ENV PYTHON_VERSION=${python}

# We need gcc-4.9 to build plugins for TensorFlow & PyTorch, which is only available in Ubuntu Xenial
RUN echo deb http://archive.ubuntu.com/ubuntu xenial main universe | tee -a /etc/apt/sources.list

RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
build-essential \
cmake \
gcc-4.9 \
g++-4.9 \
gcc-4.9-base \
software-properties-common \
git \
curl \
vim \
@@ -26,16 +33,29 @@ RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-
libjpeg-dev \
libpng-dev \
python${PYTHON_VERSION} \
python${PYTHON_VERSION}-dev
python${PYTHON_VERSION}-dev \
librdmacm1 \
libibverbs1 \
ibverbs-providers

RUN if [[ "${PYTHON_VERSION}" == "3.6" ]]; then \
apt-get install -y python${PYTHON_VERSION}-distutils; \
fi
RUN ln -s /usr/bin/python${PYTHON_VERSION} /usr/bin/python

RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
python get-pip.py && \
rm get-pip.py

# Install TensorFlow, Keras, PyTorch and MXNet
RUN pip install 'numpy<1.15.0' tensorflow-gpu==${TENSORFLOW_VERSION} keras h5py torch==${PYTORCH_VERSION} torchvision==${TORCHVISION_VERSION} mxnet-cu90==${MXNET_VERSION}
RUN pip install future typing
RUN pip install numpy \
tensorflow-gpu==${TENSORFLOW_VERSION} \
keras \
h5py
RUN pip install https://download.pytorch.org/whl/cu100/torch-${PYTORCH_VERSION}-$(python -c "import wheel.pep425tags as w; print('-'.join(w.get_supported()[0]))").whl \
https://download.pytorch.org/whl/cu100/torchvision-${TORCHVISION_VERSION}-$(python -c "import wheel.pep425tags as w; print('-'.join(w.get_supported()[0]))").whl
RUN pip install mxnet-cu100==${MXNET_VERSION}

# Install Open MPI
RUN mkdir /tmp/openmpi && \
@@ -49,11 +69,28 @@ RUN mkdir /tmp/openmpi && \
ldconfig && \
rm -rf /tmp/openmpi

# Pin GCC to 4.9 (priority 200) to compile correctly against TensorFlow, PyTorch, and MXNet.
# Backup existing GCC installation as priority 100, so that it can be recovered later.
RUN update-alternatives --install /usr/bin/gcc gcc $(readlink -f $(which gcc)) 100 && \
update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc $(readlink -f $(which gcc)) 100 && \
update-alternatives --install /usr/bin/g++ g++ $(readlink -f $(which g++)) 100 && \
update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ $(readlink -f $(which g++)) 100
RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.9 200 && \
update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc /usr/bin/gcc-4.9 200 && \
update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.9 200 && \
update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ /usr/bin/g++-4.9 200

# Install Horovod, temporarily using CUDA stubs
RUN ldconfig /usr/local/cuda-9.0/targets/x86_64-linux/lib/stubs && \
RUN ldconfig /usr/local/cuda/targets/x86_64-linux/lib/stubs && \
HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 HOROVOD_WITH_MXNET=1 pip install --no-cache-dir horovod && \
ldconfig

# Remove GCC pinning
RUN update-alternatives --remove gcc /usr/bin/gcc-4.9 && \
update-alternatives --remove x86_64-linux-gnu-gcc /usr/bin/gcc-4.9 && \
update-alternatives --remove g++ /usr/bin/g++-4.9 && \
update-alternatives --remove x86_64-linux-gnu-g++ /usr/bin/g++-4.9

# Install OpenSSH for MPI to communicate between containers
RUN apt-get install -y --no-install-recommends openssh-client openssh-server && \
mkdir -p /var/run/sshd
@@ -6,16 +6,19 @@ FROM ubuntu:${UBUNTU_VERSION}
ARG UBUNTU_VERSION=16.04
ARG MPI_KIND=OpenMPI
ARG PYTHON_VERSION=2.7
ARG TENSORFLOW_PACKAGE=tensorflow==1.12.0
ARG KERAS_PACKAGE=keras==2.2.2
ARG PYTORCH_PACKAGE=torch==1.0.0
ARG TORCHVISION_PACKAGE=torchvision==0.2.2.post3
ARG TENSORFLOW_PACKAGE=tensorflow==1.14.0
ARG KERAS_PACKAGE=keras==2.2.4
ARG PYTORCH_PACKAGE=torch==1.1.0
ARG TORCHVISION_PACKAGE=https://download.pytorch.org/whl/cpu/torchvision-0.3.0-cp27-cp27mu-linux_x86_64.whl
ARG MXNET_PACKAGE=mxnet==1.4.1
ARG PYSPARK_PACKAGE=pyspark==2.4.0

# Set default shell to /bin/bash
SHELL ["/bin/bash", "-cu"]

# We need gcc-4.9 to build plugins for TensorFlow & PyTorch, which is only available in Ubuntu Xenial
RUN echo deb http://archive.ubuntu.com/ubuntu xenial main universe | tee -a /etc/apt/sources.list

# Install essential packages.
RUN apt-get update -qq
RUN apt-get install -y --no-install-recommends \
@@ -30,10 +33,10 @@ RUN apt-get install -y --no-install-recommends \
software-properties-common

# Install Python.
RUN apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev
RUN if [[ "${PYTHON_VERSION}" == "3.6" ]]; then \
add-apt-repository ppa:deadsnakes/ppa && apt-get update -qq; \
apt-get install -y python${PYTHON_VERSION}-distutils; \
fi
RUN apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev
RUN ln -s -f /usr/bin/python${PYTHON_VERSION} /usr/bin/python
RUN wget https://bootstrap.pypa.io/get-pip.py && python get-pip.py && rm get-pip.py
RUN pip install -U --force pip setuptools requests pytest
@@ -64,12 +67,6 @@ RUN if [[ ${MPI_KIND} == "OpenMPI" ]]; then \
chmod +x /usr/local/mlsl/intel64/bin/mpigxx && \
wget https://raw.githubusercontent.com/AlekseyMarchuk/MLSL/master/mpirt_2019/lib/libmpicxx.so -P /usr/local/mlsl/intel64/lib && \
chmod +x /usr/local/mlsl/intel64/lib/libmpicxx.so && \
#wget https://raw.githubusercontent.com/AlekseyMarchuk/MLSL/master/mpirt_2019/bin/mpicc -P /usr/local/bin && \
#chmod +x /usr/local/bin/mpicc && \
#wget https://raw.githubusercontent.com/AlekseyMarchuk/MLSL/master/mpirt_2019/bin/mpicxx -P /usr/local/bin && \
#chmod +x /usr/local/bin/mpicxx && \
#wget https://raw.githubusercontent.com/AlekseyMarchuk/MLSL/master/mpirt_2019/bin/mpigcc -P /usr/local/bin && \
#chmod +x /usr/local/bin/mpigcc && \
echo ". /usr/local/mlsl/intel64/bin/mlslvars.sh \"thread\"; \
echo \"mpirun is \$(which mpirun)\"; \
echo \"this file is \$(cat /mpirun_command_script)\"; \
@@ -104,13 +101,13 @@ RUN mkdir -p ~/.keras
RUN python -c "from keras.datasets import mnist; mnist.load_data()"

# Install PyTorch.
RUN pip install future typing
RUN if [[ ${PYTORCH_PACKAGE} == "torch-nightly" ]]; then \
pip install torch_nightly -v -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html; \
else \
pip install ${PYTORCH_PACKAGE}; \
fi
RUN pip install ${TORCHVISION_PACKAGE} Pillow --no-deps
RUN pip install future typing

# Install MXNet.
RUN pip install ${MXNET_PACKAGE}
@@ -143,11 +140,6 @@ RUN if [[ ${MPI_KIND} == "MLSL" ]]; then \
cd /horovod && python setup.py sdist; \
fi

#RUN if [[ ${MPI_KIND} == "MLSL" ]]; then \
# source /usr/local/mlsl/intel64/bin/mlslvars.sh "thread"; \
# fi; \
# pip install -v /horovod/dist/horovod-*.tar.gz

RUN if [[ ${MPI_KIND} == "MLSL" ]]; then \
if [ -z "${LD_LIBRARY_PATH:-}" ]; then \
export LD_LIBRARY_PATH=""; \
@@ -1,25 +1,28 @@
ARG CUDA_DOCKER_VERSION=9.0-devel-ubuntu16.04
ARG CUDA_DOCKER_VERSION=10.0-devel-ubuntu16.04
FROM nvidia/cuda:${CUDA_DOCKER_VERSION}

# Arguments for the build. CUDA_DOCKER_VERSION needs to be repeated becaus
# Arguments for the build. CUDA_DOCKER_VERSION needs to be repeated because
# the first usage only applies to the FROM tag.
ARG CUDA_DOCKER_VERSION=9.0-devel-ubuntu16.04
ARG CUDNN_VERSION=7.4.1.5-1+cuda9.0
ARG NCCL_VERSION_OVERRIDE=2.3.7-1+cuda9.0
ARG CUDA_DOCKER_VERSION=10.0-devel-ubuntu16.04
ARG CUDNN_VERSION=7.6.0.64-1+cuda10.0
ARG NCCL_VERSION_OVERRIDE=2.4.7-1+cuda10.0
ARG MPI_KIND=OpenMPI
ARG PYTHON_VERSION=2.7
ARG TENSORFLOW_PACKAGE=tensorflow-gpu==1.12.0
ARG KERAS_PACKAGE=keras==2.2.2
ARG PYTORCH_PACKAGE=torch==1.0.0
ARG TORCHVISION_PACKAGE=torchvision==0.2.2.post3
ARG MXNET_PACKAGE=mxnet-cu90==1.4.1
ARG TENSORFLOW_PACKAGE=tensorflow-gpu==1.14.0
ARG KERAS_PACKAGE=keras==2.2.4
ARG PYTORCH_PACKAGE=https://download.pytorch.org/whl/cu100/torch-1.1.0-cp27-cp27mu-linux_x86_64.whl
ARG TORCHVISION_PACKAGE=https://download.pytorch.org/whl/cu100/torchvision-0.3.0-cp27-cp27mu-linux_x86_64.whl
ARG MXNET_PACKAGE=mxnet-cu100==1.4.1
ARG PYSPARK_PACKAGE=pyspark==2.4.0
ARG HOROVOD_BUILD_FLAGS=HOROVOD_GPU_ALLREDUCE=NCCL
ARG HOROVOD_MIXED_INSTALL=0

# Set default shell to /bin/bash
SHELL ["/bin/bash", "-cu"]

# We need gcc-4.9 to build plugins for TensorFlow & PyTorch, which is only available in Ubuntu Xenial
RUN echo deb http://archive.ubuntu.com/ubuntu xenial main universe | tee -a /etc/apt/sources.list

# Install essential packages.
RUN apt-get update -qq
RUN apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
@@ -37,10 +40,10 @@ RUN apt-get install -y --allow-downgrades --allow-change-held-packages --no-inst
libnccl-dev=${NCCL_VERSION_OVERRIDE}

# Install Python.
RUN apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev
RUN if [[ "${PYTHON_VERSION}" == "3.6" ]]; then \
add-apt-repository ppa:deadsnakes/ppa && apt-get update -qq; \
apt-get install -y python${PYTHON_VERSION}-distutils; \
fi
RUN apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev
RUN ln -s -f /usr/bin/python${PYTHON_VERSION} /usr/bin/python
RUN wget https://bootstrap.pypa.io/get-pip.py && python get-pip.py && rm get-pip.py
RUN pip install -U --force pip setuptools requests pytest
@@ -79,14 +82,14 @@ RUN ldconfig /usr/local/cuda/targets/x86_64-linux/lib/stubs && \
ldconfig

# Install PyTorch.
RUN pip install future typing
RUN if [[ ${PYTORCH_PACKAGE} == "torch-nightly" ]]; then \
PYTORCH_CUDA=$(echo ${CUDA_DOCKER_VERSION} | awk -F- '{print $1}' | sed 's/\.//'); \
pip install torch_nightly -v -f https://download.pytorch.org/whl/nightly/cu${PYTORCH_CUDA}/torch_nightly.html; \
else \
pip install ${PYTORCH_PACKAGE}; \
fi
RUN pip install ${TORCHVISION_PACKAGE} Pillow --no-deps
RUN pip install future typing

# Install MXNet.
RUN pip install ${MXNET_PACKAGE}
@@ -22,7 +22,7 @@ docker rmi $(cat Dockerfile | grep FROM | awk '{print $2}') || true

# build for py2 and py3
build_one 2.7
build_one 3.5
build_one 3.6

# print recent images
docker images horovod/horovod

0 comments on commit fda07a6

Please sign in to comment.
You can’t perform that action at this time.