Skip to content

Commit

Permalink
Update min framework versions, test that Horovod builds with them (#3452
Browse files Browse the repository at this point in the history
)

Signed-off-by: Enrico Minack <github@enrico.minack.dev>
  • Loading branch information
EnricoMi committed Mar 11, 2022
1 parent a23732e commit f8c9649
Show file tree
Hide file tree
Showing 9 changed files with 210 additions and 16 deletions.
10 changes: 7 additions & 3 deletions .buildkite/gen-pipeline.sh
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ tests=$(if [[ -n "${PIPELINE_MODE:-}" ]] && ( [[ "${BUILDKITE_BRANCH:-}" == "${B
# our baseline again
# printf "test-cpu-gloo-py3_8-tf2_8_0-keras2_8_0-torch1_11_0-mxnet1_9_0-pyspark3_2_1 "
printf "test-cpu-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_2_1 "
# these are the lowest framework versions that Horovod compiles with, but they are not tested
printf "test-cpu-gloo-py3_7-tfmin-kerasmin-torchmin-mxnetmin-pysparkmin "
# then we vary the frameworks for gpu
# we need CUDA 10.0 as tensorflow-gpu==1.15.5 is compiled against and linked to CUDA 10.0
Expand All @@ -60,6 +62,8 @@ tests=$(if [[ -n "${PIPELINE_MODE:-}" ]] && ( [[ "${BUILDKITE_BRANCH:-}" == "${B
printf "test-gpu-gloo-py3_8-tf2_7_1-keras2_7_0-torch1_10_2-mxnet1_8_0_p0-pyspark3_2_1 "
printf "test-gpu-openmpi-gloo-py3_8-tf2_8_0-keras2_8_0-torch1_11_0-mxnet1_9_0-pyspark3_2_1 "
printf "test-gpu-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_2_1 "
# these are the lowest framework versions that Horovod compiles with, but they are not tested
printf "test-gpu-gloo-py3_7-tfmin-kerasmin-torchmin-mxnetmin-pysparkmin "
# and one final test with mixed cpu+gpu
printf "test-mixed-openmpi-gloo-py3_8-tf2_8_0-keras2_8_0-torch1_11_0-mxnet1_9_0-pyspark3_2_1 "
Expand Down Expand Up @@ -414,7 +418,7 @@ oneccl_cmd_mpi="${oneccl_env}:echo:'/mpirun_command_mpi':>:/mpirun_command:&&"
# run all the cpu unit tests and integration tests
for test in ${tests[@]-}; do
if [[ ${test} == *-cpu-* ]]; then
if [[ ${test} == *-cpu-* && ${test} != *min-* ]]; then
# if gloo is specified, run gloo cpu unit tests and integration tests
if [[ ${test} == *-gloo* ]]; then
run_gloo ${test} ${cpu_queue}
Expand Down Expand Up @@ -453,7 +457,7 @@ echo "- wait"
# run 4x gpu unit tests
for test in ${tests[@]-}; do
if [[ ${test} == *-gpu-* ]] || [[ ${test} == *-mixed-* ]]; then
if ( [[ ${test} == *-gpu-* ]] || [[ ${test} == *-mixed-* ]] ) && [[ ${test} != *min-* ]]; then
# if gloo is specified, run gloo gpu unit tests
if [[ ${test} == *-gloo* ]]; then
run_gloo_pytest ${test} ${gpux4_queue}
Expand All @@ -471,7 +475,7 @@ echo "- wait"
# run 2x gpu integration tests
for test in ${tests[@]-}; do
if [[ ${test} == *-gpu-* ]] || [[ ${test} == *-mixed-* ]]; then
if ( [[ ${test} == *-gpu-* ]] || [[ ${test} == *-mixed-* ]] ) && [[ ${test} != *min-* ]]; then
# if gloo is specified, run gloo gpu integration tests
if [[ ${test} == *-gloo* ]]; then
run_gloo_integration ${test} ${gpux2_queue}
Expand Down
23 changes: 14 additions & 9 deletions .github/gen-workflow-ci.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,12 +270,14 @@ def build_and_test_images(id: str,
name: str,
needs: List[str],
images: List[str],
parallel_images: str,
tests_per_image: Dict[str, Set[str]],
tests: Dict[str, Dict],
parallel_images: int = None,
attempts: int = 3) -> str:
if 'init-workflow' not in needs:
needs.insert(0, 'init-workflow')
if parallel_images is None:
parallel_images = len(images)
failure = "'failure'"
return (f' {id}:\n'
f' name: "{name} (${{{{ matrix.image }}}})"\n'
Expand All @@ -286,13 +288,13 @@ def build_and_test_images(id: str,
f' runs-on: ubuntu-latest\n'
f'\n'
f' strategy:\n'
f' max-parallel: {len([image for image in images if parallel_images in image])}\n'
f' max-parallel: {parallel_images}\n'
f' fail-fast: false\n'
f' matrix:\n'
f' include:\n' +
'\n'.join([f' - image: {image}\n' +
f''.join([f' {test}: true\n'
for test in sorted(list(tests_per_image[image]))]) +
for test in sorted(list(tests_per_image.get(image, [])))]) +
f' build_timeout: {30 if "-cpu-" in image else 40}\n'
for image in sorted(images)
# oneccl does not compile on GitHub Workflows:
Expand Down Expand Up @@ -603,7 +605,7 @@ def publish_docker_images(needs: List[str], images: List[str]) -> str:
return (f' docker-config:\n'
f' name: Configure docker build\n'
f' needs: [{", ".join(needs)}]\n'
f" # build-and-test-cpu, build-gpu and buildkite might have been skipped (! needs.init-workflow.outputs.run-builds-and-tests)\n"
f" # build-and-test and buildkite might have been skipped (! needs.init-workflow.outputs.run-builds-and-tests)\n"
f' # buildkite might have been skipped (workflow runs for a fork PR),\n'
f' # we still want to build docker images (though we might not want to push them)\n'
f' if: >\n'
Expand Down Expand Up @@ -795,16 +797,19 @@ def sync_files(needs: List[str]) -> str:
f' fi\n')

with open(path.joinpath('workflows', 'ci.yaml').absolute(), 'wt') as w:
mins = ['tfmin', 'torchmin', 'mxnetmin']
heads = ['tfhead', 'torchhead', 'mxnethead']
release_images = [image for image in images if not all(head in image for head in heads)]
allmin_images = [image for image in images if all(min in image for min in mins)]
allhead_images = [image for image in images if all(head in image for head in heads)]
release_images = [image for image in images if image not in allhead_images + allmin_images]
cpu_release_images = [image for image in release_images if '-cpu-' in image]
gpu_release_images = [image for image in release_images if '-gpu-' in image or '-mixed-' in image]
allhead_images = [image for image in images if all(head in image for head in heads)]
workflow = workflow_header() + jobs(
init_workflow_job(),
# changing these names require changes in the workflow-conclusion step in ci-fork.yaml
build_and_test_images(id='build-and-test', name='Build and Test', needs=['init-workflow'], images=release_images, parallel_images='-cpu-', tests_per_image=tests_per_image, tests=tests),
build_and_test_images(id='build-and-test-heads', name='Build and Test heads', needs=['build-and-test'], images=allhead_images, parallel_images='', tests_per_image=tests_per_image, tests=tests),
# changing these names require changes in the workflow-conclusion step in ci-results.yaml
build_and_test_images(id='build-and-test', name='Build and Test', needs=['init-workflow'], images=release_images, parallel_images=len(cpu_release_images), tests_per_image=tests_per_image, tests=tests),
build_and_test_images(id='build-and-test-heads', name='Build and Test heads', needs=['build-and-test'], images=allhead_images, tests_per_image=tests_per_image, tests=tests),
build_and_test_images(id='build-mins', name='Build mins', needs=['build-and-test'], images=allmin_images, tests_per_image=tests_per_image, tests={}),
build_and_test_macos(id='build-and-test-macos', name='Build and Test macOS', needs=['build-and-test']),
trigger_buildkite_job(id='buildkite', name='Build and Test GPU (on Builtkite)', needs=['build-and-test'], mode='GPU NON HEADS'),
trigger_buildkite_job(id='buildkite-heads', name='Build and Test GPU heads (on Builtkite)', needs=['build-and-test'], mode='GPU HEADS'),
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/ci-results.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ jobs:

steps:
- name: Fetch workflow conclusion
# fetch conclusion of steps building and testing CPU and building GPU
# ignores steps building heads and mins, building and testing macOS, building and testing GPU via Buildkite
id: workflow-conclusion
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
Expand Down
136 changes: 135 additions & 1 deletion .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3353,6 +3353,140 @@ jobs:
docker image ls | head
shell: bash

build-mins:
name: "Build mins (${{ matrix.image }})"
needs: [init-workflow, build-and-test]
if: >
needs.init-workflow.outputs.run-at-all == 'true' &&
needs.init-workflow.outputs.run-builds-and-tests == 'true'
runs-on: ubuntu-latest

strategy:
max-parallel: 2
fail-fast: false
matrix:
include:
- image: test-cpu-gloo-py3_7-tfmin-kerasmin-torchmin-mxnetmin-pysparkmin
build_timeout: 30

- image: test-gpu-gloo-py3_7-tfmin-kerasmin-torchmin-mxnetmin-pysparkmin
build_timeout: 40

steps:
- name: Clean up disk space
# deleting these paths frees 38 GB disk space:
# sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc
# but this sometimes takes 3-4 minutes
# so we delete only some sub-paths which are known to be quick (10s) and 20 GB
run: |
echo ::group::Disk space before clean up
df -h
echo ::endgroup::
for dir in /usr/share/dotnet/sdk/\*/nuGetPackagesArchive.lzma \
/usr/share/dotnet/shared \
/usr/local/lib/android/sdk/ndk \
/usr/local/lib/android/sdk/build-tools \
/opt/ghc
do
echo ::group::Deleting "$dir"
sudo du -hsc $dir | tail -n1 || true
sudo rm -rf $dir
echo ::endgroup::
done
echo ::group::Disk space after clean up
df -h
echo ::endgroup::
- name: Checkout
uses: actions/checkout@v2
with:
submodules: recursive

- name: Setup Python
uses: actions/setup-python@v2
with:
python-version: 3.8

- name: Setup docker-compose
run: pip install docker-compose

- name: Configure AWS credentials
id: aws
uses: aws-actions/configure-aws-credentials@v1
# AWS credentials are used to authenticate against AWS ECR to pull and push test images
# We can only authenticate when running on Horovod repo (not a fork)
if: >
github.repository == 'horovod/horovod' &&
( github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository )
continue-on-error: true
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: us-east-1

- name: Login to Amazon ECR
id: ecr
if: steps.aws.outcome == 'success'
continue-on-error: true
uses: aws-actions/amazon-ecr-login@v1

- name: Add cache_from to docker-compose YAML
if: steps.ecr.outcome == 'success'
run: |
cat > docker-compose.test.override.yml <<EOF
version: '2.3'
services:
${{ matrix.image }}:
build:
cache_from:
- ${{ steps.ecr.outputs.registry }}/buildkite:horovod-${{ matrix.image }}-latest
EOF
cat docker-compose.test.override.yml
shell: bash

- name: Pull latest test image
if: steps.ecr.outcome == 'success'
continue-on-error: true
run: |
docker pull ${{ steps.ecr.outputs.registry }}/buildkite:horovod-${{ matrix.image }}-latest
env:
DOCKER_BUILDKIT: 1

- name: Build
id: build
run: |
override_yaml=""
if [ -e docker-compose.test.override.yml ]; then override_yaml="-f docker-compose.test.override.yml"; fi
.github/timeout-and-retry.sh ${{ matrix.build_timeout }}m 3 10 docker-compose -f docker-compose.test.yml $override_yaml build --pull ${{ matrix.image }}
env:
COMPOSE_DOCKER_CLI_BUILD: 1
DOCKER_BUILDKIT: 1


- name: Upload Test Results
uses: actions/upload-artifact@v2
if: always() && contains(matrix.image, '-cpu-')
with:
name: Unit Test Results - ${{ matrix.image }}
path: artifacts/${{ matrix.image }}/**/*.xml

- name: Push test image
# We push test image to AWS ECR on push to Horovod master (not a fork)
if: >
github.event_name == 'push' &&
github.ref == 'refs/heads/master' &&
github.repository == 'horovod/horovod' &&
steps.ecr.outcome == 'success'
continue-on-error: true
run: |
docker image ls | head
docker tag horovod_${{ matrix.image }} ${{ steps.ecr.outputs.registry }}/buildkite:horovod-${{ matrix.image }}-latest
docker push ${{ steps.ecr.outputs.registry }}/buildkite:horovod-${{ matrix.image }}-latest
docker image ls | head
shell: bash

build-and-test-macos:
name: "Build and Test macOS (${{ matrix.image }}-macos)"
needs: [init-workflow, build-and-test]
Expand Down Expand Up @@ -3604,7 +3738,7 @@ jobs:
docker-config:
name: Configure docker build
needs: [init-workflow, build-and-test, buildkite]
# build-and-test-cpu, build-gpu and buildkite might have been skipped (! needs.init-workflow.outputs.run-builds-and-tests)
# build-and-test and buildkite might have been skipped (! needs.init-workflow.outputs.run-builds-and-tests)
# buildkite might have been skipped (workflow runs for a fork PR),
# we still want to build docker images (though we might not want to push them)
if: >
Expand Down
32 changes: 32 additions & 0 deletions docker-compose.test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,21 @@ services:
TORCHVISION_PACKAGE: torchvision
PYTORCH_LIGHTNING_PACKAGE: pytorch_lightning==1.3.8
MXNET_PACKAGE: mxnet-nightly
# these are the lowest framework versions that Horovod compiles with, but they are not tested
test-cpu-gloo-py3_7-tfmin-kerasmin-torchmin-mxnetmin-pysparkmin:
extends: test-cpu-base
build:
args:
UBUNTU_VERSION: 18.04
PYTHON_VERSION: 3.7
TENSORFLOW_PACKAGE: tensorflow-cpu==1.15.0
KERAS_PACKAGE: keras==2.2.4
PYTORCH_PACKAGE: torch==1.5.0+cpu
PYTORCH_LIGHTNING_PACKAGE: pytorch-lightning==0.7.3
TORCHVISION_PACKAGE: torchvision==0.6.0+cpu
MXNET_PACKAGE: mxnet==1.4.1
PYSPARK_PACKAGE: pyspark==2.4.0
SPARK_PACKAGE: spark-2.4.0/spark-2.4.0-bin-hadoop2.7.tgz

test-cpu-gloo-py3_7-tf2_8_0-keras2_8_0-torch1_11_0-mxnet1_9_0-pyspark2_4_8:
extends: test-cpu-base
Expand Down Expand Up @@ -213,6 +228,23 @@ services:
PYTORCH_LIGHTNING_PACKAGE: pytorch_lightning==1.3.8
TORCHVISION_PACKAGE: torchvision
MXNET_PACKAGE: mxnet-nightly-cu112
# these are the lowest framework versions that Horovod compiles with, but they are not tested
test-gpu-gloo-py3_7-tfmin-kerasmin-torchmin-mxnetmin-pysparkmin:
extends: test-gpu-base
build:
args:
CUDA_DOCKER_VERSION: 10.0-devel-ubuntu18.04
CUDNN_VERSION: 7.6.5.32-1+cuda10.1
NCCL_VERSION_OVERRIDE: 2.7.8-1+cuda10.1
PYTHON_VERSION: 3.7
TENSORFLOW_PACKAGE: tensorflow-gpu==1.15.0
KERAS_PACKAGE: keras==2.2.4
PYTORCH_PACKAGE: torch==1.5.0+cu101
PYTORCH_LIGHTNING_PACKAGE: pytorch-lightning==0.7.3
TORCHVISION_PACKAGE: torchvision==0.6.0+cu101
MXNET_PACKAGE: mxnet-cu100==1.4.1
PYSPARK_PACKAGE: pyspark==2.4.0
SPARK_PACKAGE: spark-2.4.0/spark-2.4.0-bin-hadoop2.7.tgz

test-mixed-openmpi-gloo-py3_8-tf2_8_0-keras2_8_0-torch1_11_0-mxnet1_9_0-pyspark3_2_1:
extends: test-gpu-base
Expand Down
2 changes: 1 addition & 1 deletion docs/install.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ Requirements
- Python >= 3.6
- `g++-5` or above, or another compiler supporting C++14
- CMake 3.13 or newer
- TensorFlow, PyTorch, or MXNet
- TensorFlow (>=1.15.0), PyTorch (>=1.5.0), or MXNet (>=1.4.1)
- (Optional) MPI

For best performance on GPU:
Expand Down
2 changes: 1 addition & 1 deletion horovod/mxnet/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ set(Mxnet_REQUIRED "")
if ("$ENV{HOROVOD_WITH_MXNET}" STREQUAL "1")
set(Mxnet_REQUIRED "REQUIRED")
endif ()
find_package(Mxnet "1.4.0" ${Mxnet_REQUIRED})
find_package(Mxnet "1.4.1" ${Mxnet_REQUIRED})
if(NOT MXNET_FOUND)
return()
endif()
Expand Down
2 changes: 1 addition & 1 deletion horovod/torch/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ set(PYTORCH_REQUIRED "")
if ("$ENV{HOROVOD_WITH_PYTORCH}" STREQUAL "1")
set(PYTORCH_REQUIRED "REQUIRED")
endif ()
find_package(Pytorch "1.2.0" ${PYTORCH_REQUIRED})
find_package(Pytorch "1.5.0" ${PYTORCH_REQUIRED})
if(NOT PYTORCH_FOUND)
return()
endif()
Expand Down
17 changes: 17 additions & 0 deletions test/single/data/expected_buildkite_gpu_non_heads_pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,23 @@ steps:
automatic: true
agents:
queue: cpu-v572
- label: ':docker: Build test-gpu-gloo-py3_7-tfmin-kerasmin-torchmin-mxnetmin-pysparkmin'
env:
COMPOSE_HTTP_TIMEOUT: 300
plugins:
- docker-compose#v3.5.0:
build: test-gpu-gloo-py3_7-tfmin-kerasmin-torchmin-mxnetmin-pysparkmin
image-repository: 823773083436.dkr.ecr.us-east-1.amazonaws.com/buildkite
cache-from: test-gpu-gloo-py3_7-tfmin-kerasmin-torchmin-mxnetmin-pysparkmin:823773083436.dkr.ecr.us-east-1.amazonaws.com/buildkite:SLUG-test-gpu-gloo-py3_7-tfmin-kerasmin-torchmin-mxnetmin-pysparkmin-latest
config: docker-compose.test.yml
push-retries: 5
- ecr#v1.2.0:
login: true
timeout_in_minutes: 40
retry:
automatic: true
agents:
queue: cpu-v572
- label: ':docker: Build test-mixed-openmpi-gloo-py3_8-tf2_8_0-keras2_8_0-torch1_11_0-mxnet1_9_0-pyspark3_2_1'
env:
COMPOSE_HTTP_TIMEOUT: 300
Expand Down

0 comments on commit f8c9649

Please sign in to comment.