Skip to content

Commit

Permalink
Add pytorch 1.10.0 to test space, remove 1.6.0 (#3291)
Browse files Browse the repository at this point in the history
* Adding PyTorch 1.10.0 to test space, upgrading to 1.9.1 while removing 1.6.0
* Skip test_delta_optimizer with PyTorch 1.10
* Harmonize macOS tests with docker-compose.test.yml
* Latest mxnet does not compile for macOS

Signed-off-by: Enrico Minack <github@enrico.minack.dev>
Co-authored-by: Max H. Gerlach <git@maxgerlach.de>
  • Loading branch information
EnricoMi and maxhgerlach committed Dec 11, 2021
1 parent be3b72d commit df18797
Show file tree
Hide file tree
Showing 6 changed files with 275 additions and 267 deletions.
36 changes: 18 additions & 18 deletions .buildkite/gen-pipeline.sh
Expand Up @@ -7,7 +7,7 @@ set -eu
repository=823773083436.dkr.ecr.us-east-1.amazonaws.com/buildkite

# our baseline test is
baseline="test-cpu-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0"
baseline="test-cpu-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0"
# in run_gloo_integration we run 'Elastic Spark * Tests' for this baseline
# so it has to have Gloo mpi kind

Expand All @@ -17,48 +17,48 @@ code_files=$(python "$dir/get_changed_code_files.py" || echo failure)
tests=$(if [[ -n "${PIPELINE_MODE:-}" ]] && ( [[ "${BUILDKITE_BRANCH:-}" == "${BUILDKITE_PIPELINE_DEFAULT_BRANCH:-}" ]] || [[ -n "$code_files" ]] ); then
# we vary the baseline along the Python dimension and PySpark together
# run_gloo_integration expects these to have Gloo mpi kind to run 'Elastic Spark * Tests'
printf "test-cpu-gloo-py3_7-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark2_4_8 "
printf "test-cpu-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_1_2 "
printf "test-cpu-gloo-py3_7-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark2_4_8 "
printf "test-cpu-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_1_2 "
# our baseline
printf "$baseline "
# then we vary the baseline along mpi kinds dimension
# our baseline again
# printf "test-cpu-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0 "
printf "test-cpu-mpich-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0 "
printf "test-cpu-oneccl-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0 "
printf "test-cpu-openmpi-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0 "
# printf "test-cpu-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0 "
printf "test-cpu-mpich-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0 "
printf "test-cpu-oneccl-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0 "
printf "test-cpu-openmpi-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0 "
# note: we test openmpi-gloo mpi kind in this variation in each of [cpu, gpu, mixed]
printf "test-cpu-openmpi-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0 "
printf "test-cpu-openmpi-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0 "
# then we vary the baseline along the framework dimensions all together
# some frameworks are not available for our baseline Python version 3.8, so we use Python 3.7
# run_gloo_integration expects tf1 to have Gloo mpi kind to run 'Elastic Spark * Tests'
printf "test-cpu-gloo-py3_7-tf1_15_5-keras2_2_4-torch1_6_0-mxnet1_5_1_p0-pyspark3_2_0 "
printf "test-cpu-gloo-py3_7-tf1_15_5-keras2_2_4-torch1_7_1-mxnet1_5_1_p0-pyspark3_2_0 "
# there is no mxnet-1.6.0.post0 and mxnet-1.6.0 does not work with horovod
# https://github.com/apache/incubator-mxnet/issues/16193
# however, there is an mxnet-cu101-1.6.0.post0, so we test this with gpu instead of cpu
#printf "test-cpu-gloo-py3_8-tf2_4_3-keras2_3_1-torch1_7_1-mxnet1_6_0_p0-pyspark3_2_0 "
printf "test-cpu-gloo-py3_8-tf2_5_1-keras2_4_3-torch1_8_1-mxnet1_7_0_p2-pyspark3_2_0 "
#printf "test-cpu-gloo-py3_8-tf2_4_3-keras2_3_1-torch1_8_1-mxnet1_6_0_p0-pyspark3_2_0 "
printf "test-cpu-gloo-py3_8-tf2_5_1-keras2_4_3-torch1_9_1-mxnet1_7_0_p2-pyspark3_2_0 "
# our baseline again
# printf "test-cpu-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0 "
# printf "test-cpu-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0 "
printf "test-cpu-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_2_0 "
# then we vary the frameworks for gpu
printf "test-gpu-gloo-py3_7-tf1_15_5-keras2_2_4-torch1_6_0-mxnet1_5_1_p0-pyspark3_2_0 "
printf "test-gpu-gloo-py3_7-tf1_15_5-keras2_2_4-torch1_7_1-mxnet1_5_1_p0-pyspark3_2_0 "
# this is required as we cannot test mxnet-1.6.0.post0 with cpu
printf "test-gpu-gloo-py3_8-tf2_4_3-keras2_3_1-torch1_7_1-mxnet1_6_0_p0-pyspark3_2_0 "
printf "test-gpu-gloo-py3_8-tf2_4_3-keras2_3_1-torch1_8_1-mxnet1_6_0_p0-pyspark3_2_0 "
# we additionally test the previous framework combination (CUDA 10.x) with mxnet 1.7.x
# as mxnet 1.7.x only supports CUDA 10.x, but next framework combination targets CUDA 11.x
printf "test-gpu-gloo-py3_8-tf2_4_3-keras2_3_1-torch1_7_1-mxnet1_7_0_p1-pyspark3_2_0 "
printf "test-gpu-gloo-py3_8-tf2_4_3-keras2_3_1-torch1_8_1-mxnet1_7_0_p1-pyspark3_2_0 "
# we deviate from mxnet1_7_0_p2 here as other frameworks target CUDA 11.x and
# mxnet 1.7.x only supports CUDA 10.x, with mxnet 1.8.x we have CUDA 11.x packages
printf "test-gpu-gloo-py3_8-tf2_5_1-keras2_4_3-torch1_8_1-mxnet1_8_0_p0-pyspark3_2_0 "
printf "test-gpu-openmpi-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0 "
printf "test-gpu-gloo-py3_8-tf2_5_1-keras2_4_3-torch1_9_1-mxnet1_8_0_p0-pyspark3_2_0 "
printf "test-gpu-openmpi-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0 "
printf "test-gpu-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_2_0 "
# and one final test with mixed cpu+gpu
printf "test-mixed-openmpi-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0 "
printf "test-mixed-openmpi-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0 "
fi | if [[ "${PIPELINE_MODE:-}" == "GPU"* ]]; then sed -E "s/[^ ]*-cpu-[^ ]*//g"; else cat; fi \
| if [[ "${PIPELINE_MODE:-}" == "GPU HEADS" ]]; then sed -E "s/ /\n/g" | grep -e "-tfhead-keras_none-torchhead-mxnethead-" | paste -s -d " " -; else cat; fi \
| if [[ "${PIPELINE_MODE:-}" == "GPU NON HEADS" ]]; then sed -E "s/[^ ]*-tfhead-keras_none-torchhead-mxnethead-[^ ]*//g"; else cat; fi)
Expand Down
21 changes: 11 additions & 10 deletions .github/gen-workflow-ci.py
Expand Up @@ -441,35 +441,36 @@ def build_and_test_macos(id: str, name: str, needs: List[str], attempts: int = 3
f' matrix:\n'
f' include:\n'
f''
f' - image: test-cpu-openmpi-py3_7-tf1_15_5-keras2_2_4-torch1_6_0-mxnet1_5_0\n'
f' - image: test-cpu-openmpi-py3_7-tf1_15_5-keras2_2_4-torch1_6_0-mxnet1_5_1_p0\n'
f' HOROVOD_WITH_MPI: 1\n'
f' HOROVOD_WITHOUT_GLOO: 1\n'
f' TENSORFLOW: 1.15.0\n'
f' KERAS: 2.2.4\n'
f' PYTORCH: 1.6.0\n'
f' PYTORCH_LIGHTNING: 1.3.8\n'
f' TORCHVISION: 0.7.0\n'
f' MXNET: 1.5.0\n'
f' MXNET: 1.5.1.post0\n'
f'\n'
f' - image: test-cpu-gloo-py3_8-tf2_5_1-keras2_4_3-torch1_8_1-mxnet1_5_0\n'
f' - image: test-cpu-gloo-py3_8-tf2_5_1-keras2_5_0rc0-torch1_9_1-mxnet1_6_0\n'
f' HOROVOD_WITHOUT_MPI: 1\n'
f' HOROVOD_WITH_GLOO: 1\n'
f' TENSORFLOW: 2.5.1\n'
f' KERAS: 2.5.0rc0\n'
f' PYTORCH: 1.8.1\n'
f' PYTORCH: 1.9.1\n'
f' PYTORCH_LIGHTNING: 1.3.8\n'
f' TORCHVISION: 0.9.1\n'
f' MXNET: 1.5.0\n'
f' TORCHVISION: 0.10.1\n'
f' MXNET: 1.6.0\n'
f'\n'
f' - image: test-openmpi-cpu-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_5_0\n'
f'' # latest mxnet version 1.8.0.post0 does not compile for macos due to missing dnnl_config.h
f' - image: test-openmpi-cpu-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_7_0_p2\n'
f' HOROVOD_WITH_MPI: 1\n'
f' HOROVOD_WITH_GLOO: 1\n'
f' TENSORFLOW: 2.6.0\n'
f' KERAS: 2.6.0\n'
f' PYTORCH: 1.9.0\n'
f' PYTORCH: 1.10.0\n'
f' PYTORCH_LIGHTNING: 1.3.8\n'
f' TORCHVISION: 0.10.0\n'
f' MXNET: 1.5.0\n'
f' TORCHVISION: 0.11.0\n'
f' MXNET: 1.7.0.post2\n'
f'\n'
f' steps:\n'
f' - name: Checkout\n'
Expand Down
48 changes: 24 additions & 24 deletions .github/workflows/ci.yaml
Expand Up @@ -164,7 +164,7 @@ jobs:
fail-fast: false
matrix:
include:
- image: test-cpu-gloo-py3_7-tf1_15_5-keras2_2_4-torch1_6_0-mxnet1_5_1_p0-pyspark3_2_0
- image: test-cpu-gloo-py3_7-tf1_15_5-keras2_2_4-torch1_7_1-mxnet1_5_1_p0-pyspark3_2_0
Elastic_Spark_TensorFlow_Tests_2: true
Elastic_Tests_2: true
Gloo_Cluster_PyTests: true
Expand All @@ -185,7 +185,7 @@ jobs:
Spark_Torch_MNIST: true
build_timeout: 30

- image: test-cpu-gloo-py3_7-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark2_4_8
- image: test-cpu-gloo-py3_7-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark2_4_8
Elastic_Spark_TensorFlow_Tests_1: true
Elastic_Spark_Torch_Tests: true
Elastic_Tests_1: true
Expand All @@ -203,7 +203,7 @@ jobs:
Spark_Torch_MNIST: true
build_timeout: 30

- image: test-cpu-gloo-py3_8-tf2_5_1-keras2_4_3-torch1_8_1-mxnet1_7_0_p2-pyspark3_2_0
- image: test-cpu-gloo-py3_8-tf2_5_1-keras2_4_3-torch1_9_1-mxnet1_7_0_p2-pyspark3_2_0
Elastic_Tests_1: true
Gloo_Cluster_PyTests: true
Gloo_MXNet_MNIST: true
Expand All @@ -219,7 +219,7 @@ jobs:
Spark_Torch_MNIST: true
build_timeout: 30

- image: test-cpu-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_1_2
- image: test-cpu-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_1_2
Elastic_Spark_TensorFlow_Tests_1: true
Elastic_Spark_Torch_Tests: true
Elastic_Tests_1: true
Expand All @@ -237,7 +237,7 @@ jobs:
Spark_Torch_MNIST: true
build_timeout: 30

- image: test-cpu-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0
- image: test-cpu-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0
Elastic_Spark_TensorFlow_Tests_1: true
Elastic_Spark_Torch_Tests: true
Elastic_Tests_1: true
Expand All @@ -255,7 +255,7 @@ jobs:
Spark_Torch_MNIST: true
build_timeout: 30

- image: test-cpu-mpich-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0
- image: test-cpu-mpich-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0
MPI_Cluster_PyTests: true
MPI_MXNet_MNIST: true
MPI_Parallel_PyTests: true
Expand All @@ -267,7 +267,7 @@ jobs:
Single_PyTorch_MNIST: true
build_timeout: 30

- image: test-cpu-openmpi-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0
- image: test-cpu-openmpi-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0
Elastic_Tests_1: true
Gloo_Cluster_PyTests: true
Gloo_MXNet_MNIST: true
Expand All @@ -291,7 +291,7 @@ jobs:
Spark_Torch_MNIST: true
build_timeout: 30

- image: test-cpu-openmpi-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0
- image: test-cpu-openmpi-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0
MPI_Cluster_PyTests: true
MPI_MXNet_MNIST: true
MPI_Parallel_PyTests: true
Expand All @@ -307,22 +307,22 @@ jobs:
Spark_Torch_MNIST: true
build_timeout: 30

- image: test-gpu-gloo-py3_7-tf1_15_5-keras2_2_4-torch1_6_0-mxnet1_5_1_p0-pyspark3_2_0
- image: test-gpu-gloo-py3_7-tf1_15_5-keras2_2_4-torch1_7_1-mxnet1_5_1_p0-pyspark3_2_0
build_timeout: 40

- image: test-gpu-gloo-py3_8-tf2_4_3-keras2_3_1-torch1_7_1-mxnet1_6_0_p0-pyspark3_2_0
- image: test-gpu-gloo-py3_8-tf2_4_3-keras2_3_1-torch1_8_1-mxnet1_6_0_p0-pyspark3_2_0
build_timeout: 40

- image: test-gpu-gloo-py3_8-tf2_4_3-keras2_3_1-torch1_7_1-mxnet1_7_0_p1-pyspark3_2_0
- image: test-gpu-gloo-py3_8-tf2_4_3-keras2_3_1-torch1_8_1-mxnet1_7_0_p1-pyspark3_2_0
build_timeout: 40

- image: test-gpu-gloo-py3_8-tf2_5_1-keras2_4_3-torch1_8_1-mxnet1_8_0_p0-pyspark3_2_0
- image: test-gpu-gloo-py3_8-tf2_5_1-keras2_4_3-torch1_9_1-mxnet1_8_0_p0-pyspark3_2_0
build_timeout: 40

- image: test-gpu-openmpi-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0
- image: test-gpu-openmpi-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0
build_timeout: 40

- image: test-mixed-openmpi-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0
- image: test-mixed-openmpi-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0
build_timeout: 40

steps:
Expand Down Expand Up @@ -3352,35 +3352,35 @@ jobs:
fail-fast: false
matrix:
include:
- image: test-cpu-openmpi-py3_7-tf1_15_5-keras2_2_4-torch1_6_0-mxnet1_5_0
- image: test-cpu-openmpi-py3_7-tf1_15_5-keras2_2_4-torch1_6_0-mxnet1_5_1_p0
HOROVOD_WITH_MPI: 1
HOROVOD_WITHOUT_GLOO: 1
TENSORFLOW: 1.15.0
KERAS: 2.2.4
PYTORCH: 1.6.0
PYTORCH_LIGHTNING: 1.3.8
TORCHVISION: 0.7.0
MXNET: 1.5.0
MXNET: 1.5.1.post0

- image: test-cpu-gloo-py3_8-tf2_5_1-keras2_4_3-torch1_8_1-mxnet1_5_0
- image: test-cpu-gloo-py3_8-tf2_5_1-keras2_5_0rc0-torch1_9_1-mxnet1_6_0
HOROVOD_WITHOUT_MPI: 1
HOROVOD_WITH_GLOO: 1
TENSORFLOW: 2.5.1
KERAS: 2.5.0rc0
PYTORCH: 1.8.1
PYTORCH: 1.9.1
PYTORCH_LIGHTNING: 1.3.8
TORCHVISION: 0.9.1
MXNET: 1.5.0
TORCHVISION: 0.10.1
MXNET: 1.6.0

- image: test-openmpi-cpu-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_5_0
- image: test-openmpi-cpu-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_7_0_p2
HOROVOD_WITH_MPI: 1
HOROVOD_WITH_GLOO: 1
TENSORFLOW: 2.6.0
KERAS: 2.6.0
PYTORCH: 1.9.0
PYTORCH: 1.10.0
PYTORCH_LIGHTNING: 1.3.8
TORCHVISION: 0.10.0
MXNET: 1.5.0
TORCHVISION: 0.11.0
MXNET: 1.7.0.post2

steps:
- name: Checkout
Expand Down

0 comments on commit df18797

Please sign in to comment.