Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add pytorch 1.10.0 to test space, remove 1.6.0 #3291

Merged
merged 6 commits into from
Dec 11, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
36 changes: 18 additions & 18 deletions .buildkite/gen-pipeline.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ set -eu
repository=823773083436.dkr.ecr.us-east-1.amazonaws.com/buildkite

# our baseline test is
baseline="test-cpu-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0"
baseline="test-cpu-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0"
# in run_gloo_integration we run 'Elastic Spark * Tests' for this baseline
# so it has to have Gloo mpi kind

Expand All @@ -17,48 +17,48 @@ code_files=$(python "$dir/get_changed_code_files.py" || echo failure)
tests=$(if [[ -n "${PIPELINE_MODE:-}" ]] && ( [[ "${BUILDKITE_BRANCH:-}" == "${BUILDKITE_PIPELINE_DEFAULT_BRANCH:-}" ]] || [[ -n "$code_files" ]] ); then
# we vary the baseline along the Python dimension and PySpark together
# run_gloo_integration expects these to have Gloo mpi kind to run 'Elastic Spark * Tests'
printf "test-cpu-gloo-py3_7-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark2_4_8 "
printf "test-cpu-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_1_2 "
printf "test-cpu-gloo-py3_7-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark2_4_8 "
printf "test-cpu-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_1_2 "
# our baseline
printf "$baseline "

# then we vary the baseline along mpi kinds dimension
# our baseline again
# printf "test-cpu-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0 "
printf "test-cpu-mpich-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0 "
printf "test-cpu-oneccl-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0 "
printf "test-cpu-openmpi-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0 "
# printf "test-cpu-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0 "
printf "test-cpu-mpich-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0 "
printf "test-cpu-oneccl-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0 "
printf "test-cpu-openmpi-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0 "
# note: we test openmpi-gloo mpi kind in this variation in each of [cpu, gpu, mixed]
printf "test-cpu-openmpi-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0 "
printf "test-cpu-openmpi-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0 "

# then we vary the baseline along the framework dimensions all together
# some frameworks are not available for our baseline Python version 3.8, so we use Python 3.7
# run_gloo_integration expects tf1 to have Gloo mpi kind to run 'Elastic Spark * Tests'
printf "test-cpu-gloo-py3_7-tf1_15_5-keras2_2_4-torch1_6_0-mxnet1_5_1_p0-pyspark3_2_0 "
printf "test-cpu-gloo-py3_7-tf1_15_5-keras2_2_4-torch1_7_1-mxnet1_5_1_p0-pyspark3_2_0 "
# there is no mxnet-1.6.0.post0 and mxnet-1.6.0 does not work with horovod
# https://github.com/apache/incubator-mxnet/issues/16193
# however, there is an mxnet-cu101-1.6.0.post0, so we test this with gpu instead of cpu
#printf "test-cpu-gloo-py3_8-tf2_4_3-keras2_3_1-torch1_7_1-mxnet1_6_0_p0-pyspark3_2_0 "
printf "test-cpu-gloo-py3_8-tf2_5_1-keras2_4_3-torch1_8_1-mxnet1_7_0_p2-pyspark3_2_0 "
#printf "test-cpu-gloo-py3_8-tf2_4_3-keras2_3_1-torch1_8_1-mxnet1_6_0_p0-pyspark3_2_0 "
printf "test-cpu-gloo-py3_8-tf2_5_1-keras2_4_3-torch1_9_1-mxnet1_7_0_p2-pyspark3_2_0 "
# our baseline again
# printf "test-cpu-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0 "
# printf "test-cpu-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0 "
printf "test-cpu-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_2_0 "

# then we vary the frameworks for gpu
printf "test-gpu-gloo-py3_7-tf1_15_5-keras2_2_4-torch1_6_0-mxnet1_5_1_p0-pyspark3_2_0 "
printf "test-gpu-gloo-py3_7-tf1_15_5-keras2_2_4-torch1_7_1-mxnet1_5_1_p0-pyspark3_2_0 "
# this is required as we cannot test mxnet-1.6.0.post0 with cpu
printf "test-gpu-gloo-py3_8-tf2_4_3-keras2_3_1-torch1_7_1-mxnet1_6_0_p0-pyspark3_2_0 "
printf "test-gpu-gloo-py3_8-tf2_4_3-keras2_3_1-torch1_8_1-mxnet1_6_0_p0-pyspark3_2_0 "
# we additionally test the previous framework combination (CUDA 10.x) with mxnet 1.7.x
# as mxnet 1.7.x only supports CUDA 10.x, but next framework combination targets CUDA 11.x
printf "test-gpu-gloo-py3_8-tf2_4_3-keras2_3_1-torch1_7_1-mxnet1_7_0_p1-pyspark3_2_0 "
printf "test-gpu-gloo-py3_8-tf2_4_3-keras2_3_1-torch1_8_1-mxnet1_7_0_p1-pyspark3_2_0 "
# we deviate from mxnet1_7_0_p2 here as other frameworks target CUDA 11.x and
# mxnet 1.7.x only supports CUDA 10.x, with mxnet 1.8.x we have CUDA 11.x packages
printf "test-gpu-gloo-py3_8-tf2_5_1-keras2_4_3-torch1_8_1-mxnet1_8_0_p0-pyspark3_2_0 "
printf "test-gpu-openmpi-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0 "
printf "test-gpu-gloo-py3_8-tf2_5_1-keras2_4_3-torch1_9_1-mxnet1_8_0_p0-pyspark3_2_0 "
printf "test-gpu-openmpi-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0 "
printf "test-gpu-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_2_0 "

# and one final test with mixed cpu+gpu
printf "test-mixed-openmpi-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0 "
printf "test-mixed-openmpi-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0 "
fi | if [[ "${PIPELINE_MODE:-}" == "GPU"* ]]; then sed -E "s/[^ ]*-cpu-[^ ]*//g"; else cat; fi \
| if [[ "${PIPELINE_MODE:-}" == "GPU HEADS" ]]; then sed -E "s/ /\n/g" | grep -e "-tfhead-keras_none-torchhead-mxnethead-" | paste -s -d " " -; else cat; fi \
| if [[ "${PIPELINE_MODE:-}" == "GPU NON HEADS" ]]; then sed -E "s/[^ ]*-tfhead-keras_none-torchhead-mxnethead-[^ ]*//g"; else cat; fi)
Expand Down
21 changes: 11 additions & 10 deletions .github/gen-workflow-ci.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,35 +441,36 @@ def build_and_test_macos(id: str, name: str, needs: List[str], attempts: int = 3
f' matrix:\n'
f' include:\n'
f''
f' - image: test-cpu-openmpi-py3_7-tf1_15_5-keras2_2_4-torch1_6_0-mxnet1_5_0\n'
f' - image: test-cpu-openmpi-py3_7-tf1_15_5-keras2_2_4-torch1_6_0-mxnet1_5_1_p0\n'
f' HOROVOD_WITH_MPI: 1\n'
f' HOROVOD_WITHOUT_GLOO: 1\n'
f' TENSORFLOW: 1.15.0\n'
f' KERAS: 2.2.4\n'
f' PYTORCH: 1.6.0\n'
f' PYTORCH_LIGHTNING: 1.3.8\n'
f' TORCHVISION: 0.7.0\n'
f' MXNET: 1.5.0\n'
f' MXNET: 1.5.1.post0\n'
f'\n'
f' - image: test-cpu-gloo-py3_8-tf2_5_1-keras2_4_3-torch1_8_1-mxnet1_5_0\n'
f' - image: test-cpu-gloo-py3_8-tf2_5_1-keras2_5_0rc0-torch1_9_1-mxnet1_6_0\n'
f' HOROVOD_WITHOUT_MPI: 1\n'
f' HOROVOD_WITH_GLOO: 1\n'
f' TENSORFLOW: 2.5.1\n'
f' KERAS: 2.5.0rc0\n'
f' PYTORCH: 1.8.1\n'
f' PYTORCH: 1.9.1\n'
f' PYTORCH_LIGHTNING: 1.3.8\n'
f' TORCHVISION: 0.9.1\n'
f' MXNET: 1.5.0\n'
f' TORCHVISION: 0.10.1\n'
f' MXNET: 1.6.0\n'
f'\n'
f' - image: test-openmpi-cpu-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_5_0\n'
f'' # latest mxnet version 1.8.0.post0 does not compile for macos due to missing dnnl_config.h
f' - image: test-openmpi-cpu-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_7_0_p2\n'
f' HOROVOD_WITH_MPI: 1\n'
f' HOROVOD_WITH_GLOO: 1\n'
f' TENSORFLOW: 2.6.0\n'
f' KERAS: 2.6.0\n'
f' PYTORCH: 1.9.0\n'
f' PYTORCH: 1.10.0\n'
f' PYTORCH_LIGHTNING: 1.3.8\n'
f' TORCHVISION: 0.10.0\n'
f' MXNET: 1.5.0\n'
f' TORCHVISION: 0.11.0\n'
f' MXNET: 1.7.0.post2\n'
f'\n'
f' steps:\n'
f' - name: Checkout\n'
Expand Down
48 changes: 24 additions & 24 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ jobs:
fail-fast: false
matrix:
include:
- image: test-cpu-gloo-py3_7-tf1_15_5-keras2_2_4-torch1_6_0-mxnet1_5_1_p0-pyspark3_2_0
- image: test-cpu-gloo-py3_7-tf1_15_5-keras2_2_4-torch1_7_1-mxnet1_5_1_p0-pyspark3_2_0
Elastic_Spark_TensorFlow_Tests_2: true
Elastic_Tests_2: true
Gloo_Cluster_PyTests: true
Expand All @@ -185,7 +185,7 @@ jobs:
Spark_Torch_MNIST: true
build_timeout: 30

- image: test-cpu-gloo-py3_7-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark2_4_8
- image: test-cpu-gloo-py3_7-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark2_4_8
Elastic_Spark_TensorFlow_Tests_1: true
Elastic_Spark_Torch_Tests: true
Elastic_Tests_1: true
Expand All @@ -203,7 +203,7 @@ jobs:
Spark_Torch_MNIST: true
build_timeout: 30

- image: test-cpu-gloo-py3_8-tf2_5_1-keras2_4_3-torch1_8_1-mxnet1_7_0_p2-pyspark3_2_0
- image: test-cpu-gloo-py3_8-tf2_5_1-keras2_4_3-torch1_9_1-mxnet1_7_0_p2-pyspark3_2_0
Elastic_Tests_1: true
Gloo_Cluster_PyTests: true
Gloo_MXNet_MNIST: true
Expand All @@ -219,7 +219,7 @@ jobs:
Spark_Torch_MNIST: true
build_timeout: 30

- image: test-cpu-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_1_2
- image: test-cpu-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_1_2
Elastic_Spark_TensorFlow_Tests_1: true
Elastic_Spark_Torch_Tests: true
Elastic_Tests_1: true
Expand All @@ -237,7 +237,7 @@ jobs:
Spark_Torch_MNIST: true
build_timeout: 30

- image: test-cpu-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0
- image: test-cpu-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0
Elastic_Spark_TensorFlow_Tests_1: true
Elastic_Spark_Torch_Tests: true
Elastic_Tests_1: true
Expand All @@ -255,7 +255,7 @@ jobs:
Spark_Torch_MNIST: true
build_timeout: 30

- image: test-cpu-mpich-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0
- image: test-cpu-mpich-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0
MPI_Cluster_PyTests: true
MPI_MXNet_MNIST: true
MPI_Parallel_PyTests: true
Expand All @@ -267,7 +267,7 @@ jobs:
Single_PyTorch_MNIST: true
build_timeout: 30

- image: test-cpu-openmpi-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0
- image: test-cpu-openmpi-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0
Elastic_Tests_1: true
Gloo_Cluster_PyTests: true
Gloo_MXNet_MNIST: true
Expand All @@ -291,7 +291,7 @@ jobs:
Spark_Torch_MNIST: true
build_timeout: 30

- image: test-cpu-openmpi-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0
- image: test-cpu-openmpi-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0
MPI_Cluster_PyTests: true
MPI_MXNet_MNIST: true
MPI_Parallel_PyTests: true
Expand All @@ -307,22 +307,22 @@ jobs:
Spark_Torch_MNIST: true
build_timeout: 30

- image: test-gpu-gloo-py3_7-tf1_15_5-keras2_2_4-torch1_6_0-mxnet1_5_1_p0-pyspark3_2_0
- image: test-gpu-gloo-py3_7-tf1_15_5-keras2_2_4-torch1_7_1-mxnet1_5_1_p0-pyspark3_2_0
build_timeout: 40

- image: test-gpu-gloo-py3_8-tf2_4_3-keras2_3_1-torch1_7_1-mxnet1_6_0_p0-pyspark3_2_0
- image: test-gpu-gloo-py3_8-tf2_4_3-keras2_3_1-torch1_8_1-mxnet1_6_0_p0-pyspark3_2_0
build_timeout: 40

- image: test-gpu-gloo-py3_8-tf2_4_3-keras2_3_1-torch1_7_1-mxnet1_7_0_p1-pyspark3_2_0
- image: test-gpu-gloo-py3_8-tf2_4_3-keras2_3_1-torch1_8_1-mxnet1_7_0_p1-pyspark3_2_0
build_timeout: 40

- image: test-gpu-gloo-py3_8-tf2_5_1-keras2_4_3-torch1_8_1-mxnet1_8_0_p0-pyspark3_2_0
- image: test-gpu-gloo-py3_8-tf2_5_1-keras2_4_3-torch1_9_1-mxnet1_8_0_p0-pyspark3_2_0
build_timeout: 40

- image: test-gpu-openmpi-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0
- image: test-gpu-openmpi-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0
build_timeout: 40

- image: test-mixed-openmpi-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0
- image: test-mixed-openmpi-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0
build_timeout: 40

steps:
Expand Down Expand Up @@ -3352,35 +3352,35 @@ jobs:
fail-fast: false
matrix:
include:
- image: test-cpu-openmpi-py3_7-tf1_15_5-keras2_2_4-torch1_6_0-mxnet1_5_0
- image: test-cpu-openmpi-py3_7-tf1_15_5-keras2_2_4-torch1_6_0-mxnet1_5_1_p0
HOROVOD_WITH_MPI: 1
HOROVOD_WITHOUT_GLOO: 1
TENSORFLOW: 1.15.0
KERAS: 2.2.4
PYTORCH: 1.6.0
PYTORCH_LIGHTNING: 1.3.8
TORCHVISION: 0.7.0
MXNET: 1.5.0
MXNET: 1.5.1.post0

- image: test-cpu-gloo-py3_8-tf2_5_1-keras2_4_3-torch1_8_1-mxnet1_5_0
- image: test-cpu-gloo-py3_8-tf2_5_1-keras2_5_0rc0-torch1_9_1-mxnet1_6_0
HOROVOD_WITHOUT_MPI: 1
HOROVOD_WITH_GLOO: 1
TENSORFLOW: 2.5.1
KERAS: 2.5.0rc0
PYTORCH: 1.8.1
PYTORCH: 1.9.1
PYTORCH_LIGHTNING: 1.3.8
TORCHVISION: 0.9.1
MXNET: 1.5.0
TORCHVISION: 0.10.1
MXNET: 1.6.0

- image: test-openmpi-cpu-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_5_0
- image: test-openmpi-cpu-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_7_0_p2
HOROVOD_WITH_MPI: 1
HOROVOD_WITH_GLOO: 1
TENSORFLOW: 2.6.0
KERAS: 2.6.0
PYTORCH: 1.9.0
PYTORCH: 1.10.0
PYTORCH_LIGHTNING: 1.3.8
TORCHVISION: 0.10.0
MXNET: 1.5.0
TORCHVISION: 0.11.0
MXNET: 1.7.0.post2

steps:
- name: Checkout
Expand Down