Skip to content

Commit

Permalink
Move buildkite agents to v5.7.2 (#3390)
Browse files Browse the repository at this point in the history
Signed-off-by: Enrico Minack <github@enrico.minack.dev>
  • Loading branch information
EnricoMi committed Feb 1, 2022
1 parent fcd1af7 commit 215d0af
Show file tree
Hide file tree
Showing 3 changed files with 105 additions and 100 deletions.
35 changes: 20 additions & 15 deletions .buildkite/gen-pipeline.sh
Expand Up @@ -6,6 +6,11 @@ set -eu
# our repository in AWS
repository=823773083436.dkr.ecr.us-east-1.amazonaws.com/buildkite

# our queues
cpu_queue="cpu-v572"
gpux2_queue="2x-gpu-v572"
gpux4_queue="4x-gpu-v572"

# our baseline test is
baseline="test-cpu-gloo-py3_8-tf2_7_0-keras2_7_0-torch1_10_1-mxnet1_9_0-pyspark3_2_0"
# in run_gloo_integration we run 'Elastic Spark * Tests' for this baseline
Expand Down Expand Up @@ -80,7 +85,7 @@ build_test() {
echo " retry:"
echo " automatic: true"
echo " agents:"
echo " queue: cpu"
echo " queue: ${cpu_queue}"
}
run_test() {
Expand Down Expand Up @@ -407,33 +412,33 @@ for test in ${tests[@]-}; do
if [[ ${test} == *-cpu-* ]]; then
# if gloo is specified, run gloo cpu unit tests and integration tests
if [[ ${test} == *-gloo* ]]; then
run_gloo ${test} "cpu"
run_gloo ${test} ${cpu_queue}
fi
# if oneCCL is specified, run some tests twice,
# once with mpirun_command_ofi, and once with mpirun_command_mpi
if [[ ${test} == *oneccl* ]]; then
# run mpi cpu unit tests and integration tests
run_mpi ${test} "cpu" ${oneccl_cmd_mpi}
run_mpi ${test} "cpu" ${oneccl_cmd_ofi}
run_mpi ${test} ${cpu_queue} ${oneccl_cmd_mpi}
run_mpi ${test} ${cpu_queue} ${oneccl_cmd_ofi}
# always run spark tests which use MPI and Gloo
run_spark_integration ${test} "cpu"
run_spark_integration ${test} ${cpu_queue}
# no runner application, world size = 1
run_single_integration ${test} "cpu" ${oneccl_cmd_mpi}
run_single_integration ${test} "cpu" ${oneccl_cmd_ofi}
run_single_integration ${test} ${cpu_queue} ${oneccl_cmd_mpi}
run_single_integration ${test} ${cpu_queue} ${oneccl_cmd_ofi}
else
# run mpi cpu unit tests and integration tests
if [[ ${test} == *mpi* ]]; then
run_mpi ${test} "cpu"
run_mpi ${test} ${cpu_queue}
fi
# always run spark tests which use MPI and Gloo
run_spark_integration ${test} "cpu"
run_spark_integration ${test} ${cpu_queue}
# no runner application, world size = 1
run_single_integration ${test} "cpu"
run_single_integration ${test} ${cpu_queue}
fi
fi
done
Expand All @@ -446,12 +451,12 @@ for test in ${tests[@]-}; do
if [[ ${test} == *-gpu-* ]] || [[ ${test} == *-mixed-* ]]; then
# if gloo is specified, run gloo gpu unit tests
if [[ ${test} == *-gloo* ]]; then
run_gloo_pytest ${test} "4x-gpu-v510"
run_gloo_pytest ${test} ${gpux4_queue}
fi
# if mpi is specified, run mpi gpu unit tests
if [[ ${test} == *mpi* ]]; then
run_mpi_pytest ${test} "4x-gpu-v510"
run_mpi_pytest ${test} ${gpux4_queue}
fi
fi
done
Expand All @@ -464,14 +469,14 @@ for test in ${tests[@]-}; do
if [[ ${test} == *-gpu-* ]] || [[ ${test} == *-mixed-* ]]; then
# if gloo is specified, run gloo gpu integration tests
if [[ ${test} == *-gloo* ]]; then
run_gloo_integration ${test} "2x-gpu-v510"
run_gloo_integration ${test} ${gpux2_queue}
fi
# if mpi is specified, run mpi gpu integration tests
if [[ ${test} == *mpi* ]]; then
run_mpi_integration ${test} "2x-gpu-v510"
run_mpi_integration ${test} ${gpux2_queue}
fi
run_spark_integration ${test} "2x-gpu-v510"
run_spark_integration ${test} ${gpux2_queue}
fi
done
22 changes: 11 additions & 11 deletions test/single/data/expected_buildkite_gpu_heads_pipeline.yaml
Expand Up @@ -13,7 +13,7 @@ steps:
retry:
automatic: true
agents:
queue: cpu
queue: cpu-v572
- wait
- wait
- label: ':pytest: Gloo Parallel PyTests (test-gpu-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_2_0)'
Expand All @@ -31,7 +31,7 @@ steps:
retry:
automatic: true
agents:
queue: 4x-gpu-v510
queue: 4x-gpu-v572
- label: ':pytest: Gloo Single PyTests (test-gpu-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_2_0)'
command: bash -c "HOROVOD_TEST_GPU=1 cd /horovod/test/single && (ls -1 test_*.py | xargs -n 1 /bin/bash /pytest_standalone.sh gloo)"
artifact_paths: "artifacts/**"
Expand All @@ -47,7 +47,7 @@ steps:
retry:
automatic: true
agents:
queue: 4x-gpu-v510
queue: 4x-gpu-v572
- label: ':pytest: Gloo Cluster PyTests (test-gpu-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_2_0)'
command: bash -c "HOROVOD_TEST_GPU=1 /etc/init.d/ssh start && cd /horovod/test/integration && pytest --forked -v --capture=fd --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.static.xml test_static_run.py"
artifact_paths: "artifacts/**"
Expand All @@ -63,7 +63,7 @@ steps:
retry:
automatic: true
agents:
queue: 4x-gpu-v510
queue: 4x-gpu-v572
- wait
- label: ':tensorflow: Gloo TensorFlow 2.0 MNIST (test-gpu-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_2_0)'
command: horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_mnist.py
Expand All @@ -80,7 +80,7 @@ steps:
retry:
automatic: true
agents:
queue: 2x-gpu-v510
queue: 2x-gpu-v572
- label: ':tensorflow: Gloo TensorFlow 2.0 Keras MNIST (test-gpu-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_2_0)'
command: horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/tensorflow2/tensorflow2_keras_mnist.py
artifact_paths: "artifacts/**"
Expand All @@ -96,7 +96,7 @@ steps:
retry:
automatic: true
agents:
queue: 2x-gpu-v510
queue: 2x-gpu-v572
- label: ':fire: Gloo PyTorch MNIST (test-gpu-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_2_0)'
command: horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/pytorch/pytorch_mnist.py --data-dir /data/pytorch_datasets
artifact_paths: "artifacts/**"
Expand All @@ -112,7 +112,7 @@ steps:
retry:
automatic: true
agents:
queue: 2x-gpu-v510
queue: 2x-gpu-v572
- label: ':muscle: Gloo MXNet2 MNIST (test-gpu-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_2_0)'
command: horovodrun -np 2 -H localhost:2 --gloo python /horovod/examples/mxnet/mxnet2_mnist.py
artifact_paths: "artifacts/**"
Expand All @@ -128,7 +128,7 @@ steps:
retry:
automatic: true
agents:
queue: 2x-gpu-v510
queue: 2x-gpu-v572
- label: ':factory: Elastic Tests (test-gpu-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_2_0)'
command: bash -c "cd /horovod/test/integration && HOROVOD_LOG_LEVEL=DEBUG pytest --forked -v --log-cli-level 10 --log-cli-format '[%(asctime)-15s %(levelname)s %(filename)s:%(lineno)d %(funcName)s()] %(message)s' --capture=no --continue-on-collection-errors --junit-xml=/artifacts/junit.gloo.elastic.xml test_elastic_torch.py test_elastic_tensorflow2.py"
artifact_paths: "artifacts/**"
Expand All @@ -144,7 +144,7 @@ steps:
retry:
automatic: true
agents:
queue: 2x-gpu-v510
queue: 2x-gpu-v572
- label: ':spark: Spark Torch MNIST (test-gpu-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_2_0)'
command: bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3"
artifact_paths: "artifacts/**"
Expand All @@ -160,7 +160,7 @@ steps:
retry:
automatic: true
agents:
queue: 2x-gpu-v510
queue: 2x-gpu-v572
- label: ':spark: Spark Lightning MNIST (test-gpu-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_2_0)'
command: bash -c "OMP_NUM_THREADS=1 /spark_env.sh python /horovod/examples/spark/pytorch/pytorch_lightning_spark_mnist.py --num-proc 2 --work-dir /work --data-dir /data --epochs 3"
artifact_paths: "artifacts/**"
Expand All @@ -176,4 +176,4 @@ steps:
retry:
automatic: true
agents:
queue: 2x-gpu-v510
queue: 2x-gpu-v572

0 comments on commit 215d0af

Please sign in to comment.