horovod · EnricoMi · Dec 11, 2021 · Nov 24, 2021 · Dec 7, 2021 · Dec 8, 2021
diff --git a/.buildkite/gen-pipeline.sh b/.buildkite/gen-pipeline.sh
@@ -7,7 +7,7 @@ set -eu
 repository=823773083436.dkr.ecr.us-east-1.amazonaws.com/buildkite
 
 # our baseline test is
-baseline="test-cpu-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0"
+baseline="test-cpu-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0"
 # in run_gloo_integration we run 'Elastic Spark * Tests' for this baseline
 # so it has to have Gloo mpi kind
 
@@ -17,48 +17,48 @@ code_files=$(python "$dir/get_changed_code_files.py" || echo failure)
 tests=$(if [[ -n "${PIPELINE_MODE:-}" ]] && ( [[ "${BUILDKITE_BRANCH:-}" == "${BUILDKITE_PIPELINE_DEFAULT_BRANCH:-}" ]] || [[ -n "$code_files" ]] ); then
   # we vary the baseline along the Python dimension and PySpark together
   # run_gloo_integration expects these to have Gloo mpi kind to run 'Elastic Spark * Tests'
-  printf "test-cpu-gloo-py3_7-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark2_4_8 "
-  printf "test-cpu-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_1_2 "
+  printf "test-cpu-gloo-py3_7-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark2_4_8 "
+  printf "test-cpu-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_1_2 "
   # our baseline
   printf "$baseline "
 
   # then we vary the baseline along mpi kinds dimension
   # our baseline again
-# printf "test-cpu-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0 "
-  printf "test-cpu-mpich-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0 "
-  printf "test-cpu-oneccl-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0 "
-  printf "test-cpu-openmpi-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0 "
+# printf "test-cpu-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0 "
+  printf "test-cpu-mpich-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0 "
+  printf "test-cpu-oneccl-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0 "
+  printf "test-cpu-openmpi-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0 "
   # note: we test openmpi-gloo mpi kind in this variation in each of [cpu, gpu, mixed]
-  printf "test-cpu-openmpi-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0 "
+  printf "test-cpu-openmpi-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0 "
 
   # then we vary the baseline along the framework dimensions all together
   # some frameworks are not available for our baseline Python version 3.8, so we use Python 3.7
   # run_gloo_integration expects tf1 to have Gloo mpi kind to run 'Elastic Spark * Tests'
-  printf "test-cpu-gloo-py3_7-tf1_15_5-keras2_2_4-torch1_6_0-mxnet1_5_1_p0-pyspark3_2_0 "
+  printf "test-cpu-gloo-py3_7-tf1_15_5-keras2_2_4-torch1_7_1-mxnet1_5_1_p0-pyspark3_2_0 "
   # there is no mxnet-1.6.0.post0 and mxnet-1.6.0 does not work with horovod
   # https://github.com/apache/incubator-mxnet/issues/16193
   # however, there is an mxnet-cu101-1.6.0.post0, so we test this with gpu instead of cpu
-  #printf "test-cpu-gloo-py3_8-tf2_4_3-keras2_3_1-torch1_7_1-mxnet1_6_0_p0-pyspark3_2_0 "
-  printf "test-cpu-gloo-py3_8-tf2_5_1-keras2_4_3-torch1_8_1-mxnet1_7_0_p2-pyspark3_2_0 "
+  #printf "test-cpu-gloo-py3_8-tf2_4_3-keras2_3_1-torch1_8_1-mxnet1_6_0_p0-pyspark3_2_0 "
+  printf "test-cpu-gloo-py3_8-tf2_5_1-keras2_4_3-torch1_9_1-mxnet1_7_0_p2-pyspark3_2_0 "
   # our baseline again
-# printf "test-cpu-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0 "
+# printf "test-cpu-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0 "
   printf "test-cpu-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_2_0 "
 
   # then we vary the frameworks for gpu
-  printf "test-gpu-gloo-py3_7-tf1_15_5-keras2_2_4-torch1_6_0-mxnet1_5_1_p0-pyspark3_2_0 "
+  printf "test-gpu-gloo-py3_7-tf1_15_5-keras2_2_4-torch1_7_1-mxnet1_5_1_p0-pyspark3_2_0 "
   # this is required as we cannot test mxnet-1.6.0.post0 with cpu
-  printf "test-gpu-gloo-py3_8-tf2_4_3-keras2_3_1-torch1_7_1-mxnet1_6_0_p0-pyspark3_2_0 "
+  printf "test-gpu-gloo-py3_8-tf2_4_3-keras2_3_1-torch1_8_1-mxnet1_6_0_p0-pyspark3_2_0 "
   # we additionally test the previous framework combination (CUDA 10.x) with mxnet 1.7.x
   # as mxnet 1.7.x only supports CUDA 10.x, but next framework combination targets CUDA 11.x
-  printf "test-gpu-gloo-py3_8-tf2_4_3-keras2_3_1-torch1_7_1-mxnet1_7_0_p1-pyspark3_2_0 "
+  printf "test-gpu-gloo-py3_8-tf2_4_3-keras2_3_1-torch1_8_1-mxnet1_7_0_p1-pyspark3_2_0 "
   # we deviate from mxnet1_7_0_p2 here as other frameworks target CUDA 11.x and
   # mxnet 1.7.x only supports CUDA 10.x, with mxnet 1.8.x we have CUDA 11.x packages
-  printf "test-gpu-gloo-py3_8-tf2_5_1-keras2_4_3-torch1_8_1-mxnet1_8_0_p0-pyspark3_2_0 "
-  printf "test-gpu-openmpi-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0 "
+  printf "test-gpu-gloo-py3_8-tf2_5_1-keras2_4_3-torch1_9_1-mxnet1_8_0_p0-pyspark3_2_0 "
+  printf "test-gpu-openmpi-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0 "
   printf "test-gpu-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_2_0 "
 
   # and one final test with mixed cpu+gpu
-  printf "test-mixed-openmpi-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0 "
+  printf "test-mixed-openmpi-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0 "
 fi | if [[ "${PIPELINE_MODE:-}" == "GPU"* ]]; then sed -E "s/[^ ]*-cpu-[^ ]*//g"; else cat; fi \
    | if [[ "${PIPELINE_MODE:-}" == "GPU HEADS" ]]; then sed -E "s/ /\n/g" | grep -e "-tfhead-keras_none-torchhead-mxnethead-" | paste -s -d " " -; else cat; fi \
    | if [[ "${PIPELINE_MODE:-}" == "GPU NON HEADS" ]]; then sed -E "s/[^ ]*-tfhead-keras_none-torchhead-mxnethead-[^ ]*//g"; else cat; fi)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -164,7 +164,7 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - image: test-cpu-gloo-py3_7-tf1_15_5-keras2_2_4-torch1_6_0-mxnet1_5_1_p0-pyspark3_2_0
+          - image: test-cpu-gloo-py3_7-tf1_15_5-keras2_2_4-torch1_7_1-mxnet1_5_1_p0-pyspark3_2_0
             Elastic_Spark_TensorFlow_Tests_2: true
             Elastic_Tests_2: true
             Gloo_Cluster_PyTests: true
@@ -185,7 +185,7 @@ jobs:
             Spark_Torch_MNIST: true
             build_timeout: 30
 
-          - image: test-cpu-gloo-py3_7-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark2_4_8
+          - image: test-cpu-gloo-py3_7-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark2_4_8
             Elastic_Spark_TensorFlow_Tests_1: true
             Elastic_Spark_Torch_Tests: true
             Elastic_Tests_1: true
@@ -203,7 +203,7 @@ jobs:
             Spark_Torch_MNIST: true
             build_timeout: 30
 
-          - image: test-cpu-gloo-py3_8-tf2_5_1-keras2_4_3-torch1_8_1-mxnet1_7_0_p2-pyspark3_2_0
+          - image: test-cpu-gloo-py3_8-tf2_5_1-keras2_4_3-torch1_9_1-mxnet1_7_0_p2-pyspark3_2_0
             Elastic_Tests_1: true
             Gloo_Cluster_PyTests: true
             Gloo_MXNet_MNIST: true
@@ -219,7 +219,7 @@ jobs:
             Spark_Torch_MNIST: true
             build_timeout: 30
 
-          - image: test-cpu-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_1_2
+          - image: test-cpu-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_1_2
             Elastic_Spark_TensorFlow_Tests_1: true
             Elastic_Spark_Torch_Tests: true
             Elastic_Tests_1: true
@@ -237,7 +237,7 @@ jobs:
             Spark_Torch_MNIST: true
             build_timeout: 30
 
-          - image: test-cpu-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0
+          - image: test-cpu-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0
             Elastic_Spark_TensorFlow_Tests_1: true
             Elastic_Spark_Torch_Tests: true
             Elastic_Tests_1: true
@@ -255,7 +255,7 @@ jobs:
             Spark_Torch_MNIST: true
             build_timeout: 30
 
-          - image: test-cpu-mpich-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0
+          - image: test-cpu-mpich-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0
             MPI_Cluster_PyTests: true
             MPI_MXNet_MNIST: true
             MPI_Parallel_PyTests: true
@@ -267,7 +267,7 @@ jobs:
             Single_PyTorch_MNIST: true
             build_timeout: 30
 
-          - image: test-cpu-openmpi-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0
+          - image: test-cpu-openmpi-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0
             Elastic_Tests_1: true
             Gloo_Cluster_PyTests: true
             Gloo_MXNet_MNIST: true
@@ -291,7 +291,7 @@ jobs:
             Spark_Torch_MNIST: true
             build_timeout: 30
 
-          - image: test-cpu-openmpi-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0
+          - image: test-cpu-openmpi-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0
             MPI_Cluster_PyTests: true
             MPI_MXNet_MNIST: true
             MPI_Parallel_PyTests: true
@@ -307,22 +307,22 @@ jobs:
             Spark_Torch_MNIST: true
             build_timeout: 30
 
-          - image: test-gpu-gloo-py3_7-tf1_15_5-keras2_2_4-torch1_6_0-mxnet1_5_1_p0-pyspark3_2_0
+          - image: test-gpu-gloo-py3_7-tf1_15_5-keras2_2_4-torch1_7_1-mxnet1_5_1_p0-pyspark3_2_0
             build_timeout: 40
 
-          - image: test-gpu-gloo-py3_8-tf2_4_3-keras2_3_1-torch1_7_1-mxnet1_6_0_p0-pyspark3_2_0
+          - image: test-gpu-gloo-py3_8-tf2_4_3-keras2_3_1-torch1_8_1-mxnet1_6_0_p0-pyspark3_2_0
             build_timeout: 40
 
-          - image: test-gpu-gloo-py3_8-tf2_4_3-keras2_3_1-torch1_7_1-mxnet1_7_0_p1-pyspark3_2_0
+          - image: test-gpu-gloo-py3_8-tf2_4_3-keras2_3_1-torch1_8_1-mxnet1_7_0_p1-pyspark3_2_0
             build_timeout: 40
 
-          - image: test-gpu-gloo-py3_8-tf2_5_1-keras2_4_3-torch1_8_1-mxnet1_8_0_p0-pyspark3_2_0
+          - image: test-gpu-gloo-py3_8-tf2_5_1-keras2_4_3-torch1_9_1-mxnet1_8_0_p0-pyspark3_2_0
             build_timeout: 40
 
-          - image: test-gpu-openmpi-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0
+          - image: test-gpu-openmpi-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0
             build_timeout: 40
 
-          - image: test-mixed-openmpi-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0
+          - image: test-mixed-openmpi-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0
             build_timeout: 40
 
     steps:

diff --git a/docker-compose.test.yml b/docker-compose.test.yml
@@ -11,9 +11,9 @@ services:
         PYTHON_VERSION: 3.8
         TENSORFLOW_PACKAGE: tensorflow-cpu==2.6.0
         KERAS_PACKAGE: keras==2.6.0
-        PYTORCH_PACKAGE: torch==1.9.0+cpu
+        PYTORCH_PACKAGE: torch==1.10.0+cpu
         PYTORCH_LIGHTNING_PACKAGE: pytorch-lightning==1.3.8
-        TORCHVISION_PACKAGE: torchvision==0.10.0+cpu
+        TORCHVISION_PACKAGE: torchvision==0.11.1+cpu
         MXNET_PACKAGE: mxnet==1.8.0.post0
         PYSPARK_PACKAGE: pyspark==3.2.0
         SPARK_PACKAGE: spark-3.2.0/spark-3.2.0-bin-hadoop2.7.tgz
@@ -22,57 +22,57 @@ services:
     shm_size: 8gb
 
   # our baseline first
-  test-cpu-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0:
+  test-cpu-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0:
     extends: test-cpu-base
-  test-cpu-mpich-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0:
+  test-cpu-mpich-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0:
     extends: test-cpu-base
     build:
       args:
         MPI_KIND: MPICH
         HOROVOD_BUILD_FLAGS: HOROVOD_WITHOUT_GLOO=1
-  test-cpu-oneccl-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0:
+  test-cpu-oneccl-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0:
     extends: test-cpu-base
     build:
       args:
         MPI_KIND: ONECCL
         HOROVOD_BUILD_FLAGS: HOROVOD_WITHOUT_GLOO=1
-  test-cpu-openmpi-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0:
+  test-cpu-openmpi-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0:
     extends: test-cpu-base
     build:
       args:
         MPI_KIND: OpenMPI
         HOROVOD_BUILD_FLAGS: HOROVOD_WITHOUT_GLOO=1
-  test-cpu-openmpi-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0:
+  test-cpu-openmpi-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0:
     extends: test-cpu-base
     build:
       args:
         MPI_KIND: OpenMPI
 
-  test-cpu-gloo-py3_7-tf1_15_5-keras2_2_4-torch1_6_0-mxnet1_5_1_p0-pyspark3_2_0:
+  test-cpu-gloo-py3_7-tf1_15_5-keras2_2_4-torch1_7_1-mxnet1_5_1_p0-pyspark3_2_0:
     extends: test-cpu-base
     build:
       args:
         PYTHON_VERSION: 3.7
         # there is no tensorflow-cpu>1.15.0, so we use tensorflow==1.15.5
         TENSORFLOW_PACKAGE: tensorflow==1.15.5
         KERAS_PACKAGE: keras==2.2.4
-        PYTORCH_PACKAGE: torch==1.6.0+cpu
+        PYTORCH_PACKAGE: torch==1.7.1+cpu
         PYTORCH_LIGHTNING_PACKAGE: pytorch_lightning==1.3.8
-        TORCHVISION_PACKAGE: torchvision==0.7.0+cpu
+        TORCHVISION_PACKAGE: torchvision==0.8.2+cpu
         MXNET_PACKAGE: mxnet==1.5.1.post0
   # there is no mxnet-1.6.0.post0 and mxnet-1.6.0 does not work with horovod
   # https://github.com/apache/incubator-mxnet/issues/16193
   # however, there is an mxnet-cu101-1.6.0.post0, so we test this with gpu instead of cpu
   # this cpu test variation is defined as gpu in gpu frameworks variations below
-#  test-cpu-gloo-py3_8-tf2_4_3-keras2_3_1-torch1_7_1-mxnet1_6_0_p0-pyspark3_2_0:
-  test-cpu-gloo-py3_8-tf2_5_1-keras2_4_3-torch1_8_1-mxnet1_7_0_p2-pyspark3_2_0:
+#  test-cpu-gloo-py3_8-tf2_4_3-keras2_3_1-torch1_8_1-mxnet1_6_0_p0-pyspark3_2_0:
+  test-cpu-gloo-py3_8-tf2_5_1-keras2_4_3-torch1_9_1-mxnet1_7_0_p2-pyspark3_2_0:
     extends: test-cpu-base
     build:
       args:
         TENSORFLOW_PACKAGE: tensorflow==2.5.1
         KERAS_PACKAGE: keras==2.4.3
-        PYTORCH_PACKAGE: torch==1.8.1+cpu
-        TORCHVISION_PACKAGE: torchvision==0.9.1
+        PYTORCH_PACKAGE: torch==1.9.1+cpu
+        TORCHVISION_PACKAGE: torchvision==0.10.1
         MXNET_PACKAGE: mxnet==1.7.0.post2
   # then our baseline again, omitted ...
   test-cpu-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_2_0:
@@ -86,14 +86,14 @@ services:
         PYTORCH_LIGHTNING_PACKAGE: pytorch_lightning
         MXNET_PACKAGE: mxnet-nightly
 
-  test-cpu-gloo-py3_7-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark2_4_8:
+  test-cpu-gloo-py3_7-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark2_4_8:
     extends: test-cpu-base
     build:
       args:
         PYTHON_VERSION: 3.7
         PYSPARK_PACKAGE: pyspark==2.4.8
         SPARK_PACKAGE: spark-2.4.8/spark-2.4.8-bin-hadoop2.7.tgz
-  test-cpu-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_1_2:
+  test-cpu-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_1_2:
     extends: test-cpu-base
     build:
       args:
@@ -123,8 +123,8 @@ services:
     shm_size: 8gb
 
   # okay to mix cuda 10.0 and 10.1 here as pytorch ships its own cuda libs
-  # torch==1.6.0+cu101 requires torchvision==0.7.0+cu101
-  test-gpu-gloo-py3_7-tf1_15_5-keras2_2_4-torch1_6_0-mxnet1_5_1_p0-pyspark3_2_0:
+  # torch==1.7.1+cu101 requires torchvision==0.8.2+cu101
+  test-gpu-gloo-py3_7-tf1_15_5-keras2_2_4-torch1_7_1-mxnet1_5_1_p0-pyspark3_2_0:
     extends: test-gpu-base
     build:
       args:
@@ -134,12 +134,12 @@ services:
         PYTHON_VERSION: 3.7
         TENSORFLOW_PACKAGE: tensorflow-gpu==1.15.5
         KERAS_PACKAGE: keras==2.2.4
-        PYTORCH_PACKAGE: torch==1.6.0+cu101
+        PYTORCH_PACKAGE: torch==1.7.1+cu101
         PYTORCH_LIGHTNING_PACKAGE: pytorch_lightning==1.3.8
-        TORCHVISION_PACKAGE: torchvision==0.7.0+cu101
+        TORCHVISION_PACKAGE: torchvision==0.8.2+cu101
         MXNET_PACKAGE: mxnet-cu100==1.5.1.post0
   # this is required as we cannot test mxnet-1.6.0.post0 with cpu
-  test-gpu-gloo-py3_8-tf2_4_3-keras2_3_1-torch1_7_1-mxnet1_6_0_p0-pyspark3_2_0:
+  test-gpu-gloo-py3_8-tf2_4_3-keras2_3_1-torch1_8_1-mxnet1_6_0_p0-pyspark3_2_0:
     extends: test-gpu-base
     build:
       args:
@@ -148,13 +148,13 @@ services:
         NCCL_VERSION_OVERRIDE: 2.7.8-1+cuda10.1
         TENSORFLOW_PACKAGE: tensorflow-gpu==2.4.3
         KERAS_PACKAGE: keras==2.3.1
-        PYTORCH_PACKAGE: torch==1.7.1+cu101
+        PYTORCH_PACKAGE: torch==1.8.1+cu101
         PYTORCH_LIGHTNING_PACKAGE: pytorch_lightning==1.3.8
-        TORCHVISION_PACKAGE: torchvision==0.8.2+cu101
+        TORCHVISION_PACKAGE: torchvision==0.9.1+cu101
         MXNET_PACKAGE: mxnet-cu101==1.6.0.post0
   # we additionally test the previous framework combination (CUDA 10.x) with mxnet 1.7.x
-  # as mxnet 1.7.x only supports CUDA 10.x, but next framework combination targets CUAA 11.x
-  test-gpu-gloo-py3_8-tf2_4_3-keras2_3_1-torch1_7_1-mxnet1_7_0_p1-pyspark3_2_0:
+  # as mxnet 1.7.x only supports CUDA 10.x, but next framework combination targets CUDA 11.x
+  test-gpu-gloo-py3_8-tf2_4_3-keras2_3_1-torch1_8_1-mxnet1_7_0_p1-pyspark3_2_0:
     extends: test-gpu-base
     build:
       args:
@@ -163,13 +163,13 @@ services:
         NCCL_VERSION_OVERRIDE: 2.7.8-1+cuda10.1
         TENSORFLOW_PACKAGE: tensorflow-gpu==2.4.3
         KERAS_PACKAGE: keras==2.3.1
-        PYTORCH_PACKAGE: torch==1.7.1+cu101
+        PYTORCH_PACKAGE: torch==1.8.1+cu101
         PYTORCH_LIGHTNING_PACKAGE: pytorch_lightning==1.3.8
-        TORCHVISION_PACKAGE: torchvision==0.8.2+cu101
+        TORCHVISION_PACKAGE: torchvision==0.9.1+cu101
         MXNET_PACKAGE: mxnet-cu101==1.7.0.post1
   # we deviate from mxnet1_7_0_p2 here as other frameworks target CUDA 11.x and
   # mxnet 1.7.x only supports CUDA 10.x, with mxnet 1.8.x we have CUDA 11.x packages
-  test-gpu-gloo-py3_8-tf2_5_1-keras2_4_3-torch1_8_1-mxnet1_8_0_p0-pyspark3_2_0:
+  test-gpu-gloo-py3_8-tf2_5_1-keras2_4_3-torch1_9_1-mxnet1_8_0_p0-pyspark3_2_0:
     extends: test-gpu-base
     build:
       args:
@@ -178,11 +178,11 @@ services:
         NCCL_VERSION_OVERRIDE: 2.8.4-1+cuda11.2
         TENSORFLOW_PACKAGE: tensorflow-gpu==2.5.1
         KERAS_PACKAGE: keras==2.4.3
-        PYTORCH_PACKAGE: torch==1.8.1+cu111
+        PYTORCH_PACKAGE: torch==1.9.1+cu111
         PYTORCH_LIGHTNING_PACKAGE: pytorch_lightning==1.3.8
-        TORCHVISION_PACKAGE: torchvision==0.9.1+cu111
+        TORCHVISION_PACKAGE: torchvision==0.10.1+cu111
         MXNET_PACKAGE: mxnet-cu112==1.8.0.post0
-  test-gpu-openmpi-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0:
+  test-gpu-openmpi-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0:
     extends: test-gpu-base
     build:
       args:
@@ -192,9 +192,9 @@ services:
         MPI_KIND: OpenMPI
         TENSORFLOW_PACKAGE: tensorflow-gpu==2.6.0
         KERAS_PACKAGE: keras==2.6.0
-        PYTORCH_PACKAGE: torch==1.9.0+cu111
+        PYTORCH_PACKAGE: torch==1.10.0+cu111
         PYTORCH_LIGHTNING_PACKAGE: pytorch-lightning==1.3.8
-        TORCHVISION_PACKAGE: torchvision==0.10.0+cu111
+        TORCHVISION_PACKAGE: torchvision==0.11.1+cu111
         MXNET_PACKAGE: mxnet-cu112==1.8.0.post0
   test-gpu-gloo-py3_8-tfhead-keras_none-torchhead-mxnethead-pyspark3_2_0:
     extends: test-gpu-base
@@ -210,7 +210,7 @@ services:
         TORCHVISION_PACKAGE: torchvision
         MXNET_PACKAGE: mxnet-nightly-cu112
 
-  test-mixed-openmpi-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_9_0-mxnet1_8_0_p0-pyspark3_2_0:
+  test-mixed-openmpi-gloo-py3_8-tf2_6_0-keras2_6_0-torch1_10_0-mxnet1_8_0_p0-pyspark3_2_0:
     extends: test-gpu-base
     build:
       args:
@@ -220,9 +220,9 @@ services:
         MPI_KIND: OpenMPI
         TENSORFLOW_PACKAGE: tensorflow-gpu==2.6.0
         KERAS_PACKAGE: keras==2.6.0
-        PYTORCH_PACKAGE: torch==1.9.0+cu111
+        PYTORCH_PACKAGE: torch==1.10.0+cu111
         PYTORCH_LIGHTNING_PACKAGE: pytorch_lightning==1.3.8
-        TORCHVISION_PACKAGE: torchvision==0.10.0+cu111
+        TORCHVISION_PACKAGE: torchvision==0.11.1+cu111
         MXNET_PACKAGE: mxnet-cu112==1.8.0.post0
         HOROVOD_BUILD_FLAGS: ""
         HOROVOD_MIXED_INSTALL: 1