E2E: Replace outdated images with latest ones

Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>
kubeflow · Apr 25, 2024 · 68a6741 · 68a6741
1 parent 7345e33
commit 68a6741
Show file tree

Hide file tree

Showing 7 changed files with 47 additions and 48 deletions.
diff --git a/examples/mpi/tensorflow-mnist.yaml b/examples/mpi/tensorflow-mnist.yaml
@@ -12,7 +12,7 @@ spec:
       template:
         spec:
           containers:
-          - image: horovod/horovod:0.20.0-tf2.3.0-torch1.6.0-mxnet1.5.0-py3.7-cpu
+          - image: horovod/horovod:0.28.1
             name: mpi
             command:
             - mpirun
@@ -35,7 +35,7 @@ spec:
             - btl
             - ^openib
             - python
-            - /examples/tensorflow2_mnist.py
+            - /horovod/examples/tensorflow2/tensorflow2_mnist.py
             resources:
               limits:
                 cpu: 1
@@ -45,7 +45,7 @@ spec:
       template:
         spec:
           containers:
-          - image: horovod/horovod:0.20.0-tf2.3.0-torch1.6.0-mxnet1.5.0-py3.7-cpu
+          - image: horovod/horovod:0.28.1
             name: mpi
             resources:
               limits:

diff --git a/examples/pytorch/mnist/v1/pytorch_job_mnist_gloo.yaml b/examples/pytorch/mnist/v1/pytorch_job_mnist_gloo.yaml
@@ -11,10 +11,10 @@ spec:
         spec:
           containers:
             - name: pytorch
-              image: gcr.io/<your_project>/pytorch_dist_mnist:latest
+              image: kubeflow/pytorch-dist-mnist:latest
               args: ["--backend", "gloo"]
               # Comment out the below resources to use the CPU.
-              resources: 
+              resources:
                 limits:
                   nvidia.com/gpu: 1
     Worker:
@@ -24,9 +24,9 @@ spec:
         spec:
           containers: 
             - name: pytorch
-              image: gcr.io/<your_project>/pytorch_dist_mnist:latest
+              image: kubeflow/pytorch-dist-mnist:latest
               args: ["--backend", "gloo"]
               # Comment out the below resources to use the CPU.
-              resources: 
+              resources:
                 limits:
                   nvidia.com/gpu: 1
diff --git a/examples/pytorch/mnist/v1/pytorch_job_mnist_mpi.yaml b/examples/pytorch/mnist/v1/pytorch_job_mnist_mpi.yaml
@@ -11,7 +11,7 @@ spec:
         spec:
           containers:
             - name: pytorch
-              image: gcr.io/<your_project>/pytorch_dist_mnist:mpi
+              image: kubeflow/pytorch-dist-mnist:latest
               args: ["--backend", "mpi"]
               # Comment out the below resources to use the CPU.
               resources: 
@@ -24,7 +24,7 @@ spec:
         spec:
           containers: 
             - name: pytorch
-              image: gcr.io/<your_project>/pytorch_dist_mnist:mpi
+              image: kubeflow/pytorch-dist-mnist:latest
               args: ["--backend", "mpi"]
               # Comment out the below resources to use the CPU.
               resources: 

diff --git a/examples/pytorch/mnist/v1/pytorch_job_mnist_nccl.yaml b/examples/pytorch/mnist/v1/pytorch_job_mnist_nccl.yaml
@@ -11,7 +11,7 @@ spec:
         spec:
           containers:
             - name: pytorch
-              image: gcr.io/<your_project>/pytorch_dist_mnist:latest
+              image: kubeflow/pytorch-dist-mnist:latest
               args: ["--backend", "nccl"]
               resources: 
                 limits:
@@ -23,7 +23,7 @@ spec:
         spec:
           containers: 
             - name: pytorch
-              image: gcr.io/<your_project>/pytorch_dist_mnist:latest
+              image: kubeflow/pytorch-dist-mnist:latest
               args: ["--backend", "nccl"]
               resources: 
                 limits:

diff --git a/sdk/python/kubeflow/training/utils/utils.py b/sdk/python/kubeflow/training/utils/utils.py
@@ -284,27 +284,27 @@ def get_tfjob_template(
 
     # Add Chief, PS, and Worker replicas to the TFJob.
     if num_chief_replicas is not None:
-        tfjob.spec.tf_replica_specs[constants.REPLICA_TYPE_CHIEF] = (
-            models.KubeflowOrgV1ReplicaSpec(
-                replicas=num_chief_replicas,
-                template=pod_template_spec,
-            )
+        tfjob.spec.tf_replica_specs[
+            constants.REPLICA_TYPE_CHIEF
+        ] = models.KubeflowOrgV1ReplicaSpec(
+            replicas=num_chief_replicas,
+            template=pod_template_spec,
         )
 
     if num_ps_replicas is not None:
-        tfjob.spec.tf_replica_specs[constants.REPLICA_TYPE_PS] = (
-            models.KubeflowOrgV1ReplicaSpec(
-                replicas=num_ps_replicas,
-                template=pod_template_spec,
-            )
+        tfjob.spec.tf_replica_specs[
+            constants.REPLICA_TYPE_PS
+        ] = models.KubeflowOrgV1ReplicaSpec(
+            replicas=num_ps_replicas,
+            template=pod_template_spec,
         )
 
     if num_workers is not None:
-        tfjob.spec.tf_replica_specs[constants.REPLICA_TYPE_WORKER] = (
-            models.KubeflowOrgV1ReplicaSpec(
-                replicas=num_workers,
-                template=pod_template_spec,
-            )
+        tfjob.spec.tf_replica_specs[
+            constants.REPLICA_TYPE_WORKER
+        ] = models.KubeflowOrgV1ReplicaSpec(
+            replicas=num_workers,
+            template=pod_template_spec,
         )
 
     return tfjob
@@ -343,19 +343,19 @@ def get_pytorchjob_template(
 
     # Create Master replica if that is set.
     if master_pod_template_spec:
-        pytorchjob.spec.pytorch_replica_specs[constants.REPLICA_TYPE_MASTER] = (
-            models.KubeflowOrgV1ReplicaSpec(
-                replicas=1,
-                template=master_pod_template_spec,
-            )
+        pytorchjob.spec.pytorch_replica_specs[
+            constants.REPLICA_TYPE_MASTER
+        ] = models.KubeflowOrgV1ReplicaSpec(
+            replicas=1,
+            template=master_pod_template_spec,
         )
     # If we don't define Master template, use the Worker template.
     else:
-        pytorchjob.spec.pytorch_replica_specs[constants.REPLICA_TYPE_MASTER] = (
-            models.KubeflowOrgV1ReplicaSpec(
-                replicas=1,
-                template=worker_pod_template_spec,
-            )
+        pytorchjob.spec.pytorch_replica_specs[
+            constants.REPLICA_TYPE_MASTER
+        ] = models.KubeflowOrgV1ReplicaSpec(
+            replicas=1,
+            template=worker_pod_template_spec,
         )
 
     # Create Worker with num_workers - 1 replicas.
@@ -364,11 +364,11 @@ def get_pytorchjob_template(
     # doesn't set RANK and WORLD_SIZE for PyTorchJob.
     # Ref issue: https://github.com/kubeflow/training-operator/issues/1991
     if num_workers > 1:
-        pytorchjob.spec.pytorch_replica_specs[constants.REPLICA_TYPE_WORKER] = (
-            models.KubeflowOrgV1ReplicaSpec(
-                replicas=num_workers - 1,
-                template=worker_pod_template_spec,
-            )
+        pytorchjob.spec.pytorch_replica_specs[
+            constants.REPLICA_TYPE_WORKER
+        ] = models.KubeflowOrgV1ReplicaSpec(
+            replicas=num_workers - 1,
+            template=worker_pod_template_spec,
         )
 
     return pytorchjob

diff --git a/sdk/python/test/e2e/test_e2e_mpijob.py b/sdk/python/test/e2e/test_e2e_mpijob.py
@@ -182,7 +182,7 @@ def generate_mpijob(
 def generate_containers() -> Tuple[V1Container, V1Container]:
     launcher_container = V1Container(
         name=CONTAINER_NAME,
-        image="horovod/horovod:0.20.0-tf2.3.0-torch1.6.0-mxnet1.5.0-py3.7-cpu",
+        image="horovod/horovod:0.28.1",
         command=["mpirun"],
         args=[
             "-np",
@@ -202,9 +202,8 @@ def generate_containers() -> Tuple[V1Container, V1Container]:
             "-mca",
             "btl",
             "^openib",
-            # "python", "/examples/tensorflow2_mnist.py"]
             "python",
-            "/examples/pytorch_mnist.py",
+            "/horovod/examples/pytorch/pytorch_mnist.py",
             "--epochs",
             "1",
         ],
@@ -213,7 +212,7 @@ def generate_containers() -> Tuple[V1Container, V1Container]:
 
     worker_container = V1Container(
         name="mpi",
-        image="horovod/horovod:0.20.0-tf2.3.0-torch1.6.0-mxnet1.5.0-py3.7-cpu",
+        image="horovod/horovod:0.28.1",
         resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.4"}),
     )
 

diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py
@@ -264,7 +264,7 @@ def generate_pytorchjob(
 def generate_container() -> V1Container:
     return V1Container(
         name=CONTAINER_NAME,
-        image="gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0",
-        args=["--backend", "gloo"],
-        resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.4"}),
+        image="kubeflow/pytorch-dist-mnist:latest",
+        args=["--backend", "gloo", "--epochs", "1"],
+        resources=V1ResourceRequirements(limits={"memory": "2Gi", "cpu": "1"}),
     )