Skip to content

Commit

Permalink
E2E: Replace outdated images with latest ones
Browse files Browse the repository at this point in the history
Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>
  • Loading branch information
tenzen-y committed Apr 25, 2024
1 parent 7345e33 commit 68a6741
Show file tree
Hide file tree
Showing 7 changed files with 47 additions and 48 deletions.
6 changes: 3 additions & 3 deletions examples/mpi/tensorflow-mnist.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ spec:
template:
spec:
containers:
- image: horovod/horovod:0.20.0-tf2.3.0-torch1.6.0-mxnet1.5.0-py3.7-cpu
- image: horovod/horovod:0.28.1
name: mpi
command:
- mpirun
Expand All @@ -35,7 +35,7 @@ spec:
- btl
- ^openib
- python
- /examples/tensorflow2_mnist.py
- /horovod/examples/tensorflow2/tensorflow2_mnist.py
resources:
limits:
cpu: 1
Expand All @@ -45,7 +45,7 @@ spec:
template:
spec:
containers:
- image: horovod/horovod:0.20.0-tf2.3.0-torch1.6.0-mxnet1.5.0-py3.7-cpu
- image: horovod/horovod:0.28.1
name: mpi
resources:
limits:
Expand Down
8 changes: 4 additions & 4 deletions examples/pytorch/mnist/v1/pytorch_job_mnist_gloo.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@ spec:
spec:
containers:
- name: pytorch
image: gcr.io/<your_project>/pytorch_dist_mnist:latest
image: kubeflow/pytorch-dist-mnist:latest
args: ["--backend", "gloo"]
# Comment out the below resources to use the CPU.
resources:
resources:
limits:
nvidia.com/gpu: 1
Worker:
Expand All @@ -24,9 +24,9 @@ spec:
spec:
containers:
- name: pytorch
image: gcr.io/<your_project>/pytorch_dist_mnist:latest
image: kubeflow/pytorch-dist-mnist:latest
args: ["--backend", "gloo"]
# Comment out the below resources to use the CPU.
resources:
resources:
limits:
nvidia.com/gpu: 1
4 changes: 2 additions & 2 deletions examples/pytorch/mnist/v1/pytorch_job_mnist_mpi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ spec:
spec:
containers:
- name: pytorch
image: gcr.io/<your_project>/pytorch_dist_mnist:mpi
image: kubeflow/pytorch-dist-mnist:latest
args: ["--backend", "mpi"]
# Comment out the below resources to use the CPU.
resources:
Expand All @@ -24,7 +24,7 @@ spec:
spec:
containers:
- name: pytorch
image: gcr.io/<your_project>/pytorch_dist_mnist:mpi
image: kubeflow/pytorch-dist-mnist:latest
args: ["--backend", "mpi"]
# Comment out the below resources to use the CPU.
resources:
Expand Down
4 changes: 2 additions & 2 deletions examples/pytorch/mnist/v1/pytorch_job_mnist_nccl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ spec:
spec:
containers:
- name: pytorch
image: gcr.io/<your_project>/pytorch_dist_mnist:latest
image: kubeflow/pytorch-dist-mnist:latest
args: ["--backend", "nccl"]
resources:
limits:
Expand All @@ -23,7 +23,7 @@ spec:
spec:
containers:
- name: pytorch
image: gcr.io/<your_project>/pytorch_dist_mnist:latest
image: kubeflow/pytorch-dist-mnist:latest
args: ["--backend", "nccl"]
resources:
limits:
Expand Down
60 changes: 30 additions & 30 deletions sdk/python/kubeflow/training/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,27 +284,27 @@ def get_tfjob_template(

# Add Chief, PS, and Worker replicas to the TFJob.
if num_chief_replicas is not None:
tfjob.spec.tf_replica_specs[constants.REPLICA_TYPE_CHIEF] = (
models.KubeflowOrgV1ReplicaSpec(
replicas=num_chief_replicas,
template=pod_template_spec,
)
tfjob.spec.tf_replica_specs[
constants.REPLICA_TYPE_CHIEF
] = models.KubeflowOrgV1ReplicaSpec(
replicas=num_chief_replicas,
template=pod_template_spec,
)

if num_ps_replicas is not None:
tfjob.spec.tf_replica_specs[constants.REPLICA_TYPE_PS] = (
models.KubeflowOrgV1ReplicaSpec(
replicas=num_ps_replicas,
template=pod_template_spec,
)
tfjob.spec.tf_replica_specs[
constants.REPLICA_TYPE_PS
] = models.KubeflowOrgV1ReplicaSpec(
replicas=num_ps_replicas,
template=pod_template_spec,
)

if num_workers is not None:
tfjob.spec.tf_replica_specs[constants.REPLICA_TYPE_WORKER] = (
models.KubeflowOrgV1ReplicaSpec(
replicas=num_workers,
template=pod_template_spec,
)
tfjob.spec.tf_replica_specs[
constants.REPLICA_TYPE_WORKER
] = models.KubeflowOrgV1ReplicaSpec(
replicas=num_workers,
template=pod_template_spec,
)

return tfjob
Expand Down Expand Up @@ -343,19 +343,19 @@ def get_pytorchjob_template(

# Create Master replica if that is set.
if master_pod_template_spec:
pytorchjob.spec.pytorch_replica_specs[constants.REPLICA_TYPE_MASTER] = (
models.KubeflowOrgV1ReplicaSpec(
replicas=1,
template=master_pod_template_spec,
)
pytorchjob.spec.pytorch_replica_specs[
constants.REPLICA_TYPE_MASTER
] = models.KubeflowOrgV1ReplicaSpec(
replicas=1,
template=master_pod_template_spec,
)
# If we don't define Master template, use the Worker template.
else:
pytorchjob.spec.pytorch_replica_specs[constants.REPLICA_TYPE_MASTER] = (
models.KubeflowOrgV1ReplicaSpec(
replicas=1,
template=worker_pod_template_spec,
)
pytorchjob.spec.pytorch_replica_specs[
constants.REPLICA_TYPE_MASTER
] = models.KubeflowOrgV1ReplicaSpec(
replicas=1,
template=worker_pod_template_spec,
)

# Create Worker with num_workers - 1 replicas.
Expand All @@ -364,11 +364,11 @@ def get_pytorchjob_template(
# doesn't set RANK and WORLD_SIZE for PyTorchJob.
# Ref issue: https://github.com/kubeflow/training-operator/issues/1991
if num_workers > 1:
pytorchjob.spec.pytorch_replica_specs[constants.REPLICA_TYPE_WORKER] = (
models.KubeflowOrgV1ReplicaSpec(
replicas=num_workers - 1,
template=worker_pod_template_spec,
)
pytorchjob.spec.pytorch_replica_specs[
constants.REPLICA_TYPE_WORKER
] = models.KubeflowOrgV1ReplicaSpec(
replicas=num_workers - 1,
template=worker_pod_template_spec,
)

return pytorchjob
Expand Down
7 changes: 3 additions & 4 deletions sdk/python/test/e2e/test_e2e_mpijob.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ def generate_mpijob(
def generate_containers() -> Tuple[V1Container, V1Container]:
launcher_container = V1Container(
name=CONTAINER_NAME,
image="horovod/horovod:0.20.0-tf2.3.0-torch1.6.0-mxnet1.5.0-py3.7-cpu",
image="horovod/horovod:0.28.1",
command=["mpirun"],
args=[
"-np",
Expand All @@ -202,9 +202,8 @@ def generate_containers() -> Tuple[V1Container, V1Container]:
"-mca",
"btl",
"^openib",
# "python", "/examples/tensorflow2_mnist.py"]
"python",
"/examples/pytorch_mnist.py",
"/horovod/examples/pytorch/pytorch_mnist.py",
"--epochs",
"1",
],
Expand All @@ -213,7 +212,7 @@ def generate_containers() -> Tuple[V1Container, V1Container]:

worker_container = V1Container(
name="mpi",
image="horovod/horovod:0.20.0-tf2.3.0-torch1.6.0-mxnet1.5.0-py3.7-cpu",
image="horovod/horovod:0.28.1",
resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.4"}),
)

Expand Down
6 changes: 3 additions & 3 deletions sdk/python/test/e2e/test_e2e_pytorchjob.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ def generate_pytorchjob(
def generate_container() -> V1Container:
return V1Container(
name=CONTAINER_NAME,
image="gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0",
args=["--backend", "gloo"],
resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.4"}),
image="kubeflow/pytorch-dist-mnist:latest",
args=["--backend", "gloo", "--epochs", "1"],
resources=V1ResourceRequirements(limits={"memory": "2Gi", "cpu": "1"}),
)

0 comments on commit 68a6741

Please sign in to comment.