From 5a5f92dcef17f5664ce2483cb3a25fe888d92654 Mon Sep 17 00:00:00 2001 From: Johnu George Date: Tue, 14 Feb 2023 00:06:24 +0530 Subject: [PATCH] Cherry pick commits (#1757) * Add validation for verifying that the CustomJob (e.g., TFJob) name meets DNS1035 (#1748) Signed-off-by: Yuki Iwai * Fix the success condition of the job in PyTorchJob's Elastic mode. (#1752) Signed-off-by: Syulin7 <735122171@qq.com> * Set the default value of CleanPodPolicy to None (#1754) Signed-off-by: Syulin7 <735122171@qq.com> * Update mpijob_controller.go (#1755) fix typo TFJob, should be MPIJob --------- Signed-off-by: Yuki Iwai Signed-off-by: Syulin7 <735122171@qq.com> Co-authored-by: Yuki Iwai Co-authored-by: yu lin <37265556+Syulin7@users.noreply.github.com> Co-authored-by: Yasser Shalabi --- pkg/apis/kubeflow.org/v1/defaulting_utils.go | 4 + pkg/apis/kubeflow.org/v1/mxnet_defaults.go | 4 +- .../kubeflow.org/v1/mxnet_defaults_test.go | 35 ++- pkg/apis/kubeflow.org/v1/mxnet_validation.go | 17 +- .../kubeflow.org/v1/mxnet_validation_test.go | 184 +++++++++++---- .../v1/paddlepaddle_validation.go | 18 +- .../v1/paddlepaddle_validation_test.go | 155 ++++++++++--- .../kubeflow.org/v1/pytorch_validation.go | 17 +- .../v1/pytorch_validation_test.go | 186 +++++++++++---- .../kubeflow.org/v1/tensorflow_defaults.go | 4 +- .../v1/tensorflow_defaults_test.go | 13 +- .../kubeflow.org/v1/tensorflow_validation.go | 21 +- .../v1/tensorflow_validation_test.go | 163 +++++++++---- pkg/apis/kubeflow.org/v1/xgboost_defaults.go | 4 +- .../kubeflow.org/v1/xgboost_defaults_test.go | 34 ++- .../kubeflow.org/v1/xgboost_validation.go | 17 +- .../v1/xgboost_validation_test.go | 216 +++++++++++++----- pkg/controller.v1/mpi/mpijob_controller.go | 2 +- pkg/controller.v1/mxnet/mxjob_controller.go | 2 +- .../paddlepaddle/paddlepaddle_controller.go | 2 +- .../pytorch/pytorchjob_controller.go | 7 +- .../tensorflow/tfjob_controller.go | 2 +- .../xgboost/xgboostjob_controller.go | 2 +- 23 files changed, 840 insertions(+), 269 deletions(-) diff --git a/pkg/apis/kubeflow.org/v1/defaulting_utils.go b/pkg/apis/kubeflow.org/v1/defaulting_utils.go index 538eb82d66..8898b5dc07 100644 --- a/pkg/apis/kubeflow.org/v1/defaulting_utils.go +++ b/pkg/apis/kubeflow.org/v1/defaulting_utils.go @@ -58,3 +58,7 @@ func setTypeNameToCamelCase(replicaSpecs map[commonv1.ReplicaType]*commonv1.Repl } } } + +func cleanPodPolicyPointer(cleanPodPolicy commonv1.CleanPodPolicy) *commonv1.CleanPodPolicy { + return &cleanPodPolicy +} diff --git a/pkg/apis/kubeflow.org/v1/mxnet_defaults.go b/pkg/apis/kubeflow.org/v1/mxnet_defaults.go index 235fcf8cad..2c684c9668 100644 --- a/pkg/apis/kubeflow.org/v1/mxnet_defaults.go +++ b/pkg/apis/kubeflow.org/v1/mxnet_defaults.go @@ -46,9 +46,9 @@ func setMXNetTypeNamesToCamelCase(mxJob *MXJob) { // SetDefaults_MXJob sets any unspecified values to defaults. func SetDefaults_MXJob(mxjob *MXJob) { - // Set default cleanpod policy to All. + // Set default cleanpod policy to None. if mxjob.Spec.RunPolicy.CleanPodPolicy == nil { - all := commonv1.CleanPodPolicyAll + all := commonv1.CleanPodPolicyNone mxjob.Spec.RunPolicy.CleanPodPolicy = &all } diff --git a/pkg/apis/kubeflow.org/v1/mxnet_defaults_test.go b/pkg/apis/kubeflow.org/v1/mxnet_defaults_test.go index 7d2a277e92..0e5ba92581 100644 --- a/pkg/apis/kubeflow.org/v1/mxnet_defaults_test.go +++ b/pkg/apis/kubeflow.org/v1/mxnet_defaults_test.go @@ -96,7 +96,7 @@ func TestSetDefaults_MXJob(t *testing.T) { }, }, }, - expected: expectedMXNetJob(commonv1.CleanPodPolicyAll, MXJobDefaultRestartPolicy, 1, MXJobDefaultPortName, MXJobDefaultPort), + expected: expectedMXNetJob(commonv1.CleanPodPolicyNone, MXJobDefaultRestartPolicy, 1, MXJobDefaultPortName, MXJobDefaultPort), }, "Set spec with restart policy": { original: &MXJob{ @@ -118,7 +118,7 @@ func TestSetDefaults_MXJob(t *testing.T) { }, }, }, - expected: expectedMXNetJob(commonv1.CleanPodPolicyAll, commonv1.RestartPolicyOnFailure, 1, MXJobDefaultPortName, MXJobDefaultPort), + expected: expectedMXNetJob(commonv1.CleanPodPolicyNone, commonv1.RestartPolicyOnFailure, 1, MXJobDefaultPortName, MXJobDefaultPort), }, "Set spec with replicas": { original: &MXJob{ @@ -140,7 +140,7 @@ func TestSetDefaults_MXJob(t *testing.T) { }, }, }, - expected: expectedMXNetJob(commonv1.CleanPodPolicyAll, MXJobDefaultRestartPolicy, 3, MXJobDefaultPortName, MXJobDefaultPort), + expected: expectedMXNetJob(commonv1.CleanPodPolicyNone, MXJobDefaultRestartPolicy, 3, MXJobDefaultPortName, MXJobDefaultPort), }, "Set spec with default node port name and port": { @@ -168,7 +168,7 @@ func TestSetDefaults_MXJob(t *testing.T) { }, }, }, - expected: expectedMXNetJob(commonv1.CleanPodPolicyAll, MXJobDefaultRestartPolicy, 1, MXJobDefaultPortName, MXJobDefaultPort), + expected: expectedMXNetJob(commonv1.CleanPodPolicyNone, MXJobDefaultRestartPolicy, 1, MXJobDefaultPortName, MXJobDefaultPort), }, "Set spec with node port": { @@ -196,7 +196,32 @@ func TestSetDefaults_MXJob(t *testing.T) { }, }, }, - expected: expectedMXNetJob(commonv1.CleanPodPolicyAll, MXJobDefaultRestartPolicy, 1, MXJobDefaultPortName, 9999), + expected: expectedMXNetJob(commonv1.CleanPodPolicyNone, MXJobDefaultRestartPolicy, 1, MXJobDefaultPortName, 9999), + }, + + "set spec with cleanpod policy": { + original: &MXJob{ + Spec: MXJobSpec{ + RunPolicy: commonv1.RunPolicy{ + CleanPodPolicy: cleanPodPolicyPointer(commonv1.CleanPodPolicyAll), + }, + MXReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ + MXJobReplicaTypeWorker: &commonv1.ReplicaSpec{ + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + corev1.Container{ + Name: MXJobDefaultContainerName, + Image: testImage, + }, + }, + }, + }, + }, + }, + }, + }, + expected: expectedMXNetJob(commonv1.CleanPodPolicyAll, MXJobDefaultRestartPolicy, 1, MXJobDefaultPortName, MXJobDefaultPort), }, } diff --git a/pkg/apis/kubeflow.org/v1/mxnet_validation.go b/pkg/apis/kubeflow.org/v1/mxnet_validation.go index b88e3e2dac..943678af40 100644 --- a/pkg/apis/kubeflow.org/v1/mxnet_validation.go +++ b/pkg/apis/kubeflow.org/v1/mxnet_validation.go @@ -16,14 +16,21 @@ package v1 import ( "fmt" - commonv1 "github.com/kubeflow/common/pkg/apis/common/v1" + commonv1 "github.com/kubeflow/common/pkg/apis/common/v1" log "github.com/sirupsen/logrus" + apimachineryvalidation "k8s.io/apimachinery/pkg/api/validation" ) -// ValidateV1MXJobSpec checks that the kubeflowv1.MXJobSpec is valid. -func ValidateV1MXJobSpec(c *MXJobSpec) error { - return validateMXNetReplicaSpecs(c.MXReplicaSpecs) +// ValidateV1MXJob checks that the kubeflowv1.MXJobSpec is valid. +func ValidateV1MXJob(mxJob *MXJob) error { + if errors := apimachineryvalidation.NameIsDNS1035Label(mxJob.ObjectMeta.Name, false); errors != nil { + return fmt.Errorf("MXJob name is invalid: %v", errors) + } + if err := validateMXReplicaSpecs(mxJob.Spec.MXReplicaSpecs); err != nil { + return err + } + return nil } // IsScheduler returns true if the type is Scheduler. @@ -31,7 +38,7 @@ func IsScheduler(typ commonv1.ReplicaType) bool { return typ == MXJobReplicaTypeScheduler } -func validateMXNetReplicaSpecs(specs map[commonv1.ReplicaType]*commonv1.ReplicaSpec) error { +func validateMXReplicaSpecs(specs map[commonv1.ReplicaType]*commonv1.ReplicaSpec) error { if specs == nil { return fmt.Errorf("MXJobSpec is not valid") } diff --git a/pkg/apis/kubeflow.org/v1/mxnet_validation_test.go b/pkg/apis/kubeflow.org/v1/mxnet_validation_test.go index 94ea5244cd..addfd9273a 100644 --- a/pkg/apis/kubeflow.org/v1/mxnet_validation_test.go +++ b/pkg/apis/kubeflow.org/v1/mxnet_validation_test.go @@ -18,72 +18,176 @@ import ( "testing" commonv1 "github.com/kubeflow/common/pkg/apis/common/v1" - v1 "k8s.io/api/core/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/utils/pointer" ) -func TestValidateV1MXJobSpec(t *testing.T) { - testCases := []MXJobSpec{ - { - MXReplicaSpecs: nil, +func TestValidateV1MXJob(t *testing.T) { + validMXReplicaSpecs := map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ + MXJobReplicaTypeScheduler: { + Replicas: pointer.Int32(1), + RestartPolicy: commonv1.RestartPolicyNever, + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{{ + Name: "mxnet", + Image: "mxjob/mxnet", + }}, + }, + }, + }, + MXJobReplicaTypeServer: { + Replicas: pointer.Int32(1), + RestartPolicy: commonv1.RestartPolicyNever, + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{{ + Name: "mxnet", + Image: "mxjob/mxnet", + }}, + }, + }, }, - { - MXReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ - MXJobReplicaTypeWorker: &commonv1.ReplicaSpec{ - Template: v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{}, + MXJobReplicaTypeWorker: { + Replicas: pointer.Int32(1), + RestartPolicy: commonv1.RestartPolicyNever, + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{{ + Name: "mxnet", + Image: "mxjob/mxnet", + Command: []string{"python"}, + Args: []string{ + "/incubator-mxnet/example/image-classification/train_mnist.py", + "--num-epochs=10", + "--num-layers=2", + "--kv-store=dist_device_sync", }, - }, + }}, + }, + }, + }, + } + + testCases := map[string]struct { + MXJob *MXJob + wantErr bool + }{ + "valid mxJob": { + MXJob: &MXJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + }, + Spec: MXJobSpec{ + MXReplicaSpecs: validMXReplicaSpecs, + }, + }, + wantErr: false, + }, + "mxReplicaSpecs is nil": { + MXJob: &MXJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", }, }, + wantErr: true, }, - { - MXReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ - MXJobReplicaTypeWorker: &commonv1.ReplicaSpec{ - Template: v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - v1.Container{ - Image: "", + "mxJob name does not meet DNS1035": { + MXJob: &MXJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "10test", + }, + Spec: MXJobSpec{ + MXReplicaSpecs: validMXReplicaSpecs, + }, + }, + wantErr: true, + }, + "no containers": { + MXJob: &MXJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + }, + Spec: MXJobSpec{ + MXReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ + MXJobReplicaTypeWorker: { + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{}, }, }, }, }, }, }, + wantErr: true, }, - { - MXReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ - MXJobReplicaTypeWorker: &commonv1.ReplicaSpec{ - Template: v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - v1.Container{ - Name: "", - Image: "mxjob/mxnet:gpu", + "image is empty": { + MXJob: &MXJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + }, + Spec: MXJobSpec{ + MXReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ + MXJobReplicaTypeWorker: { + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{{ + Name: "mxnet", + Image: "", + }}, }, }, }, }, }, }, + wantErr: true, }, - { - MXReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ - MXJobReplicaTypeScheduler: &commonv1.ReplicaSpec{ - Template: v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{}, + "mxnet default container name doesn't find": { + MXJob: &MXJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + }, + Spec: MXJobSpec{ + MXReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ + MXJobReplicaTypeWorker: { + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{{ + Name: "", + Image: "mxjob/mxnet:gpu", + }}, + }, + }, }, }, }, }, + wantErr: true, + }, + "replicaSpec is nil": { + MXJob: &MXJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + }, + Spec: MXJobSpec{ + MXReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ + MXJobReplicaTypeScheduler: nil, + }, + }, + }, + wantErr: true, }, } - for _, c := range testCases { - err := ValidateV1MXJobSpec(&c) - if err.Error() != "MXJobSpec is not valid" { - t.Error("Failed validate the alpha2.MXJobSpec") - } + + for name, tc := range testCases { + t.Run(name, func(t *testing.T) { + got := ValidateV1MXJob(tc.MXJob) + if (got != nil) != tc.wantErr { + t.Fatalf("ValidateV1MXJob() error = %v, wantErr %v", got, tc.wantErr) + } + }) } } diff --git a/pkg/apis/kubeflow.org/v1/paddlepaddle_validation.go b/pkg/apis/kubeflow.org/v1/paddlepaddle_validation.go index 8969bda032..57310710ce 100644 --- a/pkg/apis/kubeflow.org/v1/paddlepaddle_validation.go +++ b/pkg/apis/kubeflow.org/v1/paddlepaddle_validation.go @@ -18,13 +18,24 @@ import ( "fmt" commonv1 "github.com/kubeflow/common/pkg/apis/common/v1" + apimachineryvalidation "k8s.io/apimachinery/pkg/api/validation" ) -func ValidateV1PaddleJobSpec(c *PaddleJobSpec) error { - if c.PaddleReplicaSpecs == nil { +func ValidateV1PaddleJob(paddleJob *PaddleJob) error { + if errors := apimachineryvalidation.NameIsDNS1035Label(paddleJob.ObjectMeta.Name, false); errors != nil { + return fmt.Errorf("PaddleJob name is invalid: %v", errors) + } + if err := validatePaddleReplicaSpecs(paddleJob.Spec.PaddleReplicaSpecs); err != nil { + return err + } + return nil +} + +func validatePaddleReplicaSpecs(specs map[commonv1.ReplicaType]*commonv1.ReplicaSpec) error { + if specs == nil { return fmt.Errorf("PaddleJobSpec is not valid") } - for rType, value := range c.PaddleReplicaSpecs { + for rType, value := range specs { if value == nil || len(value.Template.Spec.Containers) == 0 { return fmt.Errorf("PaddleJobSpec is not valid: containers definition expected in %v", rType) } @@ -63,5 +74,4 @@ func ValidateV1PaddleJobSpec(c *PaddleJobSpec) error { } return nil - } diff --git a/pkg/apis/kubeflow.org/v1/paddlepaddle_validation_test.go b/pkg/apis/kubeflow.org/v1/paddlepaddle_validation_test.go index a09002582e..cdc06d8bb9 100644 --- a/pkg/apis/kubeflow.org/v1/paddlepaddle_validation_test.go +++ b/pkg/apis/kubeflow.org/v1/paddlepaddle_validation_test.go @@ -18,61 +18,150 @@ import ( "testing" commonv1 "github.com/kubeflow/common/pkg/apis/common/v1" - v1 "k8s.io/api/core/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/utils/pointer" ) -func TestValidateV1PaddleJobSpec(t *testing.T) { - testCases := []PaddleJobSpec{ - { - PaddleReplicaSpecs: nil, +func TestValidateV1PaddleJob(t *testing.T) { + validPaddleReplicaSpecs := map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ + PaddleJobReplicaTypeWorker: { + Replicas: pointer.Int32(2), + RestartPolicy: commonv1.RestartPolicyNever, + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{{ + Name: "paddle", + Image: "registry.baidubce.com/paddlepaddle/paddle:2.4.0rc0-cpu", + Command: []string{"python"}, + Args: []string{ + "-m", + "paddle.distributed.launch", + "run_check", + }, + Ports: []corev1.ContainerPort{{ + Name: "master", + ContainerPort: int32(37777), + }}, + ImagePullPolicy: corev1.PullAlways, + }}, + }, + }, + }, + } + + testCases := map[string]struct { + paddleJob *PaddleJob + wantErr bool + }{ + "valid paddleJob": { + paddleJob: &PaddleJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + }, + Spec: PaddleJobSpec{ + PaddleReplicaSpecs: validPaddleReplicaSpecs, + }, + }, + wantErr: false, }, - { - PaddleReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ - PaddleJobReplicaTypeWorker: { - Template: v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{}, + "paddleJob name does not meet DNS1035": { + paddleJob: &PaddleJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "__test", + }, + Spec: PaddleJobSpec{ + PaddleReplicaSpecs: validPaddleReplicaSpecs, + }, + }, + wantErr: true, + }, + "no containers": { + paddleJob: &PaddleJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + }, + Spec: PaddleJobSpec{ + PaddleReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ + PaddleJobReplicaTypeWorker: { + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{}, + }, + }, }, }, }, }, + wantErr: true, }, - { - PaddleReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ - PaddleJobReplicaTypeWorker: { - Template: v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Image: "", + "image is empty": { + paddleJob: &PaddleJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + }, + Spec: PaddleJobSpec{ + PaddleReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ + PaddleJobReplicaTypeWorker: { + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "paddle", + Image: "", + }, + }, }, }, }, }, }, }, + wantErr: true, }, - { - PaddleReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ - PaddleJobReplicaTypeWorker: { - Template: v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Name: "", - Image: "gcr.io/kubeflow-ci/paddle-dist-mnist_test:1.0", + "paddle default container name doesn't find": { + paddleJob: &PaddleJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + }, + Spec: PaddleJobSpec{ + PaddleReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ + PaddleJobReplicaTypeWorker: { + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "", + Image: "gcr.io/kubeflow-ci/paddle-dist-mnist_test:1.0", + }, + }, }, }, }, }, }, }, + wantErr: true, + }, + "replicaSpec is nil": { + paddleJob: &PaddleJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + }, + Spec: PaddleJobSpec{ + PaddleReplicaSpecs: nil, + }, + }, + wantErr: true, }, } - for _, c := range testCases { - err := ValidateV1PaddleJobSpec(&c) - if err == nil { - t.Error("Failed validate the v1.PaddleJobSpec") - } + + for name, tc := range testCases { + t.Run(name, func(t *testing.T) { + got := ValidateV1PaddleJob(tc.paddleJob) + if (got != nil) != tc.wantErr { + t.Fatalf("ValidateV1PaddleJob() error = %v, wantErr %v", got, tc.wantErr) + } + }) } } diff --git a/pkg/apis/kubeflow.org/v1/pytorch_validation.go b/pkg/apis/kubeflow.org/v1/pytorch_validation.go index 2528c3ecd9..e9869ece1c 100644 --- a/pkg/apis/kubeflow.org/v1/pytorch_validation.go +++ b/pkg/apis/kubeflow.org/v1/pytorch_validation.go @@ -18,13 +18,24 @@ import ( "fmt" commonv1 "github.com/kubeflow/common/pkg/apis/common/v1" + apimachineryvalidation "k8s.io/apimachinery/pkg/api/validation" ) -func ValidateV1PyTorchJobSpec(c *PyTorchJobSpec) error { - if c.PyTorchReplicaSpecs == nil { +func ValidateV1PyTorchJob(pytorchJob *PyTorchJob) error { + if errors := apimachineryvalidation.NameIsDNS1035Label(pytorchJob.ObjectMeta.Name, false); errors != nil { + return fmt.Errorf("PyTorchJob name is invalid: %v", errors) + } + if err := validatePyTorchReplicaSpecs(pytorchJob.Spec.PyTorchReplicaSpecs); err != nil { + return err + } + return nil +} + +func validatePyTorchReplicaSpecs(specs map[commonv1.ReplicaType]*commonv1.ReplicaSpec) error { + if specs == nil { return fmt.Errorf("PyTorchJobSpec is not valid") } - for rType, value := range c.PyTorchReplicaSpecs { + for rType, value := range specs { if value == nil || len(value.Template.Spec.Containers) == 0 { return fmt.Errorf("PyTorchJobSpec is not valid: containers definition expected in %v", rType) } diff --git a/pkg/apis/kubeflow.org/v1/pytorch_validation_test.go b/pkg/apis/kubeflow.org/v1/pytorch_validation_test.go index d8a1b20570..6428215d9b 100644 --- a/pkg/apis/kubeflow.org/v1/pytorch_validation_test.go +++ b/pkg/apis/kubeflow.org/v1/pytorch_validation_test.go @@ -15,82 +15,180 @@ package v1 import ( - "k8s.io/utils/pointer" "testing" commonv1 "github.com/kubeflow/common/pkg/apis/common/v1" - v1 "k8s.io/api/core/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/utils/pointer" ) -func TestValidateV1PyTorchJobSpec(t *testing.T) { - testCases := []PyTorchJobSpec{ - { - PyTorchReplicaSpecs: nil, +func TestValidateV1PyTorchJob(t *testing.T) { + validPyTorchReplicaSpecs := map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ + PyTorchJobReplicaTypeMaster: { + Replicas: pointer.Int32(1), + RestartPolicy: commonv1.RestartPolicyOnFailure, + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{{ + Name: "pytorch", + Image: "docker.io/kubeflowkatib/pytorch-mnist:v1beta1-45c5727", + ImagePullPolicy: corev1.PullAlways, + Command: []string{ + "python3", + "/opt/pytorch-mnist/mnist.py", + "--epochs=1", + }, + }}, + }, + }, + }, + PyTorchJobReplicaTypeWorker: { + Replicas: pointer.Int32(1), + RestartPolicy: commonv1.RestartPolicyOnFailure, + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{{ + Name: "pytorch", + Image: "docker.io/kubeflowkatib/pytorch-mnist:v1beta1-45c5727", + ImagePullPolicy: corev1.PullAlways, + Command: []string{ + "python3", + "/opt/pytorch-mnist/mnist.py", + "--epochs=1", + }, + }}, + }, + }, + }, + } + + testCases := map[string]struct { + pytorchJob *PyTorchJob + wantErr bool + }{ + "valid PyTorchJob": { + pytorchJob: &PyTorchJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + }, + Spec: PyTorchJobSpec{ + PyTorchReplicaSpecs: validPyTorchReplicaSpecs, + }, + }, + wantErr: false, }, - { - PyTorchReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ - PyTorchJobReplicaTypeWorker: { - Template: v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{}, + "pytorchJob name does not meet DNS1035": { + pytorchJob: &PyTorchJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "0-test", + }, + Spec: PyTorchJobSpec{ + PyTorchReplicaSpecs: validPyTorchReplicaSpecs, + }, + }, + wantErr: true, + }, + "no containers": { + pytorchJob: &PyTorchJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + }, + Spec: PyTorchJobSpec{ + PyTorchReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ + PyTorchJobReplicaTypeWorker: { + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{}, + }, + }, }, }, }, }, + wantErr: true, }, - { - PyTorchReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ - PyTorchJobReplicaTypeWorker: { - Template: v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Image: "", + "image is empty": { + pytorchJob: &PyTorchJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + }, + Spec: PyTorchJobSpec{ + PyTorchReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ + PyTorchJobReplicaTypeWorker: { + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "pytorch", + Image: "", + }, + }, }, }, }, }, }, }, + wantErr: true, }, - { - PyTorchReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ - PyTorchJobReplicaTypeWorker: { - Template: v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Name: "", - Image: "gcr.io/kubeflow-ci/pytorch-dist-mnist_test:1.0", + "pytorchJob default container name doesn't present": { + pytorchJob: &PyTorchJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + }, + Spec: PyTorchJobSpec{ + PyTorchReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ + PyTorchJobReplicaTypeWorker: { + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "", + Image: "gcr.io/kubeflow-ci/pytorch-dist-mnist_test:1.0", + }, + }, }, }, }, }, }, }, + wantErr: true, }, - { - PyTorchReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ - PyTorchJobReplicaTypeMaster: { - Replicas: pointer.Int32(2), - Template: v1.PodTemplateSpec{ - Spec: v1.PodSpec{ - Containers: []v1.Container{ - { - Name: "pytorch", - Image: "gcr.io/kubeflow-ci/pytorch-dist-mnist_test:1.0", + "the number of replicas in masterReplica is other than 1": { + pytorchJob: &PyTorchJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + }, + Spec: PyTorchJobSpec{ + PyTorchReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ + PyTorchJobReplicaTypeMaster: { + Replicas: pointer.Int32(2), + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "pytorch", + Image: "gcr.io/kubeflow-ci/pytorch-dist-mnist_test:1.0", + }, + }, }, }, }, }, }, }, + wantErr: true, }, } - for _, c := range testCases { - err := ValidateV1PyTorchJobSpec(&c) - if err == nil { - t.Error("Failed validate the v1.PyTorchJobSpec") - } + + for name, tc := range testCases { + t.Run(name, func(t *testing.T) { + got := ValidateV1PyTorchJob(tc.pytorchJob) + if (got != nil) != tc.wantErr { + t.Fatalf("ValidateV1PyTorchJob() error = %v, wantErr %v", got, tc.wantErr) + } + }) } } diff --git a/pkg/apis/kubeflow.org/v1/tensorflow_defaults.go b/pkg/apis/kubeflow.org/v1/tensorflow_defaults.go index 9ffa758473..88ae0c1c6b 100644 --- a/pkg/apis/kubeflow.org/v1/tensorflow_defaults.go +++ b/pkg/apis/kubeflow.org/v1/tensorflow_defaults.go @@ -50,9 +50,9 @@ func setTensorflowTypeNamesToCamelCase(tfJob *TFJob) { // SetDefaults_TFJob sets any unspecified values to defaults. func SetDefaults_TFJob(tfJob *TFJob) { - // Set default cleanpod policy to Running. + // Set default cleanpod policy to None. if tfJob.Spec.RunPolicy.CleanPodPolicy == nil { - running := commonv1.CleanPodPolicyRunning + running := commonv1.CleanPodPolicyNone tfJob.Spec.RunPolicy.CleanPodPolicy = &running } // Set default success policy to "". diff --git a/pkg/apis/kubeflow.org/v1/tensorflow_defaults_test.go b/pkg/apis/kubeflow.org/v1/tensorflow_defaults_test.go index 49361c84c0..3fa2236490 100644 --- a/pkg/apis/kubeflow.org/v1/tensorflow_defaults_test.go +++ b/pkg/apis/kubeflow.org/v1/tensorflow_defaults_test.go @@ -151,7 +151,7 @@ func TestSetDefaultTFJob(t *testing.T) { }, }, }, - expected: expectedTFJob(commonv1.CleanPodPolicyRunning, customRestartPolicy, TFJobDefaultPortName, TFJobDefaultPort), + expected: expectedTFJob(commonv1.CleanPodPolicyNone, customRestartPolicy, TFJobDefaultPortName, TFJobDefaultPort), }, "set replicas with default restartpolicy": { original: &TFJob{ @@ -178,7 +178,7 @@ func TestSetDefaultTFJob(t *testing.T) { }, }, }, - expected: expectedTFJob(commonv1.CleanPodPolicyRunning, TFJobDefaultRestartPolicy, TFJobDefaultPortName, TFJobDefaultPort), + expected: expectedTFJob(commonv1.CleanPodPolicyNone, TFJobDefaultRestartPolicy, TFJobDefaultPortName, TFJobDefaultPort), }, "set replicas with default port": { original: &TFJob{ @@ -201,7 +201,7 @@ func TestSetDefaultTFJob(t *testing.T) { }, }, }, - expected: expectedTFJob(commonv1.CleanPodPolicyRunning, customRestartPolicy, "", 0), + expected: expectedTFJob(commonv1.CleanPodPolicyNone, customRestartPolicy, "", 0), }, "set replicas adding default port": { original: &TFJob{ @@ -230,7 +230,7 @@ func TestSetDefaultTFJob(t *testing.T) { }, }, }, - expected: expectedTFJob(commonv1.CleanPodPolicyRunning, customRestartPolicy, customPortName, customPort), + expected: expectedTFJob(commonv1.CleanPodPolicyNone, customRestartPolicy, customPortName, customPort), }, "set custom cleanpod policy": { original: &TFJob{ @@ -273,8 +273,3 @@ func TestSetDefaultTFJob(t *testing.T) { } } } - -func cleanPodPolicyPointer(cleanPodPolicy commonv1.CleanPodPolicy) *commonv1.CleanPodPolicy { - c := cleanPodPolicy - return &c -} diff --git a/pkg/apis/kubeflow.org/v1/tensorflow_validation.go b/pkg/apis/kubeflow.org/v1/tensorflow_validation.go index 32a9c25463..9e06b31e44 100644 --- a/pkg/apis/kubeflow.org/v1/tensorflow_validation.go +++ b/pkg/apis/kubeflow.org/v1/tensorflow_validation.go @@ -17,11 +17,21 @@ package v1 import ( "fmt" - log "github.com/sirupsen/logrus" - commonv1 "github.com/kubeflow/common/pkg/apis/common/v1" + log "github.com/sirupsen/logrus" + apimachineryvalidation "k8s.io/apimachinery/pkg/api/validation" ) +func ValidateV1TFJob(tfjob *TFJob) error { + if errors := apimachineryvalidation.NameIsDNS1035Label(tfjob.ObjectMeta.Name, false); errors != nil { + return fmt.Errorf("TFJob name is invalid: %v", errors) + } + if err := validateV1TFReplicaSpecs(tfjob.Spec.TFReplicaSpecs); err != nil { + return err + } + return nil +} + // IsChieforMaster returns true if the type is Master or Chief. func IsChieforMaster(typ commonv1.ReplicaType) bool { return typ == TFJobReplicaTypeChief || typ == TFJobReplicaTypeMaster @@ -37,12 +47,7 @@ func IsEvaluator(typ commonv1.ReplicaType) bool { return typ == TFJobReplicaTypeEval } -// ValidateV1TFJobSpec checks that the v1.TFJobSpec is valid. -func ValidateV1TFJobSpec(c *TFJobSpec) error { - return validateV1ReplicaSpecs(c.TFReplicaSpecs) -} - -func validateV1ReplicaSpecs(specs map[commonv1.ReplicaType]*commonv1.ReplicaSpec) error { +func validateV1TFReplicaSpecs(specs map[commonv1.ReplicaType]*commonv1.ReplicaSpec) error { if specs == nil { return fmt.Errorf("TFJobSpec is not valid") } diff --git a/pkg/apis/kubeflow.org/v1/tensorflow_validation_test.go b/pkg/apis/kubeflow.org/v1/tensorflow_validation_test.go index 4b070881e3..216256a343 100644 --- a/pkg/apis/kubeflow.org/v1/tensorflow_validation_test.go +++ b/pkg/apis/kubeflow.org/v1/tensorflow_validation_test.go @@ -19,79 +19,154 @@ import ( commonv1 "github.com/kubeflow/common/pkg/apis/common/v1" corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/utils/pointer" ) -func TestValidateV1TFJobSpec(t *testing.T) { - testCases := []TFJobSpec{ - { - TFReplicaSpecs: nil, - }, - { - TFReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ - TFJobReplicaTypeWorker: &commonv1.ReplicaSpec{ - Template: corev1.PodTemplateSpec{ - Spec: corev1.PodSpec{ - Containers: []corev1.Container{}, +func TestValidateV1TFJob(t *testing.T) { + validTFReplicaSpecs := map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ + TFJobReplicaTypeWorker: { + Replicas: pointer.Int32(2), + RestartPolicy: commonv1.RestartPolicyOnFailure, + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{{ + Name: "tensorflow", + Image: "kubeflow/tf-mnist-with-summaries:latest", + Command: []string{ + "python", + "/var/tf_mnist/mnist_with_summaries.py", }, - }, + }}, }, }, }, - { - TFReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ - TFJobReplicaTypeWorker: &commonv1.ReplicaSpec{ - Template: corev1.PodTemplateSpec{ - Spec: corev1.PodSpec{ - Containers: []corev1.Container{ - corev1.Container{ - Image: "", + } + + testCases := map[string]struct { + tfJob *TFJob + wantErr bool + }{ + "valid tfJob": { + tfJob: &TFJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + }, + Spec: TFJobSpec{ + TFReplicaSpecs: validTFReplicaSpecs, + }, + }, + wantErr: false, + }, + "TFJob name does not meet DNS1035": { + tfJob: &TFJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "00test", + }, + Spec: TFJobSpec{ + TFReplicaSpecs: validTFReplicaSpecs, + }, + }, + wantErr: true, + }, + "no containers": { + tfJob: &TFJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + }, + Spec: TFJobSpec{ + TFReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ + TFJobReplicaTypeWorker: { + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{}, }, }, }, }, }, }, + wantErr: true, }, - { - TFReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ - TFJobReplicaTypeWorker: &commonv1.ReplicaSpec{ - Template: corev1.PodTemplateSpec{ - Spec: corev1.PodSpec{ - Containers: []corev1.Container{ - corev1.Container{ - Name: "", - Image: "kubeflow/tf-dist-mnist-test:1.0", + "empty image": { + tfJob: &TFJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + }, + Spec: TFJobSpec{ + TFReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ + TFJobReplicaTypeWorker: { + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{{ + Name: "tensorflow", + Image: "", + }}, }, }, }, }, }, }, + wantErr: true, }, - { - TFReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ - TFJobReplicaTypeChief: &commonv1.ReplicaSpec{ - Template: corev1.PodTemplateSpec{ - Spec: corev1.PodSpec{ - Containers: []corev1.Container{}, + "tfJob default container name doesn't present": { + tfJob: &TFJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + }, + Spec: TFJobSpec{ + TFReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ + TFJobReplicaTypeWorker: { + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{{ + Name: "", + Image: "kubeflow/tf-dist-mnist-test:1.0", + }}, + }, + }, }, }, }, - TFJobReplicaTypeMaster: &commonv1.ReplicaSpec{ - Template: corev1.PodTemplateSpec{ - Spec: corev1.PodSpec{ - Containers: []corev1.Container{}, + }, + wantErr: true, + }, + "there are more than 2 masterReplica's or ChiefReplica's": { + tfJob: &TFJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + }, + Spec: TFJobSpec{ + TFReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ + TFJobReplicaTypeChief: { + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{}, + }, + }, + }, + TFJobReplicaTypeMaster: { + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{}, + }, + }, }, }, }, }, + wantErr: true, }, } - for _, c := range testCases { - err := ValidateV1TFJobSpec(&c) - if err == nil { - t.Error("Expected error got nil") - } + + for name, tc := range testCases { + t.Run(name, func(t *testing.T) { + got := ValidateV1TFJob(tc.tfJob) + if (got != nil) != tc.wantErr { + t.Fatalf("ValidateV1TFJob() error = %v, wantErr %v", got, tc.wantErr) + } + }) } } diff --git a/pkg/apis/kubeflow.org/v1/xgboost_defaults.go b/pkg/apis/kubeflow.org/v1/xgboost_defaults.go index 227bb90fb7..79cddf65c3 100644 --- a/pkg/apis/kubeflow.org/v1/xgboost_defaults.go +++ b/pkg/apis/kubeflow.org/v1/xgboost_defaults.go @@ -46,9 +46,9 @@ func setXGBoostJobTypeNamesToCamelCase(xgboostJob *XGBoostJob) { // SetDefaults_XGBoostJob sets any unspecified values to defaults. func SetDefaults_XGBoostJob(xgboostJob *XGBoostJob) { - // Set default cleanpod policy to All. + // Set default cleanpod policy to None. if xgboostJob.Spec.RunPolicy.CleanPodPolicy == nil { - all := commonv1.CleanPodPolicyAll + all := commonv1.CleanPodPolicyNone xgboostJob.Spec.RunPolicy.CleanPodPolicy = &all } diff --git a/pkg/apis/kubeflow.org/v1/xgboost_defaults_test.go b/pkg/apis/kubeflow.org/v1/xgboost_defaults_test.go index b71093b853..329ad99eda 100644 --- a/pkg/apis/kubeflow.org/v1/xgboost_defaults_test.go +++ b/pkg/apis/kubeflow.org/v1/xgboost_defaults_test.go @@ -96,7 +96,7 @@ func TestSetDefaults_XGBoostJob(t *testing.T) { }, }, }, - expected: expectedXGBoostJob(commonv1.CleanPodPolicyAll, XGBoostJobDefaultRestartPolicy, 1, XGBoostJobDefaultPortName, XGBoostJobDefaultPort), + expected: expectedXGBoostJob(commonv1.CleanPodPolicyNone, XGBoostJobDefaultRestartPolicy, 1, XGBoostJobDefaultPortName, XGBoostJobDefaultPort), }, "Set spec with restart policy": { original: &XGBoostJob{ @@ -118,7 +118,7 @@ func TestSetDefaults_XGBoostJob(t *testing.T) { }, }, }, - expected: expectedXGBoostJob(commonv1.CleanPodPolicyAll, commonv1.RestartPolicyOnFailure, 1, XGBoostJobDefaultPortName, XGBoostJobDefaultPort), + expected: expectedXGBoostJob(commonv1.CleanPodPolicyNone, commonv1.RestartPolicyOnFailure, 1, XGBoostJobDefaultPortName, XGBoostJobDefaultPort), }, "Set spec with replicas": { original: &XGBoostJob{ @@ -140,7 +140,7 @@ func TestSetDefaults_XGBoostJob(t *testing.T) { }, }, }, - expected: expectedXGBoostJob(commonv1.CleanPodPolicyAll, XGBoostJobDefaultRestartPolicy, 3, XGBoostJobDefaultPortName, XGBoostJobDefaultPort), + expected: expectedXGBoostJob(commonv1.CleanPodPolicyNone, XGBoostJobDefaultRestartPolicy, 3, XGBoostJobDefaultPortName, XGBoostJobDefaultPort), }, "Set spec with default node port name and port": { @@ -168,7 +168,7 @@ func TestSetDefaults_XGBoostJob(t *testing.T) { }, }, }, - expected: expectedXGBoostJob(commonv1.CleanPodPolicyAll, XGBoostJobDefaultRestartPolicy, 1, XGBoostJobDefaultPortName, XGBoostJobDefaultPort), + expected: expectedXGBoostJob(commonv1.CleanPodPolicyNone, XGBoostJobDefaultRestartPolicy, 1, XGBoostJobDefaultPortName, XGBoostJobDefaultPort), }, "Set spec with node port": { @@ -196,7 +196,31 @@ func TestSetDefaults_XGBoostJob(t *testing.T) { }, }, }, - expected: expectedXGBoostJob(commonv1.CleanPodPolicyAll, XGBoostJobDefaultRestartPolicy, 1, XGBoostJobDefaultPortName, 9999), + expected: expectedXGBoostJob(commonv1.CleanPodPolicyNone, XGBoostJobDefaultRestartPolicy, 1, XGBoostJobDefaultPortName, 9999), + }, + "set spec with cleanpod policy": { + original: &XGBoostJob{ + Spec: XGBoostJobSpec{ + RunPolicy: commonv1.RunPolicy{ + CleanPodPolicy: cleanPodPolicyPointer(commonv1.CleanPodPolicyAll), + }, + XGBReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ + XGBoostJobReplicaTypeWorker: &commonv1.ReplicaSpec{ + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + corev1.Container{ + Name: XGBoostJobDefaultContainerName, + Image: testImage, + }, + }, + }, + }, + }, + }, + }, + }, + expected: expectedXGBoostJob(commonv1.CleanPodPolicyAll, XGBoostJobDefaultRestartPolicy, 1, XGBoostJobDefaultPortName, XGBoostJobDefaultPort), }, } diff --git a/pkg/apis/kubeflow.org/v1/xgboost_validation.go b/pkg/apis/kubeflow.org/v1/xgboost_validation.go index 0091c25582..c7b6186d20 100644 --- a/pkg/apis/kubeflow.org/v1/xgboost_validation.go +++ b/pkg/apis/kubeflow.org/v1/xgboost_validation.go @@ -18,14 +18,25 @@ import ( "fmt" commonv1 "github.com/kubeflow/common/pkg/apis/common/v1" + apimachineryvalidation "k8s.io/apimachinery/pkg/api/validation" ) -func ValidateXGBoostJobSpec(c *XGBoostJobSpec) error { - if c.XGBReplicaSpecs == nil { +func ValidateV1XGBoostJob(xgboostJob *XGBoostJob) error { + if errors := apimachineryvalidation.NameIsDNS1035Label(xgboostJob.ObjectMeta.Name, false); errors != nil { + return fmt.Errorf("XGBoostJob name is invalid: %v", errors) + } + if err := validateXGBoostReplicaSpecs(xgboostJob.Spec.XGBReplicaSpecs); err != nil { + return err + } + return nil +} + +func validateXGBoostReplicaSpecs(specs map[commonv1.ReplicaType]*commonv1.ReplicaSpec) error { + if specs == nil { return fmt.Errorf("XGBoostJobSpec is not valid") } masterExists := false - for rType, value := range c.XGBReplicaSpecs { + for rType, value := range specs { if value == nil || len(value.Template.Spec.Containers) == 0 { return fmt.Errorf("XGBoostJobSpec is not valid: containers definition expected in %v", rType) } diff --git a/pkg/apis/kubeflow.org/v1/xgboost_validation_test.go b/pkg/apis/kubeflow.org/v1/xgboost_validation_test.go index 0cb3a82469..b5564b9553 100644 --- a/pkg/apis/kubeflow.org/v1/xgboost_validation_test.go +++ b/pkg/apis/kubeflow.org/v1/xgboost_validation_test.go @@ -15,99 +15,209 @@ package v1 import ( - "k8s.io/utils/pointer" "testing" commonv1 "github.com/kubeflow/common/pkg/apis/common/v1" corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/utils/pointer" ) -func TestValidateXGBoostJobSpec(t *testing.T) { - testCases := []XGBoostJobSpec{ - { - XGBReplicaSpecs: nil, +func TestValidateV1XGBoostJob(t *testing.T) { + validXGBoostReplicaSpecs := map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ + XGBoostJobReplicaTypeMaster: { + Replicas: pointer.Int32(1), + RestartPolicy: commonv1.RestartPolicyNever, + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{{ + Name: "xgboost", + Image: "docker.io/merlintang/xgboost-dist-iris:1.1", + Ports: []corev1.ContainerPort{{ + Name: "xgboostjob-port", + ContainerPort: 9991, + }}, + ImagePullPolicy: corev1.PullAlways, + Args: []string{ + "--job_type=Train", + "--xgboost_parameter=objective:multi:softprob,num_class:3", + "--n_estimators=10", + "--learning_rate=0.1", + "--model_path=/tmp/xgboost-model", + "--model_storage_type=local", + }, + }}, + }, + }, }, - { - XGBReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ - XGBoostJobReplicaTypeWorker: &commonv1.ReplicaSpec{ - Template: corev1.PodTemplateSpec{ - Spec: corev1.PodSpec{ - Containers: []corev1.Container{}, + XGBoostJobReplicaTypeWorker: { + Replicas: pointer.Int32(2), + RestartPolicy: commonv1.RestartPolicyExitCode, + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{{ + Name: "xgboost", + Image: "docker.io/merlintang/xgboost-dist-iris:1.", + Ports: []corev1.ContainerPort{{ + Name: "xgboostjob-port", + ContainerPort: 9991, + }}, + ImagePullPolicy: corev1.PullAlways, + Args: []string{ + "--job_type=Train", + "--xgboost_parameter=objective:multi:softprob,num_class:3", + "--n_estimators=10", + "--learning_rate=0.1", + }, + }}, + }, + }, + }, + } + + testCases := map[string]struct { + xgboostJob *XGBoostJob + wantErr bool + }{ + "valid XGBoostJob": { + xgboostJob: &XGBoostJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + }, + Spec: XGBoostJobSpec{ + XGBReplicaSpecs: validXGBoostReplicaSpecs, + }, + }, + wantErr: false, + }, + "XGBoostJob name does not meet DNS1035": { + xgboostJob: &XGBoostJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "-test", + }, + Spec: XGBoostJobSpec{ + XGBReplicaSpecs: validXGBoostReplicaSpecs, + }, + }, + wantErr: true, + }, + "empty containers": { + xgboostJob: &XGBoostJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + }, + Spec: XGBoostJobSpec{ + XGBReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ + XGBoostJobReplicaTypeWorker: { + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{}, + }, + }, }, }, }, }, + wantErr: true, }, - { - XGBReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ - XGBoostJobReplicaTypeWorker: &commonv1.ReplicaSpec{ - Template: corev1.PodTemplateSpec{ - Spec: corev1.PodSpec{ - Containers: []corev1.Container{ - corev1.Container{ - Image: "", + "image is empty": { + xgboostJob: &XGBoostJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + }, + Spec: XGBoostJobSpec{ + XGBReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ + XGBoostJobReplicaTypeWorker: { + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{{ + Name: "xgboost", + Image: "", + }}, }, }, }, }, }, }, + wantErr: true, }, - { - XGBReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ - XGBoostJobReplicaTypeWorker: &commonv1.ReplicaSpec{ - Template: corev1.PodTemplateSpec{ - Spec: corev1.PodSpec{ - Containers: []corev1.Container{ - corev1.Container{ - Name: "", - Image: "gcr.io/kubeflow-ci/xgboost-dist-mnist_test:1.0", + "xgboostJob default container name doesn't present": { + xgboostJob: &XGBoostJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + }, + Spec: XGBoostJobSpec{ + XGBReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ + XGBoostJobReplicaTypeWorker: { + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{{ + Name: "", + Image: "gcr.io/kubeflow-ci/xgboost-dist-mnist_test:1.0", + }}, }, }, }, }, }, }, + wantErr: true, }, - { - XGBReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ - XGBoostJobReplicaTypeMaster: &commonv1.ReplicaSpec{ - Replicas: pointer.Int32(2), - Template: corev1.PodTemplateSpec{ - Spec: corev1.PodSpec{ - Containers: []corev1.Container{ - corev1.Container{ - Name: "xgboost", - Image: "gcr.io/kubeflow-ci/xgboost-dist-mnist_test:1.0", + "the number of replicas in masterReplica is other than 1": { + xgboostJob: &XGBoostJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + }, + Spec: XGBoostJobSpec{ + XGBReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ + XGBoostJobReplicaTypeMaster: { + Replicas: pointer.Int32(2), + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{{ + Name: "xgboost", + Image: "gcr.io/kubeflow-ci/xgboost-dist-mnist_test:1.0", + }}, }, }, }, }, }, }, + wantErr: true, }, - { - XGBReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ - XGBoostJobReplicaTypeWorker: &commonv1.ReplicaSpec{ - Replicas: pointer.Int32(1), - Template: corev1.PodTemplateSpec{ - Spec: corev1.PodSpec{ - Containers: []corev1.Container{ - corev1.Container{ - Name: "xgboost", - Image: "gcr.io/kubeflow-ci/xgboost-dist-mnist_test:1.0", + "masterReplica does not exist": { + xgboostJob: &XGBoostJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + }, + Spec: XGBoostJobSpec{ + XGBReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ + XGBoostJobReplicaTypeWorker: { + Replicas: pointer.Int32(1), + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{{ + Name: "xgboost", + Image: "gcr.io/kubeflow-ci/xgboost-dist-mnist_test:1.0", + }}, }, }, }, }, }, }, + wantErr: true, }, } - for _, c := range testCases { - err := ValidateXGBoostJobSpec(&c) - if err == nil { - t.Error("Failed validate the corev1.XGBoostJobSpec") - } + + for name, tc := range testCases { + t.Run(name, func(t *testing.T) { + got := ValidateV1XGBoostJob(tc.xgboostJob) + if (got != nil) != tc.wantErr { + t.Fatalf("ValidateV1XGBoostJob() error = %v, wantErr %v", got, tc.wantErr) + } + }) } } diff --git a/pkg/controller.v1/mpi/mpijob_controller.go b/pkg/controller.v1/mpi/mpijob_controller.go index fde68c6ac8..2af6b72565 100644 --- a/pkg/controller.v1/mpi/mpijob_controller.go +++ b/pkg/controller.v1/mpi/mpijob_controller.go @@ -561,7 +561,7 @@ func (jc *MPIJobReconciler) GetPodsForJob(jobObject interface{}) ([]*corev1.Pod, func (jc *MPIJobReconciler) DeleteJob(job interface{}) error { mpiJob, ok := job.(*kubeflowv1.MPIJob) if !ok { - return fmt.Errorf("%v is not a type of TFJob", mpiJob) + return fmt.Errorf("%v is not a type of MPIJob", mpiJob) } log := commonutil.LoggerForJob(mpiJob) diff --git a/pkg/controller.v1/mxnet/mxjob_controller.go b/pkg/controller.v1/mxnet/mxjob_controller.go index b4ca986048..470e4cbf7a 100644 --- a/pkg/controller.v1/mxnet/mxjob_controller.go +++ b/pkg/controller.v1/mxnet/mxjob_controller.go @@ -128,7 +128,7 @@ func (r *MXJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl return ctrl.Result{}, client.IgnoreNotFound(err) } - if err = kubeflowv1.ValidateV1MXJobSpec(&mxjob.Spec); err != nil { + if err = kubeflowv1.ValidateV1MXJob(mxjob); err != nil { logger.Error(err, "MXJob failed validation") r.Recorder.Eventf(mxjob, corev1.EventTypeWarning, commonutil.JobFailedValidationReason, "MXJob failed validation because %s", err) return ctrl.Result{}, err diff --git a/pkg/controller.v1/paddlepaddle/paddlepaddle_controller.go b/pkg/controller.v1/paddlepaddle/paddlepaddle_controller.go index b0d051e74e..bc8ac7feaf 100644 --- a/pkg/controller.v1/paddlepaddle/paddlepaddle_controller.go +++ b/pkg/controller.v1/paddlepaddle/paddlepaddle_controller.go @@ -129,7 +129,7 @@ func (r *PaddleJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( return ctrl.Result{}, client.IgnoreNotFound(err) } - if err = kubeflowv1.ValidateV1PaddleJobSpec(&paddlejob.Spec); err != nil { + if err = kubeflowv1.ValidateV1PaddleJob(paddlejob); err != nil { logger.Error(err, "PaddleJob failed validation") r.Recorder.Eventf(paddlejob, corev1.EventTypeWarning, commonutil.JobFailedValidationReason, "PaddleJob failed validation because %s", err) return ctrl.Result{}, err diff --git a/pkg/controller.v1/pytorch/pytorchjob_controller.go b/pkg/controller.v1/pytorch/pytorchjob_controller.go index dcd929381b..550a008429 100644 --- a/pkg/controller.v1/pytorch/pytorchjob_controller.go +++ b/pkg/controller.v1/pytorch/pytorchjob_controller.go @@ -129,7 +129,7 @@ func (r *PyTorchJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) return ctrl.Result{}, client.IgnoreNotFound(err) } - if err = kubeflowv1.ValidateV1PyTorchJobSpec(&pytorchjob.Spec); err != nil { + if err = kubeflowv1.ValidateV1PyTorchJob(pytorchjob); err != nil { logger.Error(err, "PyTorchJob failed validation") r.Recorder.Eventf(pytorchjob, corev1.EventTypeWarning, commonutil.JobFailedValidationReason, "PyTorchJob failed validation because %s", err) return ctrl.Result{}, err @@ -434,7 +434,10 @@ func (r *PyTorchJobReconciler) UpdateJobStatus(job interface{}, } else { if rtype == kubeflowv1.PyTorchJobReplicaTypeWorker { // TODO(gaocegege): Support SuccessPolicy - if expected == 0 { + // Leave a succeeded condition for the following two cases: + // 1. If all workers are succeeded. + // 2. If `ElasticPolicy` is not nil and any worker has completed. + if expected == 0 || (pytorchjob.Spec.ElasticPolicy != nil && succeeded > 0) { msg := fmt.Sprintf("PyTorchJob %s/%s successfully completed.", pytorchjob.Namespace, pytorchjob.Name) r.recorder.Event(pytorchjob, corev1.EventTypeNormal, commonutil.JobSucceededReason, msg) diff --git a/pkg/controller.v1/tensorflow/tfjob_controller.go b/pkg/controller.v1/tensorflow/tfjob_controller.go index b3b9c38b69..da06a0ce7d 100644 --- a/pkg/controller.v1/tensorflow/tfjob_controller.go +++ b/pkg/controller.v1/tensorflow/tfjob_controller.go @@ -147,7 +147,7 @@ func (r *TFJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl return ctrl.Result{}, client.IgnoreNotFound(err) } - if err = kubeflowv1.ValidateV1TFJobSpec(&tfjob.Spec); err != nil { + if err = kubeflowv1.ValidateV1TFJob(tfjob); err != nil { logger.Error(err, "TFJob failed validation") r.Recorder.Eventf(tfjob, corev1.EventTypeWarning, commonutil.JobFailedValidationReason, "TFJob failed validation because %s", err) return ctrl.Result{}, err diff --git a/pkg/controller.v1/xgboost/xgboostjob_controller.go b/pkg/controller.v1/xgboost/xgboostjob_controller.go index 659527ecc5..b78c91e9b1 100644 --- a/pkg/controller.v1/xgboost/xgboostjob_controller.go +++ b/pkg/controller.v1/xgboost/xgboostjob_controller.go @@ -139,7 +139,7 @@ func (r *XGBoostJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) return ctrl.Result{}, client.IgnoreNotFound(err) } - if err = kubeflowv1.ValidateXGBoostJobSpec(&xgboostjob.Spec); err != nil { + if err = kubeflowv1.ValidateV1XGBoostJob(xgboostjob); err != nil { logger.Error(err, "XGBoostJob failed validation") r.Recorder.Eventf(xgboostjob, corev1.EventTypeWarning, commonutil.JobFailedValidationReason, "XGBoostJob failed validation because %s", err) return ctrl.Result{}, err