From 6871b2f657c4ec50f682b12b60029d9d046b2ce8 Mon Sep 17 00:00:00 2001 From: Andreas Neumann Date: Tue, 3 Mar 2020 17:03:06 +0100 Subject: [PATCH] Error out a task if a job reaches terminal state (#1370) * Return a fatal engine error if a job never gets healthy but reaches a failed terminal state Signed-off-by: Andreas Neumann --- pkg/engine/health/health.go | 24 ++++++++++++++++++ pkg/engine/task/task.go | 1 + pkg/engine/task/task_apply.go | 14 ++++++++++- pkg/engine/task/task_pipe.go | 3 +++ test/e2e/terminal-failed-job/00-assert.yaml | 11 ++++++++ test/e2e/terminal-failed-job/00-install.yaml | 4 +++ test/e2e/terminal-failed-job/01-assert.yaml | 11 ++++++++ .../01-install-timeout.yaml | 5 ++++ .../failjob-operator/operator.yaml | 25 +++++++++++++++++++ .../failjob-operator/params.yaml | 2 ++ .../failjob-operator/templates/failjob.yaml | 14 +++++++++++ .../job-timeout-operator/operator.yaml | 25 +++++++++++++++++++ .../job-timeout-operator/params.yaml | 2 ++ .../templates/timeoutjob.yaml | 15 +++++++++++ 14 files changed, 155 insertions(+), 1 deletion(-) create mode 100644 test/e2e/terminal-failed-job/00-assert.yaml create mode 100644 test/e2e/terminal-failed-job/00-install.yaml create mode 100644 test/e2e/terminal-failed-job/01-assert.yaml create mode 100644 test/e2e/terminal-failed-job/01-install-timeout.yaml create mode 100644 test/e2e/terminal-failed-job/failjob-operator/operator.yaml create mode 100644 test/e2e/terminal-failed-job/failjob-operator/params.yaml create mode 100644 test/e2e/terminal-failed-job/failjob-operator/templates/failjob.yaml create mode 100644 test/e2e/terminal-failed-job/job-timeout-operator/operator.yaml create mode 100644 test/e2e/terminal-failed-job/job-timeout-operator/params.yaml create mode 100644 test/e2e/terminal-failed-job/job-timeout-operator/templates/timeoutjob.yaml diff --git a/pkg/engine/health/health.go b/pkg/engine/health/health.go index 86a1dfd2f..f7b8e6169 100644 --- a/pkg/engine/health/health.go +++ b/pkg/engine/health/health.go @@ -16,6 +16,30 @@ import ( kudov1beta1 "github.com/kudobuilder/kudo/pkg/apis/kudo/v1beta1" ) +// IsTerminallyFailed returns whether an object is in a terminal failed state and has no chance to reach healthy +func IsTerminallyFailed(obj runtime.Object) (bool, string) { + if obj == nil { + return false, "" + } + + switch obj := obj.(type) { + case *batchv1.Job: + return isJobTerminallyFailed(obj) + default: + return false, "" + } +} + +func isJobTerminallyFailed(job *batchv1.Job) (bool, string) { + for _, c := range job.Status.Conditions { + if c.Type == batchv1.JobFailed && c.Status == corev1.ConditionTrue { + log.Printf("HealthUtil: Job \"%v\" has failed: %s", job.Name, c.Message) + return true, c.Message + } + } + return false, "" +} + // IsHealthy returns whether an object is healthy. Must be implemented for each type. func IsHealthy(obj runtime.Object) error { if obj == nil { diff --git a/pkg/engine/task/task.go b/pkg/engine/task/task.go index f49a2d425..e94391a27 100644 --- a/pkg/engine/task/task.go +++ b/pkg/engine/task/task.go @@ -53,6 +53,7 @@ var ( dummyTaskError = "DummyTaskError" resourceUnmarshalError = "ResourceUnmarshalError" resourceValidationError = "ResourceValidationError" + failedTerminalState = "FailedTerminalStateError" ) // Build factory method takes an v1beta1.Task and returns a corresponding Tasker object diff --git a/pkg/engine/task/task_apply.go b/pkg/engine/task/task_apply.go index 27eb32bb2..d709e9960 100644 --- a/pkg/engine/task/task_apply.go +++ b/pkg/engine/task/task_apply.go @@ -58,7 +58,9 @@ func (at ApplyTask) Run(ctx Context) (bool, error) { // 4. - Check health for all resources - err = isHealthy(applied) if err != nil { - // so far we do not distinguish between unhealthy resources and other errors that might occur during a health check + if fatal := isTerminallyFailed(applied); fatal != nil { + return false, fatalExecutionError(fatal, failedTerminalState, ctx.Meta) + } // an error during a health check is not treated task execution error log.Printf("TaskExecution: %v", err) return false, nil @@ -237,6 +239,16 @@ func isHealthy(ro []runtime.Object) error { return nil } +func isTerminallyFailed(ro []runtime.Object) error { + for _, r := range ro { + if failed, msg := health.IsTerminallyFailed(r); failed { + key, _ := client.ObjectKeyFromObject(r) + return fmt.Errorf("object %s/%s has failed: %s", key.Namespace, key.Name, msg) + } + } + return nil +} + // copy from k8s.io/kubectl@v0.16.6/pkg/util/apply.go, with adjustments // GetOriginalConfiguration retrieves the original configuration of the object // from the annotation. diff --git a/pkg/engine/task/task_pipe.go b/pkg/engine/task/task_pipe.go index 310369be9..c86e211f7 100644 --- a/pkg/engine/task/task_pipe.go +++ b/pkg/engine/task/task_pipe.go @@ -97,6 +97,9 @@ func (pt PipeTask) Run(ctx Context) (bool, error) { // once the pod is Ready, it means that its initContainer finished successfully and we can copy // out the generated files. An error during a health check is not treated as task execution error if err != nil { + if fatal := isTerminallyFailed(podObj); fatal != nil { + return false, fatalExecutionError(fatal, failedTerminalState, ctx.Meta) + } return false, nil } diff --git a/test/e2e/terminal-failed-job/00-assert.yaml b/test/e2e/terminal-failed-job/00-assert.yaml new file mode 100644 index 000000000..1186b631e --- /dev/null +++ b/test/e2e/terminal-failed-job/00-assert.yaml @@ -0,0 +1,11 @@ +apiVersion: kudo.dev/v1beta1 +kind: Instance +metadata: + labels: + kudo.dev/operator: failjob-operator +spec: + operatorVersion: + name: failjob-operator-0.1.0 +status: + aggregatedStatus: + status: FATAL_ERROR \ No newline at end of file diff --git a/test/e2e/terminal-failed-job/00-install.yaml b/test/e2e/terminal-failed-job/00-install.yaml new file mode 100644 index 000000000..7b15d080d --- /dev/null +++ b/test/e2e/terminal-failed-job/00-install.yaml @@ -0,0 +1,4 @@ +apiVersion: kudo.dev/v1beta1 +kind: TestStep +kubectl: +- kudo install --instance failjob-operator-instance ./failjob-operator \ No newline at end of file diff --git a/test/e2e/terminal-failed-job/01-assert.yaml b/test/e2e/terminal-failed-job/01-assert.yaml new file mode 100644 index 000000000..ce3f42dbe --- /dev/null +++ b/test/e2e/terminal-failed-job/01-assert.yaml @@ -0,0 +1,11 @@ +apiVersion: kudo.dev/v1beta1 +kind: Instance +metadata: + labels: + kudo.dev/operator: job-timeout-operator +spec: + operatorVersion: + name: job-timeout-operator-0.1.0 +status: + aggregatedStatus: + status: FATAL_ERROR diff --git a/test/e2e/terminal-failed-job/01-install-timeout.yaml b/test/e2e/terminal-failed-job/01-install-timeout.yaml new file mode 100644 index 000000000..ee09f5dd7 --- /dev/null +++ b/test/e2e/terminal-failed-job/01-install-timeout.yaml @@ -0,0 +1,5 @@ +apiVersion: kudo.dev/v1beta1 +kind: TestStep +kubectl: +- kudo uninstall --instance failjob-operator-instance +- kudo install --instance job-timout-operator-instance ./job-timeout-operator \ No newline at end of file diff --git a/test/e2e/terminal-failed-job/failjob-operator/operator.yaml b/test/e2e/terminal-failed-job/failjob-operator/operator.yaml new file mode 100644 index 000000000..92b498be7 --- /dev/null +++ b/test/e2e/terminal-failed-job/failjob-operator/operator.yaml @@ -0,0 +1,25 @@ +apiVersion: kudo.dev/v1beta1 +name: "failjob-operator" +operatorVersion: "0.1.0" +appVersion: "1.7.9" +kubernetesVersion: 1.13.0 +maintainers: + - name: Your name + email: +url: https://kudo.dev +tasks: + - name: app + kind: Apply + spec: + resources: + - failjob.yaml +plans: + deploy: + strategy: serial + phases: + - name: main + strategy: parallel + steps: + - name: everything + tasks: + - app diff --git a/test/e2e/terminal-failed-job/failjob-operator/params.yaml b/test/e2e/terminal-failed-job/failjob-operator/params.yaml new file mode 100644 index 000000000..4200b5c94 --- /dev/null +++ b/test/e2e/terminal-failed-job/failjob-operator/params.yaml @@ -0,0 +1,2 @@ +apiVersion: kudo.dev/v1beta1 +parameters: \ No newline at end of file diff --git a/test/e2e/terminal-failed-job/failjob-operator/templates/failjob.yaml b/test/e2e/terminal-failed-job/failjob-operator/templates/failjob.yaml new file mode 100644 index 000000000..ae1a473f0 --- /dev/null +++ b/test/e2e/terminal-failed-job/failjob-operator/templates/failjob.yaml @@ -0,0 +1,14 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: failing-job + namespace: {{ .Namespace }} +spec: + template: + spec: + containers: + - name: main + image: alpine + command: ["false"] + restartPolicy: Never + backoffLimit: 0 diff --git a/test/e2e/terminal-failed-job/job-timeout-operator/operator.yaml b/test/e2e/terminal-failed-job/job-timeout-operator/operator.yaml new file mode 100644 index 000000000..005fc3e2a --- /dev/null +++ b/test/e2e/terminal-failed-job/job-timeout-operator/operator.yaml @@ -0,0 +1,25 @@ +apiVersion: kudo.dev/v1beta1 +name: "job-timeout-operator" +operatorVersion: "0.1.0" +appVersion: "1.7.9" +kubernetesVersion: 1.13.0 +maintainers: + - name: Your name + email: +url: https://kudo.dev +tasks: + - name: app + kind: Apply + spec: + resources: + - timeoutjob.yaml +plans: + deploy: + strategy: serial + phases: + - name: main + strategy: parallel + steps: + - name: everything + tasks: + - app diff --git a/test/e2e/terminal-failed-job/job-timeout-operator/params.yaml b/test/e2e/terminal-failed-job/job-timeout-operator/params.yaml new file mode 100644 index 000000000..4200b5c94 --- /dev/null +++ b/test/e2e/terminal-failed-job/job-timeout-operator/params.yaml @@ -0,0 +1,2 @@ +apiVersion: kudo.dev/v1beta1 +parameters: \ No newline at end of file diff --git a/test/e2e/terminal-failed-job/job-timeout-operator/templates/timeoutjob.yaml b/test/e2e/terminal-failed-job/job-timeout-operator/templates/timeoutjob.yaml new file mode 100644 index 000000000..bf615ddfc --- /dev/null +++ b/test/e2e/terminal-failed-job/job-timeout-operator/templates/timeoutjob.yaml @@ -0,0 +1,15 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: timeout-job + namespace: {{ .Namespace }} +spec: + template: + spec: + containers: + - name: main + image: alpine + command: ["sleep", "1000"] + restartPolicy: Never + backoffLimit: 3 + activeDeadlineSeconds: 60