Skip to content

Commit

Permalink
Error out a task if a job reaches terminal state (#1370)
Browse files Browse the repository at this point in the history
* Return a fatal engine error if a job never gets healthy but reaches a failed terminal state

Signed-off-by: Andreas Neumann <aneumann@mesosphere.com>
  • Loading branch information
ANeumann82 committed Mar 3, 2020
1 parent 6e9e762 commit 6871b2f
Show file tree
Hide file tree
Showing 14 changed files with 155 additions and 1 deletion.
24 changes: 24 additions & 0 deletions pkg/engine/health/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,30 @@ import (
kudov1beta1 "github.com/kudobuilder/kudo/pkg/apis/kudo/v1beta1"
)

// IsTerminallyFailed returns whether an object is in a terminal failed state and has no chance to reach healthy
func IsTerminallyFailed(obj runtime.Object) (bool, string) {
if obj == nil {
return false, ""
}

switch obj := obj.(type) {
case *batchv1.Job:
return isJobTerminallyFailed(obj)
default:
return false, ""
}
}

func isJobTerminallyFailed(job *batchv1.Job) (bool, string) {
for _, c := range job.Status.Conditions {
if c.Type == batchv1.JobFailed && c.Status == corev1.ConditionTrue {
log.Printf("HealthUtil: Job \"%v\" has failed: %s", job.Name, c.Message)
return true, c.Message
}
}
return false, ""
}

// IsHealthy returns whether an object is healthy. Must be implemented for each type.
func IsHealthy(obj runtime.Object) error {
if obj == nil {
Expand Down
1 change: 1 addition & 0 deletions pkg/engine/task/task.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ var (
dummyTaskError = "DummyTaskError"
resourceUnmarshalError = "ResourceUnmarshalError"
resourceValidationError = "ResourceValidationError"
failedTerminalState = "FailedTerminalStateError"
)

// Build factory method takes an v1beta1.Task and returns a corresponding Tasker object
Expand Down
14 changes: 13 additions & 1 deletion pkg/engine/task/task_apply.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,9 @@ func (at ApplyTask) Run(ctx Context) (bool, error) {
// 4. - Check health for all resources -
err = isHealthy(applied)
if err != nil {
// so far we do not distinguish between unhealthy resources and other errors that might occur during a health check
if fatal := isTerminallyFailed(applied); fatal != nil {
return false, fatalExecutionError(fatal, failedTerminalState, ctx.Meta)
}
// an error during a health check is not treated task execution error
log.Printf("TaskExecution: %v", err)
return false, nil
Expand Down Expand Up @@ -237,6 +239,16 @@ func isHealthy(ro []runtime.Object) error {
return nil
}

func isTerminallyFailed(ro []runtime.Object) error {
for _, r := range ro {
if failed, msg := health.IsTerminallyFailed(r); failed {
key, _ := client.ObjectKeyFromObject(r)
return fmt.Errorf("object %s/%s has failed: %s", key.Namespace, key.Name, msg)
}
}
return nil
}

// copy from k8s.io/kubectl@v0.16.6/pkg/util/apply.go, with adjustments
// GetOriginalConfiguration retrieves the original configuration of the object
// from the annotation.
Expand Down
3 changes: 3 additions & 0 deletions pkg/engine/task/task_pipe.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,9 @@ func (pt PipeTask) Run(ctx Context) (bool, error) {
// once the pod is Ready, it means that its initContainer finished successfully and we can copy
// out the generated files. An error during a health check is not treated as task execution error
if err != nil {
if fatal := isTerminallyFailed(podObj); fatal != nil {
return false, fatalExecutionError(fatal, failedTerminalState, ctx.Meta)
}
return false, nil
}

Expand Down
11 changes: 11 additions & 0 deletions test/e2e/terminal-failed-job/00-assert.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
apiVersion: kudo.dev/v1beta1
kind: Instance
metadata:
labels:
kudo.dev/operator: failjob-operator
spec:
operatorVersion:
name: failjob-operator-0.1.0
status:
aggregatedStatus:
status: FATAL_ERROR
4 changes: 4 additions & 0 deletions test/e2e/terminal-failed-job/00-install.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
apiVersion: kudo.dev/v1beta1
kind: TestStep
kubectl:
- kudo install --instance failjob-operator-instance ./failjob-operator
11 changes: 11 additions & 0 deletions test/e2e/terminal-failed-job/01-assert.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
apiVersion: kudo.dev/v1beta1
kind: Instance
metadata:
labels:
kudo.dev/operator: job-timeout-operator
spec:
operatorVersion:
name: job-timeout-operator-0.1.0
status:
aggregatedStatus:
status: FATAL_ERROR
5 changes: 5 additions & 0 deletions test/e2e/terminal-failed-job/01-install-timeout.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
apiVersion: kudo.dev/v1beta1
kind: TestStep
kubectl:
- kudo uninstall --instance failjob-operator-instance
- kudo install --instance job-timout-operator-instance ./job-timeout-operator
25 changes: 25 additions & 0 deletions test/e2e/terminal-failed-job/failjob-operator/operator.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
apiVersion: kudo.dev/v1beta1
name: "failjob-operator"
operatorVersion: "0.1.0"
appVersion: "1.7.9"
kubernetesVersion: 1.13.0
maintainers:
- name: Your name
email: <your@email.com>
url: https://kudo.dev
tasks:
- name: app
kind: Apply
spec:
resources:
- failjob.yaml
plans:
deploy:
strategy: serial
phases:
- name: main
strategy: parallel
steps:
- name: everything
tasks:
- app
2 changes: 2 additions & 0 deletions test/e2e/terminal-failed-job/failjob-operator/params.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
apiVersion: kudo.dev/v1beta1
parameters:
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
apiVersion: batch/v1
kind: Job
metadata:
name: failing-job
namespace: {{ .Namespace }}
spec:
template:
spec:
containers:
- name: main
image: alpine
command: ["false"]
restartPolicy: Never
backoffLimit: 0
25 changes: 25 additions & 0 deletions test/e2e/terminal-failed-job/job-timeout-operator/operator.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
apiVersion: kudo.dev/v1beta1
name: "job-timeout-operator"
operatorVersion: "0.1.0"
appVersion: "1.7.9"
kubernetesVersion: 1.13.0
maintainers:
- name: Your name
email: <your@email.com>
url: https://kudo.dev
tasks:
- name: app
kind: Apply
spec:
resources:
- timeoutjob.yaml
plans:
deploy:
strategy: serial
phases:
- name: main
strategy: parallel
steps:
- name: everything
tasks:
- app
2 changes: 2 additions & 0 deletions test/e2e/terminal-failed-job/job-timeout-operator/params.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
apiVersion: kudo.dev/v1beta1
parameters:
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
apiVersion: batch/v1
kind: Job
metadata:
name: timeout-job
namespace: {{ .Namespace }}
spec:
template:
spec:
containers:
- name: main
image: alpine
command: ["sleep", "1000"]
restartPolicy: Never
backoffLimit: 3
activeDeadlineSeconds: 60

0 comments on commit 6871b2f

Please sign in to comment.