Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fatal error on terminal job #1370

Merged
merged 3 commits into from
Mar 3, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions pkg/engine/health/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,30 @@ import (
kudov1beta1 "github.com/kudobuilder/kudo/pkg/apis/kudo/v1beta1"
)

// IsTerminallyFailed returns whether an object is in a terminal failed state and has no chance to reach healthy
func IsTerminallyFailed(obj runtime.Object) (bool, string) {
if obj == nil {
return false, ""
}

switch obj := obj.(type) {
case *batchv1.Job:
return isJobTerminallyFailed(obj)
default:
return false, ""
}
}

func isJobTerminallyFailed(job *batchv1.Job) (bool, string) {
for _, c := range job.Status.Conditions {
if c.Type == batchv1.JobFailed && c.Status == corev1.ConditionTrue {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does it make sense to check for c.Reason == "DeadlineExceeded" as well?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had that in first, with special cases for DeadlineExceeded and BackoffLimitExceeded, but I don't think it makes sense to distinguish here: If the condition is Failed, the job won't complete, so we abort. The reason is included in the error message and should show up in the FatalError of the task.

log.Printf("HealthUtil: Job \"%v\" has failed: %s", job.Name, c.Message)
return true, c.Message
}
}
return false, ""
}

// IsHealthy returns whether an object is healthy. Must be implemented for each type.
func IsHealthy(obj runtime.Object) error {
if obj == nil {
Expand Down
1 change: 1 addition & 0 deletions pkg/engine/task/task.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ var (
dummyTaskError = "DummyTaskError"
resourceUnmarshalError = "ResourceUnmarshalError"
resourceValidationError = "ResourceValidationError"
failedTerminalState = "FailedTerminalStateError"
)

// Build factory method takes an v1beta1.Task and returns a corresponding Tasker object
Expand Down
14 changes: 13 additions & 1 deletion pkg/engine/task/task_apply.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,9 @@ func (at ApplyTask) Run(ctx Context) (bool, error) {
// 4. - Check health for all resources -
err = isHealthy(applied)
if err != nil {
// so far we do not distinguish between unhealthy resources and other errors that might occur during a health check
if fatal := isTerminallyFailed(applied); fatal != nil {
return false, fatalExecutionError(fatal, failedTerminalState, ctx.Meta)
}
// an error during a health check is not treated task execution error
log.Printf("TaskExecution: %v", err)
return false, nil
Expand Down Expand Up @@ -237,6 +239,16 @@ func isHealthy(ro []runtime.Object) error {
return nil
}

func isTerminallyFailed(ro []runtime.Object) error {
for _, r := range ro {
if failed, msg := health.IsTerminallyFailed(r); failed {
key, _ := client.ObjectKeyFromObject(r)
return fmt.Errorf("object %s/%s has failed: %s", key.Namespace, key.Name, msg)
}
}
return nil
}

// copy from k8s.io/kubectl@v0.16.6/pkg/util/apply.go, with adjustments
// GetOriginalConfiguration retrieves the original configuration of the object
// from the annotation.
Expand Down
3 changes: 3 additions & 0 deletions pkg/engine/task/task_pipe.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,9 @@ func (pt PipeTask) Run(ctx Context) (bool, error) {
// once the pod is Ready, it means that its initContainer finished successfully and we can copy
// out the generated files. An error during a health check is not treated as task execution error
if err != nil {
if fatal := isTerminallyFailed(podObj); fatal != nil {
return false, fatalExecutionError(fatal, failedTerminalState, ctx.Meta)
}
return false, nil
}

Expand Down
11 changes: 11 additions & 0 deletions test/e2e/terminal-failed-job/00-assert.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
apiVersion: kudo.dev/v1beta1
kind: Instance
metadata:
labels:
kudo.dev/operator: failjob-operator
spec:
operatorVersion:
name: failjob-operator-0.1.0
status:
aggregatedStatus:
status: FATAL_ERROR
4 changes: 4 additions & 0 deletions test/e2e/terminal-failed-job/00-install.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
apiVersion: kudo.dev/v1beta1
kind: TestStep
kubectl:
- kudo install --instance failjob-operator-instance ./failjob-operator
11 changes: 11 additions & 0 deletions test/e2e/terminal-failed-job/01-assert.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
apiVersion: kudo.dev/v1beta1
kind: Instance
metadata:
labels:
kudo.dev/operator: job-timeout-operator
spec:
operatorVersion:
name: job-timeout-operator-0.1.0
status:
aggregatedStatus:
status: FATAL_ERROR
5 changes: 5 additions & 0 deletions test/e2e/terminal-failed-job/01-install-timeout.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
apiVersion: kudo.dev/v1beta1
kind: TestStep
kubectl:
- kudo uninstall --instance failjob-operator-instance
- kudo install --instance job-timout-operator-instance ./job-timeout-operator
25 changes: 25 additions & 0 deletions test/e2e/terminal-failed-job/failjob-operator/operator.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
apiVersion: kudo.dev/v1beta1
name: "failjob-operator"
operatorVersion: "0.1.0"
appVersion: "1.7.9"
kubernetesVersion: 1.13.0
maintainers:
- name: Your name
email: <your@email.com>
url: https://kudo.dev
tasks:
- name: app
kind: Apply
spec:
resources:
- failjob.yaml
plans:
deploy:
strategy: serial
phases:
- name: main
strategy: parallel
steps:
- name: everything
tasks:
- app
2 changes: 2 additions & 0 deletions test/e2e/terminal-failed-job/failjob-operator/params.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
apiVersion: kudo.dev/v1beta1
parameters:
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
apiVersion: batch/v1
kind: Job
metadata:
name: failing-job
namespace: {{ .Namespace }}
spec:
template:
spec:
containers:
- name: main
image: alpine
command: ["false"]
restartPolicy: Never
backoffLimit: 0
25 changes: 25 additions & 0 deletions test/e2e/terminal-failed-job/job-timeout-operator/operator.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
apiVersion: kudo.dev/v1beta1
name: "job-timeout-operator"
operatorVersion: "0.1.0"
appVersion: "1.7.9"
kubernetesVersion: 1.13.0
maintainers:
- name: Your name
email: <your@email.com>
url: https://kudo.dev
tasks:
- name: app
kind: Apply
spec:
resources:
- timeoutjob.yaml
plans:
deploy:
strategy: serial
phases:
- name: main
strategy: parallel
steps:
- name: everything
tasks:
- app
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
apiVersion: kudo.dev/v1beta1
parameters:
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
apiVersion: batch/v1
kind: Job
metadata:
name: timeout-job
namespace: {{ .Namespace }}
spec:
template:
spec:
containers:
- name: main
image: alpine
command: ["sleep", "1000"]
restartPolicy: Never
backoffLimit: 3
activeDeadlineSeconds: 60