Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Never clean backoff in job controller #63650

Merged
merged 3 commits into from
Jun 6, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 24 additions & 5 deletions pkg/controller/job/job_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ import (
"github.com/golang/glog"
)

const statusUpdateRetries = 3

// controllerKind contains the schema.GroupVersionKind for this controller type.
var controllerKind = batch.SchemeGroupVersion.WithKind("Job")

Expand Down Expand Up @@ -356,10 +358,10 @@ func (jm *JobController) enqueueController(obj interface{}, immediate bool) {
return
}

if immediate {
jm.queue.Forget(key)
backoff := time.Duration(0)
if !immediate {
backoff = getBackoff(jm.queue, key)
}
backoff := getBackoff(jm.queue, key)

// TODO: Handle overlapping controllers better. Either disallow them at admission time or
// deterministically avoid syncing controllers that fight over pods. Currently, we only
Expand Down Expand Up @@ -495,7 +497,11 @@ func (jm *JobController) syncJob(key string) (bool, error) {
var failureMessage string

jobHaveNewFailure := failed > job.Status.Failed
exceedsBackoffLimit := jobHaveNewFailure && (int32(previousRetry)+1 > *job.Spec.BackoffLimit)
// new failures happen when status does not reflect the failures and active
// is different than parallelism, otherwise the previous controller loop
// failed updating status so even if we pick up failure it is not a new one
exceedsBackoffLimit := jobHaveNewFailure && (active != *job.Spec.Parallelism) &&
(int32(previousRetry)+1 > *job.Spec.BackoffLimit)

if exceedsBackoffLimit || pastBackoffLimitOnFailure(&job, pods) {
// check if the number of pod restart exceeds backoff (for restart OnFailure only)
Expand Down Expand Up @@ -813,7 +819,20 @@ func (jm *JobController) manageJob(activePods []*v1.Pod, succeeded int32, job *b
}

func (jm *JobController) updateJobStatus(job *batch.Job) error {
_, err := jm.kubeClient.BatchV1().Jobs(job.Namespace).UpdateStatus(job)
jobClient := jm.kubeClient.BatchV1().Jobs(job.Namespace)
var err error
for i := 0; i <= statusUpdateRetries; i = i + 1 {
var newJob *batch.Job
newJob, err = jobClient.Get(job.Name, metav1.GetOptions{})
if err != nil {
break
}
newJob.Status = job.Status
if _, err = jobClient.UpdateStatus(newJob); err == nil {
break
}
}

return err
}

Expand Down
4 changes: 2 additions & 2 deletions pkg/controller/job/job_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -412,8 +412,8 @@ func TestSyncJobPastDeadline(t *testing.T) {
},
"activeDeadlineSeconds with backofflimit reach": {
1, 1, 1, 10, 0,
1, 0, 2,
true, 1, 0, 0, 3, "BackoffLimitExceeded",
0, 0, 1,
true, 0, 0, 0, 1, "BackoffLimitExceeded",
},
}

Expand Down
18 changes: 13 additions & 5 deletions test/e2e/apps/job.go
Original file line number Diff line number Diff line change
Expand Up @@ -174,19 +174,27 @@ var _ = SIGDescribe("Job", func() {

It("should exceed backoffLimit", func() {
By("Creating a job")
job := framework.NewTestJob("fail", "backofflimit", v1.RestartPolicyNever, 1, 1, nil, 0)
backoff := 1
job := framework.NewTestJob("fail", "backofflimit", v1.RestartPolicyNever, 1, 1, nil, int32(backoff))
job, err := framework.CreateJob(f.ClientSet, f.Namespace.Name, job)
Expect(err).NotTo(HaveOccurred())
By("Ensuring job exceed backofflimit")

err = framework.WaitForJobFailure(f.ClientSet, f.Namespace.Name, job.Name, framework.JobTimeout, "BackoffLimitExceeded")
Expect(err).NotTo(HaveOccurred())

By("Checking that only one pod created and status is failed")
By(fmt.Sprintf("Checking that %d pod created and status is failed", backoff+1))
pods, err := framework.GetJobPods(f.ClientSet, f.Namespace.Name, job.Name)
Expect(err).NotTo(HaveOccurred())
Expect(pods.Items).To(HaveLen(1))
pod := pods.Items[0]
Expect(pod.Status.Phase).To(Equal(v1.PodFailed))
// Expect(pods.Items).To(HaveLen(backoff + 1))
// due to NumRequeus not being stable enough, especially with failed status
// updates we need to allow more than backoff+1
// TODO revert this back to above when https://github.com/kubernetes/kubernetes/issues/64787 gets fixed
if len(pods.Items) < backoff+1 {
framework.Failf("Not enough pod created expected at least %d, got %#v", backoff+1, pods.Items)
}
for _, pod := range pods.Items {
Expect(pod.Status.Phase).To(Equal(v1.PodFailed))
}
})
})