Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix job backoff limit for restart policy Never #93779

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
5 changes: 1 addition & 4 deletions pkg/controller/job/job_controller.go
Expand Up @@ -467,9 +467,6 @@ func (jm *Controller) syncJob(key string) (bool, error) {
return true, nil
}

// retrieve the previous number of retry
previousRetry := jm.queue.NumRequeues(key)

// Check the expectations of the job before counting active pods, otherwise a new pod can sneak in
// and update the expectations after we've retrieved active pods from the store. If a new pod enters
// the store after we've checked the expectation, the job sync is just deferred till the next relist.
Expand Down Expand Up @@ -506,7 +503,7 @@ func (jm *Controller) syncJob(key string) (bool, error) {
// is different than parallelism, otherwise the previous controller loop
// failed updating status so even if we pick up failure it is not a new one
exceedsBackoffLimit := jobHaveNewFailure && (active != *job.Spec.Parallelism) &&
(int32(previousRetry)+1 > *job.Spec.BackoffLimit)
(failed > *job.Spec.BackoffLimit)

if exceedsBackoffLimit || pastBackoffLimitOnFailure(&job, pods) {
// check if the number of pod restart exceeds backoff (for restart OnFailure only)
Expand Down
104 changes: 104 additions & 0 deletions pkg/controller/job/job_controller_test.go
Expand Up @@ -1521,3 +1521,107 @@ func TestJobBackoffForOnFailure(t *testing.T) {
})
}
}

func TestJobBackoffOnRestartPolicyNever(t *testing.T) {
jobConditionFailed := batch.JobFailed

testCases := map[string]struct {
// job setup
parallelism int32
completions int32
backoffLimit int32

// pod setup
activePodsPhase v1.PodPhase
activePods int32
failedPods int32

// expectations
isExpectingAnError bool
jobKeyForget bool
expectedActive int32
expectedSucceeded int32
expectedFailed int32
expectedCondition *batch.JobConditionType
expectedConditionReason string
}{
"not enough failures with backoffLimit 0 - single pod": {
1, 1, 0,
v1.PodRunning, 1, 0,
false, true, 1, 0, 0, nil, "",
},
"not enough failures with backoffLimit 1 - single pod": {
1, 1, 1,
"", 0, 1,
true, false, 1, 0, 1, nil, "",
},
"too many failures with backoffLimit 1 - single pod": {
1, 1, 1,
"", 0, 2,
false, true, 0, 0, 2, &jobConditionFailed, "BackoffLimitExceeded",
},
"not enough failures with backoffLimit 6 - multiple pods": {
2, 2, 6,
v1.PodRunning, 1, 6,
true, false, 2, 0, 6, nil, "",
},
"too many failures with backoffLimit 6 - multiple pods": {
2, 2, 6,
"", 0, 7,
false, true, 0, 0, 7, &jobConditionFailed, "BackoffLimitExceeded",
},
}

for name, tc := range testCases {
t.Run(name, func(t *testing.T) {
// job manager setup
clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}})
manager, sharedInformerFactory := newControllerFromClient(clientset, controller.NoResyncPeriodFunc)
fakePodControl := controller.FakePodControl{}
manager.podControl = &fakePodControl
manager.podStoreSynced = alwaysReady
manager.jobStoreSynced = alwaysReady
var actual *batch.Job
manager.updateHandler = func(job *batch.Job) error {
actual = job
return nil
}

// job & pods setup
job := newJob(tc.parallelism, tc.completions, tc.backoffLimit)
job.Spec.Template.Spec.RestartPolicy = v1.RestartPolicyNever
sharedInformerFactory.Batch().V1().Jobs().Informer().GetIndexer().Add(job)
podIndexer := sharedInformerFactory.Core().V1().Pods().Informer().GetIndexer()
for _, pod := range newPodList(tc.failedPods, v1.PodFailed, job) {
podIndexer.Add(&pod)
}
for _, pod := range newPodList(tc.activePods, tc.activePodsPhase, job) {
podIndexer.Add(&pod)
}

// run
forget, err := manager.syncJob(testutil.GetKey(job, t))

if (err != nil) != tc.isExpectingAnError {
t.Errorf("unexpected error syncing job. Got %#v, isExpectingAnError: %v\n", err, tc.isExpectingAnError)
}
if forget != tc.jobKeyForget {
t.Errorf("unexpected forget value. Expected %v, saw %v\n", tc.jobKeyForget, forget)
}
// validate status
if actual.Status.Active != tc.expectedActive {
t.Errorf("unexpected number of active pods. Expected %d, saw %d\n", tc.expectedActive, actual.Status.Active)
}
if actual.Status.Succeeded != tc.expectedSucceeded {
t.Errorf("unexpected number of succeeded pods. Expected %d, saw %d\n", tc.expectedSucceeded, actual.Status.Succeeded)
}
if actual.Status.Failed != tc.expectedFailed {
t.Errorf("unexpected number of failed pods. Expected %d, saw %d\n", tc.expectedFailed, actual.Status.Failed)
}
// validate conditions
if tc.expectedCondition != nil && !getCondition(actual, *tc.expectedCondition, tc.expectedConditionReason) {
t.Errorf("expected completion condition. Got %#v", actual.Status.Conditions)
}
})
}
}
8 changes: 1 addition & 7 deletions test/e2e/apps/job.go
Expand Up @@ -246,13 +246,7 @@ var _ = SIGDescribe("Job", func() {
ginkgo.By(fmt.Sprintf("Checking that %d pod created and status is failed", backoff+1))
pods, err := e2ejob.GetJobPods(f.ClientSet, f.Namespace.Name, job.Name)
framework.ExpectNoError(err, "failed to get PodList for job %s in namespace: %s", job.Name, f.Namespace.Name)
// gomega.Expect(pods.Items).To(gomega.HaveLen(backoff + 1))
// due to NumRequeus not being stable enough, especially with failed status
// updates we need to allow more than backoff+1
// TODO revert this back to above when https://github.com/kubernetes/kubernetes/issues/64787 gets fixed
if len(pods.Items) < backoff+1 {
framework.Failf("Not enough pod created expected at least %d, got %#v", backoff+1, pods.Items)
}
gomega.Expect(pods.Items).To(gomega.HaveLen(backoff + 1))
for _, pod := range pods.Items {
framework.ExpectEqual(pod.Status.Phase, v1.PodFailed)
}
Expand Down