From 63e217d562cc4b10088534d698bbb0c23ca6f7c4 Mon Sep 17 00:00:00 2001 From: Aldo Culquicondor Date: Fri, 7 Jul 2023 14:08:19 -0400 Subject: [PATCH] Only declare job as finished after removing all finalizers Change-Id: Id4b01b0e6fabe24134e57e687356e0fc613cead4 --- pkg/controller/job/job_controller.go | 11 +++++------ test/integration/job/job_test.go | 4 +++- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/pkg/controller/job/job_controller.go b/pkg/controller/job/job_controller.go index e1f90dfd237f1..9771867fa3092 100644 --- a/pkg/controller/job/job_controller.go +++ b/pkg/controller/job/job_controller.go @@ -750,11 +750,7 @@ func (jm *Controller) syncJob(ctx context.Context, key string) (forget bool, rEr var finishedCondition *batch.JobCondition jobHasNewFailure := failed > job.Status.Failed - // new failures happen when status does not reflect the failures and active - // is different than parallelism, otherwise the previous controller loop - // failed updating status so even if we pick up failure it is not a new one - exceedsBackoffLimit := jobHasNewFailure && (active != *job.Spec.Parallelism) && - (failed > *job.Spec.BackoffLimit) + exceedsBackoffLimit := job.Spec.BackoffLimit != nil && failed > *job.Spec.BackoffLimit if exceedsBackoffLimit || pastBackoffLimitOnFailure(&job, pods) { // check if the number of pod restart exceeds backoff (for restart OnFailure only) @@ -1019,6 +1015,7 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job if cleanUncountedPodsWithoutFinalizers(&job.Status, uidsWithFinalizer) { needsFlush = true } + reachedMaxUncountedPods := false for _, pod := range pods { if !hasJobTrackingFinalizer(pod) || expectedRmFinalizers.Has(string(pod.UID)) { continue @@ -1061,6 +1058,7 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job // // The job will be synced again because the Job status and Pod updates // will put the Job back to the work queue. + reachedMaxUncountedPods = true break } } @@ -1077,7 +1075,8 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job if job, needsFlush, err = jm.flushUncountedAndRemoveFinalizers(ctx, job, podsToRemoveFinalizer, uidsWithFinalizer, &oldCounters, needsFlush); err != nil { return err } - if jm.enactJobFinished(job, finishedCond) { + jobFinished := !reachedMaxUncountedPods && jm.enactJobFinished(job, finishedCond) + if jobFinished { needsFlush = true } if needsFlush { diff --git a/test/integration/job/job_test.go b/test/integration/job/job_test.go index a6b9ddc6cc14c..687300aaab027 100644 --- a/test/integration/job/job_test.go +++ b/test/integration/job/job_test.go @@ -583,7 +583,9 @@ func TestOrphanPodsFinalizersClearedWithGC(t *testing.T) { func TestFinalizersClearedWhenBackoffLimitExceeded(t *testing.T) { defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobTrackingWithFinalizers, true)() - + // Set a maximum number of uncounted pods below parallelism, to ensure it + // doesn't affect the termination of pods. + t.Cleanup(setDuringTest(&jobcontroller.MaxUncountedPods, 50)) closeFn, restConfig, clientSet, ns := setup(t, "simple") defer closeFn() ctx, cancel := startJobControllerAndWaitForCaches(restConfig)