From 92a0f58e2bf94de05938a70d94dabe19c50e21c7 Mon Sep 17 00:00:00 2001 From: Aldo Culquicondor Date: Fri, 7 Jul 2023 14:08:19 -0400 Subject: [PATCH] Only declare job as finished after removing all finalizers Change-Id: Id4b01b0e6fabe24134e57e687356e0fc613cead4 --- pkg/controller/job/job_controller.go | 10 ++++------ test/integration/job/job_test.go | 3 +++ 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/pkg/controller/job/job_controller.go b/pkg/controller/job/job_controller.go index 505ecac1197c..a4df1e28c58c 100644 --- a/pkg/controller/job/job_controller.go +++ b/pkg/controller/job/job_controller.go @@ -768,11 +768,7 @@ func (jm *Controller) syncJob(ctx context.Context, key string) (rErr error) { var finishedCondition *batch.JobCondition jobHasNewFailure := failed > job.Status.Failed - // new failures happen when status does not reflect the failures and active - // is different than parallelism, otherwise the previous controller loop - // failed updating status so even if we pick up failure it is not a new one - exceedsBackoffLimit := jobHasNewFailure && (active != *job.Spec.Parallelism) && - (failed > *job.Spec.BackoffLimit) + exceedsBackoffLimit := failed > *job.Spec.BackoffLimit if feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) { if failureTargetCondition := findConditionByType(job.Status.Conditions, batch.JobFailureTarget); failureTargetCondition != nil { @@ -984,6 +980,7 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job needsFlush = true } podFailureCountByPolicyAction := map[string]int{} + reachedMaxUncountedPods := false for _, pod := range pods { if !hasJobTrackingFinalizer(pod) || expectedRmFinalizers.Has(string(pod.UID)) { // This pod was processed in a previous sync. @@ -1048,6 +1045,7 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job // // The job will be synced again because the Job status and Pod updates // will put the Job back to the work queue. + reachedMaxUncountedPods = true break } } @@ -1076,7 +1074,7 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job if job, needsFlush, err = jm.flushUncountedAndRemoveFinalizers(ctx, job, podsToRemoveFinalizer, uidsWithFinalizer, &oldCounters, podFailureCountByPolicyAction, needsFlush, newBackoffRecord); err != nil { return err } - jobFinished := jm.enactJobFinished(job, finishedCond) + jobFinished := !reachedMaxUncountedPods && jm.enactJobFinished(job, finishedCond) if jobFinished { needsFlush = true } diff --git a/test/integration/job/job_test.go b/test/integration/job/job_test.go index 1b71ced1674b..d0cb4109920a 100644 --- a/test/integration/job/job_test.go +++ b/test/integration/job/job_test.go @@ -1340,6 +1340,9 @@ func TestOrphanPodsFinalizersClearedWithGC(t *testing.T) { } func TestFinalizersClearedWhenBackoffLimitExceeded(t *testing.T) { + // Set a maximum number of uncounted pods below parallelism, to ensure it + // doesn't affect the termination of pods. + t.Cleanup(setDuringTest(&jobcontroller.MaxUncountedPods, 50)) closeFn, restConfig, clientSet, ns := setup(t, "simple") defer closeFn() ctx, cancel := startJobControllerAndWaitForCaches(restConfig)