Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable the "Retriable and non-retriable pod failures for jobs" feature into beta #113360

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 1 addition & 2 deletions pkg/apis/batch/types.go
Expand Up @@ -440,8 +440,7 @@ const (
// JobFailed means the job has failed its execution.
JobFailed JobConditionType = "Failed"
// FailureTarget means the job is about to fail its execution.
// The constant is to be renamed once the name is accepted within the KEP-3329.
AlphaNoCompatGuaranteeJobFailureTarget JobConditionType = "FailureTarget"
JobFailureTarget JobConditionType = "FailureTarget"
)

// JobCondition describes current state of a job.
Expand Down
8 changes: 4 additions & 4 deletions pkg/apis/batch/v1/defaults_test.go
Expand Up @@ -52,7 +52,7 @@ func TestSetDefaultJob(t *testing.T) {
Action: batchv1.PodFailurePolicyActionFailJob,
OnPodConditions: []batchv1.PodFailurePolicyOnPodConditionsPattern{
{
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
Type: v1.DisruptionTarget,
Status: v1.ConditionTrue,
},
{
Expand All @@ -75,7 +75,7 @@ func TestSetDefaultJob(t *testing.T) {
Action: batchv1.PodFailurePolicyActionFailJob,
OnPodConditions: []batchv1.PodFailurePolicyOnPodConditionsPattern{
{
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
Type: v1.DisruptionTarget,
},
},
},
Expand All @@ -96,7 +96,7 @@ func TestSetDefaultJob(t *testing.T) {
Action: batchv1.PodFailurePolicyActionFailJob,
OnPodConditions: []batchv1.PodFailurePolicyOnPodConditionsPattern{
{
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
Type: v1.DisruptionTarget,
Status: v1.ConditionTrue,
},
{
Expand All @@ -120,7 +120,7 @@ func TestSetDefaultJob(t *testing.T) {
Action: batchv1.PodFailurePolicyActionFailJob,
OnPodConditions: []batchv1.PodFailurePolicyOnPodConditionsPattern{
{
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
Type: v1.DisruptionTarget,
Status: v1.ConditionTrue,
},
},
Expand Down
16 changes: 8 additions & 8 deletions pkg/apis/batch/validation/validation_test.go
Expand Up @@ -118,7 +118,7 @@ func TestValidateJob(t *testing.T) {
Action: batch.PodFailurePolicyActionIgnore,
OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
{
Type: api.AlphaNoCompatGuaranteeDisruptionTarget,
Type: api.DisruptionTarget,
Status: api.ConditionTrue,
},
},
Expand Down Expand Up @@ -456,7 +456,7 @@ func TestValidateJob(t *testing.T) {
},
OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
{
Type: api.AlphaNoCompatGuaranteeDisruptionTarget,
Type: api.DisruptionTarget,
Status: api.ConditionTrue,
},
},
Expand Down Expand Up @@ -558,7 +558,7 @@ func TestValidateJob(t *testing.T) {
Action: batch.PodFailurePolicyActionIgnore,
OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
{
Type: api.AlphaNoCompatGuaranteeDisruptionTarget,
Type: api.DisruptionTarget,
},
},
},
Expand All @@ -577,7 +577,7 @@ func TestValidateJob(t *testing.T) {
Action: batch.PodFailurePolicyActionIgnore,
OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
{
Type: api.AlphaNoCompatGuaranteeDisruptionTarget,
Type: api.DisruptionTarget,
Status: "UnknownStatus",
},
},
Expand Down Expand Up @@ -968,7 +968,7 @@ func TestValidateJobUpdate(t *testing.T) {
Action: batch.PodFailurePolicyActionIgnore,
OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
{
Type: api.AlphaNoCompatGuaranteeDisruptionTarget,
Type: api.DisruptionTarget,
Status: api.ConditionTrue,
},
},
Expand All @@ -993,7 +993,7 @@ func TestValidateJobUpdate(t *testing.T) {
Action: batch.PodFailurePolicyActionIgnore,
OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
{
Type: api.AlphaNoCompatGuaranteeDisruptionTarget,
Type: api.DisruptionTarget,
Status: api.ConditionTrue,
},
},
Expand All @@ -1007,7 +1007,7 @@ func TestValidateJobUpdate(t *testing.T) {
Action: batch.PodFailurePolicyActionCount,
OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
{
Type: api.AlphaNoCompatGuaranteeDisruptionTarget,
Type: api.DisruptionTarget,
Status: api.ConditionTrue,
},
},
Expand All @@ -1030,7 +1030,7 @@ func TestValidateJobUpdate(t *testing.T) {
Action: batch.PodFailurePolicyActionIgnore,
OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
{
Type: api.AlphaNoCompatGuaranteeDisruptionTarget,
Type: api.DisruptionTarget,
Status: api.ConditionTrue,
},
},
Expand Down
5 changes: 2 additions & 3 deletions pkg/apis/core/types.go
Expand Up @@ -2433,10 +2433,9 @@ const (
PodReasonSchedulingGated = "SchedulingGated"
// ContainersReady indicates whether all containers in the pod are ready.
ContainersReady PodConditionType = "ContainersReady"
// AlphaNoCompatGuaranteeDisruptionTarget indicates the pod is about to be terminated due to a
// DisruptionTarget indicates the pod is about to be terminated due to a
// disruption (such as preemption, eviction API or garbage-collection).
// The constant is to be renamed once the name is accepted within the KEP-3329.
AlphaNoCompatGuaranteeDisruptionTarget PodConditionType = "DisruptionTarget"
DisruptionTarget PodConditionType = "DisruptionTarget"
)

// PodCondition represents pod's condition
Expand Down
6 changes: 3 additions & 3 deletions pkg/controller/disruption/disruption.go
Expand Up @@ -755,7 +755,7 @@ func (dc *DisruptionController) syncStalePodDisruption(ctx context.Context, key
WithStatus(corev1apply.PodStatus()).
WithResourceVersion(pod.ResourceVersion)
podApply.Status.WithConditions(corev1apply.PodCondition().
WithType(v1.AlphaNoCompatGuaranteeDisruptionTarget).
WithType(v1.DisruptionTarget).
WithStatus(v1.ConditionFalse).
WithLastTransitionTime(metav1.Now()),
)
Expand Down Expand Up @@ -998,11 +998,11 @@ func (dc *DisruptionController) nonTerminatingPodHasStaleDisruptionCondition(pod
if pod.DeletionTimestamp != nil {
return false, 0
}
_, cond := apipod.GetPodCondition(&pod.Status, v1.AlphaNoCompatGuaranteeDisruptionTarget)
_, cond := apipod.GetPodCondition(&pod.Status, v1.DisruptionTarget)
// Pod disruption conditions added by kubelet are never considered stale because the condition might take
// arbitrarily long before the pod is terminating (has deletion timestamp). Also, pod conditions present
// on pods in terminal phase are not stale to avoid unnecessary status updates.
if cond == nil || cond.Status != v1.ConditionTrue || cond.Reason == v1.AlphaNoCompatGuaranteePodReasonTerminationByKubelet || apipod.IsPodPhaseTerminal(pod.Status.Phase) {
if cond == nil || cond.Status != v1.ConditionTrue || cond.Reason == v1.PodReasonTerminationByKubelet || apipod.IsPodPhaseTerminal(pod.Status.Phase) {
return false, 0
}
waitFor := dc.stalePodDisruptionTimeout - dc.clock.Since(cond.LastTransitionTime.Time)
Expand Down
16 changes: 8 additions & 8 deletions pkg/controller/disruption/disruption_test.go
Expand Up @@ -1403,7 +1403,7 @@ func TestStalePodDisruption(t *testing.T) {
Status: v1.PodStatus{
Conditions: []v1.PodCondition{
{
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
Type: v1.DisruptionTarget,
Status: v1.ConditionTrue,
LastTransitionTime: metav1.Time{Time: now},
},
Expand All @@ -1413,7 +1413,7 @@ func TestStalePodDisruption(t *testing.T) {
timePassed: 2*time.Minute + time.Second,
wantConditions: []v1.PodCondition{
{
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
Type: v1.DisruptionTarget,
Status: v1.ConditionFalse,
},
},
Expand All @@ -1427,7 +1427,7 @@ func TestStalePodDisruption(t *testing.T) {
Status: v1.PodStatus{
Conditions: []v1.PodCondition{
{
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
Type: v1.DisruptionTarget,
Status: v1.ConditionTrue,
LastTransitionTime: metav1.Time{Time: now},
},
Expand All @@ -1437,7 +1437,7 @@ func TestStalePodDisruption(t *testing.T) {
timePassed: 2*time.Minute - time.Second,
wantConditions: []v1.PodCondition{
{
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
Type: v1.DisruptionTarget,
Status: v1.ConditionTrue,
},
},
Expand All @@ -1452,7 +1452,7 @@ func TestStalePodDisruption(t *testing.T) {
Status: v1.PodStatus{
Conditions: []v1.PodCondition{
{
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
Type: v1.DisruptionTarget,
Status: v1.ConditionTrue,
LastTransitionTime: metav1.Time{Time: now},
},
Expand All @@ -1462,7 +1462,7 @@ func TestStalePodDisruption(t *testing.T) {
timePassed: 2*time.Minute + time.Second,
wantConditions: []v1.PodCondition{
{
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
Type: v1.DisruptionTarget,
Status: v1.ConditionTrue,
},
},
Expand All @@ -1487,7 +1487,7 @@ func TestStalePodDisruption(t *testing.T) {
Status: v1.PodStatus{
Conditions: []v1.PodCondition{
{
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
Type: v1.DisruptionTarget,
Status: v1.ConditionFalse,
},
},
Expand All @@ -1496,7 +1496,7 @@ func TestStalePodDisruption(t *testing.T) {
timePassed: 2*time.Minute + time.Second,
wantConditions: []v1.PodCondition{
{
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
Type: v1.DisruptionTarget,
Status: v1.ConditionFalse,
},
},
Expand Down
6 changes: 3 additions & 3 deletions pkg/controller/job/job_controller.go
Expand Up @@ -758,12 +758,12 @@ func (jm *Controller) syncJob(ctx context.Context, key string) (forget bool, rEr
(failed > *job.Spec.BackoffLimit)

if feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) {
if failureTargetCondition := findConditionByType(job.Status.Conditions, batch.AlphaNoCompatGuaranteeJobFailureTarget); failureTargetCondition != nil {
if failureTargetCondition := findConditionByType(job.Status.Conditions, batch.JobFailureTarget); failureTargetCondition != nil {
finishedCondition = newFailedConditionForFailureTarget(failureTargetCondition)
} else if failJobMessage := getFailJobMessage(&job, pods, uncounted.Failed()); failJobMessage != nil {
if uncounted != nil {
// Prepare the interim FailureTarget condition to record the failure message before the finalizers (allowing removal of the pods) are removed.
finishedCondition = newCondition(batch.AlphaNoCompatGuaranteeJobFailureTarget, v1.ConditionTrue, jobConditionReasonPodFailurePolicy, *failJobMessage)
finishedCondition = newCondition(batch.JobFailureTarget, v1.ConditionTrue, jobConditionReasonPodFailurePolicy, *failJobMessage)
} else {
// Prepare the Failed job condition for the legacy path without finalizers (don't use the interim FailureTarget condition).
finishedCondition = newCondition(batch.JobFailed, v1.ConditionTrue, jobConditionReasonPodFailurePolicy, *failJobMessage)
Expand Down Expand Up @@ -1090,7 +1090,7 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job
job.Status.CompletedIndexes = succeededIndexes.String()
}
if feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) {
if finishedCond != nil && finishedCond.Type == batch.AlphaNoCompatGuaranteeJobFailureTarget {
if finishedCond != nil && finishedCond.Type == batch.JobFailureTarget {

// Append the interim FailureTarget condition to update the job status with before finalizers are removed.
job.Status.Conditions = append(job.Status.Conditions, *finishedCond)
Expand Down
16 changes: 8 additions & 8 deletions pkg/controller/job/job_controller_test.go
Expand Up @@ -2192,7 +2192,7 @@ func TestSyncJobWithJobPodFailurePolicy(t *testing.T) {
Status: batch.JobStatus{
Conditions: []batch.JobCondition{
{
Type: batch.AlphaNoCompatGuaranteeJobFailureTarget,
Type: batch.JobFailureTarget,
Status: v1.ConditionTrue,
Reason: "PodFailurePolicy",
Message: "Container main-container for pod default/mypod-0 failed with exit code 5 matching FailJob rule at index 1",
Expand Down Expand Up @@ -2245,7 +2245,7 @@ func TestSyncJobWithJobPodFailurePolicy(t *testing.T) {
Status: batch.JobStatus{
Conditions: []batch.JobCondition{
{
Type: batch.AlphaNoCompatGuaranteeJobFailureTarget,
Type: batch.JobFailureTarget,
Status: v1.ConditionTrue,
Reason: "PodFailurePolicy",
Message: "Container main-container for pod default/already-deleted-pod failed with exit code 5 matching FailJob rule at index 1",
Expand Down Expand Up @@ -2751,7 +2751,7 @@ func TestSyncJobWithJobPodFailurePolicy(t *testing.T) {
Action: batch.PodFailurePolicyActionIgnore,
OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
{
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
Type: v1.DisruptionTarget,
Status: v1.ConditionTrue,
},
},
Expand All @@ -2769,7 +2769,7 @@ func TestSyncJobWithJobPodFailurePolicy(t *testing.T) {
Status: v1.ConditionTrue,
},
{
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
Type: v1.DisruptionTarget,
Status: v1.ConditionTrue,
},
},
Expand Down Expand Up @@ -2797,7 +2797,7 @@ func TestSyncJobWithJobPodFailurePolicy(t *testing.T) {
Action: batch.PodFailurePolicyActionIgnore,
OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
{
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
Type: v1.DisruptionTarget,
Status: v1.ConditionTrue,
},
},
Expand All @@ -2811,7 +2811,7 @@ func TestSyncJobWithJobPodFailurePolicy(t *testing.T) {
Phase: v1.PodFailed,
Conditions: []v1.PodCondition{
{
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
Type: v1.DisruptionTarget,
Status: v1.ConditionTrue,
},
},
Expand Down Expand Up @@ -2839,7 +2839,7 @@ func TestSyncJobWithJobPodFailurePolicy(t *testing.T) {
Action: batch.PodFailurePolicyActionFailJob,
OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
{
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
Type: v1.DisruptionTarget,
Status: v1.ConditionTrue,
},
},
Expand All @@ -2853,7 +2853,7 @@ func TestSyncJobWithJobPodFailurePolicy(t *testing.T) {
Phase: v1.PodFailed,
Conditions: []v1.PodCondition{
{
Type: v1.AlphaNoCompatGuaranteeDisruptionTarget,
Type: v1.DisruptionTarget,
Status: v1.ConditionTrue,
},
},
Expand Down