Skip to content

Commit

Permalink
Allow deletion of unhealthy pods if enough healthy
Browse files Browse the repository at this point in the history
Currently, if you have a PDB with 0 disruptions
available and you attempt to evict a non-healthy
pod, the eviction request will always fail.  This
is because the eviction API does not currently
take in to account that the pod you are removing
is the unhealthy one.

This commit accounts for trying to evict an
unhealthy pod as long as there are enough healthy
pods to satisfy the PDB's requirements.  To
protect against race conditions, a ResourceVersion
constraint is enforced.  This will ensure that
the target pod does not go healthy and allow
any race condition to occur which might disrupt
too many pods at once.

This commit also eliminates superfluous class to
DeepCopy for the deleteOptions struct.
  • Loading branch information
michaelgugino authored and tnozicka committed Jul 28, 2021
1 parent 9f3d747 commit d9d68b7
Show file tree
Hide file tree
Showing 3 changed files with 165 additions and 18 deletions.
1 change: 1 addition & 0 deletions pkg/registry/core/pod/storage/BUILD
Expand Up @@ -14,6 +14,7 @@ go_test(
],
embed = [":go_default_library"],
deps = [
"//pkg/api/pod:go_default_library",
"//pkg/apis/core:go_default_library",
"//pkg/apis/policy:go_default_library",
"//pkg/registry/registrytest:go_default_library",
Expand Down
71 changes: 55 additions & 16 deletions pkg/registry/core/pod/storage/eviction.go
Expand Up @@ -33,6 +33,7 @@ import (
"k8s.io/apiserver/pkg/util/dryrun"
policyclient "k8s.io/client-go/kubernetes/typed/policy/v1beta1"
"k8s.io/client-go/util/retry"
podutil "k8s.io/kubernetes/pkg/api/pod"
api "k8s.io/kubernetes/pkg/apis/core"
"k8s.io/kubernetes/pkg/apis/policy"
)
Expand Down Expand Up @@ -145,19 +146,18 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje
}

// the PDB can be ignored, so delete the pod
deletionOptions := originalDeleteOptions.DeepCopy()
deleteOptions := originalDeleteOptions

// We should check if resourceVersion is already set by the requestor
// as it might be older than the pod we just fetched and should be
// honored.
if shouldEnforceResourceVersion(pod) && resourceVersionIsUnset(originalDeleteOptions) {
// Set deletionOptions.Preconditions.ResourceVersion to ensure we're not
// Set deleteOptions.Preconditions.ResourceVersion to ensure we're not
// racing with another PDB-impacting process elsewhere.
if deletionOptions.Preconditions == nil {
deletionOptions.Preconditions = &metav1.Preconditions{}
}
deletionOptions.Preconditions.ResourceVersion = &pod.ResourceVersion
deleteOptions = deleteOptions.DeepCopy()
setPreconditionsResourceVersion(deleteOptions, &pod.ResourceVersion)
}
_, _, err = r.store.Delete(ctx, eviction.Name, rest.ValidateAllObjectFunc, deletionOptions)
_, _, err = r.store.Delete(ctx, eviction.Name, rest.ValidateAllObjectFunc, deleteOptions)
if err != nil {
return err
}
Expand All @@ -181,6 +181,8 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje

var rtStatus *metav1.Status
var pdbName string
updateDeletionOptions := false

err = func() error {
pdbs, err := r.getPodDisruptionBudgets(ctx, pod)
if err != nil {
Expand All @@ -201,6 +203,13 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje

pdb := &pdbs[0]
pdbName = pdb.Name

// If the pod is not ready, it doesn't count towards healthy and we should not decrement
if !podutil.IsPodReady(pod) && pdb.Status.CurrentHealthy >= pdb.Status.DesiredHealthy && pdb.Status.DesiredHealthy > 0 {
updateDeletionOptions = true
return nil
}

refresh := false
err = retry.RetryOnConflict(EvictionsRetry, func() error {
if refresh {
Expand Down Expand Up @@ -232,18 +241,43 @@ func (r *EvictionREST) Create(ctx context.Context, name string, obj runtime.Obje
return rtStatus, nil
}

// At this point there was either no PDB or we succeeded in decrementing
// At this point there was either no PDB or we succeeded in decrementing or
// the pod was unready and we have enough healthy replicas

deleteOptions := originalDeleteOptions

// Set deleteOptions.Preconditions.ResourceVersion to ensure
// the pod hasn't been considered ready since we calculated
if updateDeletionOptions {
// Take a copy so we can compare to client-provied Options later.
deleteOptions = deleteOptions.DeepCopy()
setPreconditionsResourceVersion(deleteOptions, &pod.ResourceVersion)
}

// Try the delete
_, _, err = r.store.Delete(ctx, eviction.Name, rest.ValidateAllObjectFunc, originalDeleteOptions.DeepCopy())
_, _, err = r.store.Delete(ctx, eviction.Name, rest.ValidateAllObjectFunc, deleteOptions)
if err != nil {
if errors.IsConflict(err) && updateDeletionOptions &&
(originalDeleteOptions.Preconditions == nil || originalDeleteOptions.Preconditions.ResourceVersion == nil) {
// If we encounter a resource conflict error, we updated the deletion options to include them,
// and the original deletion options did not specify ResourceVersion, we send back
// TooManyRequests so clients will retry.
return nil, createTooManyRequestsError(pdbName)
}
return nil, err
}

// Success!
return &metav1.Status{Status: metav1.StatusSuccess}, nil
}

func setPreconditionsResourceVersion(deleteOptions *metav1.DeleteOptions, resourceVersion *string) {
if deleteOptions.Preconditions == nil {
deleteOptions.Preconditions = &metav1.Preconditions{}
}
deleteOptions.Preconditions.ResourceVersion = resourceVersion
}

// canIgnorePDB returns true for pod conditions that allow the pod to be deleted
// without checking PDBs.
func canIgnorePDB(pod *api.Pod) bool {
Expand All @@ -268,16 +302,21 @@ func resourceVersionIsUnset(options *metav1.DeleteOptions) bool {
return options.Preconditions == nil || options.Preconditions.ResourceVersion == nil
}

func createTooManyRequestsError(name string) error {
// TODO(mml): Add a Retry-After header. Once there are time-based
// budgets, we can sometimes compute a sensible suggested value. But
// even without that, we can give a suggestion (10 minutes?) that
// prevents well-behaved clients from hammering us.
err := errors.NewTooManyRequests("Cannot evict pod as it would violate the pod's disruption budget.", 0)
err.ErrStatus.Details.Causes = append(err.ErrStatus.Details.Causes, metav1.StatusCause{Type: "DisruptionBudget", Message: fmt.Sprintf("The disruption budget %s is still being processed by the server.", name)})
return err
}

// checkAndDecrement checks if the provided PodDisruptionBudget allows any disruption.
func (r *EvictionREST) checkAndDecrement(namespace string, podName string, pdb policyv1beta1.PodDisruptionBudget, dryRun bool) error {
if pdb.Status.ObservedGeneration < pdb.Generation {
// TODO(mml): Add a Retry-After header. Once there are time-based
// budgets, we can sometimes compute a sensible suggested value. But
// even without that, we can give a suggestion (10 minutes?) that
// prevents well-behaved clients from hammering us.
err := errors.NewTooManyRequests("Cannot evict pod as it would violate the pod's disruption budget.", 0)
err.ErrStatus.Details.Causes = append(err.ErrStatus.Details.Causes, metav1.StatusCause{Type: "DisruptionBudget", Message: fmt.Sprintf("The disruption budget %s is still being processed by the server.", pdb.Name)})
return err

return createTooManyRequestsError(pdb.Name)
}
if pdb.Status.DisruptionsAllowed < 0 {
return errors.NewForbidden(policy.Resource("poddisruptionbudget"), pdb.Name, fmt.Errorf("pdb disruptions allowed is negative"))
Expand Down
111 changes: 109 additions & 2 deletions pkg/registry/core/pod/storage/eviction_test.go
Expand Up @@ -31,6 +31,7 @@ import (
genericapirequest "k8s.io/apiserver/pkg/endpoints/request"
"k8s.io/apiserver/pkg/registry/rest"
"k8s.io/client-go/kubernetes/fake"
podapi "k8s.io/kubernetes/pkg/api/pod"
api "k8s.io/kubernetes/pkg/apis/core"
"k8s.io/kubernetes/pkg/apis/policy"
)
Expand Down Expand Up @@ -219,6 +220,7 @@ func TestEvictionIngorePDB(t *testing.T) {
podName string
expectedDeleteCount int
podTerminating bool
prc *api.PodCondition
}{
{
name: "pdbs No disruptions allowed, pod pending, first delete conflict, pod still pending, pod deleted successfully",
Expand Down Expand Up @@ -301,6 +303,100 @@ func TestEvictionIngorePDB(t *testing.T) {
expectedDeleteCount: 1,
podTerminating: true,
},
{
name: "matching pdbs with no disruptions allowed, pod running, pod healthy, unhealthy pod not ours",
pdbs: []runtime.Object{&policyv1beta1.PodDisruptionBudget{
ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"},
Spec: policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}},
Status: policyv1beta1.PodDisruptionBudgetStatus{
// This simulates 3 pods desired, our pod healthy, unhealthy pod is not ours.
DisruptionsAllowed: 0,
CurrentHealthy: 2,
DesiredHealthy: 2,
},
}},
eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t7", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)},
expectError: true,
podName: "t7",
expectedDeleteCount: 0,
podTerminating: false,
podPhase: api.PodRunning,
prc: &api.PodCondition{
Type: api.PodReady,
Status: api.ConditionTrue,
},
},
{
name: "matching pdbs with no disruptions allowed, pod running, pod unhealthy, unhealthy pod ours",
pdbs: []runtime.Object{&policyv1beta1.PodDisruptionBudget{
ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"},
Spec: policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}},
Status: policyv1beta1.PodDisruptionBudgetStatus{
// This simulates 3 pods desired, our pod unhealthy
DisruptionsAllowed: 0,
CurrentHealthy: 2,
DesiredHealthy: 2,
},
}},
eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t8", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)},
expectError: false,
podName: "t8",
expectedDeleteCount: 1,
podTerminating: false,
podPhase: api.PodRunning,
prc: &api.PodCondition{
Type: api.PodReady,
Status: api.ConditionFalse,
},
},
{
// This case should return the 529 retry error.
name: "matching pdbs with no disruptions allowed, pod running, pod unhealthy, unhealthy pod ours, resource version conflict",
pdbs: []runtime.Object{&policyv1beta1.PodDisruptionBudget{
ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"},
Spec: policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}},
Status: policyv1beta1.PodDisruptionBudgetStatus{
// This simulates 3 pods desired, our pod unhealthy
DisruptionsAllowed: 0,
CurrentHealthy: 2,
DesiredHealthy: 2,
},
}},
eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t9", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)},
expectError: true,
podName: "t9",
expectedDeleteCount: 1,
podTerminating: false,
podPhase: api.PodRunning,
prc: &api.PodCondition{
Type: api.PodReady,
Status: api.ConditionFalse,
},
},
{
// This case should return the 529 retry error.
name: "matching pdbs with no disruptions allowed, pod running, pod unhealthy, unhealthy pod ours, other error on delete",
pdbs: []runtime.Object{&policyv1beta1.PodDisruptionBudget{
ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"},
Spec: policyv1beta1.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"a": "true"}}},
Status: policyv1beta1.PodDisruptionBudgetStatus{
// This simulates 3 pods desired, our pod unhealthy
DisruptionsAllowed: 0,
CurrentHealthy: 2,
DesiredHealthy: 2,
},
}},
eviction: &policy.Eviction{ObjectMeta: metav1.ObjectMeta{Name: "t10", Namespace: "default"}, DeleteOptions: metav1.NewDeleteOptions(0)},
expectError: true,
podName: "t10",
expectedDeleteCount: 1,
podTerminating: false,
podPhase: api.PodRunning,
prc: &api.PodCondition{
Type: api.PodReady,
Status: api.ConditionFalse,
},
},
}

for _, tc := range testcases {
Expand All @@ -323,6 +419,13 @@ func TestEvictionIngorePDB(t *testing.T) {
pod.ObjectMeta.DeletionTimestamp = &currentTime
}

// Setup pod condition
if tc.prc != nil {
if !podapi.UpdatePodCondition(&pod.Status, tc.prc) {
t.Fatalf("Unable to update pod ready condition")
}
}

client := fake.NewSimpleClientset(tc.pdbs...)
evictionRest := newEvictionStorage(ms, client.PolicyV1beta1())

Expand Down Expand Up @@ -416,10 +519,14 @@ func (ms *mockStore) mutatorDeleteFunc(count int, options *metav1.DeleteOptions)
// Always return error for this pod
return nil, false, apierrors.NewConflict(resource("tests"), "2", errors.New("message"))
}
if ms.pod.Name == "t6" {
// This pod has a deletionTimestamp and should not raise conflict on delete
if ms.pod.Name == "t6" || ms.pod.Name == "t8" {
// t6: This pod has a deletionTimestamp and should not raise conflict on delete
// t8: This pod should not have a resource conflict.
return nil, true, nil
}
if ms.pod.Name == "t10" {
return nil, false, apierrors.NewBadRequest("test designed to error")
}
if count == 1 {
// This is a hack to ensure that some test pods don't change phase
// but do change resource version
Expand Down

0 comments on commit d9d68b7

Please sign in to comment.