Skip to content

Commit

Permalink
[release-1.12] Surface cpu and mem requests forbidden errors (and oth…
Browse files Browse the repository at this point in the history
…er ones too) in KSVC creation (#14618)

* reconciling the revision so the deployment status is propagated when there is something wrong (like low resources request and limits), since it was just beign propagated when the revision status was nil and the deployment existed

trying to surface replicaset creation errors

* added revision condition active to revision live condition set so is not nil after the revision is created

* removing RevisionConditionActive from Revision ConditionSet since we can have Ready Inactive Revisions (scale to zero) * added docs and tests for this and the replicaset failure propagation

* fixing lint

* adjusted e2e and unit tests for the replica failure erros propagation, improved propagation logic + left todos regarding revision conditionset

* removed todo from revision lifecycle since the discussion has settled

* added test case for revision fast failures when it comes to replicaset failing to create

* fixed resource quota error, now it never waits for progress deadline and it fails fast, so removing the bits where it can go one way or another in the e2e resource_quota_error_test

* finishing the replicaset deployment status failure bubbling up to the revision table test

* removed unused test methods from revision testing package

* adding condition to wait for container creation in the resource quota service creation test

* with some istio cases this could fail with progress deadline exceeded error so adding that case too

* Update resource_quota_error_test.go

* formated the test file

---------

Co-authored-by: Gabriel Freites <gfreites@vmware.com>
Co-authored-by: Gabriel Freites <gabohd@gmail.com>
  • Loading branch information
3 people committed Nov 14, 2023
1 parent 2659cc3 commit d638f49
Show file tree
Hide file tree
Showing 7 changed files with 106 additions and 15 deletions.
7 changes: 7 additions & 0 deletions pkg/apis/serving/v1/revision_helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package v1
import (
"time"

appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
net "knative.dev/networking/pkg/apis/networking"
"knative.dev/pkg/kmeta"
Expand Down Expand Up @@ -143,3 +144,9 @@ func (rs *RevisionStatus) IsActivationRequired() bool {
c := revisionCondSet.Manage(rs).GetCondition(RevisionConditionActive)
return c != nil && c.Status != corev1.ConditionTrue
}

// IsReplicaSetFailure returns true if the deployment replicaset failed to create
func (rs *RevisionStatus) IsReplicaSetFailure(deploymentStatus *appsv1.DeploymentStatus) bool {
ds := serving.TransformDeploymentStatus(deploymentStatus)
return ds != nil && ds.GetCondition(serving.DeploymentConditionReplicaSetReady).IsFalse()
}
43 changes: 43 additions & 0 deletions pkg/apis/serving/v1/revision_helpers_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"testing"
"time"

appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/equality"

Expand Down Expand Up @@ -267,3 +268,45 @@ func TestSetRoutingState(t *testing.T) {
t.Error("Expected default value for unparsable annotationm but got:", got)
}
}

func TestIsReplicaSetFailure(t *testing.T) {
revisionStatus := RevisionStatus{}
cases := []struct {
name string
status appsv1.DeploymentStatus
IsReplicaSetFailure bool
}{{
name: "empty deployment status should not be a failure",
status: appsv1.DeploymentStatus{},
}, {
name: "Ready deployment status should not be a failure",
status: appsv1.DeploymentStatus{
Conditions: []appsv1.DeploymentCondition{{
Type: appsv1.DeploymentAvailable, Status: corev1.ConditionTrue,
}},
},
}, {
name: "ReplicasetFailure true should be a failure",
status: appsv1.DeploymentStatus{
Conditions: []appsv1.DeploymentCondition{{
Type: appsv1.DeploymentReplicaFailure, Status: corev1.ConditionTrue,
}},
},
IsReplicaSetFailure: true,
}, {
name: "ReplicasetFailure false should not be a failure",
status: appsv1.DeploymentStatus{
Conditions: []appsv1.DeploymentCondition{{
Type: appsv1.DeploymentReplicaFailure, Status: corev1.ConditionFalse,
}},
},
}}

for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
if got, want := revisionStatus.IsReplicaSetFailure(&tc.status), tc.IsReplicaSetFailure; got != want {
t.Errorf("IsReplicaSetFailure = %v, want: %v", got, want)
}
})
}
}
2 changes: 1 addition & 1 deletion pkg/apis/serving/v1/revision_lifecycle.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ const (
ReasonProgressDeadlineExceeded = "ProgressDeadlineExceeded"
)

// RevisionConditionActive is not part of the RevisionConditionSet because we can have Inactive Ready Revisions (scale to zero)
var revisionCondSet = apis.NewLivingConditionSet(
RevisionConditionResourcesAvailable,
RevisionConditionContainerHealthy,
Expand Down Expand Up @@ -171,7 +172,6 @@ func (rs *RevisionStatus) PropagateDeploymentStatus(original *appsv1.DeploymentS
func (rs *RevisionStatus) PropagateAutoscalerStatus(ps *autoscalingv1alpha1.PodAutoscalerStatus) {
// Reflect the PA status in our own.
cond := ps.GetCondition(autoscalingv1alpha1.PodAutoscalerConditionReady)

rs.ActualReplicas = nil
if ps.ActualScale != nil && *ps.ActualScale >= 0 {
rs.ActualReplicas = ps.ActualScale
Expand Down
6 changes: 6 additions & 0 deletions pkg/reconciler/revision/reconcile_resources.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,12 @@ func (c *Reconciler) reconcileDeployment(ctx context.Context, rev *v1.Revision)
}
}

// If the replicaset is failing we assume its an error we have to surface
if rev.Status.IsReplicaSetFailure(&deployment.Status) {
rev.Status.PropagateDeploymentStatus(&deployment.Status)
return nil
}

// If a container keeps crashing (no active pods in the deployment although we want some)
if *deployment.Spec.Replicas > 0 && deployment.Status.AvailableReplicas == 0 {
pods, err := c.kubeclient.CoreV1().Pods(ns).List(ctx, metav1.ListOptions{LabelSelector: metav1.FormatLabelSelector(deployment.Spec.Selector)})
Expand Down
28 changes: 28 additions & 0 deletions pkg/reconciler/revision/table_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -471,6 +471,34 @@ func TestReconcile(t *testing.T) {
Object: pa("foo", "deploy-timeout", WithReachabilityUnreachable),
}},
Key: "foo/deploy-timeout",
}, {
Name: "revision failure because replicaset and deployment failed",
// This test defines a Revision InProgress with status Deploying but with a
// Deployment with a ReplicaSet failure, so the wanted status update is for
// the Deployment FailedCreate error to bubble up to the Revision
Objects: []runtime.Object{
Revision("foo", "deploy-replicaset-failure",
WithLogURL, MarkActivating("Deploying", ""),
WithRoutingState(v1.RoutingStateActive, fc),
withDefaultContainerStatuses(),
WithRevisionObservedGeneration(1),
MarkContainerHealthyUnknown("Deploying"),
),
pa("foo", "deploy-replicaset-failure", WithReachabilityUnreachable),
replicaFailureDeploy(deploy(t, "foo", "deploy-replicaset-failure"), "I ReplicaSet failed!"),
image("foo", "deploy-replicaset-failure"),
},
WantStatusUpdates: []clientgotesting.UpdateActionImpl{{
Object: Revision("foo", "deploy-replicaset-failure",
WithLogURL, MarkResourcesUnavailable("FailedCreate", "I ReplicaSet failed!"),
withDefaultContainerStatuses(),
WithRoutingState(v1.RoutingStateActive, fc),
MarkContainerHealthyUnknown("Deploying"),
WithRevisionObservedGeneration(1),
MarkActivating("Deploying", ""),
),
}},
Key: "foo/deploy-replicaset-failure",
}, {
Name: "surface replica failure",
// Test the propagation of FailedCreate from Deployment.
Expand Down
6 changes: 6 additions & 0 deletions pkg/testing/v1/revision.go
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,12 @@ func MarkDeploying(reason string) RevisionOption {
}
}

func MarkContainerHealthyUnknown(reason string) RevisionOption {
return func(r *v1.Revision) {
r.Status.MarkContainerHealthyUnknown(reason, "")
}
}

// MarkProgressDeadlineExceeded calls the method of the same name on the Revision
// with the message we expect the Revision Reconciler to pass.
func MarkProgressDeadlineExceeded(message string) RevisionOption {
Expand Down
29 changes: 15 additions & 14 deletions test/e2e/resource_quota_error_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,11 @@ func TestResourceQuotaError(t *testing.T) {

clients := test.Setup(t, test.Options{Namespace: "rq-test"})
const (
errorReason = "RevisionFailed"
errorMsgScale = "Initial scale was never achieved"
errorMsgQuota = "forbidden: exceeded quota"
revisionReason = "ProgressDeadlineExceeded"
errorReason = "RevisionFailed"
progressDeadlineReason = "ProgressDeadlineExceeded"
waitReason = "ContainerCreating"
errorMsgQuota = "forbidden: exceeded quota"
revisionReason = "RevisionFailed"
)
names := test.ResourceNames{
Service: test.ObjectNameForTest(t),
Expand Down Expand Up @@ -80,16 +81,12 @@ func TestResourceQuotaError(t *testing.T) {
err = v1test.WaitForServiceState(clients.ServingClient, names.Service, func(r *v1.Service) (bool, error) {
cond = r.Status.GetCondition(v1.ServiceConditionConfigurationsReady)
if cond != nil && !cond.IsUnknown() {
// Can fail with either a progress deadline exceeded error or an exceeded resource quota error
if strings.Contains(cond.Message, errorMsgScale) && cond.IsFalse() {
return true, nil
}
if strings.Contains(cond.Message, errorMsgQuota) && cond.IsFalse() {
if cond.Reason == errorReason && cond.IsFalse() {
return true, nil
}
t.Logf("Reason: %s ; Message: %s ; Status: %s", cond.Reason, cond.Message, cond.Status)
return true, fmt.Errorf("the service %s was not marked with expected error condition (Reason=%q, Message=%q, Status=%q), but with (Reason=%q, Message=%q, Status=%q)",
names.Config, errorReason, errorMsgScale, "False", cond.Reason, cond.Message, cond.Status)
names.Config, errorReason, "", "False", cond.Reason, cond.Message, cond.Status)
}
return false, nil
}, "ContainerUnscheduleable")
Expand All @@ -113,15 +110,19 @@ func TestResourceQuotaError(t *testing.T) {
err = v1test.CheckRevisionState(clients.ServingClient, revisionName, func(r *v1.Revision) (bool, error) {
cond := r.Status.GetCondition(v1.RevisionConditionReady)
if cond != nil {
// Can fail with either a progress deadline exceeded error or an exceeded resource quota error
if cond.Reason == revisionReason && strings.Contains(cond.Message, errorMsgScale) {
if strings.Contains(cond.Message, errorMsgQuota) && cond.IsFalse() {
return true, nil
}
if strings.Contains(cond.Message, errorMsgQuota) && cond.IsFalse() {
// Can fail with either a progress deadline exceeded error
if cond.Reason == progressDeadlineReason {
return true, nil
}
// wait for the container creation
if cond.Reason == waitReason {
return false, nil
}
return true, fmt.Errorf("the revision %s was not marked with expected error condition (Reason=%q, Message=%q), but with (Reason=%q, Message=%q)",
revisionName, revisionReason, errorMsgScale, cond.Reason, cond.Message)
revisionName, revisionReason, errorMsgQuota, cond.Reason, cond.Message)
}
return false, nil
})
Expand Down

0 comments on commit d638f49

Please sign in to comment.