Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions pkg/apis/serving/v1/configuration_lifecycle.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,17 @@ func (cs *ConfigurationStatus) MarkRevisionCreationFailed(message string) {
"Revision creation failed with message: %s.", message)
}

// MarkRevisionCreationRetrying marks the ConfigurationConditionReady condition
// as Unknown to indicate a transient error occurred during Revision creation
// and the reconciler will retry. Using Unknown (not False) prevents clients
// from treating retryable infrastructure blips as permanent failures.
func (cs *ConfigurationStatus) MarkRevisionCreationRetrying(message string) {
configCondSet.Manage(cs).MarkUnknown(
ConfigurationConditionReady,
"RevisionCreationRetrying",
"Revision creation failed transiently and will be retried: %s.", message)
}

// MarkLatestReadyDeleted marks the ConfigurationConditionReady condition to
// indicate that the Revision was deleted.
func (cs *ConfigurationStatus) MarkLatestReadyDeleted() {
Expand Down
40 changes: 31 additions & 9 deletions pkg/reconciler/configuration/configuration.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ import (

corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/equality"
"k8s.io/apimachinery/pkg/api/errors"
apierrs "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/selection"
Expand Down Expand Up @@ -65,9 +65,9 @@ func (c *Reconciler) ReconcileKind(ctx context.Context, config *v1.Configuration

// First, fetch the revision that should exist for the current generation.
lcr, isBYOName, err := c.latestCreatedRevision(ctx, config)
if errors.IsNotFound(err) {
if apierrs.IsNotFound(err) {
lcr, err = c.createRevision(ctx, config)
if errors.IsAlreadyExists(err) {
if apierrs.IsAlreadyExists(err) {
// Newer revisions with a consistent naming scheme can theoretically hit this
// path during normal operation so we don't actually report any failures to
// the user.
Expand All @@ -76,11 +76,14 @@ func (c *Reconciler) ReconcileKind(ctx context.Context, config *v1.Configuration
return fmt.Errorf("failed to create Revision: %w", err)
} else if err != nil {
recorder.Eventf(config, corev1.EventTypeWarning, "CreationFailed", "Failed to create Revision: %v", err)
config.Status.MarkRevisionCreationFailed(err.Error())

if isTransientCreateError(err) {
config.Status.MarkRevisionCreationRetrying(err.Error())
} else {
config.Status.MarkRevisionCreationFailed(err.Error())
}
return fmt.Errorf("failed to create Revision: %w", err)
}
} else if errors.IsAlreadyExists(err) {
} else if apierrs.IsAlreadyExists(err) {
// If we get an already-exists error from latestCreatedRevision it means
// that the Revision name already exists for another Configuration or at
// the wrong generation of this configuration.
Expand Down Expand Up @@ -239,10 +242,10 @@ func CheckNameAvailability(ctx context.Context, config *v1.Configuration, lister
if name == "" {
return nil, nil
}
errConflict := errors.NewAlreadyExists(v1.Resource("revisions"), name)
errConflict := apierrs.NewAlreadyExists(v1.Resource("revisions"), name)

rev, err := lister.Revisions(config.Namespace).Get(name)
if errors.IsNotFound(err) {
if apierrs.IsNotFound(err) {
// Does not exist, we must be good!
// note: for the name to change the generation must change.
return nil, err
Expand Down Expand Up @@ -293,7 +296,7 @@ func (c *Reconciler) latestCreatedRevision(ctx context.Context, config *v1.Confi
return list[0], false, nil
}

return nil, false, errors.NewNotFound(v1.Resource("revisions"), "revision for "+config.Name)
return nil, false, apierrs.NewNotFound(v1.Resource("revisions"), "revision for "+config.Name)
}

func (c *Reconciler) createRevision(ctx context.Context, config *v1.Configuration) (*v1.Revision, error) {
Expand All @@ -309,3 +312,22 @@ func (c *Reconciler) createRevision(ctx context.Context, config *v1.Configuratio

return created, nil
}

// isTransientCreateError returns true for errors that are likely transient
// (webhook timeouts, API server overload, rate limiting) and should result in
// a retry rather than a permanent RevisionFailed status. Only HTTP status codes
// whose semantics are definitively transient are included here.
//
// NOTE: IsInternalError (HTTP 500) is included because webhook infrastructure
// failures surface as InternalError (e.g. "failed calling webhook: context
// deadline exceeded"). A webhook that permanently rejects a spec typically
// returns Forbidden or Invalid, not InternalError. However, a webhook service
// that is permanently unreachable (deleted, wrong DNS) also surfaces as
// InternalError, so there is no perfect distinction at this layer.
func isTransientCreateError(err error) bool {
return apierrs.IsInternalError(err) ||
apierrs.IsServiceUnavailable(err) ||
apierrs.IsServerTimeout(err) ||
apierrs.IsTimeout(err) ||
apierrs.IsTooManyRequests(err)
}
Comment on lines +327 to +333
Copy link

Copilot AI Apr 21, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

isTransientCreateError treats any *url.Error as transient. url.Error is also used for non-transient failures (e.g., TLS/x509 validation errors, DNS misconfiguration, unsupported scheme), so this change can leave a Configuration stuck Unknown indefinitely instead of surfacing a persistent failure. Consider narrowing this to known-retriable network errors (e.g., underlying net.Error timeout/temporary, io.EOF/io.ErrUnexpectedEOF/connection reset) and letting other url.Error values continue to mark RevisionFailed (or adding an allowlist/denylist based on the wrapped error).

Copilot uses AI. Check for mistakes.
31 changes: 31 additions & 0 deletions pkg/reconciler/configuration/configuration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,37 @@ func TestReconcile(t *testing.T) {
Eventf(corev1.EventTypeWarning, "InternalError", "failed to create Revision: inducing failure for create revisions"),
},
Key: "foo/create-revision-failure",
}, {
Name: "transient failure creating revision (webhook timeout) - status set to Unknown not False",
Ctx: config.ToContext(context.Background(), config.FromContext(testCtx)),
// Webhook timeouts surface as InternalError; the reconciler should set
// Ready=Unknown/RevisionCreationRetrying so clients keep waiting rather
// than treating the blip as a permanent failure.
WantErr: true,
WithReactors: []clientgotesting.ReactionFunc{
func(action clientgotesting.Action) (bool, runtime.Object, error) {
if action.Matches("create", "revisions") {
return true, nil, apierrs.NewInternalError(errors.New("failed calling webhook: context deadline exceeded"))
}
return false, nil, nil
},
},
Objects: []runtime.Object{
cfg("transient-create-failure-internal", "foo", 99998),
},
WantCreates: []runtime.Object{
rev("transient-create-failure-internal", "foo", 99998),
},
WantStatusUpdates: []clientgotesting.UpdateActionImpl{{
Object: cfg("transient-create-failure-internal", "foo", 99998,
MarkRevisionCreationRetrying("Internal error occurred: failed calling webhook: context deadline exceeded"),
WithConfigObservedGen),
}},
WantEvents: []string{
Eventf(corev1.EventTypeWarning, "CreationFailed", "Failed to create Revision: Internal error occurred: failed calling webhook: context deadline exceeded"),
Eventf(corev1.EventTypeWarning, "InternalError", "failed to create Revision: Internal error occurred: failed calling webhook: context deadline exceeded"),
},
Key: "foo/transient-create-failure-internal",
}, {
Name: "failure updating configuration status",
Ctx: config.ToContext(context.Background(), config.FromContext(testCtx)),
Expand Down
7 changes: 7 additions & 0 deletions pkg/testing/v1/configuration.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,13 @@ func WithConfigObservedGen(cfg *v1.Configuration) {
cfg.Status.ObservedGeneration = cfg.Generation
}

// MarkRevisionCreationRetrying calls .Status.MarkRevisionCreationRetrying.
func MarkRevisionCreationRetrying(msg string) ConfigOption {
return func(cfg *v1.Configuration) {
cfg.Status.MarkRevisionCreationRetrying(msg)
}
}

// WithLatestCreated initializes the .status.latestCreatedRevisionName to be the name
// of the latest revision that the Configuration would have created.
func WithLatestCreated(name string) ConfigOption {
Expand Down
Loading