From 6ace089c0a48f003edd6e5daeab4fe1697ae2520 Mon Sep 17 00:00:00 2001 From: Ben Ash Date: Tue, 14 May 2024 15:40:28 +0000 Subject: [PATCH 1/4] WIP: Use exponential backoffs on secret source errors. Previously, the back off duration was based on a fixed duration + some jitter. This PR introduces exponential back offs for all secret syncing controllers. The back off will be calculated and honored whenever an error is encountered while fetching from a secret source e.g: Vault, HCPVS. --- chart/templates/_helpers.tpl | 24 ++++ chart/templates/deployment.yaml | 3 + chart/values.yaml | 16 +++ controllers/hcpvaultsecretsapp_controller.go | 14 ++- controllers/registry.go | 68 +++++++++++ controllers/registry_test.go | 115 +++++++++++++++++++ controllers/vaultdynamicsecret_controller.go | 18 ++- controllers/vaultpkisecret_controller.go | 12 +- controllers/vaultstaticsecret_controller.go | 14 ++- internal/options/env.go | 14 +++ internal/options/env_test.go | 17 +-- main.go | 49 +++++++- test/unit/deployment.bats | 62 ++++++++-- 13 files changed, 399 insertions(+), 27 deletions(-) diff --git a/chart/templates/_helpers.tpl b/chart/templates/_helpers.tpl index ab4b3303..23a32267 100644 --- a/chart/templates/_helpers.tpl +++ b/chart/templates/_helpers.tpl @@ -175,3 +175,27 @@ globalTransformationOptions configures the manager's --global-transformation-opt {{- $opts | join "," -}} {{- end -}} {{- end -}} + + +{{/* +backOffOnSecretSourceError provides the back-off options for the manager when a +secret source error occurs. +*/}} +{{- define "vso.backOffOnSecretSourceError" -}} +{{- $opts := list -}} +{{- with .Values.controller.manager.backoffOnSecretSourceError -}} +{{- with .initialInterval -}} +{{- $opts = mustAppend $opts (printf "--back-off-initial-interval=%s" .) -}} +{{- end -}} +{{- with .maxInterval -}} +{{- $opts = mustAppend $opts (printf "--back-off-max-interval=%s" .) -}} +{{- end -}} +{{- with .multiplier -}} +{{- $opts = mustAppend $opts (printf "--back-off-multiplier=%.2f" (. | float64)) -}} +{{- end -}} +{{- with .randomizationFactor -}} +{{- $opts = mustAppend $opts (printf "--back-off-randomization-factor=%.2f" (. | float64)) -}} +{{- end -}} +{{- $opts | toYaml | nindent 8 -}} +{{- end -}} +{{- end -}} diff --git a/chart/templates/deployment.yaml b/chart/templates/deployment.yaml index 1da02d9d..d7f3929d 100644 --- a/chart/templates/deployment.yaml +++ b/chart/templates/deployment.yaml @@ -78,6 +78,9 @@ spec: {{- if $opts }} - --global-transformation-options={{ $opts }} {{- end }} + {{- with include "vso.backOffOnSecretSourceError" . }} + {{- . -}} + {{- end }} {{- if .Values.controller.manager.extraArgs }} {{- toYaml .Values.controller.manager.extraArgs | nindent 8 }} {{- end }} diff --git a/chart/values.yaml b/chart/values.yaml index 5659007d..ec54cce2 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -113,6 +113,22 @@ controller: # in the destination K8s Secret. excludeRaw: false + # Backoff settings for the controller manager. These settings control the backoff behavior + # when the controller encounters an error while fetching secrets from the SecretSource. + backoffOnSecretSourceError: + # Initial interval between retries. + # @type: duration + initialInterval: "5s" + # Maximum interval between retries. + # @type: duration + maxInterval: "60s" + # Randomization factor to add jitter to the interval between retries. + # @type: float + randomizationFactor: 0.5 + # Sets the multiplier for increasing the interval between retries. + # @type: float + multiplier: 1.5 + # Configures the client cache which is used by the controller to cache (and potentially persist) vault tokens that # are the result of using the VaultAuthMethod. This enables re-use of Vault Tokens # throughout their TTLs as well as the ability to renew. diff --git a/controllers/hcpvaultsecretsapp_controller.go b/controllers/hcpvaultsecretsapp_controller.go index 20a4d94a..58798ce5 100644 --- a/controllers/hcpvaultsecretsapp_controller.go +++ b/controllers/hcpvaultsecretsapp_controller.go @@ -54,6 +54,7 @@ type HCPVaultSecretsAppReconciler struct { MinRefreshAfter time.Duration referenceCache ResourceReferenceCache GlobalTransformationOption *helpers.GlobalTransformationOption + BackOffRegistry *BackOffRegistry } //+kubebuilder:rbac:groups=secrets.hashicorp.com,resources=hcpvaultsecretsapps,verbs=get;list;watch;create;update;patch;delete @@ -126,9 +127,12 @@ func (r *HCPVaultSecretsAppReconciler) Reconcile(ctx context.Context, req ctrl.R resp, err := c.OpenAppSecrets(params, nil) if err != nil { logger.Error(err, "Get App Secret", "appName", o.Spec.AppName) + entry, _ := r.BackOffRegistry.Get(req.NamespacedName) return ctrl.Result{ - RequeueAfter: computeHorizonWithJitter(requeueDurationOnError), + RequeueAfter: entry.NextBackOff(), }, nil + } else { + r.BackOffRegistry.Delete(req.NamespacedName) } r.referenceCache.Set(SecretTransformation, req.NamespacedName, @@ -211,6 +215,10 @@ func (r *HCPVaultSecretsAppReconciler) updateStatus(ctx context.Context, o *secr // SetupWithManager sets up the controller with the Manager. func (r *HCPVaultSecretsAppReconciler) SetupWithManager(mgr ctrl.Manager, opts controller.Options) error { r.referenceCache = newResourceReferenceCache() + if r.BackOffRegistry == nil { + r.BackOffRegistry = NewBackOffRegistry() + } + return ctrl.NewControllerManagedBy(mgr). For(&secretsv1beta1.HCPVaultSecretsApp{}). WithEventFilter(syncableSecretPredicate(nil)). @@ -273,7 +281,9 @@ func (r *HCPVaultSecretsAppReconciler) hvsClient(ctx context.Context, o *secrets func (r *HCPVaultSecretsAppReconciler) handleDeletion(ctx context.Context, o client.Object) error { logger := log.FromContext(ctx) - r.referenceCache.Remove(SecretTransformation, client.ObjectKeyFromObject(o)) + objKey := client.ObjectKeyFromObject(o) + r.referenceCache.Remove(SecretTransformation, objKey) + r.BackOffRegistry.Delete(objKey) if controllerutil.ContainsFinalizer(o, hcpVaultSecretsAppFinalizer) { logger.Info("Removing finalizer") if controllerutil.RemoveFinalizer(o, hcpVaultSecretsAppFinalizer) { diff --git a/controllers/registry.go b/controllers/registry.go index f4c92e52..bb52fc5d 100644 --- a/controllers/registry.go +++ b/controllers/registry.go @@ -5,7 +5,9 @@ package controllers import ( "sync" + "time" + "github.com/cenkalti/backoff/v4" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -227,3 +229,69 @@ func (r *SyncRegistry) ObjectKeys() []client.ObjectKey { return result } + +// BackOffRegistry is a registry that stores sync backoff for a client.Object. +type BackOffRegistry struct { + m map[client.ObjectKey]*BackOff + mu sync.RWMutex + opts []backoff.ExponentialBackOffOpts +} + +// Delete objKey to the set of registered objects. +func (r *BackOffRegistry) Delete(objKey client.ObjectKey) bool { + r.mu.Lock() + defer r.mu.Unlock() + + _, ok := r.m[objKey] + delete(r.m, objKey) + return ok +} + +// Get is a getter/setter that returns the BackOff for objKey. +// If objKey is not in the set of registered objects, it will be added. Return +// true if the sync backoff entry was created. +func (r *BackOffRegistry) Get(objKey client.ObjectKey) (*BackOff, bool) { + r.mu.RLock() + defer r.mu.RUnlock() + + entry, ok := r.m[objKey] + if !ok { + entry = &BackOff{ + bo: backoff.NewExponentialBackOff(r.opts...), + } + r.m[objKey] = entry + } + + return entry, !ok +} + +// BackOff is a wrapper around backoff.BackOff that does not implement +// BackOff.Reset, since elements in BackOffRegistry are meant to be ephemeral. +type BackOff struct { + bo backoff.BackOff +} + +// NextBackOff returns the next backoff duration. +func (s *BackOff) NextBackOff() time.Duration { + return s.bo.NextBackOff() +} + +// DefaultExponentialBackOffOpts returns the default exponential options for the +func DefaultExponentialBackOffOpts() []backoff.ExponentialBackOffOpts { + return []backoff.ExponentialBackOffOpts{ + backoff.WithInitialInterval(requeueDurationOnError), + backoff.WithMaxInterval(time.Second * 60), + } +} + +// NewBackOffRegistry returns a BackOffRegistry. +func NewBackOffRegistry(opts ...backoff.ExponentialBackOffOpts) *BackOffRegistry { + if len(opts) == 0 { + opts = DefaultExponentialBackOffOpts() + } + + return &BackOffRegistry{ + m: map[client.ObjectKey]*BackOff{}, + opts: opts, + } +} diff --git a/controllers/registry_test.go b/controllers/registry_test.go index f79c3f59..454b38ed 100644 --- a/controllers/registry_test.go +++ b/controllers/registry_test.go @@ -6,7 +6,9 @@ package controllers import ( "sync" "testing" + "time" + "github.com/cenkalti/backoff/v4" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "sigs.k8s.io/controller-runtime/pkg/client" @@ -503,3 +505,116 @@ func TestSyncRegistry(t *testing.T) { }) } } + +func TestBackOffRegistry_Get(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + m map[client.ObjectKey]*BackOff + opts []backoff.ExponentialBackOffOpts + objKey client.ObjectKey + want *BackOff + want1 bool + }{ + { + name: "new", + m: map[client.ObjectKey]*BackOff{}, + objKey: client.ObjectKey{ + Namespace: "foo", + Name: "bar", + }, + want: &BackOff{ + bo: backoff.NewExponentialBackOff( + DefaultExponentialBackOffOpts()..., + ), + }, + want1: true, + }, + { + name: "previous", + m: map[client.ObjectKey]*BackOff{ + { + Namespace: "foo", + Name: "bar", + }: { + bo: backoff.NewExponentialBackOff( + DefaultExponentialBackOffOpts()..., + ), + }, + }, + objKey: client.ObjectKey{ + Namespace: "foo", + Name: "bar", + }, + want: &BackOff{ + bo: backoff.NewExponentialBackOff( + DefaultExponentialBackOffOpts()..., + ), + }, + want1: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + r := &BackOffRegistry{ + m: tt.m, + opts: tt.opts, + } + got, got1 := r.Get(tt.objKey) + assert.NotNilf(t, got, "Get(%v)", tt.objKey) + assert.Equalf(t, tt.want1, got1, "Get(%v)", tt.objKey) + assert.Greaterf(t, got.bo.NextBackOff(), time.Duration(0), "Get(%v)", tt.objKey) + }) + } +} + +func TestBackOffRegistry_Delete(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + m map[client.ObjectKey]*BackOff + opts []backoff.ExponentialBackOffOpts + objKey client.ObjectKey + want bool + }{ + { + name: "not-found", + m: map[client.ObjectKey]*BackOff{}, + objKey: client.ObjectKey{ + Namespace: "foo", + Name: "bar", + }, + want: false, + }, + { + name: "deleted", + m: map[client.ObjectKey]*BackOff{ + { + Namespace: "foo", + Name: "bar", + }: { + bo: backoff.NewExponentialBackOff( + DefaultExponentialBackOffOpts()..., + ), + }, + }, + objKey: client.ObjectKey{ + Namespace: "foo", + Name: "bar", + }, + want: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + r := &BackOffRegistry{ + m: tt.m, + opts: tt.opts, + } + got := r.Delete(tt.objKey) + assert.Equalf(t, tt.want, got, "Delete(%v)", tt.objKey) + }) + } +} diff --git a/controllers/vaultdynamicsecret_controller.go b/controllers/vaultdynamicsecret_controller.go index 635ea528..24fae66c 100644 --- a/controllers/vaultdynamicsecret_controller.go +++ b/controllers/vaultdynamicsecret_controller.go @@ -59,6 +59,7 @@ type VaultDynamicSecretReconciler struct { ClientFactory vault.ClientFactory HMACValidator helpers.HMACValidator SyncRegistry *SyncRegistry + BackOffRegistry *BackOffRegistry referenceCache ResourceReferenceCache GlobalTransformationOption *helpers.GlobalTransformationOption // sourceCh is used to trigger a requeue of resource instances from an @@ -158,7 +159,10 @@ func (r *VaultDynamicSecretReconciler) Reconcile(ctx context.Context, req ctrl.R // indicates that the resource has been added to the SyncRegistry // and must be synced. case r.SyncRegistry.Has(req.NamespacedName): + // indicates that the resource has been added to the SyncRegistry + // and must be synced. syncReason = "force sync" + syncReason = "last sync error" // indicates that the resource has been updated since the last sync. case o.GetGeneration() != o.Status.LastGeneration: syncReason = "resource updated" @@ -269,15 +273,16 @@ func (r *VaultDynamicSecretReconciler) Reconcile(ctx context.Context, req ctrl.R // sync the secret secretLease, staticCredsUpdated, err := r.syncSecret(ctx, vClient, o, transOption) if err != nil { - r.Recorder.Eventf(o, corev1.EventTypeWarning, consts.ReasonSecretSyncError, - "Failed to sync secret: %s", err) - _, jitter := computeMaxJitterWithPercent(requeueDurationOnError, 0.5) - horizon := requeueDurationOnError + time.Duration(jitter) + r.SyncRegistry.Add(req.NamespacedName) + entry, _ := r.BackOffRegistry.Get(req.NamespacedName) + horizon := entry.NextBackOff() r.Recorder.Eventf(o, corev1.EventTypeWarning, consts.ReasonSecretSyncError, "Failed to sync the secret, horizon=%s, err=%s", horizon, err) return ctrl.Result{ RequeueAfter: horizon, }, nil + } else { + r.BackOffRegistry.Delete(req.NamespacedName) } doRolloutRestart := (doSync && o.Status.LastGeneration > 1) || staticCredsUpdated @@ -492,6 +497,10 @@ func (r *VaultDynamicSecretReconciler) renewLease( // SetupWithManager sets up the controller with the Manager. func (r *VaultDynamicSecretReconciler) SetupWithManager(mgr ctrl.Manager, opts controller.Options) error { r.referenceCache = newResourceReferenceCache() + if r.BackOffRegistry == nil { + r.BackOffRegistry = NewBackOffRegistry() + } + r.ClientFactory.RegisterClientCallbackHandler( vault.ClientCallbackHandler{ On: vault.ClientCallbackOnLifetimeWatcherDone, @@ -552,6 +561,7 @@ func (r *VaultDynamicSecretReconciler) handleDeletion(ctx context.Context, o *se objKey := client.ObjectKeyFromObject(o) r.SyncRegistry.Delete(objKey) + r.BackOffRegistry.Delete(objKey) r.referenceCache.Remove(SecretTransformation, objKey) if controllerutil.ContainsFinalizer(o, vaultDynamicSecretFinalizer) { logger.Info("Removing finalizer") diff --git a/controllers/vaultpkisecret_controller.go b/controllers/vaultpkisecret_controller.go index d64f818f..d79cddca 100644 --- a/controllers/vaultpkisecret_controller.go +++ b/controllers/vaultpkisecret_controller.go @@ -42,6 +42,7 @@ type VaultPKISecretReconciler struct { HMACValidator helpers.HMACValidator Recorder record.EventRecorder SyncRegistry *SyncRegistry + BackOffRegistry *BackOffRegistry referenceCache ResourceReferenceCache GlobalTransformationOption *helpers.GlobalTransformationOption } @@ -183,9 +184,14 @@ func (r *VaultPKISecretReconciler) Reconcile(ctx context.Context, req ctrl.Reque if err := r.updateStatus(ctx, o); err != nil { return ctrl.Result{}, err } + + r.SyncRegistry.Add(req.NamespacedName) + entry, _ := r.BackOffRegistry.Get(req.NamespacedName) return ctrl.Result{ - RequeueAfter: computeHorizonWithJitter(requeueDurationOnError), + RequeueAfter: entry.NextBackOff(), }, nil + } else { + r.BackOffRegistry.Delete(req.NamespacedName) } certResp, err := vault.UnmarshalPKIIssueResponse(resp.Secret()) @@ -314,6 +320,7 @@ func (r *VaultPKISecretReconciler) Reconcile(ctx context.Context, req ctrl.Reque func (r *VaultPKISecretReconciler) handleDeletion(ctx context.Context, o *secretsv1beta1.VaultPKISecret) error { objKey := client.ObjectKeyFromObject(o) r.SyncRegistry.Delete(objKey) + r.BackOffRegistry.Delete(objKey) r.referenceCache.Remove(SecretTransformation, objKey) finalizerSet := controllerutil.ContainsFinalizer(o, vaultPKIFinalizer) @@ -341,6 +348,9 @@ func (r *VaultPKISecretReconciler) handleDeletion(ctx context.Context, o *secret func (r *VaultPKISecretReconciler) SetupWithManager(mgr ctrl.Manager, opts controller.Options) error { r.referenceCache = newResourceReferenceCache() + if r.BackOffRegistry == nil { + r.BackOffRegistry = NewBackOffRegistry() + } return ctrl.NewControllerManagedBy(mgr). For(&secretsv1beta1.VaultPKISecret{}). WithEventFilter(syncableSecretPredicate(r.SyncRegistry)). diff --git a/controllers/vaultstaticsecret_controller.go b/controllers/vaultstaticsecret_controller.go index 4cb6a711..a9c4f294 100644 --- a/controllers/vaultstaticsecret_controller.go +++ b/controllers/vaultstaticsecret_controller.go @@ -38,6 +38,7 @@ type VaultStaticSecretReconciler struct { HMACValidator helpers.HMACValidator referenceCache ResourceReferenceCache GlobalTransformationOption *helpers.GlobalTransformationOption + BackOffRegistry *BackOffRegistry } //+kubebuilder:rbac:groups=secrets.hashicorp.com,resources=vaultstaticsecrets,verbs=get;list;watch;create;update;patch;delete @@ -109,9 +110,12 @@ func (r *VaultStaticSecretReconciler) Reconcile(ctx context.Context, req ctrl.Re resp, err := c.Read(ctx, kvReq) if err != nil { + entry, _ := r.BackOffRegistry.Get(req.NamespacedName) r.Recorder.Eventf(o, corev1.EventTypeWarning, consts.ReasonVaultClientError, "Failed to read Vault secret: %s", err) - return ctrl.Result{RequeueAfter: computeHorizonWithJitter(requeueDurationOnError)}, nil + return ctrl.Result{RequeueAfter: entry.NextBackOff()}, nil + } else { + r.BackOffRegistry.Delete(req.NamespacedName) } data, err := r.SecretDataBuilder.WithVaultData(resp.Data(), resp.Secret().Data, transOption) @@ -194,7 +198,9 @@ func (r *VaultStaticSecretReconciler) updateStatus(ctx context.Context, o *secre func (r *VaultStaticSecretReconciler) handleDeletion(ctx context.Context, o client.Object) error { logger := log.FromContext(ctx) - r.referenceCache.Remove(SecretTransformation, client.ObjectKeyFromObject(o)) + objKey := client.ObjectKeyFromObject(o) + r.referenceCache.Remove(SecretTransformation, objKey) + r.BackOffRegistry.Delete(objKey) if controllerutil.ContainsFinalizer(o, vaultStaticSecretFinalizer) { logger.Info("Removing finalizer") if controllerutil.RemoveFinalizer(o, vaultStaticSecretFinalizer) { @@ -210,6 +216,10 @@ func (r *VaultStaticSecretReconciler) handleDeletion(ctx context.Context, o clie func (r *VaultStaticSecretReconciler) SetupWithManager(mgr ctrl.Manager, opts controller.Options) error { r.referenceCache = newResourceReferenceCache() + if r.BackOffRegistry == nil { + r.BackOffRegistry = NewBackOffRegistry() + } + return ctrl.NewControllerManagedBy(mgr). For(&secretsv1beta1.VaultStaticSecret{}). WithEventFilter(syncableSecretPredicate(nil)). diff --git a/internal/options/env.go b/internal/options/env.go index 33a0cb5d..74005e9d 100644 --- a/internal/options/env.go +++ b/internal/options/env.go @@ -4,6 +4,8 @@ package options import ( + "time" + "github.com/kelseyhightower/envconfig" ) @@ -26,6 +28,18 @@ type VSOEnvOptions struct { // GlobalTransformationOptions is VSO_GLOBAL_TRANSFORMATION_OPTIONS environment variable option GlobalTransformationOptions string `split_words:"true"` + + // BackOffInitialInterval is VSO_BACK_OFF_INITIAL_INTERVAL environment variable option + BackOffInitialInterval time.Duration `split_words:"true"` + + // BackOffMaxInterval is VSO_BACK_OFF_MAX_INTERVAL environment variable option + BackOffMaxInterval time.Duration `split_words:"true"` + + // BackOffRandomizationFactor is VSO_BACK_OFF_RANDOMIZATION_FACTOR environment variable option + BackOffRandomizationFactor float64 `split_words:"true"` + + // BackOffMultiplier is VSO_BACK_OFF_MULTIPLIER environment variable option + BackOffMultiplier float64 `split_words:"true"` } // Parse environment variable options, prefixed with "VSO_" diff --git a/internal/options/env_test.go b/internal/options/env_test.go index 19805ff5..f822fcff 100644 --- a/internal/options/env_test.go +++ b/internal/options/env_test.go @@ -4,8 +4,8 @@ package options import ( - "os" "testing" + "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -26,24 +26,27 @@ func TestParse(t *testing.T) { "VSO_CLIENT_CACHE_SIZE": "100", "VSO_CLIENT_CACHE_PERSISTENCE_MODEL": "memory", "VSO_MAX_CONCURRENT_RECONCILES": "10", + "VSO_BACK_OFF_INITIAL_INTERVAL": "1s", + "VSO_BACK_OFF_MAX_INTERVAL": "60s", + "VSO_BACK_OFF_RANDOMIZATION_FACTOR": "0.5", + "VSO_BACK_OFF_MULTIPLIER": "2.5", }, wantOptions: VSOEnvOptions{ OutputFormat: "json", ClientCacheSize: makeInt(t, 100), ClientCachePersistenceModel: "memory", MaxConcurrentReconciles: makeInt(t, 10), + BackOffInitialInterval: time.Second * 1, + BackOffMaxInterval: time.Second * 60, + BackOffRandomizationFactor: 0.5, + BackOffMultiplier: 2.5, }, }, } for name, tt := range tests { t.Run(name, func(t *testing.T) { - defer func() { - for env := range tt.envs { - require.NoError(t, os.Unsetenv(env)) - } - }() for env, val := range tt.envs { - require.NoError(t, os.Setenv(env, val)) + t.Setenv(env, val) } gotOptions := VSOEnvOptions{} diff --git a/main.go b/main.go index 786f5249..326d49cd 100644 --- a/main.go +++ b/main.go @@ -14,6 +14,7 @@ import ( "time" argorolloutsv1alpha1 "github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1" + "github.com/cenkalti/backoff/v4" "gopkg.in/yaml.v3" "k8s.io/apimachinery/pkg/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime" @@ -82,6 +83,10 @@ func main() { var uninstall bool var preDeleteHookTimeoutSeconds int var minRefreshAfterHVSA time.Duration + var backOffInitialInterval time.Duration + var backOffMaxInterval time.Duration + var backOffRandomizationFactor float64 + var backOffMultiplier float64 var globalTransformationOpts string // command-line args and flags @@ -114,8 +119,26 @@ func main() { "Minimum duration between HCPVaultSecretsApp resource reconciliation.") flag.StringVar(&globalTransformationOpts, "global-transformation-options", "", fmt.Sprintf("Set global secret transformation options as a comma delimited string. "+ - "Also set from environment variable VSO_GLOBAL_TRANSFORMATION_OPTIONS."+ + "Also set from environment variable VSO_GLOBAL_TRANSFORMATION_OPTIONS. "+ "Valid values are: %v", []string{"exclude-raw"})) + flag.DurationVar(&backOffInitialInterval, "back-off-initial-interval", time.Second*5, + "Initial interval between retries on secret source errors. "+ + "All errors are tried using an exponential backoff strategy. "+ + "Also set from environment variable VSO_BACK_OFF_INITIAL_INTERVAL.") + flag.DurationVar(&backOffMaxInterval, "back-off-max-interval", time.Second*60, + "Maximum interval between retries on secret source errors. "+ + "All errors are tried using an exponential backoff strategy. "+ + "Also set from environment variable VSO_BACK_OFF_MAX_INTERVAL.") + flag.Float64Var(&backOffRandomizationFactor, "back-off-randomization-factor", + backoff.DefaultRandomizationFactor, + "Sets the randomization factor to add jitter to the interval between retries on secret "+ + "source errors. All errors are tried using an exponential backoff strategy. "+ + "Also set from environment variable VSO_BACK_OFF_RANDOMIZATION_FACTOR.") + flag.Float64Var(&backOffMultiplier, "back-off-multiplier", + backoff.DefaultMultiplier, + "Sets the multiplier for increasing the interval between retries on secret source errors. "+ + "All errors are tried using an exponential backoff strategy. "+ + "Also set from environment variable VSO_BACK_OFF_MULTIPLIER.") opts := zap.Options{ Development: true, } @@ -144,7 +167,18 @@ func main() { if vsoEnvOptions.GlobalTransformationOptions != "" { globalTransformationOpts = vsoEnvOptions.GlobalTransformationOptions } - + if vsoEnvOptions.BackOffInitialInterval != 0 { + backOffInitialInterval = vsoEnvOptions.BackOffInitialInterval + } + if vsoEnvOptions.BackOffMaxInterval != 0 { + backOffMaxInterval = vsoEnvOptions.BackOffMaxInterval + } + if vsoEnvOptions.BackOffRandomizationFactor != 0 { + backOffRandomizationFactor = vsoEnvOptions.BackOffRandomizationFactor + } + if vsoEnvOptions.BackOffMultiplier != 0 { + backOffMultiplier = vsoEnvOptions.BackOffMultiplier + } // versionInfo is used when setting up the buildInfo metric below versionInfo := version.Version() if printVersion { @@ -177,6 +211,13 @@ func main() { } ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) + backOffOpts := []backoff.ExponentialBackOffOpts{ + backoff.WithInitialInterval(backOffInitialInterval), + backoff.WithMaxInterval(backOffMaxInterval), + backoff.WithRandomizationFactor(backOffRandomizationFactor), + backoff.WithMultiplier(backOffMultiplier), + } + globalTransOpt := &helpers.GlobalTransformationOption{} if globalTransformationOpts != "" { for _, v := range strings.Split(globalTransformationOpts, ",") { @@ -289,6 +330,7 @@ func main() { SecretDataBuilder: secretDataBuilder, HMACValidator: hmacValidator, ClientFactory: clientFactory, + BackOffRegistry: controllers.NewBackOffRegistry(backOffOpts...), GlobalTransformationOption: globalTransOpt, }).SetupWithManager(mgr, controllerOptions); err != nil { setupLog.Error(err, "Unable to create controller", "controller", "VaultStaticSecret") @@ -301,6 +343,7 @@ func main() { HMACValidator: hmacValidator, SyncRegistry: controllers.NewSyncRegistry(), Recorder: mgr.GetEventRecorderFor("VaultPKISecret"), + BackOffRegistry: controllers.NewBackOffRegistry(backOffOpts...), GlobalTransformationOption: globalTransOpt, }).SetupWithManager(mgr, controllerOptions); err != nil { setupLog.Error(err, "Unable to create controller", "controller", "VaultPKISecret") @@ -343,6 +386,7 @@ func main() { ClientFactory: clientFactory, HMACValidator: hmacValidator, SyncRegistry: controllers.NewSyncRegistry(), + BackOffRegistry: controllers.NewBackOffRegistry(backOffOpts...), GlobalTransformationOption: globalTransOpt, } if err = vdsReconciler.SetupWithManager(mgr, vdsOverrideOpts); err != nil { @@ -369,6 +413,7 @@ func main() { SecretDataBuilder: secretDataBuilder, HMACValidator: hmacValidator, MinRefreshAfter: minRefreshAfterHVSA, + BackOffRegistry: controllers.NewBackOffRegistry(backOffOpts...), GlobalTransformationOption: globalTransOpt, }).SetupWithManager(mgr, controllerOptions); err != nil { setupLog.Error(err, "unable to create controller", "controller", "HCPVaultSecretsApp") diff --git a/test/unit/deployment.bats b/test/unit/deployment.bats index 78baa874..4844031c 100755 --- a/test/unit/deployment.bats +++ b/test/unit/deployment.bats @@ -670,7 +670,7 @@ load _helpers yq 'select(.kind == "Deployment" and .metadata.labels."control-plane" == "controller-manager") | .spec.template.spec.containers[] | select(.name == "manager") | .args' | tee /dev/stderr) local actual=$(echo "$object" | yq '. | length' | tee /dev/stderr) - [ "${actual}" = "3" ] + [ "${actual}" = "7" ] } # @@ -684,11 +684,11 @@ load _helpers yq 'select(.kind == "Deployment" and .metadata.labels."control-plane" == "controller-manager") | .spec.template.spec.containers[] | select(.name == "manager") | .args' | tee /dev/stderr) local actual=$(echo "$object" | yq '. | length' | tee /dev/stderr) - [ "${actual}" = "5" ] + [ "${actual}" = "9" ] - local actual=$(echo "$object" | yq '.[3]' | tee /dev/stderr) + local actual=$(echo "$object" | yq '.[7]' | tee /dev/stderr) [ "${actual}" = "--foo=baz" ] - local actual=$(echo "$object" | yq '.[4]' | tee /dev/stderr) + local actual=$(echo "$object" | yq '.[8]' | tee /dev/stderr) [ "${actual}" = "--bar=qux" ] } @@ -750,7 +750,7 @@ load _helpers yq 'select(.kind == "Deployment" and .metadata.labels."control-plane" == "controller-manager") | .spec.template.spec.containers[] | select(.name == "manager") | .args' | tee /dev/stderr) local actual=$(echo "$object" | yq '. | length' | tee /dev/stderr) - [ "${actual}" = "3" ] + [ "${actual}" = "7" ] } @test "controller/Deployment: with globalTransformationOptions.excludeRaw" { @@ -762,7 +762,7 @@ load _helpers yq 'select(.kind == "Deployment" and .metadata.labels."control-plane" == "controller-manager") | .spec.template.spec.containers[] | select(.name == "manager") | .args' | tee /dev/stderr) local actual=$(echo "$object" | yq '. | length' | tee /dev/stderr) - [ "${actual}" = "4" ] + [ "${actual}" = "8" ] local actual=$(echo "$object" | yq '.[3]' | tee /dev/stderr) [ "${actual}" = "--global-transformation-options=exclude-raw" ] @@ -778,16 +778,60 @@ load _helpers yq 'select(.kind == "Deployment" and .metadata.labels."control-plane" == "controller-manager") | .spec.template.spec.containers[] | select(.name == "manager") | .args' | tee /dev/stderr) local actual=$(echo "$object" | yq '. | length' | tee /dev/stderr) - [ "${actual}" = "6" ] + [ "${actual}" = "10" ] local actual=$(echo "$object" | yq '.[3]' | tee /dev/stderr) [ "${actual}" = "--global-transformation-options=exclude-raw" ] - local actual=$(echo "$object" | yq '.[4]' | tee /dev/stderr) + local actual=$(echo "$object" | yq '.[8]' | tee /dev/stderr) [ "${actual}" = "--foo=baz" ] - local actual=$(echo "$object" | yq '.[5]' | tee /dev/stderr) + local actual=$(echo "$object" | yq '.[9]' | tee /dev/stderr) [ "${actual}" = "--bar=qux" ] } +@test "controller/Deployment: with backOffOnSecretSourceError defaults" { + cd `chart_dir` + local object=$(helm template \ + -s templates/deployment.yaml \ + . | tee /dev/stderr | + yq 'select(.kind == "Deployment" and .metadata.labels."control-plane" == "controller-manager") | .spec.template.spec.containers[] | select(.name == "manager") | .args' | tee /dev/stderr) + + local actual=$(echo "$object" | yq '. | length' | tee /dev/stderr) + [ "${actual}" = "7" ] + + local actual=$(echo "$object" | yq '.[3]' | tee /dev/stderr) + [ "${actual}" = "--back-off-initial-interval=5s" ] + local actual=$(echo "$object" | yq '.[4]' | tee /dev/stderr) + [ "${actual}" = "--back-off-max-interval=60s" ] + local actual=$(echo "$object" | yq '.[5]' | tee /dev/stderr) + [ "${actual}" = "--back-off-multiplier=1.50" ] + local actual=$(echo "$object" | yq '.[6]' | tee /dev/stderr) + [ "${actual}" = "--back-off-randomization-factor=0.50" ] +} + +@test "controller/Deployment: with backOffOnSecretSourceError set" { + cd `chart_dir` + local object=$(helm template \ + -s templates/deployment.yaml \ + --set 'controller.manager.backoffOnSecretSourceError.initialInterval=30s' \ + --set 'controller.manager.backoffOnSecretSourceError.maxInterval=300s' \ + --set 'controller.manager.backoffOnSecretSourceError.multiplier=2.5' \ + --set 'controller.manager.backoffOnSecretSourceError.randomizationFactor=3.7361' \ + . | tee /dev/stderr | + yq 'select(.kind == "Deployment" and .metadata.labels."control-plane" == "controller-manager") | .spec.template.spec.containers[] | select(.name == "manager") | .args' | tee /dev/stderr) + + local actual=$(echo "$object" | yq '. | length' | tee /dev/stderr) + [ "${actual}" = "7" ] + + local actual=$(echo "$object" | yq '.[3]' | tee /dev/stderr) + [ "${actual}" = "--back-off-initial-interval=30s" ] + local actual=$(echo "$object" | yq '.[4]' | tee /dev/stderr) + [ "${actual}" = "--back-off-max-interval=300s" ] + local actual=$(echo "$object" | yq '.[5]' | tee /dev/stderr) + [ "${actual}" = "--back-off-multiplier=2.50" ] + local actual=$(echo "$object" | yq '.[6]' | tee /dev/stderr) + [ "${actual}" = "--back-off-randomization-factor=3.74" ] +} + #-------------------------------------------------------------------- # image.pullPolicy @test "controller/Deployment: imagePullPolicy default" { From 6d99abfad47c25efd4842612006251becd803ac1 Mon Sep 17 00:00:00 2001 From: Ben Ash Date: Wed, 15 May 2024 15:29:43 -0400 Subject: [PATCH 2/4] Post review updates --- controllers/vaultdynamicsecret_controller.go | 1 - 1 file changed, 1 deletion(-) diff --git a/controllers/vaultdynamicsecret_controller.go b/controllers/vaultdynamicsecret_controller.go index 24fae66c..d7bebcde 100644 --- a/controllers/vaultdynamicsecret_controller.go +++ b/controllers/vaultdynamicsecret_controller.go @@ -162,7 +162,6 @@ func (r *VaultDynamicSecretReconciler) Reconcile(ctx context.Context, req ctrl.R // indicates that the resource has been added to the SyncRegistry // and must be synced. syncReason = "force sync" - syncReason = "last sync error" // indicates that the resource has been updated since the last sync. case o.GetGeneration() != o.Status.LastGeneration: syncReason = "resource updated" From 15e14c91bacf2ebdcfc5ad3888975587fe0287c3 Mon Sep 17 00:00:00 2001 From: Ben Ash Date: Wed, 15 May 2024 20:53:27 -0400 Subject: [PATCH 3/4] Post review updates - add new metric runtime configuration gauge --- chart/templates/_helpers.tpl | 2 +- chart/values.yaml | 2 +- controllers/registry_test.go | 4 +++- main.go | 27 +++++++++++++++++++++++++++ test/unit/deployment.bats | 8 ++++---- 5 files changed, 36 insertions(+), 7 deletions(-) diff --git a/chart/templates/_helpers.tpl b/chart/templates/_helpers.tpl index 23a32267..d066e58a 100644 --- a/chart/templates/_helpers.tpl +++ b/chart/templates/_helpers.tpl @@ -183,7 +183,7 @@ secret source error occurs. */}} {{- define "vso.backOffOnSecretSourceError" -}} {{- $opts := list -}} -{{- with .Values.controller.manager.backoffOnSecretSourceError -}} +{{- with .Values.controller.manager.backOffOnSecretSourceError -}} {{- with .initialInterval -}} {{- $opts = mustAppend $opts (printf "--back-off-initial-interval=%s" .) -}} {{- end -}} diff --git a/chart/values.yaml b/chart/values.yaml index ec54cce2..48ea3822 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -115,7 +115,7 @@ controller: # Backoff settings for the controller manager. These settings control the backoff behavior # when the controller encounters an error while fetching secrets from the SecretSource. - backoffOnSecretSourceError: + backOffOnSecretSourceError: # Initial interval between retries. # @type: duration initialInterval: "5s" diff --git a/controllers/registry_test.go b/controllers/registry_test.go index 454b38ed..7b6ebfa1 100644 --- a/controllers/registry_test.go +++ b/controllers/registry_test.go @@ -564,7 +564,9 @@ func TestBackOffRegistry_Get(t *testing.T) { got, got1 := r.Get(tt.objKey) assert.NotNilf(t, got, "Get(%v)", tt.objKey) assert.Equalf(t, tt.want1, got1, "Get(%v)", tt.objKey) - assert.Greaterf(t, got.bo.NextBackOff(), time.Duration(0), "Get(%v)", tt.objKey) + last := got.bo.NextBackOff() + assert.Greaterf(t, last, time.Duration(0), "Get(%v)", tt.objKey) + assert.Greaterf(t, got.bo.NextBackOff(), last, "Get(%v)", tt.objKey) }) } } diff --git a/main.go b/main.go index 052cca2f..8e8b2434 100644 --- a/main.go +++ b/main.go @@ -10,11 +10,13 @@ import ( "flag" "fmt" "os" + "strconv" "strings" "time" argorolloutsv1alpha1 "github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1" "github.com/cenkalti/backoff/v4" + "github.com/prometheus/client_golang/prometheus" "gopkg.in/yaml.v3" "k8s.io/apimachinery/pkg/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime" @@ -268,6 +270,27 @@ func main() { metrics.NewBuildInfoGauge(versionInfo), ) vclient.MustRegisterClientMetrics(cfc.MetricsRegistry) + + metric := prometheus.NewGauge( + prometheus.GaugeOpts{ + Namespace: metrics.Namespace, + Subsystem: "runtime", + Name: "config", + Help: "Vault Secrets Operator runtime config.", + ConstLabels: map[string]string{ + "backOffInitialInterval": backOffInitialInterval.String(), + "backOffMaxInterval": backOffMaxInterval.String(), + "backOffMultiplier": fmt.Sprintf("%.2f", backOffMultiplier), + "backOffRandomizationFactor": fmt.Sprintf("%.2f", backOffRandomizationFactor), + "clientCachePersistenceModel": clientCachePersistenceModel, + "clientCacheSize": strconv.Itoa(cfc.ClientCacheSize), + "globalTransformationOptions": globalTransformationOpts, + "maxConcurrentReconciles": strconv.Itoa(controllerOptions.MaxConcurrentReconciles), + }, + }, + ) + metric.Set(1) + cfc.MetricsRegistry.MustRegister(metric) } mgr, err := ctrl.NewManager(config, ctrl.Options{ @@ -442,6 +465,10 @@ func main() { setupLog.Info("Starting manager", "clientCachePersistenceModel", clientCachePersistenceModel, "clientCacheSize", cfc.ClientCacheSize, + "backOffMultiplier", backOffMultiplier, + "backOffMaxInterval", backOffMaxInterval, + "backOffInitialInterval", backOffInitialInterval, + "backOffRandomizationFactor", backOffRandomizationFactor, ) mgr.GetCache() diff --git a/test/unit/deployment.bats b/test/unit/deployment.bats index 4844031c..c510bb25 100755 --- a/test/unit/deployment.bats +++ b/test/unit/deployment.bats @@ -812,10 +812,10 @@ load _helpers cd `chart_dir` local object=$(helm template \ -s templates/deployment.yaml \ - --set 'controller.manager.backoffOnSecretSourceError.initialInterval=30s' \ - --set 'controller.manager.backoffOnSecretSourceError.maxInterval=300s' \ - --set 'controller.manager.backoffOnSecretSourceError.multiplier=2.5' \ - --set 'controller.manager.backoffOnSecretSourceError.randomizationFactor=3.7361' \ + --set 'controller.manager.backOffOnSecretSourceError.initialInterval=30s' \ + --set 'controller.manager.backOffOnSecretSourceError.maxInterval=300s' \ + --set 'controller.manager.backOffOnSecretSourceError.multiplier=2.5' \ + --set 'controller.manager.backOffOnSecretSourceError.randomizationFactor=3.7361' \ . | tee /dev/stderr | yq 'select(.kind == "Deployment" and .metadata.labels."control-plane" == "controller-manager") | .spec.template.spec.containers[] | select(.name == "manager") | .args' | tee /dev/stderr) From d10ffde439745a2ad2b55d1170e7cce99c10440a Mon Sep 17 00:00:00 2001 From: Ben Ash Date: Wed, 15 May 2024 21:11:18 -0400 Subject: [PATCH 4/4] Unify sync reasons between VDS and VPS controllers --- controllers/vaultdynamicsecret_controller.go | 14 +++++++------- controllers/vaultpkisecret_controller.go | 2 +- internal/consts/reasons.go | 4 +++- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/controllers/vaultdynamicsecret_controller.go b/controllers/vaultdynamicsecret_controller.go index d7bebcde..9dbadcf5 100644 --- a/controllers/vaultdynamicsecret_controller.go +++ b/controllers/vaultdynamicsecret_controller.go @@ -155,27 +155,27 @@ func (r *VaultDynamicSecretReconciler) Reconcile(ctx context.Context, req ctrl.R switch { // indicates that the resource has not been synced yet. case o.Status.LastGeneration == 0: - syncReason = "initial sync" + syncReason = consts.ReasonInitialSync // indicates that the resource has been added to the SyncRegistry // and must be synced. case r.SyncRegistry.Has(req.NamespacedName): // indicates that the resource has been added to the SyncRegistry // and must be synced. - syncReason = "force sync" + syncReason = consts.ReasonForceSync // indicates that the resource has been updated since the last sync. case o.GetGeneration() != o.Status.LastGeneration: - syncReason = "resource updated" + syncReason = consts.ReasonResourceUpdated // indicates that the destination secret does not exist and the resource is configured to create it. case o.Spec.Destination.Create && !destExists: - syncReason = "destination secret does not exist and create=true" + syncReason = consts.ReasonInexistentDestination // indicates that the cache key has changed since the last sync. This can happen // when the VaultAuth or VaultConnection objects are updated since the last sync. case lastClientCacheKey != "" && lastClientCacheKey != o.Status.VaultClientMeta.CacheKey: - syncReason = "new vault client due to config change" + syncReason = consts.ReasonVaultClientConfigChanged // indicates that the Vault client ID has changed since the last sync. This can // happen when the client has re-authenticated to Vault since the last sync. case lastClientID != "" && lastClientID != o.Status.VaultClientMeta.ID: - syncReason = "vault token rotated" + syncReason = consts.ReasonVaultTokenRotated } doSync := syncReason != "" @@ -253,7 +253,7 @@ func (r *VaultDynamicSecretReconciler) Reconcile(ctx context.Context, req ctrl.R r.Recorder.Eventf(o, corev1.EventTypeWarning, consts.ReasonSecretLeaseRenewalError, "Could not renew lease, lease_id=%s, err=%s", leaseID, err) } - syncReason = "lease renewal failed" + syncReason = consts.ReasonSecretLeaseRenewalError } } diff --git a/controllers/vaultpkisecret_controller.go b/controllers/vaultpkisecret_controller.go index d79cddca..71766a3b 100644 --- a/controllers/vaultpkisecret_controller.go +++ b/controllers/vaultpkisecret_controller.go @@ -120,7 +120,7 @@ func (r *VaultPKISecretReconciler) Reconcile(ctx context.Context, req ctrl.Reque case o.Status.SerialNumber == "": syncReason = consts.ReasonInitialSync case r.SyncRegistry.Has(req.NamespacedName): - syncReason = consts.ReasonSyncOnRefUpdate + syncReason = consts.ReasonForceSync case schemaEpoch > 0 && o.GetGeneration() != o.Status.LastGeneration: syncReason = consts.ReasonResourceUpdated case o.Spec.Destination.Create && !destinationExists: diff --git a/internal/consts/reasons.go b/internal/consts/reasons.go index 8153b984..e0977b99 100644 --- a/internal/consts/reasons.go +++ b/internal/consts/reasons.go @@ -32,5 +32,7 @@ const ( ReasonCertificateRevocationError = "CertificateRevocationError" ReasonTransformationError = "TransformationError" ReasonSecretDataBuilderError = "SecretDataBuilderError" - ReasonSyncOnRefUpdate = "SyncOnRefUpdate" + ReasonForceSync = "ForceSync" + ReasonVaultTokenRotated = "VaultTokenRotated" + ReasonVaultClientConfigChanged = "VaultClientConfigChanged" )