From 8ee2dcb5bbd09c90632f2778cc6bf99680e32ba5 Mon Sep 17 00:00:00 2001 From: Oliver Liu Date: Sat, 25 Jan 2020 12:52:40 -0800 Subject: [PATCH 1/2] Update nodeagent SDS backoff config and algorithm. --- pkg/istio-agent/sds-agent.go | 8 ++-- security/cmd/node_agent_k8s/main.go | 20 ++++----- security/cmd/node_agent_k8s/main_test.go | 6 +-- security/pkg/nodeagent/cache/secretcache.go | 27 ++++++------ .../pkg/nodeagent/cache/secretcache_test.go | 42 +++++++++---------- 5 files changed, 51 insertions(+), 52 deletions(-) diff --git a/pkg/istio-agent/sds-agent.go b/pkg/istio-agent/sds-agent.go index b80a0c9b5ba8..d4c515d9671d 100644 --- a/pkg/istio-agent/sds-agent.go +++ b/pkg/istio-agent/sds-agent.go @@ -67,10 +67,10 @@ var ( trustDomainEnv = env.RegisterStringVar(trustDomain, "", "").Get() secretTTLEnv = env.RegisterDurationVar(secretTTL, 24*time.Hour, "").Get() - secretRefreshGraceDurationEnv = env.RegisterDurationVar(SecretRefreshGraceDuration, 1*time.Hour, "").Get() + secretRefreshGraceDurationEnv = env.RegisterDurationVar(SecretRefreshGraceDuration, 12*time.Hour, "").Get() secretRotationIntervalEnv = env.RegisterDurationVar(SecretRotationInterval, 10*time.Minute, "").Get() staledConnectionRecycleIntervalEnv = env.RegisterDurationVar(staledConnectionRecycleInterval, 5*time.Minute, "").Get() - initialBackoffEnv = env.RegisterIntVar(InitialBackoff, 10, "").Get() + initialBackoffInMilliSecEnv = env.RegisterIntVar(InitialBackoffInMilliSec, 2000, "").Get() pkcs8KeysEnv = env.RegisterBoolVar(pkcs8Key, false, "Whether to generate PKCS#8 private keys").Get() // Location of K8S CA root. @@ -118,7 +118,7 @@ const ( // The environmental variable name for the initial backoff in milliseconds. // example value format like "10" - InitialBackoff = "INITIAL_BACKOFF_MSEC" + InitialBackoffInMilliSec = "INITIAL_BACKOFF_MSEC" pkcs8Key = "PKCS8_KEY" ) @@ -473,5 +473,5 @@ func applyEnvVars() { serverOptions.RecycleInterval = staledConnectionRecycleIntervalEnv - workloadSdsCacheOptions.InitialBackoff = int64(initialBackoffEnv) + workloadSdsCacheOptions.InitialBackoffInMilliSec = int64(initialBackoffInMilliSecEnv) } diff --git a/security/cmd/node_agent_k8s/main.go b/security/cmd/node_agent_k8s/main.go index a7301d905c09..e08cbe92d583 100644 --- a/security/cmd/node_agent_k8s/main.go +++ b/security/cmd/node_agent_k8s/main.go @@ -116,9 +116,9 @@ const ( staledConnectionRecycleInterval = "STALED_CONNECTION_RECYCLE_RUN_INTERVAL" // The environmental variable name for the initial backoff in milliseconds. - // example value format like "10" - InitialBackoff = "INITIAL_BACKOFF_MSEC" - InitialBackoffFlag = "initialBackoff" + // example value format like "1000" + InitialBackoffInMilliSec = "INITIAL_BACKOFF_MSEC" + InitialBackoffInMilliSecFlag = "initialBackoff" MonitoringPort = "MONITORING_PORT" EnableProfiling = "ENABLE_PROFILING" @@ -247,10 +247,10 @@ var ( vaultSignCsrPathEnv = env.RegisterStringVar(vaultSignCsrPath, "", "").Get() vaultTLSRootCertEnv = env.RegisterStringVar(vaultTLSRootCert, "", "").Get() secretTTLEnv = env.RegisterDurationVar(secretTTL, 24*time.Hour, "").Get() - secretRefreshGraceDurationEnv = env.RegisterDurationVar(SecretRefreshGraceDuration, 1*time.Hour, "").Get() + secretRefreshGraceDurationEnv = env.RegisterDurationVar(SecretRefreshGraceDuration, 12*time.Hour, "").Get() secretRotationIntervalEnv = env.RegisterDurationVar(SecretRotationInterval, 10*time.Minute, "").Get() staledConnectionRecycleIntervalEnv = env.RegisterDurationVar(staledConnectionRecycleInterval, 5*time.Minute, "").Get() - initialBackoffEnv = env.RegisterIntVar(InitialBackoff, 10, "").Get() + initialBackoffInMilliSecEnv = env.RegisterIntVar(InitialBackoffInMilliSec, 2000, "").Get() monitoringPortEnv = env.RegisterIntVar(MonitoringPort, 15014, "The port number for monitoring Citadel agent").Get() debugPortEnv = env.RegisterIntVar(DebugPort, 8080, @@ -327,8 +327,8 @@ func applyEnvVars(cmd *cobra.Command) { serverOptions.RecycleInterval = staledConnectionRecycleIntervalEnv - if !cmd.Flag(InitialBackoffFlag).Changed { - workloadSdsCacheOptions.InitialBackoff = int64(initialBackoffEnv) + if !cmd.Flag(InitialBackoffInMilliSecFlag).Changed { + workloadSdsCacheOptions.InitialBackoffInMilliSec = int64(initialBackoffInMilliSecEnv) } serverOptions.DebugPort = debugPortEnv @@ -338,7 +338,7 @@ func applyEnvVars(cmd *cobra.Command) { func validateOptions() error { // The initial backoff time (in millisec) is a random number between 0 and initBackoff. // Default to 10, a valid range is [10, 120000]. - initBackoff := workloadSdsCacheOptions.InitialBackoff + initBackoff := workloadSdsCacheOptions.InitialBackoffInMilliSec if initBackoff < 10 || initBackoff > 120000 { return fmt.Errorf("initial backoff should be within range 10 to 120000, found: %d", initBackoff) } @@ -390,8 +390,8 @@ func main() { rootCmd.PersistentFlags().DurationVar(&workloadSdsCacheOptions.RotationInterval, secretRotationIntervalFlag, 10*time.Minute, "Secret rotation job running interval") - rootCmd.PersistentFlags().Int64Var(&workloadSdsCacheOptions.InitialBackoff, InitialBackoffFlag, 10, - "The initial backoff interval in milliseconds, must be within the range [10, 120000]") + rootCmd.PersistentFlags().Int64Var(&workloadSdsCacheOptions.InitialBackoffInMilliSec, InitialBackoffInMilliSecFlag, 2000, + "The initial backoff interval in milliseconds, default value is 2000, must be within the range [10, 120000]") rootCmd.PersistentFlags().DurationVar(&workloadSdsCacheOptions.EvictionDuration, "secretEvictionDuration", 24*time.Hour, "Secret eviction time duration") diff --git a/security/cmd/node_agent_k8s/main_test.go b/security/cmd/node_agent_k8s/main_test.go index 98704acfaf74..8a1fd41c2038 100644 --- a/security/cmd/node_agent_k8s/main_test.go +++ b/security/cmd/node_agent_k8s/main_test.go @@ -34,14 +34,14 @@ func TestValidateOptions(t *testing.T) { { name: "initial backoff too small", setExtraOptions: func() { - workloadSdsCacheOptions.InitialBackoff = 9 + workloadSdsCacheOptions.InitialBackoffInMilliSec = 9 }, errorMsg: "initial backoff should be within range 10 to 120000", }, { name: "initial backoff too large", setExtraOptions: func() { - workloadSdsCacheOptions.InitialBackoff = 120001 + workloadSdsCacheOptions.InitialBackoffInMilliSec = 120001 }, errorMsg: "initial backoff should be within range 10 to 120000", }, @@ -73,7 +73,7 @@ func TestValidateOptions(t *testing.T) { // Set the valid options as the base for the testing. workloadSdsCacheOptions = cache.Options{ - InitialBackoff: 10, + InitialBackoffInMilliSec: 2000, } serverOptions = sds.Options{ EnableIngressGatewaySDS: true, diff --git a/security/pkg/nodeagent/cache/secretcache.go b/security/pkg/nodeagent/cache/secretcache.go index f8f4324ddf1e..f5f461d6dae5 100644 --- a/security/pkg/nodeagent/cache/secretcache.go +++ b/security/pkg/nodeagent/cache/secretcache.go @@ -61,11 +61,11 @@ const ( // identityTemplate is the format template of identity in the CSR request. identityTemplate = "spiffe://%s/ns/%s/sa/%s" - // For REST APIs between envoy->nodeagent, default value of 1s is used. - envoyDefaultTimeoutInMilliSec = 1000 + // The total timeout for any credential retrieval process, default value of 10s is used. + totalTimeout = time.Second * 10 - // initialBackOffIntervalInMilliSec is the initial backoff time interval when hitting non-retryable error in CSR request. - initialBackOffIntervalInMilliSec = 50 + // firstRetryBackOffInMilliSec is the initial backoff time interval when hitting non-retryable error in CSR request. + firstRetryBackOffInMilliSec = 50 // Timeout the K8s update/delete notification threads. This is to make sure to unblock the // secret watch main thread in case those child threads got stuck due to any reason. @@ -91,7 +91,7 @@ type Options struct { SecretTTL time.Duration // The initial backoff time in millisecond to avoid the thundering herd problem. - InitialBackoff int64 + InitialBackoffInMilliSec int64 // secret should be refreshed before it expired, SecretRefreshGraceDuration is the grace period; // secret should be refreshed if time.Now.After(secret.CreateTime + SecretTTL - SecretRefreshGraceDuration) @@ -816,15 +816,15 @@ func (sc *SecretCache) isTokenExpired() bool { func (sc *SecretCache) sendRetriableRequest(ctx context.Context, csrPEM []byte, providedExchangedToken string, connKey ConnKey, isCSR bool) ([]string, error) { sc.randMutex.Lock() - backOffInMilliSec := sc.rand.Int63n(sc.configOptions.InitialBackoff) + randomizedInitialBackOffInMS := sc.rand.Int63n(sc.configOptions.InitialBackoffInMilliSec) sc.randMutex.Unlock() - cacheLog.Debugf("Wait for %d millisec", backOffInMilliSec) + cacheLog.Debugf("Wait for %d millisec for jitter", randomizedInitialBackOffInMS) // Add a jitter to initial CSR to avoid thundering herd problem. - time.Sleep(time.Duration(backOffInMilliSec) * time.Millisecond) + time.Sleep(time.Duration(randomizedInitialBackOffInMS) * time.Millisecond) + retryBackoffInMS := int64(firstRetryBackOffInMilliSec) conIDresourceNamePrefix := cacheLogPrefix(connKey.ConnectionID, connKey.ResourceName) startTime := time.Now() - var retry int64 var certChainPEM []string exchangedToken := providedExchangedToken var requestErrorString string @@ -854,14 +854,13 @@ func (sc *SecretCache) sendRetriableRequest(ctx context.Context, csrPEM []byte, } // If reach envoy timeout, fail the request by returning err - if startTime.Add(time.Millisecond * envoyDefaultTimeoutInMilliSec).Before(time.Now()) { + if startTime.Add(totalTimeout).Before(time.Now()) { cacheLog.Errorf("%s retrial timed out: %v", requestErrorString, err) return nil, err } - retry++ - backOffInMilliSec = rand.Int63n(retry * initialBackOffIntervalInMilliSec) - time.Sleep(time.Duration(backOffInMilliSec) * time.Millisecond) - cacheLog.Warnf("%s failed with error: %v, retry in %d millisec", requestErrorString, err, backOffInMilliSec) + time.Sleep(time.Duration(retryBackoffInMS) * time.Millisecond) + cacheLog.Warnf("%s failed with error: %v, retry in %d millisec", requestErrorString, err, retryBackoffInMS) + retryBackoffInMS *= 2 // Exponentially increase the retry backoff time. // Record retry metrics. if isCSR { diff --git a/security/pkg/nodeagent/cache/secretcache_test.go b/security/pkg/nodeagent/cache/secretcache_test.go index c2d8e2d5fed1..bd3bcec8a24e 100644 --- a/security/pkg/nodeagent/cache/secretcache_test.go +++ b/security/pkg/nodeagent/cache/secretcache_test.go @@ -220,11 +220,11 @@ func TestWorkloadAgentGenerateSecretWithPluginProvider(t *testing.T) { func testWorkloadAgentGenerateSecret(t *testing.T, isUsingPluginProvider bool) { fakeCACli := mock.NewMockCAClient(mockCertChain1st, mockCertChainRemain, 0.1) opt := Options{ - SecretTTL: time.Minute, - RotationInterval: 300 * time.Microsecond, - EvictionDuration: 60 * time.Second, - InitialBackoff: 10, - SkipValidateCert: true, + SecretTTL: time.Minute, + RotationInterval: 300 * time.Microsecond, + EvictionDuration: 60 * time.Second, + InitialBackoffInMilliSec: 10, + SkipValidateCert: true, } if isUsingPluginProvider { @@ -322,11 +322,11 @@ func testWorkloadAgentGenerateSecret(t *testing.T, isUsingPluginProvider bool) { func TestWorkloadAgentRefreshSecret(t *testing.T) { fakeCACli := mock.NewMockCAClient(mockCertChain1st, mockCertChainRemain, 0) opt := Options{ - SecretTTL: 200 * time.Microsecond, - RotationInterval: 200 * time.Microsecond, - EvictionDuration: 10 * time.Second, - InitialBackoff: 10, - SkipValidateCert: true, + SecretTTL: 200 * time.Microsecond, + RotationInterval: 200 * time.Microsecond, + EvictionDuration: 10 * time.Second, + InitialBackoffInMilliSec: 10, + SkipValidateCert: true, } fetcher := &secretfetcher.SecretFetcher{ UseCaClient: true, @@ -710,11 +710,11 @@ func createSecretCache() *SecretCache { ch := make(chan struct{}) fetcher.Run(ch) opt := Options{ - SecretTTL: time.Minute, - RotationInterval: 300 * time.Microsecond, - EvictionDuration: 2 * time.Second, - InitialBackoff: 10, - SkipValidateCert: true, + SecretTTL: time.Minute, + RotationInterval: 300 * time.Microsecond, + EvictionDuration: 2 * time.Second, + InitialBackoffInMilliSec: 10, + SkipValidateCert: true, } return NewSecretCache(fetcher, notifyCb, opt) } @@ -862,12 +862,12 @@ func checkBool(t *testing.T, name string, got bool, want bool) { func TestSetAlwaysValidTokenFlag(t *testing.T) { fakeCACli := mock.NewMockCAClient(mockCertChain1st, mockCertChainRemain, 0) opt := Options{ - SecretTTL: 200 * time.Microsecond, - RotationInterval: 200 * time.Microsecond, - EvictionDuration: 10 * time.Second, - InitialBackoff: 10, - AlwaysValidTokenFlag: true, - SkipValidateCert: true, + SecretTTL: 200 * time.Microsecond, + RotationInterval: 200 * time.Microsecond, + EvictionDuration: 10 * time.Second, + InitialBackoffInMilliSec: 10, + AlwaysValidTokenFlag: true, + SkipValidateCert: true, } fetcher := &secretfetcher.SecretFetcher{ UseCaClient: true, From bff3189a84b9fbb4df36f2feb7c7738183650f7c Mon Sep 17 00:00:00 2001 From: Oliver Liu Date: Sun, 26 Jan 2020 15:08:25 -0800 Subject: [PATCH 2/2] Small fix. --- security/pkg/nodeagent/sds/server_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/pkg/nodeagent/sds/server_test.go b/security/pkg/nodeagent/sds/server_test.go index b9d98c1c170c..24372162ee94 100644 --- a/security/pkg/nodeagent/sds/server_test.go +++ b/security/pkg/nodeagent/sds/server_test.go @@ -176,7 +176,7 @@ func createRealSDSServer(t *testing.T, socket string) *Server { workloadSdsCacheOptions.Pkcs8Keys = false workloadSdsCacheOptions.Plugins = NewPlugins([]string{"GoogleTokenExchange"}) workloadSdsCacheOptions.RotationInterval = 10 * time.Minute - workloadSdsCacheOptions.InitialBackoff = 10 + workloadSdsCacheOptions.InitialBackoffInMilliSec = 10 workloadSecretCache := cache.NewSecretCache(wSecretFetcher, NotifyProxy, *workloadSdsCacheOptions) server, err := NewServer(arg, workloadSecretCache, nil)