Skip to content

Commit

Permalink
Update nodeagent SDS backoff config and algorithm.
Browse files Browse the repository at this point in the history
  • Loading branch information
myidpt committed Jan 25, 2020
1 parent 0b0526e commit 8ee2dcb
Show file tree
Hide file tree
Showing 5 changed files with 51 additions and 52 deletions.
8 changes: 4 additions & 4 deletions pkg/istio-agent/sds-agent.go
Expand Up @@ -67,10 +67,10 @@ var (

trustDomainEnv = env.RegisterStringVar(trustDomain, "", "").Get()
secretTTLEnv = env.RegisterDurationVar(secretTTL, 24*time.Hour, "").Get()
secretRefreshGraceDurationEnv = env.RegisterDurationVar(SecretRefreshGraceDuration, 1*time.Hour, "").Get()
secretRefreshGraceDurationEnv = env.RegisterDurationVar(SecretRefreshGraceDuration, 12*time.Hour, "").Get()
secretRotationIntervalEnv = env.RegisterDurationVar(SecretRotationInterval, 10*time.Minute, "").Get()
staledConnectionRecycleIntervalEnv = env.RegisterDurationVar(staledConnectionRecycleInterval, 5*time.Minute, "").Get()
initialBackoffEnv = env.RegisterIntVar(InitialBackoff, 10, "").Get()
initialBackoffInMilliSecEnv = env.RegisterIntVar(InitialBackoffInMilliSec, 2000, "").Get()
pkcs8KeysEnv = env.RegisterBoolVar(pkcs8Key, false, "Whether to generate PKCS#8 private keys").Get()

// Location of K8S CA root.
Expand Down Expand Up @@ -118,7 +118,7 @@ const (

// The environmental variable name for the initial backoff in milliseconds.
// example value format like "10"
InitialBackoff = "INITIAL_BACKOFF_MSEC"
InitialBackoffInMilliSec = "INITIAL_BACKOFF_MSEC"

pkcs8Key = "PKCS8_KEY"
)
Expand Down Expand Up @@ -473,5 +473,5 @@ func applyEnvVars() {

serverOptions.RecycleInterval = staledConnectionRecycleIntervalEnv

workloadSdsCacheOptions.InitialBackoff = int64(initialBackoffEnv)
workloadSdsCacheOptions.InitialBackoffInMilliSec = int64(initialBackoffInMilliSecEnv)
}
20 changes: 10 additions & 10 deletions security/cmd/node_agent_k8s/main.go
Expand Up @@ -116,9 +116,9 @@ const (
staledConnectionRecycleInterval = "STALED_CONNECTION_RECYCLE_RUN_INTERVAL"

// The environmental variable name for the initial backoff in milliseconds.
// example value format like "10"
InitialBackoff = "INITIAL_BACKOFF_MSEC"
InitialBackoffFlag = "initialBackoff"
// example value format like "1000"
InitialBackoffInMilliSec = "INITIAL_BACKOFF_MSEC"
InitialBackoffInMilliSecFlag = "initialBackoff"

MonitoringPort = "MONITORING_PORT"
EnableProfiling = "ENABLE_PROFILING"
Expand Down Expand Up @@ -247,10 +247,10 @@ var (
vaultSignCsrPathEnv = env.RegisterStringVar(vaultSignCsrPath, "", "").Get()
vaultTLSRootCertEnv = env.RegisterStringVar(vaultTLSRootCert, "", "").Get()
secretTTLEnv = env.RegisterDurationVar(secretTTL, 24*time.Hour, "").Get()
secretRefreshGraceDurationEnv = env.RegisterDurationVar(SecretRefreshGraceDuration, 1*time.Hour, "").Get()
secretRefreshGraceDurationEnv = env.RegisterDurationVar(SecretRefreshGraceDuration, 12*time.Hour, "").Get()
secretRotationIntervalEnv = env.RegisterDurationVar(SecretRotationInterval, 10*time.Minute, "").Get()
staledConnectionRecycleIntervalEnv = env.RegisterDurationVar(staledConnectionRecycleInterval, 5*time.Minute, "").Get()
initialBackoffEnv = env.RegisterIntVar(InitialBackoff, 10, "").Get()
initialBackoffInMilliSecEnv = env.RegisterIntVar(InitialBackoffInMilliSec, 2000, "").Get()
monitoringPortEnv = env.RegisterIntVar(MonitoringPort, 15014,
"The port number for monitoring Citadel agent").Get()
debugPortEnv = env.RegisterIntVar(DebugPort, 8080,
Expand Down Expand Up @@ -327,8 +327,8 @@ func applyEnvVars(cmd *cobra.Command) {

serverOptions.RecycleInterval = staledConnectionRecycleIntervalEnv

if !cmd.Flag(InitialBackoffFlag).Changed {
workloadSdsCacheOptions.InitialBackoff = int64(initialBackoffEnv)
if !cmd.Flag(InitialBackoffInMilliSecFlag).Changed {
workloadSdsCacheOptions.InitialBackoffInMilliSec = int64(initialBackoffInMilliSecEnv)
}

serverOptions.DebugPort = debugPortEnv
Expand All @@ -338,7 +338,7 @@ func applyEnvVars(cmd *cobra.Command) {
func validateOptions() error {
// The initial backoff time (in millisec) is a random number between 0 and initBackoff.
// Default to 10, a valid range is [10, 120000].
initBackoff := workloadSdsCacheOptions.InitialBackoff
initBackoff := workloadSdsCacheOptions.InitialBackoffInMilliSec
if initBackoff < 10 || initBackoff > 120000 {
return fmt.Errorf("initial backoff should be within range 10 to 120000, found: %d", initBackoff)
}
Expand Down Expand Up @@ -390,8 +390,8 @@ func main() {
rootCmd.PersistentFlags().DurationVar(&workloadSdsCacheOptions.RotationInterval, secretRotationIntervalFlag,
10*time.Minute, "Secret rotation job running interval")

rootCmd.PersistentFlags().Int64Var(&workloadSdsCacheOptions.InitialBackoff, InitialBackoffFlag, 10,
"The initial backoff interval in milliseconds, must be within the range [10, 120000]")
rootCmd.PersistentFlags().Int64Var(&workloadSdsCacheOptions.InitialBackoffInMilliSec, InitialBackoffInMilliSecFlag, 2000,
"The initial backoff interval in milliseconds, default value is 2000, must be within the range [10, 120000]")

rootCmd.PersistentFlags().DurationVar(&workloadSdsCacheOptions.EvictionDuration, "secretEvictionDuration",
24*time.Hour, "Secret eviction time duration")
Expand Down
6 changes: 3 additions & 3 deletions security/cmd/node_agent_k8s/main_test.go
Expand Up @@ -34,14 +34,14 @@ func TestValidateOptions(t *testing.T) {
{
name: "initial backoff too small",
setExtraOptions: func() {
workloadSdsCacheOptions.InitialBackoff = 9
workloadSdsCacheOptions.InitialBackoffInMilliSec = 9
},
errorMsg: "initial backoff should be within range 10 to 120000",
},
{
name: "initial backoff too large",
setExtraOptions: func() {
workloadSdsCacheOptions.InitialBackoff = 120001
workloadSdsCacheOptions.InitialBackoffInMilliSec = 120001
},
errorMsg: "initial backoff should be within range 10 to 120000",
},
Expand Down Expand Up @@ -73,7 +73,7 @@ func TestValidateOptions(t *testing.T) {

// Set the valid options as the base for the testing.
workloadSdsCacheOptions = cache.Options{
InitialBackoff: 10,
InitialBackoffInMilliSec: 2000,
}
serverOptions = sds.Options{
EnableIngressGatewaySDS: true,
Expand Down
27 changes: 13 additions & 14 deletions security/pkg/nodeagent/cache/secretcache.go
Expand Up @@ -61,11 +61,11 @@ const (
// identityTemplate is the format template of identity in the CSR request.
identityTemplate = "spiffe://%s/ns/%s/sa/%s"

// For REST APIs between envoy->nodeagent, default value of 1s is used.
envoyDefaultTimeoutInMilliSec = 1000
// The total timeout for any credential retrieval process, default value of 10s is used.
totalTimeout = time.Second * 10

// initialBackOffIntervalInMilliSec is the initial backoff time interval when hitting non-retryable error in CSR request.
initialBackOffIntervalInMilliSec = 50
// firstRetryBackOffInMilliSec is the initial backoff time interval when hitting non-retryable error in CSR request.
firstRetryBackOffInMilliSec = 50

// Timeout the K8s update/delete notification threads. This is to make sure to unblock the
// secret watch main thread in case those child threads got stuck due to any reason.
Expand All @@ -91,7 +91,7 @@ type Options struct {
SecretTTL time.Duration

// The initial backoff time in millisecond to avoid the thundering herd problem.
InitialBackoff int64
InitialBackoffInMilliSec int64

// secret should be refreshed before it expired, SecretRefreshGraceDuration is the grace period;
// secret should be refreshed if time.Now.After(secret.CreateTime + SecretTTL - SecretRefreshGraceDuration)
Expand Down Expand Up @@ -816,15 +816,15 @@ func (sc *SecretCache) isTokenExpired() bool {
func (sc *SecretCache) sendRetriableRequest(ctx context.Context, csrPEM []byte,
providedExchangedToken string, connKey ConnKey, isCSR bool) ([]string, error) {
sc.randMutex.Lock()
backOffInMilliSec := sc.rand.Int63n(sc.configOptions.InitialBackoff)
randomizedInitialBackOffInMS := sc.rand.Int63n(sc.configOptions.InitialBackoffInMilliSec)
sc.randMutex.Unlock()
cacheLog.Debugf("Wait for %d millisec", backOffInMilliSec)
cacheLog.Debugf("Wait for %d millisec for jitter", randomizedInitialBackOffInMS)
// Add a jitter to initial CSR to avoid thundering herd problem.
time.Sleep(time.Duration(backOffInMilliSec) * time.Millisecond)
time.Sleep(time.Duration(randomizedInitialBackOffInMS) * time.Millisecond)
retryBackoffInMS := int64(firstRetryBackOffInMilliSec)

conIDresourceNamePrefix := cacheLogPrefix(connKey.ConnectionID, connKey.ResourceName)
startTime := time.Now()
var retry int64
var certChainPEM []string
exchangedToken := providedExchangedToken
var requestErrorString string
Expand Down Expand Up @@ -854,14 +854,13 @@ func (sc *SecretCache) sendRetriableRequest(ctx context.Context, csrPEM []byte,
}

// If reach envoy timeout, fail the request by returning err
if startTime.Add(time.Millisecond * envoyDefaultTimeoutInMilliSec).Before(time.Now()) {
if startTime.Add(totalTimeout).Before(time.Now()) {
cacheLog.Errorf("%s retrial timed out: %v", requestErrorString, err)
return nil, err
}
retry++
backOffInMilliSec = rand.Int63n(retry * initialBackOffIntervalInMilliSec)
time.Sleep(time.Duration(backOffInMilliSec) * time.Millisecond)
cacheLog.Warnf("%s failed with error: %v, retry in %d millisec", requestErrorString, err, backOffInMilliSec)
time.Sleep(time.Duration(retryBackoffInMS) * time.Millisecond)
cacheLog.Warnf("%s failed with error: %v, retry in %d millisec", requestErrorString, err, retryBackoffInMS)
retryBackoffInMS *= 2 // Exponentially increase the retry backoff time.

// Record retry metrics.
if isCSR {
Expand Down
42 changes: 21 additions & 21 deletions security/pkg/nodeagent/cache/secretcache_test.go
Expand Up @@ -220,11 +220,11 @@ func TestWorkloadAgentGenerateSecretWithPluginProvider(t *testing.T) {
func testWorkloadAgentGenerateSecret(t *testing.T, isUsingPluginProvider bool) {
fakeCACli := mock.NewMockCAClient(mockCertChain1st, mockCertChainRemain, 0.1)
opt := Options{
SecretTTL: time.Minute,
RotationInterval: 300 * time.Microsecond,
EvictionDuration: 60 * time.Second,
InitialBackoff: 10,
SkipValidateCert: true,
SecretTTL: time.Minute,
RotationInterval: 300 * time.Microsecond,
EvictionDuration: 60 * time.Second,
InitialBackoffInMilliSec: 10,
SkipValidateCert: true,
}

if isUsingPluginProvider {
Expand Down Expand Up @@ -322,11 +322,11 @@ func testWorkloadAgentGenerateSecret(t *testing.T, isUsingPluginProvider bool) {
func TestWorkloadAgentRefreshSecret(t *testing.T) {
fakeCACli := mock.NewMockCAClient(mockCertChain1st, mockCertChainRemain, 0)
opt := Options{
SecretTTL: 200 * time.Microsecond,
RotationInterval: 200 * time.Microsecond,
EvictionDuration: 10 * time.Second,
InitialBackoff: 10,
SkipValidateCert: true,
SecretTTL: 200 * time.Microsecond,
RotationInterval: 200 * time.Microsecond,
EvictionDuration: 10 * time.Second,
InitialBackoffInMilliSec: 10,
SkipValidateCert: true,
}
fetcher := &secretfetcher.SecretFetcher{
UseCaClient: true,
Expand Down Expand Up @@ -710,11 +710,11 @@ func createSecretCache() *SecretCache {
ch := make(chan struct{})
fetcher.Run(ch)
opt := Options{
SecretTTL: time.Minute,
RotationInterval: 300 * time.Microsecond,
EvictionDuration: 2 * time.Second,
InitialBackoff: 10,
SkipValidateCert: true,
SecretTTL: time.Minute,
RotationInterval: 300 * time.Microsecond,
EvictionDuration: 2 * time.Second,
InitialBackoffInMilliSec: 10,
SkipValidateCert: true,
}
return NewSecretCache(fetcher, notifyCb, opt)
}
Expand Down Expand Up @@ -862,12 +862,12 @@ func checkBool(t *testing.T, name string, got bool, want bool) {
func TestSetAlwaysValidTokenFlag(t *testing.T) {
fakeCACli := mock.NewMockCAClient(mockCertChain1st, mockCertChainRemain, 0)
opt := Options{
SecretTTL: 200 * time.Microsecond,
RotationInterval: 200 * time.Microsecond,
EvictionDuration: 10 * time.Second,
InitialBackoff: 10,
AlwaysValidTokenFlag: true,
SkipValidateCert: true,
SecretTTL: 200 * time.Microsecond,
RotationInterval: 200 * time.Microsecond,
EvictionDuration: 10 * time.Second,
InitialBackoffInMilliSec: 10,
AlwaysValidTokenFlag: true,
SkipValidateCert: true,
}
fetcher := &secretfetcher.SecretFetcher{
UseCaClient: true,
Expand Down

0 comments on commit 8ee2dcb

Please sign in to comment.