Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update nodeagent/sdsagent CSR backoff config and algorithm. #20530

Merged
merged 3 commits into from Jan 27, 2020
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
8 changes: 4 additions & 4 deletions pkg/istio-agent/sds-agent.go
Expand Up @@ -67,10 +67,10 @@ var (

trustDomainEnv = env.RegisterStringVar(trustDomain, "", "").Get()
secretTTLEnv = env.RegisterDurationVar(secretTTL, 24*time.Hour, "").Get()
secretRefreshGraceDurationEnv = env.RegisterDurationVar(SecretRefreshGraceDuration, 1*time.Hour, "").Get()
secretRefreshGraceDurationEnv = env.RegisterDurationVar(SecretRefreshGraceDuration, 12*time.Hour, "").Get()
secretRotationIntervalEnv = env.RegisterDurationVar(SecretRotationInterval, 10*time.Minute, "").Get()
staledConnectionRecycleIntervalEnv = env.RegisterDurationVar(staledConnectionRecycleInterval, 5*time.Minute, "").Get()
initialBackoffEnv = env.RegisterIntVar(InitialBackoff, 10, "").Get()
initialBackoffInMilliSecEnv = env.RegisterIntVar(InitialBackoffInMilliSec, 2000, "").Get()
pkcs8KeysEnv = env.RegisterBoolVar(pkcs8Key, false, "Whether to generate PKCS#8 private keys").Get()

// Location of K8S CA root.
Expand Down Expand Up @@ -118,7 +118,7 @@ const (

// The environmental variable name for the initial backoff in milliseconds.
// example value format like "10"
InitialBackoff = "INITIAL_BACKOFF_MSEC"
InitialBackoffInMilliSec = "INITIAL_BACKOFF_MSEC"

pkcs8Key = "PKCS8_KEY"
)
Expand Down Expand Up @@ -473,5 +473,5 @@ func applyEnvVars() {

serverOptions.RecycleInterval = staledConnectionRecycleIntervalEnv

workloadSdsCacheOptions.InitialBackoff = int64(initialBackoffEnv)
workloadSdsCacheOptions.InitialBackoffInMilliSec = int64(initialBackoffInMilliSecEnv)
}
20 changes: 10 additions & 10 deletions security/cmd/node_agent_k8s/main.go
Expand Up @@ -116,9 +116,9 @@ const (
staledConnectionRecycleInterval = "STALED_CONNECTION_RECYCLE_RUN_INTERVAL"

// The environmental variable name for the initial backoff in milliseconds.
// example value format like "10"
InitialBackoff = "INITIAL_BACKOFF_MSEC"
InitialBackoffFlag = "initialBackoff"
// example value format like "1000"
InitialBackoffInMilliSec = "INITIAL_BACKOFF_MSEC"
InitialBackoffInMilliSecFlag = "initialBackoff"

MonitoringPort = "MONITORING_PORT"
EnableProfiling = "ENABLE_PROFILING"
Expand Down Expand Up @@ -247,10 +247,10 @@ var (
vaultSignCsrPathEnv = env.RegisterStringVar(vaultSignCsrPath, "", "").Get()
vaultTLSRootCertEnv = env.RegisterStringVar(vaultTLSRootCert, "", "").Get()
secretTTLEnv = env.RegisterDurationVar(secretTTL, 24*time.Hour, "").Get()
secretRefreshGraceDurationEnv = env.RegisterDurationVar(SecretRefreshGraceDuration, 1*time.Hour, "").Get()
secretRefreshGraceDurationEnv = env.RegisterDurationVar(SecretRefreshGraceDuration, 12*time.Hour, "").Get()
secretRotationIntervalEnv = env.RegisterDurationVar(SecretRotationInterval, 10*time.Minute, "").Get()
staledConnectionRecycleIntervalEnv = env.RegisterDurationVar(staledConnectionRecycleInterval, 5*time.Minute, "").Get()
initialBackoffEnv = env.RegisterIntVar(InitialBackoff, 10, "").Get()
initialBackoffInMilliSecEnv = env.RegisterIntVar(InitialBackoffInMilliSec, 2000, "").Get()
monitoringPortEnv = env.RegisterIntVar(MonitoringPort, 15014,
"The port number for monitoring Citadel agent").Get()
debugPortEnv = env.RegisterIntVar(DebugPort, 8080,
Expand Down Expand Up @@ -327,8 +327,8 @@ func applyEnvVars(cmd *cobra.Command) {

serverOptions.RecycleInterval = staledConnectionRecycleIntervalEnv

if !cmd.Flag(InitialBackoffFlag).Changed {
workloadSdsCacheOptions.InitialBackoff = int64(initialBackoffEnv)
if !cmd.Flag(InitialBackoffInMilliSecFlag).Changed {
workloadSdsCacheOptions.InitialBackoffInMilliSec = int64(initialBackoffInMilliSecEnv)
}

serverOptions.DebugPort = debugPortEnv
Expand All @@ -338,7 +338,7 @@ func applyEnvVars(cmd *cobra.Command) {
func validateOptions() error {
// The initial backoff time (in millisec) is a random number between 0 and initBackoff.
// Default to 10, a valid range is [10, 120000].
initBackoff := workloadSdsCacheOptions.InitialBackoff
initBackoff := workloadSdsCacheOptions.InitialBackoffInMilliSec
if initBackoff < 10 || initBackoff > 120000 {
return fmt.Errorf("initial backoff should be within range 10 to 120000, found: %d", initBackoff)
}
Expand Down Expand Up @@ -390,8 +390,8 @@ func main() {
rootCmd.PersistentFlags().DurationVar(&workloadSdsCacheOptions.RotationInterval, secretRotationIntervalFlag,
10*time.Minute, "Secret rotation job running interval")

rootCmd.PersistentFlags().Int64Var(&workloadSdsCacheOptions.InitialBackoff, InitialBackoffFlag, 10,
"The initial backoff interval in milliseconds, must be within the range [10, 120000]")
rootCmd.PersistentFlags().Int64Var(&workloadSdsCacheOptions.InitialBackoffInMilliSec, InitialBackoffInMilliSecFlag, 2000,
"The initial backoff interval in milliseconds, default value is 2000, must be within the range [10, 120000]")

rootCmd.PersistentFlags().DurationVar(&workloadSdsCacheOptions.EvictionDuration, "secretEvictionDuration",
24*time.Hour, "Secret eviction time duration")
Expand Down
6 changes: 3 additions & 3 deletions security/cmd/node_agent_k8s/main_test.go
Expand Up @@ -34,14 +34,14 @@ func TestValidateOptions(t *testing.T) {
{
name: "initial backoff too small",
setExtraOptions: func() {
workloadSdsCacheOptions.InitialBackoff = 9
workloadSdsCacheOptions.InitialBackoffInMilliSec = 9
},
errorMsg: "initial backoff should be within range 10 to 120000",
},
{
name: "initial backoff too large",
setExtraOptions: func() {
workloadSdsCacheOptions.InitialBackoff = 120001
workloadSdsCacheOptions.InitialBackoffInMilliSec = 120001
},
errorMsg: "initial backoff should be within range 10 to 120000",
},
Expand Down Expand Up @@ -73,7 +73,7 @@ func TestValidateOptions(t *testing.T) {

// Set the valid options as the base for the testing.
workloadSdsCacheOptions = cache.Options{
InitialBackoff: 10,
InitialBackoffInMilliSec: 2000,
}
serverOptions = sds.Options{
EnableIngressGatewaySDS: true,
Expand Down
27 changes: 13 additions & 14 deletions security/pkg/nodeagent/cache/secretcache.go
Expand Up @@ -61,11 +61,11 @@ const (
// identityTemplate is the format template of identity in the CSR request.
identityTemplate = "spiffe://%s/ns/%s/sa/%s"

// For REST APIs between envoy->nodeagent, default value of 1s is used.
envoyDefaultTimeoutInMilliSec = 1000
// The total timeout for any credential retrieval process, default value of 10s is used.
totalTimeout = time.Second * 10

// initialBackOffIntervalInMilliSec is the initial backoff time interval when hitting non-retryable error in CSR request.
initialBackOffIntervalInMilliSec = 50
// firstRetryBackOffInMilliSec is the initial backoff time interval when hitting non-retryable error in CSR request.
firstRetryBackOffInMilliSec = 50

// Timeout the K8s update/delete notification threads. This is to make sure to unblock the
// secret watch main thread in case those child threads got stuck due to any reason.
Expand All @@ -91,7 +91,7 @@ type Options struct {
SecretTTL time.Duration

// The initial backoff time in millisecond to avoid the thundering herd problem.
InitialBackoff int64
InitialBackoffInMilliSec int64

// secret should be refreshed before it expired, SecretRefreshGraceDuration is the grace period;
// secret should be refreshed if time.Now.After(secret.CreateTime + SecretTTL - SecretRefreshGraceDuration)
Expand Down Expand Up @@ -816,15 +816,15 @@ func (sc *SecretCache) isTokenExpired() bool {
func (sc *SecretCache) sendRetriableRequest(ctx context.Context, csrPEM []byte,
providedExchangedToken string, connKey ConnKey, isCSR bool) ([]string, error) {
sc.randMutex.Lock()
backOffInMilliSec := sc.rand.Int63n(sc.configOptions.InitialBackoff)
randomizedInitialBackOffInMS := sc.rand.Int63n(sc.configOptions.InitialBackoffInMilliSec)
sc.randMutex.Unlock()
cacheLog.Debugf("Wait for %d millisec", backOffInMilliSec)
cacheLog.Debugf("Wait for %d millisec for jitter", randomizedInitialBackOffInMS)
// Add a jitter to initial CSR to avoid thundering herd problem.
time.Sleep(time.Duration(backOffInMilliSec) * time.Millisecond)
time.Sleep(time.Duration(randomizedInitialBackOffInMS) * time.Millisecond)
retryBackoffInMS := int64(firstRetryBackOffInMilliSec)

conIDresourceNamePrefix := cacheLogPrefix(connKey.ConnectionID, connKey.ResourceName)
startTime := time.Now()
var retry int64
var certChainPEM []string
exchangedToken := providedExchangedToken
var requestErrorString string
Expand Down Expand Up @@ -854,14 +854,13 @@ func (sc *SecretCache) sendRetriableRequest(ctx context.Context, csrPEM []byte,
}

// If reach envoy timeout, fail the request by returning err
if startTime.Add(time.Millisecond * envoyDefaultTimeoutInMilliSec).Before(time.Now()) {
if startTime.Add(totalTimeout).Before(time.Now()) {
cacheLog.Errorf("%s retrial timed out: %v", requestErrorString, err)
return nil, err
}
retry++
backOffInMilliSec = rand.Int63n(retry * initialBackOffIntervalInMilliSec)
time.Sleep(time.Duration(backOffInMilliSec) * time.Millisecond)
cacheLog.Warnf("%s failed with error: %v, retry in %d millisec", requestErrorString, err, backOffInMilliSec)
time.Sleep(time.Duration(retryBackoffInMS) * time.Millisecond)
cacheLog.Warnf("%s failed with error: %v, retry in %d millisec", requestErrorString, err, retryBackoffInMS)
retryBackoffInMS *= 2 // Exponentially increase the retry backoff time.

// Record retry metrics.
if isCSR {
Expand Down
42 changes: 21 additions & 21 deletions security/pkg/nodeagent/cache/secretcache_test.go
Expand Up @@ -220,11 +220,11 @@ func TestWorkloadAgentGenerateSecretWithPluginProvider(t *testing.T) {
func testWorkloadAgentGenerateSecret(t *testing.T, isUsingPluginProvider bool) {
fakeCACli := mock.NewMockCAClient(mockCertChain1st, mockCertChainRemain, 0.1)
opt := Options{
SecretTTL: time.Minute,
RotationInterval: 300 * time.Microsecond,
EvictionDuration: 60 * time.Second,
InitialBackoff: 10,
SkipValidateCert: true,
SecretTTL: time.Minute,
RotationInterval: 300 * time.Microsecond,
EvictionDuration: 60 * time.Second,
InitialBackoffInMilliSec: 10,
SkipValidateCert: true,
}

if isUsingPluginProvider {
Expand Down Expand Up @@ -322,11 +322,11 @@ func testWorkloadAgentGenerateSecret(t *testing.T, isUsingPluginProvider bool) {
func TestWorkloadAgentRefreshSecret(t *testing.T) {
fakeCACli := mock.NewMockCAClient(mockCertChain1st, mockCertChainRemain, 0)
opt := Options{
SecretTTL: 200 * time.Microsecond,
RotationInterval: 200 * time.Microsecond,
EvictionDuration: 10 * time.Second,
InitialBackoff: 10,
SkipValidateCert: true,
SecretTTL: 200 * time.Microsecond,
RotationInterval: 200 * time.Microsecond,
EvictionDuration: 10 * time.Second,
InitialBackoffInMilliSec: 10,
SkipValidateCert: true,
}
fetcher := &secretfetcher.SecretFetcher{
UseCaClient: true,
Expand Down Expand Up @@ -710,11 +710,11 @@ func createSecretCache() *SecretCache {
ch := make(chan struct{})
fetcher.Run(ch)
opt := Options{
SecretTTL: time.Minute,
RotationInterval: 300 * time.Microsecond,
EvictionDuration: 2 * time.Second,
InitialBackoff: 10,
SkipValidateCert: true,
SecretTTL: time.Minute,
RotationInterval: 300 * time.Microsecond,
EvictionDuration: 2 * time.Second,
InitialBackoffInMilliSec: 10,
SkipValidateCert: true,
}
return NewSecretCache(fetcher, notifyCb, opt)
}
Expand Down Expand Up @@ -862,12 +862,12 @@ func checkBool(t *testing.T, name string, got bool, want bool) {
func TestSetAlwaysValidTokenFlag(t *testing.T) {
fakeCACli := mock.NewMockCAClient(mockCertChain1st, mockCertChainRemain, 0)
opt := Options{
SecretTTL: 200 * time.Microsecond,
RotationInterval: 200 * time.Microsecond,
EvictionDuration: 10 * time.Second,
InitialBackoff: 10,
AlwaysValidTokenFlag: true,
SkipValidateCert: true,
SecretTTL: 200 * time.Microsecond,
RotationInterval: 200 * time.Microsecond,
EvictionDuration: 10 * time.Second,
InitialBackoffInMilliSec: 10,
AlwaysValidTokenFlag: true,
SkipValidateCert: true,
}
fetcher := &secretfetcher.SecretFetcher{
UseCaClient: true,
Expand Down