diff --git a/pilot/pkg/features/pilot.go b/pilot/pkg/features/pilot.go index 2ea1e6bf5844..3f62bd6e8bf5 100644 --- a/pilot/pkg/features/pilot.go +++ b/pilot/pkg/features/pilot.go @@ -331,4 +331,10 @@ var ( EnableTLSv2OnInboundPath = env.RegisterBoolVar("PILOT_SIDECAR_ENABLE_INBOUND_TLS_V2", false, "If true, Pilot will set the TLS version on server side as TLSv1_2 and also enforce strong cipher suites").Get() + + EnableTLSXDSDynamicTypes = env.RegisterBoolVar("PILOT_ENABLE_TLS_XDS_DYNAMIC_TYPES", true, + "This flag controls the how internal version of XDS is chosen. This applies only to Istio 1.7. "+ + "When this flag is enabled, an upgrade of the Istio control plane from 1.6 -> 1.7 should result in no downtime for 1.6 proxies. "+ + "However, for users who have already upgraded to 1.7.x without this option, to 1.7.y with this version, while still maintaining old 1.6 proxies, "+ + "this will result in downtime and should be set to false.").Get() ) diff --git a/pilot/pkg/networking/core/v1alpha3/cluster.go b/pilot/pkg/networking/core/v1alpha3/cluster.go index 6607c9020165..f4ff93f98053 100644 --- a/pilot/pkg/networking/core/v1alpha3/cluster.go +++ b/pilot/pkg/networking/core/v1alpha3/cluster.go @@ -42,6 +42,7 @@ import ( authn_model "istio.io/istio/pilot/pkg/security/model" "istio.io/istio/pilot/pkg/serviceregistry" "istio.io/istio/pilot/pkg/util/sets" + xdsv2 "istio.io/istio/pilot/pkg/xds/v2" "istio.io/istio/pkg/config/constants" "istio.io/istio/pkg/config/host" "istio.io/istio/pkg/config/protocol" @@ -907,10 +908,20 @@ func applyUpstreamTLSSettings(opts *buildClusterOpts, tls *networking.ClientTLSS return } + tc := util.MessageToAny(tlsContext) + // Moving from v2 tls <-> v3 tls seems to cause downtime: https://github.com/envoyproxy/envoy/issues/13864 + // Instead, we should tie this to the cluster version, which is fixed, so we do not switch at runtime during + // an in place upgrade of the control plane only + // However, there is an edge case: users with 1.6 proxy, connecting to 1.7.x control plane without this code, then updating + // to 1.7.y control plane with this code. This would cause this code to cause another downtime by downgrading from v3 to v2. + // To workaround this, we will have a flag to disable this behavior + if tc != nil && features.EnableTLSXDSDynamicTypes && node.RequestedTypes.CDS == xdsv2.ClusterType { + tc.TypeUrl = "type.googleapis.com/envoy.api.v2.auth.UpstreamTlsContext" + } if tlsContext != nil { c.TransportSocket = &core.TransportSocket{ Name: util.EnvoyTLSSocketName, - ConfigType: &core.TransportSocket_TypedConfig{TypedConfig: util.MessageToAny(tlsContext)}, + ConfigType: &core.TransportSocket_TypedConfig{TypedConfig: tc}, } } diff --git a/pilot/pkg/networking/core/v1alpha3/listener.go b/pilot/pkg/networking/core/v1alpha3/listener.go index 9c604fd4772d..2d9db8c476c5 100644 --- a/pilot/pkg/networking/core/v1alpha3/listener.go +++ b/pilot/pkg/networking/core/v1alpha3/listener.go @@ -43,6 +43,7 @@ import ( structpb "github.com/golang/protobuf/ptypes/struct" "github.com/golang/protobuf/ptypes/wrappers" + xdsv2 "istio.io/istio/pilot/pkg/xds/v2" "istio.io/istio/pkg/util/protomarshal" meshconfig "istio.io/api/mesh/v1alpha1" @@ -2160,7 +2161,7 @@ func buildListener(opts buildListenerOpts) *listener.Listener { } filterChains = append(filterChains, &listener.FilterChain{ FilterChainMatch: match, - TransportSocket: buildDownstreamTLSTransportSocket(chain.tlsContext), + TransportSocket: buildDownstreamTLSTransportSocket(chain.tlsContext, opts.proxy), }) } @@ -2548,11 +2549,21 @@ func appendListenerFilters(filters []*listener.ListenerFilter) []*listener.Liste } // nolint: interfacer -func buildDownstreamTLSTransportSocket(tlsContext *auth.DownstreamTlsContext) *core.TransportSocket { +func buildDownstreamTLSTransportSocket(tlsContext *auth.DownstreamTlsContext, proxy *model.Proxy) *core.TransportSocket { if tlsContext == nil { return nil } - return &core.TransportSocket{Name: util.EnvoyTLSSocketName, ConfigType: &core.TransportSocket_TypedConfig{TypedConfig: util.MessageToAny(tlsContext)}} + tc := util.MessageToAny(tlsContext) + // Moving from v2 tls <-> v3 tls seems to cause downtime: https://github.com/envoyproxy/envoy/issues/13864 + // Instead, we should tie this to the cluster version, which is fixed, so we do not switch at runtime during + // an in place upgrade of the control plane only + // However, there is an edge case: users with 1.6 proxy, connecting to 1.7.x control plane without this code, then updating + // to 1.7.y control plane with this code. This would cause this code to cause another downtime by downgrading from v3 to v2. + // To workaround this, we will have a flag to disable this behavior + if tc != nil && features.EnableTLSXDSDynamicTypes && proxy.RequestedTypes.LDS == xdsv2.ListenerType { + tc.TypeUrl = "type.googleapis.com/envoy.api.v2.auth.DownstreamTlsContext" + } + return &core.TransportSocket{Name: util.EnvoyTLSSocketName, ConfigType: &core.TransportSocket_TypedConfig{TypedConfig: tc}} } func isMatchAllFilterChain(fc *listener.FilterChain) bool { diff --git a/pilot/pkg/security/model/authentication.go b/pilot/pkg/security/model/authentication.go index 840f1f77efad..f58e33db2a6a 100644 --- a/pilot/pkg/security/model/authentication.go +++ b/pilot/pkg/security/model/authentication.go @@ -208,6 +208,10 @@ func ConstructSdsSecretConfig(name, requestedType string) *tls.SdsSecretConfig { InitialFetchTimeout: features.InitialFetchTimeout, }, } + // For these, we know they are always require and should be present, so do not timeout on fetch + if name == SDSDefaultResourceName || name == SDSRootResourceName { + cfg.SdsConfig.InitialFetchTimeout = ptypes.DurationProto(time.Second * 0) + } return cfg } diff --git a/releasenotes/notes/tls-version.yaml b/releasenotes/notes/tls-version.yaml new file mode 100644 index 000000000000..b9fd88579e82 --- /dev/null +++ b/releasenotes/notes/tls-version.yaml @@ -0,0 +1,11 @@ +apiVersion: release-notes/v2 +kind: bug-fix +area: networking +issue: +- 28120 +releaseNotes: +- | + **Fixed** an issue causing a short spike in errors during in place upgrades from Istio 1.6 to Istio 1.7. As a result of this + fix, users who already have Istio 1.7 deployed but still have proxies left on version 1.6 will see a similar spike during this + upgrade. It is highly recommended you either migrate all existing proxies to version 1.7 prior to this release. Alternatively, to + retain the previous behavior, you may set the `PILOT_ENABLE_TLS_XDS_DYNAMIC_TYPES=false` environment variable in Istiod.