Skip to content

Commit

Permalink
Do not switch TLS version on 1.6 -> 1.7 upgrade (#28498)
Browse files Browse the repository at this point in the history
* Do not switch TLS version on 1.6 -> 1.7 upgrade

See #28120
See envoyproxy/envoy#13864

This resolves a downtime event on in place upgrade from 1.6 to 1.7. This
is a couple seconds of 503s.

This is intentionally sent only to 1.7 as it is only relevant for this
branch.

Please note this feature flag is shipped by on by default. We have two
choices:

* Off by default. Anyone upgrading from 1.6 to 1.7 will continue to get
downtime unless they read the release notes and add the flag.
* On by default. Anyone with 1.7 already deployed, but that still has
1.6 proxies will encur a downtime unless they read the release notes and
remove the flag.

I have chosen on by default, as the set of people with 1.6 proxies with
1.7.x Istiod upgrading to 1.7.5 seems far smaller than the impacted set
of "off by default", and the mitigation is the same. Additionally, for
those that are impacted, the impact will be exclusively the proxies on
1.6, which is presumably not 100% of proxies, whereas in the other case
ALL proxies are 1.6 and thus impacted.

* fix nil

* Fix initial fetch
  • Loading branch information
howardjohn committed Nov 2, 2020
1 parent aa021c6 commit 0166aec
Show file tree
Hide file tree
Showing 5 changed files with 47 additions and 4 deletions.
6 changes: 6 additions & 0 deletions pilot/pkg/features/pilot.go
Expand Up @@ -331,4 +331,10 @@ var (

EnableTLSv2OnInboundPath = env.RegisterBoolVar("PILOT_SIDECAR_ENABLE_INBOUND_TLS_V2", false,
"If true, Pilot will set the TLS version on server side as TLSv1_2 and also enforce strong cipher suites").Get()

EnableTLSXDSDynamicTypes = env.RegisterBoolVar("PILOT_ENABLE_TLS_XDS_DYNAMIC_TYPES", true,
"This flag controls the how internal version of XDS is chosen. This applies only to Istio 1.7. "+
"When this flag is enabled, an upgrade of the Istio control plane from 1.6 -> 1.7 should result in no downtime for 1.6 proxies. "+
"However, for users who have already upgraded to 1.7.x without this option, to 1.7.y with this version, while still maintaining old 1.6 proxies, "+
"this will result in downtime and should be set to false.").Get()
)
13 changes: 12 additions & 1 deletion pilot/pkg/networking/core/v1alpha3/cluster.go
Expand Up @@ -42,6 +42,7 @@ import (
authn_model "istio.io/istio/pilot/pkg/security/model"
"istio.io/istio/pilot/pkg/serviceregistry"
"istio.io/istio/pilot/pkg/util/sets"
xdsv2 "istio.io/istio/pilot/pkg/xds/v2"
"istio.io/istio/pkg/config/constants"
"istio.io/istio/pkg/config/host"
"istio.io/istio/pkg/config/protocol"
Expand Down Expand Up @@ -907,10 +908,20 @@ func applyUpstreamTLSSettings(opts *buildClusterOpts, tls *networking.ClientTLSS
return
}

tc := util.MessageToAny(tlsContext)
// Moving from v2 tls <-> v3 tls seems to cause downtime: https://github.com/envoyproxy/envoy/issues/13864
// Instead, we should tie this to the cluster version, which is fixed, so we do not switch at runtime during
// an in place upgrade of the control plane only
// However, there is an edge case: users with 1.6 proxy, connecting to 1.7.x control plane without this code, then updating
// to 1.7.y control plane with this code. This would cause this code to cause another downtime by downgrading from v3 to v2.
// To workaround this, we will have a flag to disable this behavior
if tc != nil && features.EnableTLSXDSDynamicTypes && node.RequestedTypes.CDS == xdsv2.ClusterType {
tc.TypeUrl = "type.googleapis.com/envoy.api.v2.auth.UpstreamTlsContext"
}
if tlsContext != nil {
c.TransportSocket = &core.TransportSocket{
Name: util.EnvoyTLSSocketName,
ConfigType: &core.TransportSocket_TypedConfig{TypedConfig: util.MessageToAny(tlsContext)},
ConfigType: &core.TransportSocket_TypedConfig{TypedConfig: tc},
}
}

Expand Down
17 changes: 14 additions & 3 deletions pilot/pkg/networking/core/v1alpha3/listener.go
Expand Up @@ -43,6 +43,7 @@ import (
structpb "github.com/golang/protobuf/ptypes/struct"
"github.com/golang/protobuf/ptypes/wrappers"

xdsv2 "istio.io/istio/pilot/pkg/xds/v2"
"istio.io/istio/pkg/util/protomarshal"

meshconfig "istio.io/api/mesh/v1alpha1"
Expand Down Expand Up @@ -2160,7 +2161,7 @@ func buildListener(opts buildListenerOpts) *listener.Listener {
}
filterChains = append(filterChains, &listener.FilterChain{
FilterChainMatch: match,
TransportSocket: buildDownstreamTLSTransportSocket(chain.tlsContext),
TransportSocket: buildDownstreamTLSTransportSocket(chain.tlsContext, opts.proxy),
})
}

Expand Down Expand Up @@ -2548,11 +2549,21 @@ func appendListenerFilters(filters []*listener.ListenerFilter) []*listener.Liste
}

// nolint: interfacer
func buildDownstreamTLSTransportSocket(tlsContext *auth.DownstreamTlsContext) *core.TransportSocket {
func buildDownstreamTLSTransportSocket(tlsContext *auth.DownstreamTlsContext, proxy *model.Proxy) *core.TransportSocket {
if tlsContext == nil {
return nil
}
return &core.TransportSocket{Name: util.EnvoyTLSSocketName, ConfigType: &core.TransportSocket_TypedConfig{TypedConfig: util.MessageToAny(tlsContext)}}
tc := util.MessageToAny(tlsContext)
// Moving from v2 tls <-> v3 tls seems to cause downtime: https://github.com/envoyproxy/envoy/issues/13864
// Instead, we should tie this to the cluster version, which is fixed, so we do not switch at runtime during
// an in place upgrade of the control plane only
// However, there is an edge case: users with 1.6 proxy, connecting to 1.7.x control plane without this code, then updating
// to 1.7.y control plane with this code. This would cause this code to cause another downtime by downgrading from v3 to v2.
// To workaround this, we will have a flag to disable this behavior
if tc != nil && features.EnableTLSXDSDynamicTypes && proxy.RequestedTypes.LDS == xdsv2.ListenerType {
tc.TypeUrl = "type.googleapis.com/envoy.api.v2.auth.DownstreamTlsContext"
}
return &core.TransportSocket{Name: util.EnvoyTLSSocketName, ConfigType: &core.TransportSocket_TypedConfig{TypedConfig: tc}}
}

func isMatchAllFilterChain(fc *listener.FilterChain) bool {
Expand Down
4 changes: 4 additions & 0 deletions pilot/pkg/security/model/authentication.go
Expand Up @@ -208,6 +208,10 @@ func ConstructSdsSecretConfig(name, requestedType string) *tls.SdsSecretConfig {
InitialFetchTimeout: features.InitialFetchTimeout,
},
}
// For these, we know they are always require and should be present, so do not timeout on fetch
if name == SDSDefaultResourceName || name == SDSRootResourceName {
cfg.SdsConfig.InitialFetchTimeout = ptypes.DurationProto(time.Second * 0)
}

return cfg
}
Expand Down
11 changes: 11 additions & 0 deletions releasenotes/notes/tls-version.yaml
@@ -0,0 +1,11 @@
apiVersion: release-notes/v2
kind: bug-fix
area: networking
issue:
- 28120
releaseNotes:
- |
**Fixed** an issue causing a short spike in errors during in place upgrades from Istio 1.6 to Istio 1.7. As a result of this
fix, users who already have Istio 1.7 deployed but still have proxies left on version 1.6 will see a similar spike during this
upgrade. It is highly recommended you either migrate all existing proxies to version 1.7 prior to this release. Alternatively, to
retain the previous behavior, you may set the `PILOT_ENABLE_TLS_XDS_DYNAMIC_TYPES=false` environment variable in Istiod.

0 comments on commit 0166aec

Please sign in to comment.