Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

rafctor LookupCluster #44468

Merged
merged 4 commits into from
Apr 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
60 changes: 0 additions & 60 deletions pilot/pkg/extensionproviders/cluster_lookup.go

This file was deleted.

33 changes: 33 additions & 0 deletions pilot/pkg/model/monitoring.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
// Copyright Istio Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package model

import "istio.io/pkg/monitoring"

func init() {
monitoring.MustRegister(
providerLookupClusterFailures,
)
}

var providerLookupClusterFailures = monitoring.NewSum(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think Gauge is better, we can record which cluster look up failed rather than always increasing the num

Copy link
Member Author

@zirain zirain Apr 23, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

when to decrease the count?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please refer to envoyfilter patching metric, the sum number here is meaningless, it oculd be unlimited big

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For a counter, user should care about rate or increase.
if you are talking about pilot_envoy_filter_status, it's disabled by default.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cc @ramaraochavali , do you know why it's disabled?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is disabled because of possible performance impact. People who want will only enable it in dev envs

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After take a look at istio.io/pkg/monitoring, it's impossible to reset all metrics to 0 when services or telemetries chaged.
cc @hzxuzhonghu

"provider_lookup_cluster_failures",
"Number of times a cluster lookup failed",
monitoring.WithLabels(typeTag),
)

func IncLookupClusterFailures(provider string) {
providerLookupClusterFailures.With(typeTag.Value(provider)).Increment()
}
4 changes: 3 additions & 1 deletion pilot/pkg/model/telemetry_logging.go
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ func tcpGrpcAccessLogFromTelemetry(push *PushContext, prov *meshconfig.MeshConfi

hostname, cluster, err := clusterLookupFn(push, prov.Service, int(prov.Port))
if err != nil {
IncLookupClusterFailures("envoyTCPAls")
log.Errorf("could not find cluster for tcp grpc provider %q: %v", prov, err)
return nil
}
Expand Down Expand Up @@ -306,6 +307,7 @@ func httpGrpcAccessLogFromTelemetry(push *PushContext, prov *meshconfig.MeshConf

hostname, cluster, err := clusterLookupFn(push, prov.Service, int(prov.Port))
if err != nil {
IncLookupClusterFailures("envoyHTTPAls")
log.Errorf("could not find cluster for http grpc provider %q: %v", prov, err)
return nil
}
Expand Down Expand Up @@ -407,6 +409,7 @@ func openTelemetryLog(pushCtx *PushContext,
) *accesslog.AccessLog {
hostname, cluster, err := clusterLookupFn(pushCtx, provider.Service, int(provider.Port))
if err != nil {
IncLookupClusterFailures("envoyOtelAls")
log.Errorf("could not find cluster for open telemetry provider %q: %v", provider, err)
return nil
}
Expand Down Expand Up @@ -487,7 +490,6 @@ func ConvertStructToAttributeKeyValues(labels map[string]*structpb.Value) []*otl
return attrList
}

// FIXME: this is a copy of extensionproviders.LookupCluster to avoid import cycle
func LookupCluster(push *PushContext, service string, port int) (hostname string, cluster string, err error) {
if service == "" {
err = fmt.Errorf("service must not be empty")
Expand Down
8 changes: 6 additions & 2 deletions pilot/pkg/networking/core/v1alpha3/tracing.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ import (

meshconfig "istio.io/api/mesh/v1alpha1"
telemetrypb "istio.io/api/telemetry/v1alpha1"
"istio.io/istio/pilot/pkg/extensionproviders"
"istio.io/istio/pilot/pkg/features"
"istio.io/istio/pilot/pkg/model"
"istio.io/istio/pilot/pkg/networking"
Expand All @@ -56,7 +55,7 @@ const (
)

// this is used for testing. it should not be changed in regular code.
var clusterLookupFn = extensionproviders.LookupCluster
var clusterLookupFn = model.LookupCluster

type typedConfigGenFn func() (*anypb.Any, error)

Expand Down Expand Up @@ -153,6 +152,7 @@ func configureFromProviderConfig(pushCtx *model.PushContext, proxy *model.Proxy,
providerConfig = func() (*anypb.Any, error) {
hostname, cluster, err := clusterLookupFn(pushCtx, provider.Zipkin.GetService(), int(provider.Zipkin.GetPort()))
if err != nil {
model.IncLookupClusterFailures("zipkin")
return nil, fmt.Errorf("could not find cluster for tracing provider %q: %v", provider, err)
}
return zipkinConfig(hostname, cluster, !provider.Zipkin.GetEnable_64BitTraceId())
Expand All @@ -163,6 +163,7 @@ func configureFromProviderConfig(pushCtx *model.PushContext, proxy *model.Proxy,
providerConfig = func() (*anypb.Any, error) {
hostname, cluster, err := clusterLookupFn(pushCtx, provider.Datadog.GetService(), int(provider.Datadog.GetPort()))
if err != nil {
model.IncLookupClusterFailures("datadog")
return nil, fmt.Errorf("could not find cluster for tracing provider %q: %v", provider, err)
}
return datadogConfig(serviceCluster, hostname, cluster)
Expand All @@ -175,6 +176,7 @@ func configureFromProviderConfig(pushCtx *model.PushContext, proxy *model.Proxy,
providerConfig = func() (*anypb.Any, error) {
hostname, clusterName, err := clusterLookupFn(pushCtx, provider.Lightstep.GetService(), int(provider.Lightstep.GetPort()))
if err != nil {
model.IncLookupClusterFailures("lightstep")
return nil, fmt.Errorf("could not find cluster for tracing provider %q: %v", provider, err)
}
// TODO: read raw metadata and retrieve lightstep extensions (instead of relying on version)
Expand All @@ -197,6 +199,7 @@ func configureFromProviderConfig(pushCtx *model.PushContext, proxy *model.Proxy,
providerConfig = func() (*anypb.Any, error) {
hostname, clusterName, err := clusterLookupFn(pushCtx, provider.Skywalking.GetService(), int(provider.Skywalking.GetPort()))
if err != nil {
model.IncLookupClusterFailures("skywalking")
return nil, fmt.Errorf("could not find cluster for tracing provider %q: %v", provider, err)
}
return skywalkingConfig(clusterName, hostname)
Expand All @@ -217,6 +220,7 @@ func configureFromProviderConfig(pushCtx *model.PushContext, proxy *model.Proxy,
providerConfig = func() (*anypb.Any, error) {
hostname, clusterName, err := clusterLookupFn(pushCtx, provider.Opentelemetry.GetService(), int(provider.Opentelemetry.GetPort()))
if err != nil {
model.IncLookupClusterFailures("opentelemetry")
return nil, fmt.Errorf("could not find cluster for tracing provider %q: %v", provider, err)
}
return otelConfig(serviceCluster, hostname, clusterName)
Expand Down
3 changes: 1 addition & 2 deletions pilot/pkg/networking/core/v1alpha3/tracing_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ import (

meshconfig "istio.io/api/mesh/v1alpha1"
tpb "istio.io/api/telemetry/v1alpha1"
"istio.io/istio/pilot/pkg/extensionproviders"
"istio.io/istio/pilot/pkg/model"
"istio.io/istio/pilot/pkg/networking"
"istio.io/istio/pilot/pkg/util/protoconv"
Expand All @@ -44,7 +43,7 @@ func TestConfigureTracing(t *testing.T) {
return authority, clusterName, nil
}
defer func() {
clusterLookupFn = extensionproviders.LookupCluster
clusterLookupFn = model.LookupCluster
}()

defaultUUIDExtensionCtx := requestidextension.UUIDRequestIDExtensionContext{
Expand Down
4 changes: 2 additions & 2 deletions pilot/pkg/security/authn/v1beta1/policy_applier.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ import (
authn_filter "istio.io/api/envoy/config/filter/http/authn/v2alpha1"
meshconfig "istio.io/api/mesh/v1alpha1"
"istio.io/api/security/v1beta1"
"istio.io/istio/pilot/pkg/extensionproviders"
"istio.io/istio/pilot/pkg/features"
"istio.io/istio/pilot/pkg/model"
"istio.io/istio/pilot/pkg/networking"
Expand Down Expand Up @@ -256,7 +255,7 @@ func convertToEnvoyJwtConfig(jwtRules []*v1beta1.JWTRule, push *model.PushContex
if err != nil {
authnLog.Errorf("Failed to parse jwt rule jwks uri %v", err)
}
_, cluster, err := extensionproviders.LookupCluster(push, jwksInfo.Hostname.String(), jwksInfo.Port)
_, cluster, err := model.LookupCluster(push, jwksInfo.Hostname.String(), jwksInfo.Port)
authnLog.Debugf("Look up cluster result: %v", cluster)

if err == nil && len(cluster) > 0 {
Expand All @@ -276,6 +275,7 @@ func convertToEnvoyJwtConfig(jwtRules []*v1beta1.JWTRule, push *model.PushContex
} else if features.JwksFetchMode == jwt.Hybrid {
provider.JwksSourceSpecifier = push.JwtKeyResolver.BuildLocalJwks(jwtRule.JwksUri, jwtRule.Issuer, "")
} else {
model.IncLookupClusterFailures("jwks")
// Log error and create remote JWKs with fake cluster
authnLog.Errorf("Failed to look up Envoy cluster %v. "+
"Please create ServiceEntry to register external JWKs server or "+
Expand Down
6 changes: 3 additions & 3 deletions pilot/pkg/security/authz/builder/extauthz.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ import (
"google.golang.org/protobuf/types/known/durationpb"

meshconfig "istio.io/api/mesh/v1alpha1"
"istio.io/istio/pilot/pkg/extensionproviders"
"istio.io/istio/pilot/pkg/model"
authzmodel "istio.io/istio/pilot/pkg/security/authz/model"
"istio.io/istio/pkg/config/validation"
Expand Down Expand Up @@ -153,8 +152,9 @@ func buildExtAuthzHTTP(push *model.PushContext,
if err != nil {
errs = multierror.Append(errs, err)
}
hostname, cluster, err := extensionproviders.LookupCluster(push, config.Service, port)
hostname, cluster, err := model.LookupCluster(push, config.Service, port)
if err != nil {
model.IncLookupClusterFailures("authz")
errs = multierror.Append(errs, err)
}
status, err := parseStatusOnError(config.StatusOnError)
Expand Down Expand Up @@ -197,7 +197,7 @@ func buildExtAuthzGRPC(push *model.PushContext,
if err != nil {
errs = multierror.Append(errs, err)
}
_, cluster, err := extensionproviders.LookupCluster(push, config.Service, port)
_, cluster, err := model.LookupCluster(push, config.Service, port)
if err != nil {
errs = multierror.Append(errs, err)
}
Expand Down
6 changes: 6 additions & 0 deletions releasenotes/notes/44468.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
apiVersion: release-notes/v2
kind: feature
area: telemetry
releaseNotes:
- |
**Added** new metric named `provider_lookup_cluster_failures` for lookup cluster failures.