Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add performance metrics for FederatedHPA #3972

Merged
merged 1 commit into from
Aug 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion pkg/controllers/federatedhpa/federatedhpa_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import (
policyv1alpha1 "github.com/karmada-io/karmada/pkg/apis/policy/v1alpha1"
workv1alpha2 "github.com/karmada-io/karmada/pkg/apis/work/v1alpha2"
"github.com/karmada-io/karmada/pkg/controllers/federatedhpa/monitor"
"github.com/karmada-io/karmada/pkg/metrics"
"github.com/karmada-io/karmada/pkg/sharedcli/ratelimiterflag"
"github.com/karmada-io/karmada/pkg/util"
"github.com/karmada-io/karmada/pkg/util/fedinformer/typedmanager"
Expand Down Expand Up @@ -153,7 +154,12 @@ func (c *FederatedHPAController) Reconcile(ctx context.Context, req controllerru
}
c.hpaSelectorsMux.Unlock()

err := c.reconcileAutoscaler(ctx, hpa)
// observe process FederatedHPA latency
var err error
startTime := time.Now()
defer metrics.ObserveProcessFederatedHPALatency(err, startTime)

err = c.reconcileAutoscaler(ctx, hpa)
if err != nil {
return controllerruntime.Result{}, err
}
Expand Down
23 changes: 22 additions & 1 deletion pkg/controllers/federatedhpa/metrics/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ import (
resourceclient "k8s.io/metrics/pkg/client/clientset/versioned/typed/metrics/v1beta1"
customclient "k8s.io/metrics/pkg/client/custom_metrics"
externalclient "k8s.io/metrics/pkg/client/external_metrics"

"github.com/karmada-io/karmada/pkg/metrics"
)

const (
Expand Down Expand Up @@ -64,6 +66,11 @@ type resourceMetricsClient struct {
// GetResourceMetric gets the given resource metric (and an associated oldest timestamp)
// for all pods matching the specified selector in the given namespace
func (c *resourceMetricsClient) GetResourceMetric(ctx context.Context, resource corev1.ResourceName, namespace string, selector labels.Selector, container string) (PodMetricsInfo, time.Time, error) {
// observe pull ResourceMetric latency
var err error
startTime := time.Now()
defer metrics.ObserveFederatedHPAPullMetricsLatency(err, "ResourceMetric", startTime)

metrics, err := c.client.PodMetricses(namespace).List(ctx, metav1.ListOptions{LabelSelector: selector.String()})
if err != nil {
return nil, time.Time{}, fmt.Errorf("unable to fetch metrics from resource metrics API: %v", err)
Expand Down Expand Up @@ -143,6 +150,11 @@ type customMetricsClient struct {
// GetRawMetric gets the given metric (and an associated oldest timestamp)
// for all pods matching the specified selector in the given namespace
func (c *customMetricsClient) GetRawMetric(metricName string, namespace string, selector labels.Selector, metricSelector labels.Selector) (PodMetricsInfo, time.Time, error) {
// observe pull RawMetric latency
var err error
startTime := time.Now()
defer metrics.ObserveFederatedHPAPullMetricsLatency(err, "RawMetric", startTime)

metrics, err := c.client.NamespacedMetrics(namespace).GetForObjects(schema.GroupKind{Kind: "Pod"}, selector, metricName, metricSelector)
if err != nil {
return nil, time.Time{}, fmt.Errorf("unable to fetch metrics from custom metrics API: %v", err)
Expand Down Expand Up @@ -175,9 +187,13 @@ func (c *customMetricsClient) GetRawMetric(metricName string, namespace string,
// GetObjectMetric gets the given metric (and an associated timestamp) for the given
// object in the given namespace
func (c *customMetricsClient) GetObjectMetric(metricName string, namespace string, objectRef *autoscalingv2.CrossVersionObjectReference, metricSelector labels.Selector) (int64, time.Time, error) {
// observe pull ObjectMetric latency
var err error
startTime := time.Now()
defer metrics.ObserveFederatedHPAPullMetricsLatency(err, "ObjectMetric", startTime)

gvk := schema.FromAPIVersionAndKind(objectRef.APIVersion, objectRef.Kind)
var metricValue *customapi.MetricValue
var err error
if gvk.Kind == "Namespace" && gvk.Group == "" {
// handle namespace separately
// NB: we ignore namespace name here, since CrossVersionObjectReference isn't
Expand All @@ -203,6 +219,11 @@ type externalMetricsClient struct {
// GetExternalMetric gets all the values of a given external metric
// that match the specified selector.
func (c *externalMetricsClient) GetExternalMetric(metricName, namespace string, selector labels.Selector) ([]int64, time.Time, error) {
// observe pull ExternalMetric latency
var err error
startTime := time.Now()
defer metrics.ObserveFederatedHPAPullMetricsLatency(err, "ExternalMetric", startTime)

metrics, err := c.client.NamespacedMetrics(namespace).List(metricName, selector)
if err != nil {
return []int64{}, time.Time{}, fmt.Errorf("unable to fetch metrics from external metrics API: %v", err)
Expand Down
42 changes: 34 additions & 8 deletions pkg/metrics/resource.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,16 @@ import (
)

const (
resourceMatchPolicyDurationMetricsName = "resource_match_policy_duration_seconds"
resourceApplyPolicyDurationMetricsName = "resource_apply_policy_duration_seconds"
policyApplyAttemptsMetricsName = "policy_apply_attempts_total"
syncWorkDurationMetricsName = "binding_sync_work_duration_seconds"
syncWorkloadDurationMetricsName = "work_sync_workload_duration_seconds"
policyPreemptionMetricsName = "policy_preemption_total"
cronFederatedHPADurationMetricsName = "cronfederatedhpa_process_duration_seconds"
cronFederatedHPARuleDurationMetricsName = "cronfederatedhpa_rule_process_duration_seconds"
resourceMatchPolicyDurationMetricsName = "resource_match_policy_duration_seconds"
resourceApplyPolicyDurationMetricsName = "resource_apply_policy_duration_seconds"
policyApplyAttemptsMetricsName = "policy_apply_attempts_total"
syncWorkDurationMetricsName = "binding_sync_work_duration_seconds"
syncWorkloadDurationMetricsName = "work_sync_workload_duration_seconds"
policyPreemptionMetricsName = "policy_preemption_total"
cronFederatedHPADurationMetricsName = "cronfederatedhpa_process_duration_seconds"
cronFederatedHPARuleDurationMetricsName = "cronfederatedhpa_rule_process_duration_seconds"
federatedHPADurationMetricsName = "federatedhpa_process_duration_seconds"
federatedHPAPullMetricsDurationMetricsName = "federatedhpa_pull_metrics_duration_seconds"
)

var (
Expand Down Expand Up @@ -65,6 +67,18 @@ var (
Help: "Duration in seconds to process a CronFederatedHPA rule. By the result, 'error' means a CronFederatedHPA rule failed to be processed. Otherwise 'success'.",
Buckets: prometheus.ExponentialBuckets(0.001, 2, 12),
}, []string{"result"})

federatedHPADurationHistogram = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Name: federatedHPADurationMetricsName,
Help: "Duration in seconds to process a FederatedHPA. By the result, 'error' means a FederatedHPA failed to be processed. Otherwise 'success'.",
Buckets: prometheus.ExponentialBuckets(0.01, 2, 12),
}, []string{"result"})

federatedHPAPullMetricsDurationHistogram = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Name: federatedHPAPullMetricsDurationMetricsName,
Help: "Duration in seconds taken by the FederatedHPA to pull metrics. By the result, 'error' means the FederatedHPA failed to pull the metrics. Otherwise 'success'.",
Buckets: prometheus.ExponentialBuckets(0.01, 2, 12),
}, []string{"result", "metricType"})
)

// ObserveFindMatchedPolicyLatency records the duration for the resource finding a matched policy.
Expand Down Expand Up @@ -103,6 +117,16 @@ func ObserveProcessCronFederatedHPARuleLatency(err error, start time.Time) {
cronFederatedHPARuleDurationHistogram.WithLabelValues(utilmetrics.GetResultByError(err)).Observe(utilmetrics.DurationInSeconds(start))
}

// ObserveProcessFederatedHPALatency records the duration to process a FederatedHPA.
func ObserveProcessFederatedHPALatency(err error, start time.Time) {
federatedHPADurationHistogram.WithLabelValues(utilmetrics.GetResultByError(err)).Observe(utilmetrics.DurationInSeconds(start))
}

// ObserveFederatedHPAPullMetricsLatency records the duration it takes for the FederatedHPA to pull metrics.
func ObserveFederatedHPAPullMetricsLatency(err error, metricType string, start time.Time) {
federatedHPAPullMetricsDurationHistogram.WithLabelValues(utilmetrics.GetResultByError(err), metricType).Observe(utilmetrics.DurationInSeconds(start))
}

// ResourceCollectors returns the collectors about resources.
func ResourceCollectors() []prometheus.Collector {
return []prometheus.Collector{
Expand All @@ -114,6 +138,8 @@ func ResourceCollectors() []prometheus.Collector {
policyPreemptionCounter,
cronFederatedHPADurationHistogram,
cronFederatedHPARuleDurationHistogram,
federatedHPADurationHistogram,
federatedHPAPullMetricsDurationHistogram,
}
}

Expand Down
Loading