Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

making some apiserver metrics stable #106122

Merged
merged 1 commit into from
Nov 8, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
103 changes: 58 additions & 45 deletions staging/src/k8s.io/apiserver/pkg/admission/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ package metrics

import (
"context"
"fmt"
"strconv"
"time"

Expand All @@ -45,8 +44,6 @@ const (
)

var (
// Use buckets ranging from 5 ms to 2.5 seconds (admission webhooks timeout at 30 seconds by default).
latencyBuckets = []float64{0.005, 0.025, 0.1, 0.5, 1.0, 2.5}
latencySummaryMaxAge = 5 * time.Hour

// Metrics provides access to all admission metrics.
Expand Down Expand Up @@ -126,19 +123,68 @@ type AdmissionMetrics struct {
func newAdmissionMetrics() *AdmissionMetrics {
// Admission metrics for a step of the admission flow. The entire admission flow is broken down into a series of steps
// Each step is identified by a distinct type label value.
step := newMetricSet("step",
[]string{"type", "operation", "rejected"},
"Admission sub-step %s, broken out for each operation and API resource and step type (validate or admit).", true)
// Use buckets ranging from 5 ms to 2.5 seconds.
step := &metricSet{
latencies: metrics.NewHistogramVec(
&metrics.HistogramOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "step_admission_duration_seconds",
Help: "Admission sub-step latency histogram in seconds, broken out for each operation and API resource and step type (validate or admit).",
Buckets: []float64{0.005, 0.025, 0.1, 0.5, 1.0, 2.5},
StabilityLevel: metrics.STABLE,
},
[]string{"type", "operation", "rejected"},
),

latenciesSummary: metrics.NewSummaryVec(
&metrics.SummaryOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "step_admission_duration_seconds_summary",
Help: "Admission sub-step latency summary in seconds, broken out for each operation and API resource and step type (validate or admit).",
MaxAge: latencySummaryMaxAge,
StabilityLevel: metrics.ALPHA,
},
[]string{"type", "operation", "rejected"},
),
}

// Built-in admission controller metrics. Each admission controller is identified by name.
controller := newMetricSet("controller",
[]string{"name", "type", "operation", "rejected"},
"Admission controller %s, identified by name and broken out for each operation and API resource and type (validate or admit).", false)
// Use buckets ranging from 5 ms to 2.5 seconds.
controller := &metricSet{
latencies: metrics.NewHistogramVec(
&metrics.HistogramOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "controller_admission_duration_seconds",
Help: "Admission controller latency histogram in seconds, identified by name and broken out for each operation and API resource and type (validate or admit).",
Buckets: []float64{0.005, 0.025, 0.1, 0.5, 1.0, 2.5},
StabilityLevel: metrics.STABLE,
},
[]string{"name", "type", "operation", "rejected"},
),

latenciesSummary: nil,
}

// Admission webhook metrics. Each webhook is identified by name.
webhook := newMetricSet("webhook",
[]string{"name", "type", "operation", "rejected"},
"Admission webhook %s, identified by name and broken out for each operation and API resource and type (validate or admit).", false)
// Use buckets ranging from 5 ms to 2.5 seconds (admission webhooks timeout at 30 seconds by default).
webhook := &metricSet{
latencies: metrics.NewHistogramVec(
&metrics.HistogramOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "webhook_admission_duration_seconds",
Help: "Admission webhook latency histogram in seconds, identified by name and broken out for each operation and API resource and type (validate or admit).",
Buckets: []float64{0.005, 0.025, 0.1, 0.5, 1.0, 2.5},
StabilityLevel: metrics.STABLE,
},
[]string{"name", "type", "operation", "rejected"},
),

latenciesSummary: nil,
}

webhookRejection := metrics.NewCounterVec(
&metrics.CounterOpts{
Expand Down Expand Up @@ -209,39 +255,6 @@ type metricSet struct {
latenciesSummary *metrics.SummaryVec
}

func newMetricSet(name string, labels []string, helpTemplate string, hasSummary bool) *metricSet {
var summary *metrics.SummaryVec
if hasSummary {
summary = metrics.NewSummaryVec(
&metrics.SummaryOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: fmt.Sprintf("%s_admission_duration_seconds_summary", name),
Help: fmt.Sprintf(helpTemplate, "latency summary in seconds"),
MaxAge: latencySummaryMaxAge,
StabilityLevel: metrics.ALPHA,
},
labels,
)
}

return &metricSet{
latencies: metrics.NewHistogramVec(
&metrics.HistogramOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: fmt.Sprintf("%s_admission_duration_seconds", name),
Help: fmt.Sprintf(helpTemplate, "latency histogram in seconds"),
Buckets: latencyBuckets,
StabilityLevel: metrics.ALPHA,
},
labels,
),

latenciesSummary: summary,
}
}

// MustRegister registers all the prometheus metrics in the metricSet.
func (m *metricSet) mustRegister() {
legacyregistry.MustRegister(m.latencies)
Expand Down
4 changes: 2 additions & 2 deletions staging/src/k8s.io/apiserver/pkg/endpoints/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ var (
Help: "Response size distribution in bytes for each group, version, verb, resource, subresource, scope and component.",
// Use buckets ranging from 1000 bytes (1KB) to 10^9 bytes (1GB).
Buckets: compbasemetrics.ExponentialBuckets(1000, 10.0, 7),
StabilityLevel: compbasemetrics.ALPHA,
StabilityLevel: compbasemetrics.STABLE,
},
[]string{"verb", "group", "version", "resource", "subresource", "scope", "component"},
)
Expand Down Expand Up @@ -169,7 +169,7 @@ var (
&compbasemetrics.GaugeOpts{
Name: "apiserver_current_inflight_requests",
Help: "Maximal number of currently used inflight request limit of this apiserver per request kind in last second.",
StabilityLevel: compbasemetrics.ALPHA,
StabilityLevel: compbasemetrics.STABLE,
},
[]string{"request_kind"},
)
Expand Down
84 changes: 84 additions & 0 deletions test/instrumentation/testdata/stable-metrics-list.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,69 @@
- 4.096
- 8.192
- 16.384
- name: controller_admission_duration_seconds
subsystem: admission
namespace: apiserver
help: Admission controller latency histogram in seconds, identified by name and
broken out for each operation and API resource and type (validate or admit).
type: Histogram
stabilityLevel: STABLE
labels:
- name
- operation
- rejected
- type
buckets:
- 0.005
- 0.025
- 0.1
- 0.5
- 1
- 2.5
- name: step_admission_duration_seconds
subsystem: admission
namespace: apiserver
help: Admission sub-step latency histogram in seconds, broken out for each operation
and API resource and step type (validate or admit).
type: Histogram
stabilityLevel: STABLE
labels:
- operation
- rejected
- type
buckets:
- 0.005
- 0.025
- 0.1
- 0.5
- 1
- 2.5
- name: webhook_admission_duration_seconds
subsystem: admission
namespace: apiserver
help: Admission webhook latency histogram in seconds, identified by name and broken
out for each operation and API resource and type (validate or admit).
type: Histogram
stabilityLevel: STABLE
labels:
- name
- operation
- rejected
- type
buckets:
- 0.005
- 0.025
- 0.1
- 0.5
- 1
- 2.5
- name: apiserver_current_inflight_requests
help: Maximal number of currently used inflight request limit of this apiserver
per request kind in last second.
type: Gauge
stabilityLevel: STABLE
labels:
- request_kind
- name: apiserver_request_duration_seconds
help: Response latency distribution in seconds for each verb, dry run value, group,
version, resource, subresource, scope and component.
Expand Down Expand Up @@ -139,6 +202,27 @@
- resource
- subresource
- version
- name: apiserver_response_sizes
help: Response size distribution in bytes for each group, version, verb, resource,
subresource, scope and component.
type: Histogram
stabilityLevel: STABLE
labels:
- component
- group
- resource
- scope
- subresource
- verb
- version
buckets:
- 1000
- 10000
- 100000
- 1e+06
- 1e+07
- 1e+08
- 1e+09
- name: apiserver_storage_objects
help: Number of stored objects at the time of last check split by kind.
type: Gauge
Expand Down