Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make some scheduler metrics stable #105941

Merged
merged 1 commit into from
Nov 4, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
10 changes: 5 additions & 5 deletions pkg/scheduler/internal/queue/scheduling_queue_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1450,7 +1450,7 @@ func TestPendingPodsMetric(t *testing.T) {
},
metricsName: "scheduler_pending_pods",
wants: `
# HELP scheduler_pending_pods [ALPHA] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.
# HELP scheduler_pending_pods [STABLE] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.
# TYPE scheduler_pending_pods gauge
scheduler_pending_pods{queue="active"} 30
scheduler_pending_pods{queue="backoff"} 0
Expand All @@ -1471,7 +1471,7 @@ scheduler_pending_pods{queue="unschedulable"} 20
},
metricsName: "scheduler_pending_pods",
wants: `
# HELP scheduler_pending_pods [ALPHA] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.
# HELP scheduler_pending_pods [STABLE] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.
# TYPE scheduler_pending_pods gauge
scheduler_pending_pods{queue="active"} 15
scheduler_pending_pods{queue="backoff"} 25
Expand All @@ -1492,7 +1492,7 @@ scheduler_pending_pods{queue="unschedulable"} 10
},
metricsName: "scheduler_pending_pods",
wants: `
# HELP scheduler_pending_pods [ALPHA] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.
# HELP scheduler_pending_pods [STABLE] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.
# TYPE scheduler_pending_pods gauge
scheduler_pending_pods{queue="active"} 50
scheduler_pending_pods{queue="backoff"} 0
Expand All @@ -1515,7 +1515,7 @@ scheduler_pending_pods{queue="unschedulable"} 0
},
metricsName: "scheduler_pending_pods",
wants: `
# HELP scheduler_pending_pods [ALPHA] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.
# HELP scheduler_pending_pods [STABLE] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.
# TYPE scheduler_pending_pods gauge
scheduler_pending_pods{queue="active"} 30
scheduler_pending_pods{queue="backoff"} 20
Expand All @@ -1538,7 +1538,7 @@ scheduler_pending_pods{queue="unschedulable"} 0
},
metricsName: "scheduler_pending_pods",
wants: `
# HELP scheduler_pending_pods [ALPHA] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.
# HELP scheduler_pending_pods [STABLE] Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.
# TYPE scheduler_pending_pods gauge
scheduler_pending_pods{queue="active"} 50
scheduler_pending_pods{queue="backoff"} 0
Expand Down
24 changes: 17 additions & 7 deletions pkg/scheduler/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,16 +44,25 @@ var (
Subsystem: SchedulerSubsystem,
Name: "schedule_attempts_total",
Help: "Number of attempts to schedule pods, by the result. 'unschedulable' means a pod could not be scheduled, while 'error' means an internal scheduler problem.",
StabilityLevel: metrics.ALPHA,
StabilityLevel: metrics.STABLE,
}, []string{"result", "profile"})

e2eSchedulingLatency = metrics.NewHistogramVec(
&metrics.HistogramOpts{
Subsystem: SchedulerSubsystem,
Name: "e2e_scheduling_duration_seconds",
DeprecatedVersion: "1.23.0",
Help: "E2e scheduling latency in seconds (scheduling algorithm + binding). This metric is replaced by scheduling_attempt_duration_seconds.",
Buckets: metrics.ExponentialBuckets(0.001, 2, 15),
StabilityLevel: metrics.ALPHA,
}, []string{"result", "profile"})
schedulingLatency = metrics.NewHistogramVec(
&metrics.HistogramOpts{
Subsystem: SchedulerSubsystem,
Name: "e2e_scheduling_duration_seconds",
Help: "E2e scheduling latency in seconds (scheduling algorithm + binding)",
Name: "scheduling_attempt_duration_seconds",
Help: "Scheduling attempt latency in seconds (scheduling algorithm + binding)",
Buckets: metrics.ExponentialBuckets(0.001, 2, 15),
StabilityLevel: metrics.ALPHA,
StabilityLevel: metrics.STABLE,
}, []string{"result", "profile"})
SchedulingAlgorithmLatency = metrics.NewHistogram(
&metrics.HistogramOpts{
Expand All @@ -71,21 +80,21 @@ var (
Help: "Number of selected preemption victims",
// we think #victims>50 is pretty rare, therefore [50, +Inf) is considered a single bucket.
Buckets: metrics.LinearBuckets(5, 5, 10),
StabilityLevel: metrics.ALPHA,
StabilityLevel: metrics.STABLE,
})
PreemptionAttempts = metrics.NewCounter(
&metrics.CounterOpts{
Subsystem: SchedulerSubsystem,
Name: "preemption_attempts_total",
Help: "Total preemption attempts in the cluster till now",
StabilityLevel: metrics.ALPHA,
StabilityLevel: metrics.STABLE,
})
pendingPods = metrics.NewGaugeVec(
&metrics.GaugeOpts{
Subsystem: SchedulerSubsystem,
Name: "pending_pods",
Help: "Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulableQ.",
StabilityLevel: metrics.ALPHA,
StabilityLevel: metrics.STABLE,
}, []string{"queue"})
SchedulerGoroutines = metrics.NewGaugeVec(
&metrics.GaugeOpts{
Expand Down Expand Up @@ -167,6 +176,7 @@ var (
metricsList = []metrics.Registerable{
scheduleAttempts,
e2eSchedulingLatency,
schedulingLatency,
SchedulingAlgorithmLatency,
PreemptionVictims,
PreemptionAttempts,
Expand Down
1 change: 1 addition & 0 deletions pkg/scheduler/metrics/profile_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,5 +44,6 @@ func PodScheduleError(profile string, duration float64) {

func observeScheduleAttemptAndLatency(result, profile string, duration float64) {
e2eSchedulingLatency.WithLabelValues(result, profile).Observe(duration)
schedulingLatency.WithLabelValues(result, profile).Observe(duration)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not super comfortable with setting a metric to stable immediately in the same PR that it is introduced.. can we bake this a little?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fine with me. FYI @vantuvt

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@logicalhan, in case it wasn't clear, this is just a "rename" by duplication+deprecation of old metric.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@logicalhan Also wanted to point out that this kind of renaming+making stable has been done before #99785

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I also don't think we should go through another round of Alpha stability when adjusting a metric before it graduates to stable. As a matter of fact, Kubernetes resources can also be modified during the graduation process to GA.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok ok...

scheduleAttempts.WithLabelValues(result, profile).Inc()
}
63 changes: 63 additions & 0 deletions test/instrumentation/testdata/stable-metrics-list.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,66 @@
- name: pending_pods
subsystem: scheduler
help: Number of pending pods, by the queue type. 'active' means number of pods in
activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number
of pods in unschedulableQ.
type: Gauge
stabilityLevel: STABLE
labels:
- queue
- name: preemption_attempts_total
subsystem: scheduler
help: Total preemption attempts in the cluster till now
type: Counter
stabilityLevel: STABLE
- name: preemption_victims
subsystem: scheduler
help: Number of selected preemption victims
type: Histogram
stabilityLevel: STABLE
buckets:
- 5
- 10
- 15
- 20
- 25
- 30
- 35
- 40
- 45
- 50
- name: schedule_attempts_total
subsystem: scheduler
help: Number of attempts to schedule pods, by the result. 'unschedulable' means
a pod could not be scheduled, while 'error' means an internal scheduler problem.
type: Counter
stabilityLevel: STABLE
labels:
- profile
- result
- name: scheduling_attempt_duration_seconds
subsystem: scheduler
help: Scheduling attempt latency in seconds (scheduling algorithm + binding)
type: Histogram
stabilityLevel: STABLE
labels:
- profile
- result
buckets:
- 0.001
- 0.002
- 0.004
- 0.008
- 0.016
- 0.032
- 0.064
- 0.128
- 0.256
- 0.512
- 1.024
- 2.048
- 4.096
- 8.192
- 16.384
- name: apiserver_request_duration_seconds
help: Response latency distribution in seconds for each verb, dry run value, group,
version, resource, subresource, scope and component.
Expand Down