Skip to content

Commit

Permalink
Merge pull request #64838 from krzysied/scheduling_latency_metric_fix
Browse files Browse the repository at this point in the history
Automatic merge from submit-queue. If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>.

Adding summary metric for scheduling latency

**What this PR does / why we need it**:
Re-introduces histogram metrics for the backward compatibility.
Changes SchedulingLatency metric to satisfy prometheus best practice.
ref #64316

**Release note**:

```release-note
NONE
```
  • Loading branch information
Kubernetes Submit Queue committed Jun 15, 2018
2 parents 3abba25 + e32910a commit a6e61e7
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 21 deletions.
42 changes: 37 additions & 5 deletions pkg/scheduler/metrics/metrics.go
Expand Up @@ -27,16 +27,15 @@ const (
// SchedulerSubsystem - subsystem name used by scheduler
SchedulerSubsystem = "scheduler"
// SchedulingLatencyName - scheduler latency metric name
SchedulingLatencyName = "scheduling_latencies_summary"
SchedulingLatencyName = "scheduling_latency_seconds"

// OperationLabel - operation label name
OperationLabel = "operation"
// Binding - binding operation label value
Binding = "binding"
// SchedulingAlgorithm - scheduling algorithm operation label value
SchedulingAlgorithm = "scheduling_algorithm"
// SelectingNode - selecting node operation label value
SelectingNode = "selecting_node"
// E2eScheduling - e2e scheduling operation label value
E2eScheduling = "e2e_scheduling"
)

// All the histogram based metrics have 1ms as size for the smallest bucket.
Expand All @@ -45,13 +44,29 @@ var (
prometheus.SummaryOpts{
Subsystem: SchedulerSubsystem,
Name: SchedulingLatencyName,
Help: "Scheduling latency in microseconds split by sub-parts of the scheduling operation",
Help: "Scheduling latency in seconds split by sub-parts of the scheduling operation",
// Make the sliding window of 5h.
// TODO: The value for this should be based on some SLI definition (long term).
MaxAge: 5 * time.Hour,
},
[]string{OperationLabel},
)
E2eSchedulingLatency = prometheus.NewHistogram(
prometheus.HistogramOpts{
Subsystem: SchedulerSubsystem,
Name: "e2e_scheduling_latency_microseconds",
Help: "E2e scheduling latency (scheduling algorithm + binding)",
Buckets: prometheus.ExponentialBuckets(1000, 2, 15),
},
)
SchedulingAlgorithmLatency = prometheus.NewHistogram(
prometheus.HistogramOpts{
Subsystem: SchedulerSubsystem,
Name: "scheduling_algorithm_latency_microseconds",
Help: "Scheduling algorithm latency",
Buckets: prometheus.ExponentialBuckets(1000, 2, 15),
},
)
SchedulingAlgorithmPredicateEvaluationDuration = prometheus.NewHistogram(
prometheus.HistogramOpts{
Subsystem: SchedulerSubsystem,
Expand All @@ -76,6 +91,14 @@ var (
Buckets: prometheus.ExponentialBuckets(1000, 2, 15),
},
)
BindingLatency = prometheus.NewHistogram(
prometheus.HistogramOpts{
Subsystem: SchedulerSubsystem,
Name: "binding_latency_microseconds",
Help: "Binding latency",
Buckets: prometheus.ExponentialBuckets(1000, 2, 15),
},
)
PreemptionVictims = prometheus.NewGauge(
prometheus.GaugeOpts{
Subsystem: SchedulerSubsystem,
Expand All @@ -90,6 +113,9 @@ var (
})
metricsList = []prometheus.Collector{
SchedulingLatency,
E2eSchedulingLatency,
SchedulingAlgorithmLatency,
BindingLatency,
SchedulingAlgorithmPredicateEvaluationDuration,
SchedulingAlgorithmPriorityEvaluationDuration,
SchedulingAlgorithmPremptionEvaluationDuration,
Expand All @@ -102,6 +128,7 @@ var registerMetrics sync.Once

// Register all metrics.
func Register() {
// Register the metrics.
registerMetrics.Do(func() {
for _, metric := range metricsList {
prometheus.MustRegister(metric)
Expand All @@ -118,3 +145,8 @@ func Reset() {
func SinceInMicroseconds(start time.Time) float64 {
return float64(time.Since(start).Nanoseconds() / time.Microsecond.Nanoseconds())
}

// SinceInSeconds gets the time since the specified start in seconds.
func SinceInSeconds(start time.Time) float64 {
return time.Since(start).Seconds()
}
8 changes: 5 additions & 3 deletions pkg/scheduler/scheduler.go
Expand Up @@ -429,7 +429,8 @@ func (sched *Scheduler) bind(assumed *v1.Pod, b *v1.Binding) error {
return err
}

metrics.SchedulingLatency.WithLabelValues(metrics.Binding).Observe(metrics.SinceInMicroseconds(bindingStart))
metrics.BindingLatency.Observe(metrics.SinceInMicroseconds(bindingStart))
metrics.SchedulingLatency.WithLabelValues(metrics.Binding).Observe(metrics.SinceInSeconds(bindingStart))
sched.config.Recorder.Eventf(assumed, v1.EventTypeNormal, "Scheduled", "Successfully assigned %v/%v to %v", assumed.Namespace, assumed.Name, b.Target.Name)
return nil
}
Expand Down Expand Up @@ -461,7 +462,8 @@ func (sched *Scheduler) scheduleOne() {
}
return
}
metrics.SchedulingLatency.WithLabelValues(metrics.SchedulingAlgorithm).Observe(metrics.SinceInMicroseconds(start))
metrics.SchedulingAlgorithmLatency.Observe(metrics.SinceInMicroseconds(start))
metrics.SchedulingLatency.WithLabelValues(metrics.SelectingNode).Observe(metrics.SinceInSeconds(start))
// Tell the cache to assume that a pod now is running on a given node, even though it hasn't been bound yet.
// This allows us to keep scheduling without waiting on binding to occur.
assumedPod := pod.DeepCopy()
Expand Down Expand Up @@ -496,7 +498,7 @@ func (sched *Scheduler) scheduleOne() {
Name: suggestedHost,
},
})
metrics.SchedulingLatency.WithLabelValues(metrics.E2eScheduling).Observe(metrics.SinceInMicroseconds(start))
metrics.E2eSchedulingLatency.Observe(metrics.SinceInMicroseconds(start))
if err != nil {
glog.Errorf("Internal error binding pod: (%v)", err)
}
Expand Down
22 changes: 9 additions & 13 deletions test/e2e/framework/metrics_util.go
Expand Up @@ -210,13 +210,12 @@ func (l *PodStartupLatency) PrintJSON() string {
}

type SchedulingMetrics struct {
SchedulingLatency LatencyMetric `json:"schedulingLatency"`
BindingLatency LatencyMetric `json:"bindingLatency"`
E2ELatency LatencyMetric `json:"e2eLatency"`
ThroughputAverage float64 `json:"throughputAverage"`
ThroughputPerc50 float64 `json:"throughputPerc50"`
ThroughputPerc90 float64 `json:"throughputPerc90"`
ThroughputPerc99 float64 `json:"throughputPerc99"`
SelectingNodeLatency LatencyMetric `json:"selectingNodeLatency"`
BindingLatency LatencyMetric `json:"bindingLatency"`
ThroughputAverage float64 `json:"throughputAverage"`
ThroughputPerc50 float64 `json:"throughputPerc50"`
ThroughputPerc90 float64 `json:"throughputPerc90"`
ThroughputPerc99 float64 `json:"throughputPerc99"`
}

func (l *SchedulingMetrics) SummaryKind() string {
Expand Down Expand Up @@ -512,23 +511,20 @@ func getSchedulingLatency(c clientset.Interface) (*SchedulingMetrics, error) {

var metric *LatencyMetric = nil
switch sample.Metric[schedulermetric.OperationLabel] {
case schedulermetric.SchedulingAlgorithm:
metric = &result.SchedulingLatency
case schedulermetric.SelectingNode:
metric = &result.SelectingNodeLatency
case schedulermetric.Binding:
metric = &result.BindingLatency
case schedulermetric.E2eScheduling:
metric = &result.E2ELatency
}
if metric == nil {
continue
}

latency := sample.Value
quantile, err := strconv.ParseFloat(string(sample.Metric[model.QuantileLabel]), 64)
if err != nil {
return nil, err
}
setQuantile(metric, quantile, time.Duration(int64(latency)))
setQuantile(metric, quantile, time.Duration(int64(float64(sample.Value)*float64(time.Second))))
}
return &result, nil
}
Expand Down

0 comments on commit a6e61e7

Please sign in to comment.