Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

node: topology-mgr: Add metric to measure topology manager admission latency #115590

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
9 changes: 6 additions & 3 deletions pkg/kubelet/cm/topologymanager/topology_manager.go
Expand Up @@ -18,6 +18,7 @@ package topologymanager

import (
"fmt"
"time"

cadvisorapi "github.com/google/cadvisor/info/v1"
"k8s.io/api/core/v1"
Expand Down Expand Up @@ -209,9 +210,11 @@ func (m *manager) RemoveContainer(containerID string) error {

func (m *manager) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAdmitResult {
klog.InfoS("Topology Admit Handler")

metrics.TopologyManagerAdmissionRequestsTotal.Inc()
swatisehgal marked this conversation as resolved.
Show resolved Hide resolved
pod := attrs.Pod

return m.scope.Admit(pod)
startTime := time.Now()
podAdmitResult := m.scope.Admit(attrs.Pod)
metrics.TopologyManagerAdmissionDuration.Observe(float64(time.Since(startTime).Milliseconds()))

return podAdmitResult
}
13 changes: 13 additions & 0 deletions pkg/kubelet/metrics/metrics.go
Expand Up @@ -94,6 +94,7 @@ const (
// Metrics to track the Topology manager behavior
TopologyManagerAdmissionRequestsTotalKey = "topology_manager_admission_requests_total"
TopologyManagerAdmissionErrorsTotalKey = "topology_manager_admission_errors_total"
TopologyManagerAdmissionDurationKey = "topology_manager_admission_duration_ms"

// Values used in metric labels
Container = "container"
Expand Down Expand Up @@ -573,6 +574,17 @@ var (
StabilityLevel: metrics.ALPHA,
},
)

// TopologyManagerAdmissionDuration is a Histogram that tracks the duration (in seconds) to serve a pod admission request.
TopologyManagerAdmissionDuration = metrics.NewHistogram(
&metrics.HistogramOpts{
Subsystem: KubeletSubsystem,
Name: TopologyManagerAdmissionDurationKey,
Help: "Duration in milliseconds to serve a pod admission request.",
Buckets: metrics.ExponentialBuckets(.05, 2, 15),
StabilityLevel: metrics.ALPHA,
swatisehgal marked this conversation as resolved.
Show resolved Hide resolved
},
)
)

var registerMetrics sync.Once
Expand Down Expand Up @@ -626,6 +638,7 @@ func Register(collectors ...metrics.StableCollector) {
legacyregistry.MustRegister(CPUManagerPinningErrorsTotal)
legacyregistry.MustRegister(TopologyManagerAdmissionRequestsTotal)
legacyregistry.MustRegister(TopologyManagerAdmissionErrorsTotal)
legacyregistry.MustRegister(TopologyManagerAdmissionDuration)

for _, collector := range collectors {
legacyregistry.CustomMustRegister(collector)
Expand Down
21 changes: 20 additions & 1 deletion test/e2e_node/topology_manager_metrics_test.go
Expand Up @@ -23,6 +23,7 @@ import (
"github.com/onsi/ginkgo/v2"
"github.com/onsi/gomega"
"github.com/onsi/gomega/gstruct"
"github.com/onsi/gomega/types"

v1 "k8s.io/api/core/v1"
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
Expand Down Expand Up @@ -85,6 +86,9 @@ var _ = SIGDescribe("Topology Manager Metrics [Serial][Feature:TopologyManager]"
"kubelet_topology_manager_admission_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
"": timelessSample(0),
}),
"kubelet_topology_manager_admission_duration_ms_count": gstruct.MatchElements(nodeID, gstruct.IgnoreExtras, gstruct.Elements{
"": timelessSample(0),
}),
})

ginkgo.By("Giving the Kubelet time to start up and produce metrics")
Expand All @@ -108,6 +112,9 @@ var _ = SIGDescribe("Topology Manager Metrics [Serial][Feature:TopologyManager]"
"kubelet_topology_manager_admission_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
"": timelessSample(1),
}),
"kubelet_topology_manager_admission_duration_ms_count": gstruct.MatchElements(nodeID, gstruct.IgnoreExtras, gstruct.Elements{
"": checkMetricValueGreaterThan(0),
}),
})

ginkgo.By("Giving the Kubelet time to start up and produce metrics")
Expand All @@ -122,7 +129,7 @@ var _ = SIGDescribe("Topology Manager Metrics [Serial][Feature:TopologyManager]"

// we updated the kubelet config in BeforeEach, so we can assume we start fresh.
// being [Serial], we can also assume noone else but us is running pods.
ginkgo.By("Checking the cpumanager metrics right after the kubelet restart, with pod should be admitted")
ginkgo.By("Checking the topologymanager metrics right after the kubelet restart, with pod should be admitted")

matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
"kubelet_topology_manager_admission_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
Expand All @@ -131,6 +138,9 @@ var _ = SIGDescribe("Topology Manager Metrics [Serial][Feature:TopologyManager]"
"kubelet_topology_manager_admission_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
"": timelessSample(0),
}),
"kubelet_topology_manager_admission_duration_ms_count": gstruct.MatchElements(nodeID, gstruct.IgnoreExtras, gstruct.Elements{
"": checkMetricValueGreaterThan(0),
}),
})

ginkgo.By("Giving the Kubelet time to start up and produce metrics")
Expand All @@ -157,3 +167,12 @@ func hostCheck() (int, int) {

return numaNodes, coreCount
}

func checkMetricValueGreaterThan(value interface{}) types.GomegaMatcher {
return gstruct.PointTo(gstruct.MatchAllFields(gstruct.Fields{
// We already check Metric when matching the Id
"Metric": gstruct.Ignore(),
"Value": gomega.BeNumerically(">", value),
"Timestamp": gstruct.Ignore(),
}))
}