Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

metrics: additional metrics for nfd-master #1290

Merged
merged 4 commits into from
Aug 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/deployment/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,14 @@ The exposed metrics are
| ------------------------------------------------- | --------- | ---------------------------------------
| `nfd_master_build_info` | Gauge | Version from which nfd-master was built
| `nfd_worker_build_info` | Gauge | Version from which nfd-worker was built
| `nfd_node_update_requests_total` | Counter | Number of node update requests processed by the master
| `nfd_node_updates_total` | Counter | Number of nodes updated
| `nfd_node_update_failures_total` | Counter | Number of nodes update failures
| `nfd_node_labels_rejected_total` | Counter | Number of nodes labels rejected by nfd-master
| `nfd_node_extendedresources_rejected_total` | Counter | Number of nodes extended resources rejected by nfd-master
| `nfd_node_taints_rejected_total` | Counter | Number of nodes taints rejected by nfd-master
| `nfd_nodefeaturerule_processing_duration_seconds` | Histogram | Time taken to process NodeFeatureRule objects
| `nfd_nodefeaturerule_processing_errors_total` | Counter | Number or errors encountered while processing NodeFeatureRule objects
| `nfd_feature_discovery_duration_seconds` | Histogram | Time taken to discover features on a node

## Via Kustomize
Expand Down
47 changes: 42 additions & 5 deletions pkg/nfd-master/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,15 @@

// When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names
const (
buildInfoQuery = "nfd_master_build_info"
nodeUpdatesQuery = "nfd_node_updates_total"
nfrProcessingTimeQuery = "nfd_nodefeaturerule_processing_duration_seconds"
buildInfoQuery = "nfd_master_build_info"
nodeUpdateRequestsQuery = "nfd_node_update_requests_total"
nodeUpdatesQuery = "nfd_node_updates_total"
nodeUpdateFailuresQuery = "nfd_node_update_failures_total"
nodeLabelsRejectedQuery = "nfd_node_labels_rejected_total"
nodeERsRejectedQuery = "nfd_node_extendedresources_rejected_total"
nodeTaintsRejectedQuery = "nfd_node_taints_rejected_total"
nfrProcessingTimeQuery = "nfd_nodefeaturerule_processing_duration_seconds"
nfrProcessingErrorsQuery = "nfd_nodefeaturerule_processing_errors_total"
)

var (
Expand All @@ -43,10 +49,30 @@
"version": version.Get(),
},
})
nodeUpdateRequests = prometheus.NewCounter(prometheus.CounterOpts{
Name: nodeUpdateRequestsQuery,
Help: "Number of node update requests processed by the master.",
})
nodeUpdates = prometheus.NewCounter(prometheus.CounterOpts{
Name: nodeUpdatesQuery,
Help: "Number of nodes updated by the master.",
})
nodeUpdateFailures = prometheus.NewCounter(prometheus.CounterOpts{
Name: nodeUpdateFailuresQuery,
Help: "Number of node update failures.",
})
nodeLabelsRejected = prometheus.NewCounter(prometheus.CounterOpts{
Name: nodeLabelsRejectedQuery,
Help: "Number of node labels that were rejected by nfd-master.",
})
nodeERsRejected = prometheus.NewCounter(prometheus.CounterOpts{
Name: nodeERsRejectedQuery,
Help: "Number of node extended resources that were rejected by nfd-master.",
})
nodeTaintsRejected = prometheus.NewCounter(prometheus.CounterOpts{
Name: nodeTaintsRejectedQuery,
Help: "Number of node taints that were rejected by nfd-master.",
})
nfrProcessingTime = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: nfrProcessingTimeQuery,
Expand All @@ -58,6 +84,10 @@
"node",
},
)
nfrProcessingErrors = prometheus.NewCounter(prometheus.CounterOpts{
Name: nfrProcessingErrorsQuery,
Help: "Number of errors encountered while processing NodeFeatureRule objects.",
})
)

// registerVersion exposes the Operator build version.
Expand All @@ -68,9 +98,16 @@
// runMetricsServer starts a http server to expose metrics
func runMetricsServer(port int) {
r := prometheus.NewRegistry()
r.MustRegister(buildInfo,
r.MustRegister(
buildInfo,
nodeUpdateRequests,

Check warning on line 103 in pkg/nfd-master/metrics.go

View check run for this annotation

Codecov / codecov/patch

pkg/nfd-master/metrics.go#L101-L103

Added lines #L101 - L103 were not covered by tests
nodeUpdates,
nfrProcessingTime)
nodeUpdateFailures,
nodeLabelsRejected,
nodeERsRejected,
nodeTaintsRejected,
nfrProcessingTime,
nfrProcessingErrors)

Check warning on line 110 in pkg/nfd-master/metrics.go

View check run for this annotation

Codecov / codecov/patch

pkg/nfd-master/metrics.go#L105-L110

Added lines #L105 - L110 were not covered by tests

mux := http.NewServeMux()
mux.Handle("/metrics", promhttp.HandlerFor(r, promhttp.HandlerOpts{}))
Expand Down
8 changes: 8 additions & 0 deletions pkg/nfd-master/nfd-master.go
Original file line number Diff line number Diff line change
Expand Up @@ -452,6 +452,7 @@
// Prune labels and extended resources
err := m.updateNodeObject(cli, node.Name, Labels{}, Annotations{}, ExtendedResources{}, []corev1.Taint{})
if err != nil {
nodeUpdateFailures.Inc()

Check warning on line 455 in pkg/nfd-master/nfd-master.go

View check run for this annotation

Codecov / codecov/patch

pkg/nfd-master/nfd-master.go#L455

Added line #L455 was not covered by tests
return fmt.Errorf("failed to prune node %q: %v", node.Name, err)
}

Expand Down Expand Up @@ -509,6 +510,7 @@

if value, err := m.filterFeatureLabel(name, value, features); err != nil {
klog.ErrorS(err, "ignoring label", "labelKey", name, "labelValue", value)
nodeLabelsRejected.Inc()
} else {
outLabels[name] = value
}
Expand All @@ -522,6 +524,7 @@
if value, ok := outLabels[extendedResourceName]; ok {
if _, err := strconv.Atoi(value); err != nil {
klog.ErrorS(err, "bad label value encountered for extended resource", "labelKey", extendedResourceName, "labelValue", value)
nodeERsRejected.Inc()

Check warning on line 527 in pkg/nfd-master/nfd-master.go

View check run for this annotation

Codecov / codecov/patch

pkg/nfd-master/nfd-master.go#L527

Added line #L527 was not covered by tests
continue // non-numeric label can't be used
}

Expand Down Expand Up @@ -602,6 +605,7 @@
for _, taint := range taints {
if err := filterTaint(&taint); err != nil {
klog.ErrorS(err, "ignoring taint", "taint", taint)
nodeTaintsRejected.Inc()

Check warning on line 608 in pkg/nfd-master/nfd-master.go

View check run for this annotation

Codecov / codecov/patch

pkg/nfd-master/nfd-master.go#L608

Added line #L608 was not covered by tests
} else {
outTaints = append(outTaints, taint)
}
Expand Down Expand Up @@ -650,6 +654,7 @@

// SetLabels implements LabelerServer
func (m *nfdMaster) SetLabels(c context.Context, r *pb.SetLabelsRequest) (*pb.SetLabelsReply, error) {
nodeUpdateRequests.Inc()
err := authorizeClient(c, m.args.VerifyNodeName, r.NodeName)
if err != nil {
klog.ErrorS(err, "gRPC client authorization failed", "nodeName", r.NodeName)
Expand All @@ -675,6 +680,7 @@

// Create labels et al
if err := m.refreshNodeFeatures(cli, r.NodeName, annotations, r.GetLabels(), r.GetFeatures()); err != nil {
nodeUpdateFailures.Inc()

Check warning on line 683 in pkg/nfd-master/nfd-master.go

View check run for this annotation

Codecov / codecov/patch

pkg/nfd-master/nfd-master.go#L683

Added line #L683 was not covered by tests
return &pb.SetLabelsReply{}, err
}
}
Expand Down Expand Up @@ -784,6 +790,7 @@
capacity, err := filterExtendedResource(name, value, features)
if err != nil {
klog.ErrorS(err, "failed to create extended resources", "extendedResourceName", name, "extendedResourceValue", value)
nodeERsRejected.Inc()

Check warning on line 793 in pkg/nfd-master/nfd-master.go

View check run for this annotation

Codecov / codecov/patch

pkg/nfd-master/nfd-master.go#L793

Added line #L793 was not covered by tests
} else {
outExtendedResources[name] = capacity
}
Expand Down Expand Up @@ -989,6 +996,7 @@
ruleOut, err := rule.Execute(features)
if err != nil {
klog.ErrorS(err, "failed to process rule", "ruleName", rule.Name, "nodefeaturerule", klog.KObj(spec), "nodeName", nodeName)
nfrProcessingErrors.Inc()

Check warning on line 999 in pkg/nfd-master/nfd-master.go

View check run for this annotation

Codecov / codecov/patch

pkg/nfd-master/nfd-master.go#L999

Added line #L999 was not covered by tests
continue
}
taints = append(taints, ruleOut.Taints...)
Expand Down
2 changes: 2 additions & 0 deletions pkg/nfd-master/node-updater-pool.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,15 @@

defer queue.Done(nodeName)

nodeUpdateRequests.Inc()

Check warning on line 49 in pkg/nfd-master/node-updater-pool.go

View check run for this annotation

Codecov / codecov/patch

pkg/nfd-master/node-updater-pool.go#L49

Added line #L49 was not covered by tests
if err := u.nfdMaster.nfdAPIUpdateOneNode(nodeName.(string)); err != nil {
if queue.NumRequeues(nodeName) < 5 {
klog.InfoS("retrying node update", "nodeName", nodeName)
queue.AddRateLimited(nodeName)
return true
} else {
klog.ErrorS(err, "failed to update node", "nodeName", nodeName)
nodeUpdateFailures.Inc()

Check warning on line 57 in pkg/nfd-master/node-updater-pool.go

View check run for this annotation

Codecov / codecov/patch

pkg/nfd-master/node-updater-pool.go#L57

Added line #L57 was not covered by tests
}
}
queue.Forget(nodeName)
Expand Down