Skip to content
This repository has been archived by the owner on Apr 25, 2023. It is now read-only.

Introduce standard controller runtime metrics #1506

Merged
merged 1 commit into from
Jul 11, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
31 changes: 30 additions & 1 deletion pkg/controller/util/worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ import (
"k8s.io/client-go/util/flowcontrol"
"k8s.io/client-go/util/workqueue"
runtimeclient "sigs.k8s.io/controller-runtime/pkg/client"

"sigs.k8s.io/kubefed/pkg/metrics"
)

type ReconcileFunc func(qualifiedName QualifiedName) ReconciliationStatus
Expand Down Expand Up @@ -128,6 +130,8 @@ func (w *asyncWorker) EnqueueWithDelay(qualifiedName QualifiedName, delay time.D
}

func (w *asyncWorker) Run(stopChan <-chan struct{}) {
w.initMetrics()

StartBackoffGC(w.backoff, stopChan)
w.deliverer.StartWithHandler(func(item *DelayingDelivererItem) {
qualifiedName, ok := item.Value.(*QualifiedName)
Expand Down Expand Up @@ -183,16 +187,41 @@ func (w *asyncWorker) reconcileOnce() bool {
return true
}

metrics.ControllerRuntimeActiveWorkers.WithLabelValues(w.name).Add(1)
defer metrics.ControllerRuntimeActiveWorkers.WithLabelValues(w.name).Add(-1)
defer metrics.UpdateControllerRuntimeReconcileTimeFromStart(w.name, time.Now())

status := w.reconcile(qualifiedName)
switch status {
case StatusAllOK:
break
metrics.ControllerRuntimeReconcileTotal.WithLabelValues(w.name, labelSuccess).Inc()
case StatusError:
w.EnqueueForError(qualifiedName)
metrics.ControllerRuntimeReconcileErrors.WithLabelValues(w.name).Inc()
metrics.ControllerRuntimeReconcileTotal.WithLabelValues(w.name, labelError).Inc()
case StatusNeedsRecheck:
w.EnqueueForRetry(qualifiedName)
metrics.ControllerRuntimeReconcileTotal.WithLabelValues(w.name, labelNeedsRecheck).Inc()
case StatusNotSynced:
w.EnqueueForClusterSync(qualifiedName)
metrics.ControllerRuntimeReconcileTotal.WithLabelValues(w.name, labelNotSynced).Inc()
}
return true
}

const (
labelSuccess = "success"
labelError = "error"
labelNeedsRecheck = "needs_recheck"
labelNotSynced = "not_synced"
)

func (w *asyncWorker) initMetrics() {
metrics.ControllerRuntimeActiveWorkers.WithLabelValues(w.name).Set(0)
metrics.ControllerRuntimeReconcileErrors.WithLabelValues(w.name).Add(0)
metrics.ControllerRuntimeReconcileTotal.WithLabelValues(w.name, labelSuccess).Add(0)
metrics.ControllerRuntimeReconcileTotal.WithLabelValues(w.name, labelError).Add(0)
metrics.ControllerRuntimeReconcileTotal.WithLabelValues(w.name, labelNeedsRecheck).Add(0)
metrics.ControllerRuntimeReconcileTotal.WithLabelValues(w.name, labelNotSynced).Add(0)
metrics.ControllerRuntimeWorkerCount.WithLabelValues(w.name).Set(float64(w.maxConcurrentReconciles))
}
59 changes: 53 additions & 6 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"time"

"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/collectors"
"k8s.io/klog/v2"
"sigs.k8s.io/controller-runtime/pkg/metrics"
)
Expand Down Expand Up @@ -58,7 +59,7 @@ var (
reconcileFederatedResourcesDuration = prometheus.NewHistogram(
prometheus.HistogramOpts{
Name: "reconcile_federated_resources_duration_seconds",
Help: "Time taken to reconcile federated resources in the target clusters.",
Help: "[Deprecated] Time taken to reconcile federated resources in the target clusters. Replaced by controller_runtime_reconcile_time_seconds.",
Buckets: []float64{0.01, 0.05, 0.1, 0.5, 1.0, 2.5, 5.0, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 22.5, 25.0, 27.5, 30.0, 50.0, 75.0, 100.0, 1000.0},
},
)
Expand Down Expand Up @@ -90,18 +91,45 @@ var (
controllerRuntimeReconcileDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "controller_runtime_reconcile_duration_seconds",
Help: "Time taken by various parts of Kubefed controllers reconciliation loops.",
Help: "[Deprecated] Time taken by various parts of Kubefed controllers reconciliation loops. Replaced by controller_runtime_reconcile_time_seconds.",
Buckets: []float64{0.01, 0.05, 0.1, 0.5, 1.0, 2.5, 5.0, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 22.5, 25.0, 27.5, 30.0, 50.0, 75.0, 100.0, 1000.0},
}, []string{"controller"},
)

controllerRuntimeReconcileDurationSummary = prometheus.NewSummaryVec(
prometheus.SummaryOpts{
Name: "controller_runtime_reconcile_quantile_seconds",
Help: "Quantiles of time taken by various parts of Kubefed controllers reconciliation loops.",
Help: "[Deprecated] Quantiles of time taken by various parts of Kubefed controllers reconciliation loops. Replaced by controller_runtime_reconcile_time_seconds.",
MaxAge: time.Hour,
}, []string{"controller"},
)

ControllerRuntimeReconcileTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "controller_runtime_reconcile_total",
Help: "Total number of reconciliations per controller",
}, []string{"controller", "result"})

ControllerRuntimeReconcileErrors = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "controller_runtime_reconcile_errors_total",
Help: "Total number of reconciliation errors per controller",
}, []string{"controller"})

ControllerRuntimeReconcileTime = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Name: "controller_runtime_reconcile_time_seconds",
Help: "Length of time per reconciliation per controller",
Buckets: []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0,
1.25, 1.5, 1.75, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 40, 50, 60},
}, []string{"controller"})

ControllerRuntimeWorkerCount = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "controller_runtime_max_concurrent_reconciles",
Help: "Maximum number of concurrent reconciles per controller",
}, []string{"controller"})

ControllerRuntimeActiveWorkers = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "controller_runtime_active_workers",
Help: "Number of currently used workers per controller",
}, []string{"controller"})
)

const (
Expand All @@ -117,6 +145,10 @@ const (
// RegisterAll registers all metrics.
func RegisterAll() {
metrics.Registry.MustRegister(
// expose process metrics like CPU, Memory, file descriptor usage etc.
collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}),
// expose Go runtime metrics like GC stats, memory stats etc.
collectors.NewGoCollector(),
kubefedClusterTotal,
joinedClusterTotal,
reconcileFederatedResourcesDuration,
Expand All @@ -127,6 +159,11 @@ func RegisterAll() {
dispatchOperationDuration,
controllerRuntimeReconcileDuration,
controllerRuntimeReconcileDurationSummary,
ControllerRuntimeReconcileTotal,
ControllerRuntimeReconcileErrors,
ControllerRuntimeReconcileTime,
ControllerRuntimeWorkerCount,
ControllerRuntimeActiveWorkers,
)
}

Expand Down Expand Up @@ -203,10 +240,20 @@ func UpdateControllerReconcileDurationFromStart(controller string, start time.Ti

// UpdateControllerReconcileDuration records the duration of the reconcile function of a controller
func UpdateControllerReconcileDuration(controller string, duration time.Duration) {
controllerRuntimeReconcileDurationSummary.WithLabelValues(controller).Observe(duration.Seconds())
controllerRuntimeReconcileDuration.WithLabelValues(controller).Observe(duration.Seconds())
}

// UpdateControllerRuntimeReconcileTimeFromStart records the duration of the reconcile loop of a controller
func UpdateControllerRuntimeReconcileTimeFromStart(controller string, start time.Time) {
duration := time.Since(start)
UpdateControllerRuntimeReconcileTime(controller, duration)
}

// UpdateControllerRuntimeReconcileTime records the duration of the reconcile function of a controller
func UpdateControllerRuntimeReconcileTime(controller string, duration time.Duration) {
if duration > LogReconcileLongDurationThreshold {
klog.V(4).Infof("Reconcile loop %s took %v to complete", controller, duration)
}

controllerRuntimeReconcileDurationSummary.WithLabelValues(controller).Observe(duration.Seconds())
controllerRuntimeReconcileDuration.WithLabelValues(controller).Observe(duration.Seconds())
ControllerRuntimeReconcileTime.WithLabelValues(controller).Observe(duration.Seconds())
}