-
Notifications
You must be signed in to change notification settings - Fork 38.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix cluster IP allocator metrics #110027
Fix cluster IP allocator metrics #110027
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -40,6 +40,7 @@ type Interface interface { | |
IPFamily() api.IPFamily | ||
Has(ip net.IP) bool | ||
Destroy() | ||
EnableMetrics() | ||
|
||
// DryRun offers a way to try operations without persisting them. | ||
DryRun() Interface | ||
|
@@ -86,12 +87,12 @@ type Range struct { | |
family api.IPFamily | ||
|
||
alloc allocator.Interface | ||
// metrics is a metrics recorder that can be disabled | ||
metrics metricsRecorderInterface | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I like this solution and seems aligned with the rest of the code base, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @dgrisonnet (sig-instrumentation) I'd like your opinion too, the fact that metrics are global is 😬 , what do you think about this solution ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah, it is not ideal to have metrics declared globally, but at the same time, most of the codebase/ecosystem is doing that and I don't think we had any issues in the past because of that. The reason is that to "enable" a metric, you have to register it into a registry so you always need an extra step in order for your metric to start being exposed. So I don't really think having them global is a problem as such. That said, the approach that you've taken is one of the potential ways to make metrics initialization cleaner, but I personally prefer the one taken here https://github.com/prometheus-operator/prometheus-operator/blob/main/pkg/operator/operator.go#L180-L272 that creates structures to group similar metrics together. You are then able to initialize and register metrics via a simple call to "New". We might have some precedence of that or something similar in the codebase, but out of my head, I can't think of any that I have seen. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that current proposal is good enough, maybe is a TODO for sig-instrumentation to work on standardize this ;) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, it is good enough, we haven't standardized anything for now, so the implementation is up to the component owners. That's a good point, I'll bring it to the group. |
||
} | ||
|
||
// New creates a Range over a net.IPNet, calling allocatorFactory to construct the backing store. | ||
func New(cidr *net.IPNet, allocatorFactory allocator.AllocatorWithOffsetFactory) (*Range, error) { | ||
registerMetrics() | ||
|
||
max := netutils.RangeSize(cidr) | ||
base := netutils.BigForIP(cidr.IP) | ||
rangeSpec := cidr.String() | ||
|
@@ -116,10 +117,11 @@ func New(cidr *net.IPNet, allocatorFactory allocator.AllocatorWithOffsetFactory) | |
max-- | ||
|
||
r := Range{ | ||
net: cidr, | ||
base: base, | ||
max: maximum(0, int(max)), | ||
family: family, | ||
net: cidr, | ||
base: base, | ||
max: maximum(0, int(max)), | ||
family: family, | ||
metrics: &emptyMetricsRecorder{}, // disabled by default | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. then, should we There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
} | ||
|
||
offset := 0 | ||
|
@@ -201,8 +203,10 @@ func (r *Range) allocate(ip net.IP, dryRun bool) error { | |
label := r.CIDR() | ||
ok, offset := r.contains(ip) | ||
if !ok { | ||
// update metrics | ||
clusterIPAllocationErrors.WithLabelValues(label.String(), "static").Inc() | ||
if !dryRun { | ||
// update metrics | ||
r.metrics.incrementAllocationErrors(label.String(), "static") | ||
} | ||
return &ErrNotInRange{ip, r.net.String()} | ||
} | ||
if dryRun { | ||
thockin marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
@@ -214,20 +218,20 @@ func (r *Range) allocate(ip net.IP, dryRun bool) error { | |
allocated, err := r.alloc.Allocate(offset) | ||
if err != nil { | ||
// update metrics | ||
clusterIPAllocationErrors.WithLabelValues(label.String(), "static").Inc() | ||
r.metrics.incrementAllocationErrors(label.String(), "static") | ||
|
||
return err | ||
} | ||
if !allocated { | ||
// update metrics | ||
clusterIPAllocationErrors.WithLabelValues(label.String(), "static").Inc() | ||
r.metrics.incrementAllocationErrors(label.String(), "static") | ||
|
||
return ErrAllocated | ||
} | ||
// update metrics | ||
clusterIPAllocations.WithLabelValues(label.String(), "static").Inc() | ||
clusterIPAllocated.WithLabelValues(label.String()).Set(float64(r.Used())) | ||
clusterIPAvailable.WithLabelValues(label.String()).Set(float64(r.Free())) | ||
r.metrics.incrementAllocations(label.String(), "static") | ||
r.metrics.setAllocated(label.String(), r.Used()) | ||
r.metrics.setAvailable(label.String(), r.Free()) | ||
|
||
return nil | ||
} | ||
|
@@ -249,20 +253,20 @@ func (r *Range) allocateNext(dryRun bool) (net.IP, error) { | |
offset, ok, err := r.alloc.AllocateNext() | ||
if err != nil { | ||
// update metrics | ||
clusterIPAllocationErrors.WithLabelValues(label.String(), "dynamic").Inc() | ||
r.metrics.incrementAllocationErrors(label.String(), "dynamic") | ||
|
||
return nil, err | ||
} | ||
if !ok { | ||
// update metrics | ||
clusterIPAllocationErrors.WithLabelValues(label.String(), "dynamic").Inc() | ||
r.metrics.incrementAllocationErrors(label.String(), "dynamic") | ||
|
||
return nil, ErrFull | ||
} | ||
// update metrics | ||
clusterIPAllocations.WithLabelValues(label.String(), "dynamic").Inc() | ||
clusterIPAllocated.WithLabelValues(label.String()).Set(float64(r.Used())) | ||
clusterIPAvailable.WithLabelValues(label.String()).Set(float64(r.Free())) | ||
r.metrics.incrementAllocations(label.String(), "dynamic") | ||
r.metrics.setAllocated(label.String(), r.Used()) | ||
r.metrics.setAvailable(label.String(), r.Free()) | ||
|
||
return netutils.AddIPOffset(r.base, offset), nil | ||
} | ||
|
@@ -287,8 +291,8 @@ func (r *Range) release(ip net.IP, dryRun bool) error { | |
if err == nil { | ||
// update metrics | ||
label := r.CIDR() | ||
clusterIPAllocated.WithLabelValues(label.String()).Set(float64(r.Used())) | ||
clusterIPAvailable.WithLabelValues(label.String()).Set(float64(r.Free())) | ||
r.metrics.setAllocated(label.String(), r.Used()) | ||
r.metrics.setAvailable(label.String(), r.Free()) | ||
} | ||
return err | ||
} | ||
|
@@ -364,6 +368,12 @@ func (r *Range) Destroy() { | |
r.alloc.Destroy() | ||
} | ||
|
||
// EnableMetrics enables metrics recording. | ||
func (r *Range) EnableMetrics() { | ||
registerMetrics() | ||
r.metrics = &metricsRecorder{} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should the metrics There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
} | ||
|
||
// calculateIPOffset calculates the integer offset of ip from base such that | ||
// base + offset = ip. It requires ip >= base. | ||
func calculateIPOffset(base *big.Int, ip net.IP) int { | ||
|
@@ -436,3 +446,6 @@ func (dry dryRunRange) Has(ip net.IP) bool { | |
|
||
func (dry dryRunRange) Destroy() { | ||
} | ||
|
||
func (dry dryRunRange) EnableMetrics() { | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have to remember how this ends in the repair loops
kubernetes/pkg/controlplane/controller.go
Line 174 in 9d85e18
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
tracing
kubernetes/pkg/controlplane/instance.go
Lines 522 to 548 in 9d85e18