Skip to content

Commit

Permalink
ipam: when a CiliumNode is removed, delete node label from metrics.
Browse files Browse the repository at this point in the history
[ upstream commit 5b7b3bb ]

Affects:

* operator_ipam_available_ips
* operator_ipam_used_ips
* operator_ipam_needed_ips

Which have the label "target_name", previously when a Node was deleted
the metric continued to be emitted by the Prometheus exporter, leading
to confusing sum() values across a cluster.

Fixes changes in cilium#24776

Signed-off-by: Tom Hadlaw <tom.hadlaw@isovalent.com>
Signed-off-by: Gilberto Bertin <jibi@cilium.io>
  • Loading branch information
tommyp1ckles authored and jibi committed Sep 7, 2023
1 parent 73db393 commit 4422bae
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 2 deletions.
10 changes: 10 additions & 0 deletions pkg/ipam/metrics/metrics.go
Expand Up @@ -264,6 +264,15 @@ func (p *prometheusMetrics) SetIPNeeded(node string, usage int) {
p.NeededIPs.WithLabelValues(node).Set(float64(usage))
}

// DeleteNode removes all per-node metrics for a particular node (i.e. those labeled with "target_node").
// This is to ensure that when a Node/CiliumNode delete event happens that the operator will no longer report
// metrics for that node.
func (p *prometheusMetrics) DeleteNode(node string) {
p.AvailableIPs.DeleteLabelValues(node)
p.UsedIPs.DeleteLabelValues(node)
p.NeededIPs.DeleteLabelValues(node)
}

type triggerMetrics struct {
total prometheus.Counter
folds prometheus.Gauge
Expand Down Expand Up @@ -349,6 +358,7 @@ func (m *NoOpMetrics) SetIPNeeded(node string, n int)
func (m *NoOpMetrics) PoolMaintainerTrigger() trigger.MetricsObserver { return &NoOpMetricsObserver{} }
func (m *NoOpMetrics) K8sSyncTrigger() trigger.MetricsObserver { return &NoOpMetricsObserver{} }
func (m *NoOpMetrics) ResyncTrigger() trigger.MetricsObserver { return &NoOpMetricsObserver{} }
func (m *NoOpMetrics) DeleteNode(n string) {}

func merge(slices ...[]float64) []float64 {
result := make([]float64, 1)
Expand Down
24 changes: 24 additions & 0 deletions pkg/ipam/metrics/mock/mock.go
Expand Up @@ -197,6 +197,30 @@ func (m *mockMetrics) SetIPNeeded(s string, n int) {
m.mutex.Unlock()
}

func (m *mockMetrics) GetPerNodeMetrics(n string) (*int, *int, *int) {
m.mutex.Lock()
defer m.mutex.Unlock()
var avail, used, needed *int
if c, ok := m.nodeIPAvailable[n]; ok {
avail = &c
}
if c, ok := m.nodeIPUsed[n]; ok {
used = &c
}
if c, ok := m.nodeIPNeeded[n]; ok {
needed = &c
}
return avail, used, needed
}

func (m *mockMetrics) DeleteNode(n string) {
m.mutex.Lock()
defer m.mutex.Unlock()
delete(m.nodeIPAvailable, n)
delete(m.nodeIPUsed, n)
delete(m.nodeIPNeeded, n)
}

func (m *mockMetrics) PoolMaintainerTrigger() trigger.MetricsObserver {
return nil
}
Expand Down
4 changes: 4 additions & 0 deletions pkg/ipam/node_manager.go
Expand Up @@ -151,6 +151,7 @@ type MetricsNodeAPI interface {
SetIPAvailable(node string, cap int)
SetIPUsed(node string, used int)
SetIPNeeded(node string, needed int)
DeleteNode(node string)
}

// nodeMap is a mapping of node names to ENI nodes
Expand Down Expand Up @@ -374,6 +375,9 @@ func (n *NodeManager) Delete(resource *v2.CiliumNode) {
n.mutex.Lock()

if node, ok := n.nodes[resource.Name]; ok {
// Stop target_node metrics related to this node being emitted.
n.metricsAPI.DeleteNode(node.name)

if node.poolMaintainer != nil {
node.poolMaintainer.Shutdown()
}
Expand Down
32 changes: 30 additions & 2 deletions pkg/ipam/node_manager_test.go
Expand Up @@ -204,8 +204,6 @@ func (e *IPAMSuite) TestNodeManagerGet(c *check.C) {
c.Assert(err, check.IsNil)
c.Assert(mngr, check.Not(check.IsNil))

// instances.Resync(context.TODO())

node1 := newCiliumNode("node1", 0, 0, 0)
mngr.Upsert(node1)

Expand All @@ -217,6 +215,36 @@ func (e *IPAMSuite) TestNodeManagerGet(c *check.C) {
c.Assert(mngr.Get("node2"), check.IsNil)
}

func (e *IPAMSuite) TestNodeManagerDelete(c *check.C) {
am := newAllocationImplementationMock()
c.Assert(am, check.Not(check.IsNil))
metrics := metricsmock.NewMockMetrics()
mngr, err := NewNodeManager(am, k8sapi, metrics, 10, false, false)
c.Assert(err, check.IsNil)
c.Assert(mngr, check.Not(check.IsNil))

node1 := newCiliumNode("node-foo", 0, 0, 0)
mngr.Upsert(node1)

c.Assert(mngr.Get("node-foo"), check.Not(check.IsNil))
c.Assert(mngr.Get("node2"), check.IsNil)

mngr.Resync(context.Background(), time.Now())
avail, used, needed := metrics.GetPerNodeMetrics("node-foo")
c.Assert(avail, check.Not(check.IsNil))
c.Assert(used, check.Not(check.IsNil))
c.Assert(needed, check.Not(check.IsNil))
mngr.Delete(node1)
// Following a node Delete, we expect the per-node metrics for that Node to be
// deleted.
avail, used, needed = metrics.GetPerNodeMetrics("node-foo")
c.Assert(avail, check.IsNil)
c.Assert(used, check.IsNil)
c.Assert(needed, check.IsNil)
c.Assert(mngr.Get("node-foo"), check.IsNil)
c.Assert(mngr.Get("node2"), check.IsNil)
}

type k8sMock struct{}

func (k *k8sMock) Update(origNode, orig *v2.CiliumNode) (*v2.CiliumNode, error) {
Expand Down

0 comments on commit 4422bae

Please sign in to comment.