From a2ef00c1b1ebc646958a0e8ce3e4650f751eebd0 Mon Sep 17 00:00:00 2001 From: hui luo Date: Thu, 8 Aug 2019 20:43:25 -0700 Subject: [PATCH] Add iptables restore failure metrics As mentioned in issue #80061, in iptables lock contention case, we can see increasing rate of iptables restore failures because it need to grab iptables file lock. The failure metric can provide administrators more insight Metrics will be collected in kube-proxy iptables and ipvs modes Signed-off-by: Hui Luo --- pkg/proxy/iptables/proxier.go | 3 +++ pkg/proxy/ipvs/proxier.go | 1 + pkg/proxy/metrics/metrics.go | 11 +++++++++++ 3 files changed, 15 insertions(+) diff --git a/pkg/proxy/iptables/proxier.go b/pkg/proxy/iptables/proxier.go index 7f2abc42ec6e..5a2424609324 100644 --- a/pkg/proxy/iptables/proxier.go +++ b/pkg/proxy/iptables/proxier.go @@ -429,6 +429,7 @@ func CleanupLeftovers(ipt utiliptables.Interface) (encounteredError bool) { err = ipt.Restore(utiliptables.TableNAT, natLines, utiliptables.NoFlushTables, utiliptables.RestoreCounters) if err != nil { klog.Errorf("Failed to execute iptables-restore for %s: %v", utiliptables.TableNAT, err) + metrics.IptablesRestoreFailuresTotal.Inc() encounteredError = true } } @@ -455,6 +456,7 @@ func CleanupLeftovers(ipt utiliptables.Interface) (encounteredError bool) { // Write it. if err := ipt.Restore(utiliptables.TableFilter, filterLines, utiliptables.NoFlushTables, utiliptables.RestoreCounters); err != nil { klog.Errorf("Failed to execute iptables-restore for %s: %v", utiliptables.TableFilter, err) + metrics.IptablesRestoreFailuresTotal.Inc() encounteredError = true } } @@ -1401,6 +1403,7 @@ func (proxier *Proxier) syncProxyRules() { err = proxier.iptables.RestoreAll(proxier.iptablesData.Bytes(), utiliptables.NoFlushTables, utiliptables.RestoreCounters) if err != nil { klog.Errorf("Failed to execute iptables-restore: %v", err) + metrics.IptablesRestoreFailuresTotal.Inc() // Revert new local ports. klog.V(2).Infof("Closing local ports after iptables-restore failure") utilproxy.RevertPorts(replacementPortsMap, proxier.portsMap) diff --git a/pkg/proxy/ipvs/proxier.go b/pkg/proxy/ipvs/proxier.go index ba215ca18f70..503bf3b30d8f 100644 --- a/pkg/proxy/ipvs/proxier.go +++ b/pkg/proxy/ipvs/proxier.go @@ -1310,6 +1310,7 @@ func (proxier *Proxier) syncProxyRules() { err = proxier.iptables.RestoreAll(proxier.iptablesData.Bytes(), utiliptables.NoFlushTables, utiliptables.RestoreCounters) if err != nil { klog.Errorf("Failed to execute iptables-restore: %v\nRules:\n%s", err, proxier.iptablesData.Bytes()) + metrics.IptablesRestoreFailuresTotal.Inc() // Revert new local ports. utilproxy.RevertPorts(replacementPortsMap, proxier.portsMap) return diff --git a/pkg/proxy/metrics/metrics.go b/pkg/proxy/metrics/metrics.go index c12a11d03208..b73c740d79c3 100644 --- a/pkg/proxy/metrics/metrics.go +++ b/pkg/proxy/metrics/metrics.go @@ -116,6 +116,16 @@ var ( Help: "Cumulative proxy rules Service changes", }, ) + + // IptablesRestoreFailuresTotal is the number of iptables restore failures that the proxy has + // seen. + IptablesRestoreFailuresTotal = prometheus.NewCounter( + prometheus.CounterOpts{ + Subsystem: kubeProxySubsystem, + Name: "sync_proxy_rules_iptables_restore_failures_total", + Help: "Cumulative proxy iptables restore failures", + }, + ) ) var registerMetricsOnce sync.Once @@ -131,6 +141,7 @@ func RegisterMetrics() { prometheus.MustRegister(EndpointChangesTotal) prometheus.MustRegister(ServiceChangesPending) prometheus.MustRegister(ServiceChangesTotal) + prometheus.MustRegister(IptablesRestoreFailuresTotal) }) }