From cdcf89d87af46ddd863a5780f2270387d22333b9 Mon Sep 17 00:00:00 2001 From: hzma Date: Tue, 16 Apr 2024 13:53:56 +0800 Subject: [PATCH] add monitor for sysctl para (#3913) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add monitor for sysctl para Signed-off-by: 马洪贞 * Update pkg/daemon/exporter_metric.go Co-authored-by: 张祖建 Signed-off-by: 马洪贞 --------- Signed-off-by: 马洪贞 Co-authored-by: 张祖建 --- pkg/daemon/controller_linux.go | 25 +++++ pkg/daemon/exporter_metric.go | 186 +++++++++++++++++++++++++++++++++ pkg/daemon/metrics.go | 93 +++++++++++++++++ 3 files changed, 304 insertions(+) create mode 100644 pkg/daemon/exporter_metric.go diff --git a/pkg/daemon/controller_linux.go b/pkg/daemon/controller_linux.go index edbf667a7da..a42eadd1752 100644 --- a/pkg/daemon/controller_linux.go +++ b/pkg/daemon/controller_linux.go @@ -636,6 +636,31 @@ func (c *Controller) loopEncapIPCheck() { func (c *Controller) ovnMetricsUpdate() { c.setOvnSubnetGatewayMetric() + + resetSysParaMetrics() + c.setIPLocalPortRangeMetric() + c.setCheckSumErrMetric() + c.setCniConfigMetric() + c.setDNSSearchMetric() + c.setTCPTwRecycleMetric() + c.setTCPMtuProbingMetric() + c.setConntrackTCPLiberalMetric() + c.setBridgeNfCallIptablesMetric() + c.setIPv6RouteMaxsizeMetric() + c.setTCPMemMetric() +} + +func resetSysParaMetrics() { + metricIPLocalPortRange.Reset() + metricCheckSumErr.Reset() + metricCniConfig.Reset() + metricDNSSearch.Reset() + metricTCPTwRecycle.Reset() + metricTCPMtuProbing.Reset() + metricConntrackTCPLiberal.Reset() + metricBridgeNfCallIptables.Reset() + metricTCPMem.Reset() + metricIPv6RouteMaxsize.Reset() } func rotateLog() { diff --git a/pkg/daemon/exporter_metric.go b/pkg/daemon/exporter_metric.go new file mode 100644 index 00000000000..c6f7068b6d4 --- /dev/null +++ b/pkg/daemon/exporter_metric.go @@ -0,0 +1,186 @@ +package daemon + +import ( + "os" + "os/exec" + "strconv" + "strings" + + "k8s.io/klog/v2" + + "github.com/containernetworking/cni/libcni" + "github.com/docker/docker/libnetwork/resolvconf" +) + +func (c *Controller) setIPLocalPortRangeMetric() { + output, err := os.ReadFile("/proc/sys/net/ipv4/ip_local_port_range") + if err != nil { + klog.Errorf("failed to get value of ip_local_port_range, err %v", err) + return + } + + values := strings.Fields(string(output)) + if len(values) != 2 { + klog.Errorf("unexpected ip_local_port_range value: %q", string(output)) + return + } + metricIPLocalPortRange.WithLabelValues(c.config.NodeName, values[0], values[1]).Set(1) +} + +func (c *Controller) setCheckSumErrMetric() { + cmdstr := "netstat -us" + cmd := exec.Command("sh", "-c", cmdstr) + output, err := cmd.CombinedOutput() + if err != nil { + klog.Errorf("failed to exec cmd 'netstat -us', err %v", err) + return + } + + found := false + lines := strings.Split(string(output), "\n") + for _, line := range lines { + line = strings.TrimSpace(line) + if line == "" { + continue + } + + if strings.Contains(line, "InCsumErrors") { + values := strings.Split(line, ":") + if len(values) == 2 { + val, _ := strconv.Atoi(strings.TrimSpace(values[1])) + metricCheckSumErr.WithLabelValues(c.config.NodeName).Set(float64(val)) + found = true + } + } + } + if !found { + metricCheckSumErr.WithLabelValues(c.config.NodeName).Set(float64(0)) + } +} + +func (c *Controller) setCniConfigMetric() { + files, err := libcni.ConfFiles(c.config.CniConfDir, []string{".conf", ".conflist"}) + if err != nil { + klog.Errorf("failed to list cni config files in %s: %v", c.config.CniConfDir, err) + return + } + + found := false + for _, file := range files { + if file == c.config.CniConfName { + continue + } + found = true + metricCniConfig.WithLabelValues(c.config.NodeName, c.config.CniConfName, file).Set(1) + } + if !found { + metricCniConfig.WithLabelValues(c.config.NodeName, c.config.CniConfName, "no other cni config").Set(1) + } +} + +func (c *Controller) setDNSSearchMetric() { + file, err := resolvconf.Get() + if err != nil { + klog.Errorf("failed to get /etc/resolv.conf content: %v", err) + return + } + domains := resolvconf.GetSearchDomains(file.Content) + + found := false + for _, domain := range domains { + if strings.Contains(domain, "local") { + continue + } + + found = true + metricDNSSearch.WithLabelValues(c.config.NodeName, domain).Set(1) + } + if !found { + metricDNSSearch.WithLabelValues(c.config.NodeName, "no additional search domain").Set(1) + } +} + +func (c *Controller) setTCPTwRecycleMetric() { + output, err := os.ReadFile("/proc/sys/net/ipv4/tcp_tw_recycle") + if err != nil { + if os.IsNotExist(err) { + return + } + klog.Errorf("failed to get value of tcp_tw_recycle, err %v", err) + return + } + + val, _ := strconv.Atoi(strings.TrimSpace(string(output))) + metricTCPTwRecycle.WithLabelValues(c.config.NodeName).Set(float64(val)) +} + +func (c *Controller) setTCPMtuProbingMetric() { + output, err := os.ReadFile("/proc/sys/net/ipv4/tcp_mtu_probing") + if err != nil { + if os.IsNotExist(err) { + return + } + klog.Errorf("failed to get value of tcp_mtu_probing, err %v", err) + return + } + + val, _ := strconv.Atoi(strings.TrimSpace(string(output))) + metricTCPMtuProbing.WithLabelValues(c.config.NodeName).Set(float64(val)) +} + +func (c *Controller) setConntrackTCPLiberalMetric() { + output, err := os.ReadFile("/proc/sys/net/netfilter/nf_conntrack_tcp_be_liberal") + if err != nil { + if os.IsNotExist(err) { + return + } + klog.Errorf("failed to get value of nf_conntrack_tcp_be_liberal, err %v", err) + return + } + + val, _ := strconv.Atoi(strings.TrimSpace(string(output))) + metricConntrackTCPLiberal.WithLabelValues(c.config.NodeName).Set(float64(val)) +} + +func (c *Controller) setBridgeNfCallIptablesMetric() { + output, err := os.ReadFile("/proc/sys/net/bridge/bridge-nf-call-iptables") + if err != nil { + if os.IsNotExist(err) { + return + } + klog.Errorf("failed to get value of bridge-nf-call-iptables, err %v", err) + return + } + + val, _ := strconv.Atoi(strings.TrimSpace(string(output))) + metricBridgeNfCallIptables.WithLabelValues(c.config.NodeName).Set(float64(val)) +} + +func (c *Controller) setIPv6RouteMaxsizeMetric() { + output, err := os.ReadFile("/proc/sys/net/ipv6/route/max_size") + if err != nil { + klog.Errorf("failed to get value of ipv6 route max_size, err %v", err) + return + } + + val, _ := strconv.Atoi(strings.TrimSpace(string(output))) + metricIPv6RouteMaxsize.WithLabelValues(c.config.NodeName).Set(float64(val)) +} + +func (c *Controller) setTCPMemMetric() { + output, err := os.ReadFile("/proc/sys/net/ipv4/tcp_mem") + if err != nil { + if os.IsNotExist(err) { + return + } + klog.Errorf("failed to get value of ipv4 tcp_mem, err %v", err) + return + } + + values := strings.Fields(string(output)) + if len(values) != 3 { + klog.Errorf("unexpected tcp_mem value: %q", string(output)) + return + } + metricTCPMem.WithLabelValues(c.config.NodeName, values[0], values[1], values[2]).Set(1) +} diff --git a/pkg/daemon/metrics.go b/pkg/daemon/metrics.go index 87ee608b69f..b6b4162b79b 100644 --- a/pkg/daemon/metrics.go +++ b/pkg/daemon/metrics.go @@ -91,6 +91,85 @@ var ( "protocol", }, ) + + metricIPLocalPortRange = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "ip_local_port_range", + Help: "value of system parameter /proc/sys/net/ipv4/ip_local_port_range, which should not conflict with the nodeport range", + }, []string{ + "hostname", + "start", + "end", + }) + + metricCheckSumErr = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "checksum_err_count", + Help: "Value of InCsumErrors for cmd `netstat -us`, checksum is error when value is greater than 0", + }, + []string{"hostname"}) + + metricCniConfig = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "cni_config_file", + Help: "cni config file in /etc/cni/net.d/", + }, []string{ + "hostname", + "ovn", + "other", + }) + + metricDNSSearch = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "dns_search_domain", + Help: "search domain in /etc/resolv.conf", + }, []string{ + "hostname", + "additional", + }) + + metricTCPTwRecycle = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "tcp_tw_recycle", + Help: "value of system parameter /proc/sys/net/ipv4/tcp_tw_recycle, the recommended value is 0", + }, []string{ + "hostname", + }) + + metricTCPMtuProbing = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "tcp_mtu_probing", + Help: "value of system parameter /proc/sys/net/ipv4/tcp_mtu_probing, the recommended value is 1", + }, []string{ + "hostname", + }) + + metricConntrackTCPLiberal = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "nf_conntrack_tcp_be_liberal", + Help: "value of system parameter /proc/sys/net/netfilter/nf_conntrack_tcp_be_liberal, the recommended value is 1", + }, []string{ + "hostname", + }) + + metricBridgeNfCallIptables = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "bridge_nf_call_iptables", + Help: "value of system parameter /proc/sys/net/bridge/bridge-nf-call-iptables, the recommended value is 1 for overlay, and 0 for underlay network", + }, []string{ + "hostname", + }) + + metricTCPMem = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "tcp_mem", + Help: "value of system parameter /proc/sys/net/ipv4/tcp_mem, recommend a large number value", + }, []string{ + "hostname", + "minimum", + "pressure", + "maximum", + }) + + metricIPv6RouteMaxsize = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "max_size", + Help: "value of system parameter /proc/sys/net/ipv6/route/max_size, recommend a large number value, at least 16384", + }, []string{ + "hostname", + }) + // reflector metrics // TODO(directxman12): update these to be histograms once the metrics overhaul KEP @@ -151,6 +230,7 @@ func InitMetrics() { registerReflectorMetrics() registerClientMetrics() registerOvnSubnetGatewayMetrics() + registerSystemParameterMetrics() prometheus.MustRegister(cniOperationHistogram) prometheus.MustRegister(cniWaitAddressResult) prometheus.MustRegister(cniConnectivityResult) @@ -161,6 +241,19 @@ func registerOvnSubnetGatewayMetrics() { prometheus.MustRegister(metricOvnSubnetGatewayPackets) } +func registerSystemParameterMetrics() { + prometheus.MustRegister(metricIPLocalPortRange) + prometheus.MustRegister(metricCheckSumErr) + prometheus.MustRegister(metricCniConfig) + prometheus.MustRegister(metricDNSSearch) + prometheus.MustRegister(metricTCPTwRecycle) + prometheus.MustRegister(metricTCPMtuProbing) + prometheus.MustRegister(metricConntrackTCPLiberal) + prometheus.MustRegister(metricBridgeNfCallIptables) + prometheus.MustRegister(metricTCPMem) + prometheus.MustRegister(metricIPv6RouteMaxsize) +} + // registerClientMetrics sets up the client latency metrics from client-go func registerClientMetrics() { // register the metrics with our registry