Skip to content

Commit

Permalink
add monitor for sysctl para (#3913)
Browse files Browse the repository at this point in the history
* add monitor for sysctl para

Signed-off-by: 马洪贞 <hzma@alauda.io>

* Update pkg/daemon/exporter_metric.go

Co-authored-by: 张祖建 <zhangzujian.7@gmail.com>
Signed-off-by: 马洪贞 <hzma@alauda.io>

---------

Signed-off-by: 马洪贞 <hzma@alauda.io>
Co-authored-by: 张祖建 <zhangzujian.7@gmail.com>
  • Loading branch information
hongzhen-ma and zhangzujian committed Apr 16, 2024
1 parent 36025b5 commit c91d5b6
Show file tree
Hide file tree
Showing 3 changed files with 304 additions and 0 deletions.
25 changes: 25 additions & 0 deletions pkg/daemon/controller_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -623,6 +623,31 @@ func (c *Controller) loopEncapIPCheck() {

func (c *Controller) ovnMetricsUpdate() {
c.setOvnSubnetGatewayMetric()

resetSysParaMetrics()
c.setIPLocalPortRangeMetric()
c.setCheckSumErrMetric()
c.setCniConfigMetric()
c.setDNSSearchMetric()
c.setTCPTwRecycleMetric()
c.setTCPMtuProbingMetric()
c.setConntrackTCPLiberalMetric()
c.setBridgeNfCallIptablesMetric()
c.setIPv6RouteMaxsizeMetric()
c.setTCPMemMetric()
}

func resetSysParaMetrics() {
metricIPLocalPortRange.Reset()
metricCheckSumErr.Reset()
metricCniConfig.Reset()
metricDNSSearch.Reset()
metricTCPTwRecycle.Reset()
metricTCPMtuProbing.Reset()
metricConntrackTCPLiberal.Reset()
metricBridgeNfCallIptables.Reset()
metricTCPMem.Reset()
metricIPv6RouteMaxsize.Reset()
}

func (c *Controller) operateMod() {
Expand Down
186 changes: 186 additions & 0 deletions pkg/daemon/exporter_metric.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
package daemon

import (
"os"
"os/exec"
"strconv"
"strings"

"k8s.io/klog/v2"

"github.com/containernetworking/cni/libcni"
"github.com/docker/docker/libnetwork/resolvconf"
)

func (c *Controller) setIPLocalPortRangeMetric() {
output, err := os.ReadFile("/proc/sys/net/ipv4/ip_local_port_range")
if err != nil {
klog.Errorf("failed to get value of ip_local_port_range, err %v", err)
return
}

values := strings.Fields(string(output))
if len(values) != 2 {
klog.Errorf("unexpected ip_local_port_range value: %q", string(output))
return
}
metricIPLocalPortRange.WithLabelValues(c.config.NodeName, values[0], values[1]).Set(1)
}

func (c *Controller) setCheckSumErrMetric() {
cmdstr := "netstat -us"
cmd := exec.Command("sh", "-c", cmdstr)
output, err := cmd.CombinedOutput()
if err != nil {
klog.Errorf("failed to exec cmd 'netstat -us', err %v", err)
return
}

found := false
lines := strings.Split(string(output), "\n")
for _, line := range lines {
line = strings.TrimSpace(line)
if line == "" {
continue
}

if strings.Contains(line, "InCsumErrors") {
values := strings.Split(line, ":")
if len(values) == 2 {
val, _ := strconv.Atoi(strings.TrimSpace(values[1]))
metricCheckSumErr.WithLabelValues(c.config.NodeName).Set(float64(val))
found = true
}
}
}
if !found {
metricCheckSumErr.WithLabelValues(c.config.NodeName).Set(float64(0))
}
}

func (c *Controller) setCniConfigMetric() {
files, err := libcni.ConfFiles(c.config.CniConfDir, []string{".conf", ".conflist"})
if err != nil {
klog.Errorf("failed to list cni config files in %s: %v", c.config.CniConfDir, err)
return
}

found := false
for _, file := range files {
if file == c.config.CniConfName {
continue
}
found = true
metricCniConfig.WithLabelValues(c.config.NodeName, c.config.CniConfName, file).Set(1)
}
if !found {
metricCniConfig.WithLabelValues(c.config.NodeName, c.config.CniConfName, "no other cni config").Set(1)
}
}

func (c *Controller) setDNSSearchMetric() {
file, err := resolvconf.Get()
if err != nil {
klog.Errorf("failed to get /etc/resolv.conf content: %v", err)
return
}
domains := resolvconf.GetSearchDomains(file.Content)

found := false
for _, domain := range domains {
if strings.Contains(domain, "local") {
continue
}

found = true
metricDNSSearch.WithLabelValues(c.config.NodeName, domain).Set(1)
}
if !found {
metricDNSSearch.WithLabelValues(c.config.NodeName, "no additional search domain").Set(1)
}
}

func (c *Controller) setTCPTwRecycleMetric() {
output, err := os.ReadFile("/proc/sys/net/ipv4/tcp_tw_recycle")
if err != nil {
if os.IsNotExist(err) {
return
}
klog.Errorf("failed to get value of tcp_tw_recycle, err %v", err)
return
}

val, _ := strconv.Atoi(strings.TrimSpace(string(output)))
metricTCPTwRecycle.WithLabelValues(c.config.NodeName).Set(float64(val))
}

func (c *Controller) setTCPMtuProbingMetric() {
output, err := os.ReadFile("/proc/sys/net/ipv4/tcp_mtu_probing")
if err != nil {
if os.IsNotExist(err) {
return
}
klog.Errorf("failed to get value of tcp_mtu_probing, err %v", err)
return
}

val, _ := strconv.Atoi(strings.TrimSpace(string(output)))
metricTCPMtuProbing.WithLabelValues(c.config.NodeName).Set(float64(val))
}

func (c *Controller) setConntrackTCPLiberalMetric() {
output, err := os.ReadFile("/proc/sys/net/netfilter/nf_conntrack_tcp_be_liberal")
if err != nil {
if os.IsNotExist(err) {
return
}
klog.Errorf("failed to get value of nf_conntrack_tcp_be_liberal, err %v", err)
return
}

val, _ := strconv.Atoi(strings.TrimSpace(string(output)))
metricConntrackTCPLiberal.WithLabelValues(c.config.NodeName).Set(float64(val))
}

func (c *Controller) setBridgeNfCallIptablesMetric() {
output, err := os.ReadFile("/proc/sys/net/bridge/bridge-nf-call-iptables")
if err != nil {
if os.IsNotExist(err) {
return
}
klog.Errorf("failed to get value of bridge-nf-call-iptables, err %v", err)
return
}

val, _ := strconv.Atoi(strings.TrimSpace(string(output)))
metricBridgeNfCallIptables.WithLabelValues(c.config.NodeName).Set(float64(val))
}

func (c *Controller) setIPv6RouteMaxsizeMetric() {
output, err := os.ReadFile("/proc/sys/net/ipv6/route/max_size")
if err != nil {
klog.Errorf("failed to get value of ipv6 route max_size, err %v", err)
return
}

val, _ := strconv.Atoi(strings.TrimSpace(string(output)))
metricIPv6RouteMaxsize.WithLabelValues(c.config.NodeName).Set(float64(val))
}

func (c *Controller) setTCPMemMetric() {
output, err := os.ReadFile("/proc/sys/net/ipv4/tcp_mem")
if err != nil {
if os.IsNotExist(err) {
return
}
klog.Errorf("failed to get value of ipv4 tcp_mem, err %v", err)
return
}

values := strings.Fields(string(output))
if len(values) != 3 {
klog.Errorf("unexpected tcp_mem value: %q", string(output))
return
}
metricTCPMem.WithLabelValues(c.config.NodeName, values[0], values[1], values[2]).Set(1)
}
93 changes: 93 additions & 0 deletions pkg/daemon/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,85 @@ var (
"protocol",
},
)

metricIPLocalPortRange = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "ip_local_port_range",
Help: "value of system parameter /proc/sys/net/ipv4/ip_local_port_range, which should not conflict with the nodeport range",
}, []string{
"hostname",
"start",
"end",
})

metricCheckSumErr = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "checksum_err_count",
Help: "Value of InCsumErrors for cmd `netstat -us`, checksum is error when value is greater than 0",
},
[]string{"hostname"})

metricCniConfig = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "cni_config_file",
Help: "cni config file in /etc/cni/net.d/",
}, []string{
"hostname",
"ovn",
"other",
})

metricDNSSearch = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "dns_search_domain",
Help: "search domain in /etc/resolv.conf",
}, []string{
"hostname",
"additional",
})

metricTCPTwRecycle = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "tcp_tw_recycle",
Help: "value of system parameter /proc/sys/net/ipv4/tcp_tw_recycle, the recommended value is 0",
}, []string{
"hostname",
})

metricTCPMtuProbing = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "tcp_mtu_probing",
Help: "value of system parameter /proc/sys/net/ipv4/tcp_mtu_probing, the recommended value is 1",
}, []string{
"hostname",
})

metricConntrackTCPLiberal = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "nf_conntrack_tcp_be_liberal",
Help: "value of system parameter /proc/sys/net/netfilter/nf_conntrack_tcp_be_liberal, the recommended value is 1",
}, []string{
"hostname",
})

metricBridgeNfCallIptables = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "bridge_nf_call_iptables",
Help: "value of system parameter /proc/sys/net/bridge/bridge-nf-call-iptables, the recommended value is 1 for overlay, and 0 for underlay network",
}, []string{
"hostname",
})

metricTCPMem = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "tcp_mem",
Help: "value of system parameter /proc/sys/net/ipv4/tcp_mem, recommend a large number value",
}, []string{
"hostname",
"minimum",
"pressure",
"maximum",
})

metricIPv6RouteMaxsize = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "max_size",
Help: "value of system parameter /proc/sys/net/ipv6/route/max_size, recommend a large number value, at least 16384",
}, []string{
"hostname",
})

// reflector metrics

// TODO(directxman12): update these to be histograms once the metrics overhaul KEP
Expand Down Expand Up @@ -151,6 +230,7 @@ func InitMetrics() {
registerReflectorMetrics()
registerClientMetrics()
registerOvnSubnetGatewayMetrics()
registerSystemParameterMetrics()
prometheus.MustRegister(cniOperationHistogram)
prometheus.MustRegister(cniWaitAddressResult)
prometheus.MustRegister(cniConnectivityResult)
Expand All @@ -161,6 +241,19 @@ func registerOvnSubnetGatewayMetrics() {
prometheus.MustRegister(metricOvnSubnetGatewayPackets)
}

func registerSystemParameterMetrics() {
prometheus.MustRegister(metricIPLocalPortRange)
prometheus.MustRegister(metricCheckSumErr)
prometheus.MustRegister(metricCniConfig)
prometheus.MustRegister(metricDNSSearch)
prometheus.MustRegister(metricTCPTwRecycle)
prometheus.MustRegister(metricTCPMtuProbing)
prometheus.MustRegister(metricConntrackTCPLiberal)
prometheus.MustRegister(metricBridgeNfCallIptables)
prometheus.MustRegister(metricTCPMem)
prometheus.MustRegister(metricIPv6RouteMaxsize)
}

// registerClientMetrics sets up the client latency metrics from client-go
func registerClientMetrics() {
// register the metrics with our registry
Expand Down

0 comments on commit c91d5b6

Please sign in to comment.