Skip to content

Commit

Permalink
feat: pinger prometheus support
Browse files Browse the repository at this point in the history
  • Loading branch information
oilbeater committed Sep 29, 2019
1 parent 675f025 commit 7c0517b
Show file tree
Hide file tree
Showing 6 changed files with 287 additions and 21 deletions.
9 changes: 9 additions & 0 deletions pkg/pinger/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/clientcmd"
"k8s.io/klog"
"os"
)

type Configuration struct {
Expand All @@ -18,6 +19,10 @@ type Configuration struct {
Interval int
Mode string
DNS string
NodeName string
HostIP string
PodName string
PodIP string
}

func ParseFlags() (*Configuration, error) {
Expand Down Expand Up @@ -56,6 +61,10 @@ func ParseFlags() (*Configuration, error) {
Interval: *argInterval,
Mode: *argMode,
DNS: *argDns,
PodIP: os.Getenv("POD_IP"),
HostIP: os.Getenv("HOST_IP"),
NodeName: os.Getenv("NODE_NAME"),
PodName: os.Getenv("POD_NAME"),
}
if err := config.initKubeClient(); err != nil {
return nil, err
Expand Down
194 changes: 194 additions & 0 deletions pkg/pinger/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
package pinger

import "github.com/prometheus/client_golang/prometheus"

var (
ovsUpGauge = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "pinger_ovs_up",
Help: "If the ovs on the node is up",
},
[]string{
"nodeName",
})
ovsDownGauge = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "pinger_ovs_down",
Help: "If the ovs on the node is down",
},
[]string{
"nodeName",
})
ovnControllerUpGauge = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "pinger_ovn_controller_up",
Help: "If the ovn_controller on the node is up",
},
[]string{
"nodeName",
})
ovnControllerDownGauge = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "pinger_ovn_controller_down",
Help: "If the ovn_controller on the node is down",
},
[]string{
"nodeName",
})
dnsHealthyGauge = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "pinger_dns_healthy",
Help: "if the dns request is healthy on this node",
},
[]string{
"nodeName",
})
dnsUnhealthyGauge = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "pinger_dns_unhealthy",
Help: "if the dns request is unhealthy on this node",
},
[]string{
"nodeName",
})
dnsRequestLatencyHistogram = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "pinger_dns_latency_ms",
Help: "the latency ms histogram the node request dsn",
Buckets: []float64{.5, 1, 2, 5, 10, 30},
},
[]string{
"nodeName",
})
podPingLatencyHistogram = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "pinger_pod_ping_latency_ms",
Help: "the latency ms histogram for pod peer ping",
Buckets: []float64{.25, .5, 1, 2, 5, 10, 30},
},
[]string{
"src_node_name",
"src_node_ip",
"src_pod_ip",
"target_node_name",
"target_node_ip",
"target_pod_ip",
})
podPingLostCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "pinger_pod_ping_lost_total",
Help: "the lost count for pod peer ping",
}, []string{
"src_node_name",
"src_node_ip",
"src_pod_ip",
"target_node_name",
"target_node_ip",
"target_pod_ip",
})
nodePingLatencyHistogram = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "pinger_node_ping_latency_ms",
Help: "the latency ms histogram for pod ping node",
Buckets: []float64{.25, .5, 1, 2, 5, 10, 30},
},
[]string{
"src_node_name",
"src_node_ip",
"src_pod_ip",
"target_node_name",
"target_node_ip",
})
nodePingLostCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "pinger_node_ping_lost_total",
Help: "the lost count for pod ping node",
}, []string{
"src_node_name",
"src_node_ip",
"src_pod_ip",
"target_node_name",
"target_node_ip",
})
)

func init() {
prometheus.MustRegister(ovsUpGauge)
prometheus.MustRegister(ovsDownGauge)
prometheus.MustRegister(ovnControllerUpGauge)
prometheus.MustRegister(ovnControllerDownGauge)
prometheus.MustRegister(dnsHealthyGauge)
prometheus.MustRegister(dnsUnhealthyGauge)
prometheus.MustRegister(dnsRequestLatencyHistogram)
prometheus.MustRegister(podPingLatencyHistogram)
prometheus.MustRegister(podPingLostCounter)
prometheus.MustRegister(nodePingLatencyHistogram)
prometheus.MustRegister(nodePingLostCounter)
}

func SetOvsUpMetrics(nodeName string) {
ovsUpGauge.WithLabelValues(nodeName).Set(1)
ovsDownGauge.WithLabelValues(nodeName).Set(0)
}

func SetOvsDownMetrics(nodeName string) {
ovsUpGauge.WithLabelValues(nodeName).Set(0)
ovsDownGauge.WithLabelValues(nodeName).Set(1)
}

func SetOvnControllerUpMetrics(nodeName string) {
ovnControllerUpGauge.WithLabelValues(nodeName).Set(1)
ovnControllerDownGauge.WithLabelValues(nodeName).Set(0)
}

func SetOvnControllerDownMetrics(nodeName string) {
ovnControllerUpGauge.WithLabelValues(nodeName).Set(0)
ovnControllerDownGauge.WithLabelValues(nodeName).Set(1)
}

func SetDnsHealthyMetrics(nodeName string, latency float64) {
dnsHealthyGauge.WithLabelValues(nodeName).Set(1)
dnsRequestLatencyHistogram.WithLabelValues(nodeName).Observe(latency)
dnsUnhealthyGauge.WithLabelValues(nodeName).Set(0)
}

func SetDnsUnhealthyMetrics(nodeName string) {
dnsHealthyGauge.WithLabelValues(nodeName).Set(0)
dnsUnhealthyGauge.WithLabelValues(nodeName).Set(1)
}

func SetPodPingMetrics(srcNodeName, srcNodeIP, srcPodIP, targetNodeName, targetNodeIP, targetPodIP string, latency float64, lost int) {
podPingLatencyHistogram.WithLabelValues(
srcNodeName,
srcNodeIP,
srcPodIP,
targetNodeName,
targetNodeIP,
targetPodIP,
).Observe(latency)
podPingLostCounter.WithLabelValues(
srcNodeName,
srcNodeIP,
srcPodIP,
targetNodeName,
targetNodeIP,
targetPodIP,
).Add(float64(lost))
}

func SetNodePingMetrics(srcNodeName, srcNodeIP, srcPodIP, targetNodeName, targetNodeIP string, latency float64, lost int) {
nodePingLatencyHistogram.WithLabelValues(
srcNodeName,
srcNodeIP,
srcPodIP,
targetNodeName,
targetNodeIP,
).Observe(latency)
nodePingLostCounter.WithLabelValues(
srcNodeName,
srcNodeIP,
srcPodIP,
targetNodeName,
targetNodeIP,
).Add(float64(lost))
}
52 changes: 31 additions & 21 deletions pkg/pinger/ping.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ import (
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/client-go/kubernetes"
"k8s.io/klog"
"net"
"os/exec"
Expand All @@ -15,8 +14,8 @@ import (

func StartPinger(config *Configuration) {
for {
checkOvs()
checkOvnController()
checkOvs(config)
checkOvnController(config)
ping(config)
if config.Mode != "server" {
break
Expand All @@ -26,14 +25,14 @@ func StartPinger(config *Configuration) {
}

func ping(config *Configuration) {
pingNodes(config.KubeClient)
pingPods(config.KubeClient, config.DaemonSetNamespace, config.DaemonSetName)
nslookup(config.DNS)
pingNodes(config)
pingPods(config)
nslookup(config)
}

func pingNodes(client kubernetes.Interface) {
func pingNodes(config *Configuration) {
klog.Infof("start to check node connectivity")
nodes, err := client.CoreV1().Nodes().List(metav1.ListOptions{})
nodes, err := config.KubeClient.CoreV1().Nodes().List(metav1.ListOptions{})
if err != nil {
klog.Errorf("failed to list nodes, %v", err)
return
Expand All @@ -56,21 +55,22 @@ func pingNodes(client kubernetes.Interface) {
stats := pinger.Statistics()
klog.Infof("ping node: %s %s, count: %d, loss rate %.2f%%, average rtt %.2fms",
nodeName, nodeIP, pinger.Count, stats.PacketLoss*100, float64(stats.AvgRtt)/float64(time.Millisecond))
SetNodePingMetrics(config.NodeName, config.HostIP, config.PodName, no.Name, addr.Address, float64(stats.AvgRtt)/float64(time.Millisecond), stats.PacketsSent-stats.PacketsRecv)
}(addr.Address, no.Name)
}
}
}
wg.Wait()
}

func pingPods(client kubernetes.Interface, dsNamespace, dsName string) {
func pingPods(config *Configuration) {
klog.Infof("start to check pod connectivity")
ds, err := client.AppsV1().DaemonSets(dsNamespace).Get(dsName, metav1.GetOptions{})
ds, err := config.KubeClient.AppsV1().DaemonSets(config.DaemonSetNamespace).Get(config.DaemonSetName, metav1.GetOptions{})
if err != nil {
klog.Errorf("failed to get peer ds: %v", err)
return
}
pods, err := client.CoreV1().Pods(dsNamespace).List(metav1.ListOptions{LabelSelector: labels.Set(ds.Spec.Selector.MatchLabels).String()})
pods, err := config.KubeClient.CoreV1().Pods(config.DaemonSetNamespace).List(metav1.ListOptions{LabelSelector: labels.Set(ds.Spec.Selector.MatchLabels).String()})
if err != nil {
klog.Errorf("failed to list peer pods: %v", err)
return
Expand All @@ -80,7 +80,7 @@ func pingPods(client kubernetes.Interface, dsNamespace, dsName string) {
for _, pod := range pods.Items {
if pod.Status.PodIP != "" {
wg.Add(1)
go func(podIp, podName, podNamespace string) {
go func(podIp, podName, nodeIP, nodeName string) {
defer wg.Done()
pinger, err := goping.NewPinger(podIp)
if err != nil {
Expand All @@ -91,38 +91,48 @@ func pingPods(client kubernetes.Interface, dsNamespace, dsName string) {
pinger.Count = 5
pinger.Run()
stats := pinger.Statistics()
klog.Infof("ping pod: %s/%s %s, count: %d, loss rate %.2f, average rtt %.2fms",
podNamespace, podName, podIp, pinger.Count, stats.PacketLoss*100, float64(stats.AvgRtt)/float64(time.Millisecond))
}(pod.Status.PodIP, pod.Name, pod.Namespace)
klog.Infof("ping pod: %s %s, count: %d, loss rate %.2f, average rtt %.2fms",
podName, podIp, pinger.Count, stats.PacketLoss*100, float64(stats.AvgRtt)/float64(time.Millisecond))
SetPodPingMetrics(config.NodeName, config.HostIP, config.PodName, nodeName, nodeIP, podIp, float64(stats.AvgRtt)/float64(time.Millisecond), stats.PacketsSent-stats.PacketsRecv)
}(pod.Status.PodIP, pod.Name, pod.Spec.NodeName, pod.Status.HostIP)
}
}
wg.Wait()
}

func nslookup(dns string) {
func nslookup(config *Configuration) {
klog.Infof("start to check dns connectivity")
t1 := time.Now()
addrs, err := net.LookupHost(dns)
addrs, err := net.LookupHost(config.DNS)
elpased := time.Since(t1)
if err != nil {
klog.Errorf("failed to resolve dns %s, %v", dns, err)
klog.Errorf("failed to resolve dns %s, %v", config.DNS, err)
SetDnsUnhealthyMetrics(config.NodeName)
return
}
klog.Infof("resolve dns %s to %v in %.2fms", dns, addrs, float64(elpased)/float64(time.Millisecond))
SetDnsHealthyMetrics(config.NodeName, float64(elpased)/float64(time.Millisecond))
klog.Infof("resolve dns %s to %v in %.2fms", config.DNS, addrs, float64(elpased)/float64(time.Millisecond))
}

func checkOvs() {
func checkOvs(config *Configuration) {
output, err := exec.Command("/usr/share/openvswitch/scripts/ovs-ctl", "status").CombinedOutput()
if err != nil {
klog.Errorf("check ovs status failed %v, %s", err, string(output))
SetOvsDownMetrics(config.NodeName)
return
}
klog.Infof("ovs-vswitchd and ovsdb are up")
SetOvsUpMetrics(config.NodeName)
return
}

func checkOvnController() {
func checkOvnController(config *Configuration) {
output, err := exec.Command("/usr/share/openvswitch/scripts/ovn-ctl", "status_controller").CombinedOutput()
if err != nil {
klog.Errorf("check ovn_controller status failed %v, %s", err, string(output))
SetOvnControllerDownMetrics(config.NodeName)
return
}
klog.Infof("ovn_controller is up")
SetOvnControllerUpMetrics(config.NodeName)
}
26 changes: 26 additions & 0 deletions yamls/kube-ovn-ipv6.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,18 @@ spec:
valueFrom:
fieldRef:
fieldPath: status.podIP
- name: HOST_IP
valueFrom:
fieldRef:
fieldPath: status.hostIP
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
volumeMounts:
- mountPath: /lib/modules
name: host-modules
Expand Down Expand Up @@ -252,3 +264,17 @@ spec:
- name: host-log
hostPath:
path: /var/log/openvswitch
---
kind: Service
apiVersion: v1
metadata:
name: pinger
namespace: kube-ovn
labels:
app: kube-ovn-pinger
spec:
selector:
app: kube-ovn-pinger
ports:
- port: 8080
name: http

0 comments on commit 7c0517b

Please sign in to comment.