docs: add pinger/controller/cni metrics

kubeovn · Oct 19, 2020 · c35a159 · c35a159
1 parent ee86545
commit c35a159
Show file tree

Hide file tree

Showing 6 changed files with 74 additions and 41 deletions.
diff --git a/.gitignore b/.gitignore
@@ -10,6 +10,7 @@ dist/images/kube-ovn-gateway
 dist/images/kube-ovn-webhook
 dist/images/kube-ovn-pinger
 dist/images/kube-ovn-speaker
+dist/images/kube-ovn-monitor
 kube-ovn.yaml
 kube-ovn-crd.yaml
 ovn.yaml
diff --git a/README.md b/README.md
@@ -81,6 +81,7 @@ If you want to install Kubernetes from scratch, you can try [kubespray](https://
 - [IPv6](docs/ipv6.md)
 - [Tracing/Diagnose/Dump Traffic with Kubectl Plugin](docs/kubectl-plugin.md)
 - [Prometheus Integration](docs/prometheus.md)
+- [Metrics](docs/ovn-ovs-monitor.md)
 
 ## Contribution
 We are looking forwards to your PR!

diff --git a/docs/ovn-ovs-monitor.md b/docs/ovn-ovs-monitor.md
@@ -1,11 +1,10 @@
-# OVN/OVS Monitor Statistics
-
-This document shows monitor metrics about OVN and OVS.
+# Kube-OVN Monitor Metrics
 
+This document shows Kube-OVN monitor metrics.
 
 Type | Metric | Description
 ---|---|---
-OVN_Monitor |  | 
+OVN_Monitor | | OVN NB/SB/Northd metrics
  1 | ovn_status | OVN Health Status. The values are: health(1), unhealth(0).
  2 | ovn_info | This metric provides basic information about OVN. It is always set to 1.
  3 | failed_req_count | The number of failed requests to OVN stack.
@@ -34,7 +33,7 @@ OVN_Monitor |  |
  26 | cluster_outbound_connections_total | The total number of outbound connections from the server.
  27 | cluster_inbound_connections_error_total | The total number of failed inbound connections to the server.
  28 | cluster_outbound_connections_error_total | The total number of failed outbound connections from the server.
-OVS_Monitor | | 
+OVS_Monitor | | ovsdb/vswitchd metrics
  1 | ovs_status | OVS Health Status. The values are: health(1), unhealth(0).
  2 | ovs_info | This metric provides basic information about OVS. It is always set to 1.
  3 | failed_req_count | The number of failed requests to OVS stack.
@@ -70,4 +69,52 @@ OVS_Monitor | |
  33 | interface_rx_over_err | Represents the number of packets with RX overrun received by OVS interface.
  34 | interface_tx_dropped | Represents the number of output packets dropped by OVS interface.
  35 | interface_tx_errors | Represents the total number of transmit errors by OVS interface.
- 36 | interface_collisions | Represents the number of collisions on OVS interface.
+ 36 | interface_collisions | Represents the number of collisions on OVS interface.
+ Kube-OVN-Pinger | | Network quality metrics
+ 1 | pinger_ovs_up | If the ovs on the node is up
+ 2 | pinger_ovs_down | If the ovs on the node is down
+ 3 | pinger_ovn_controller_up | If the ovn_controller on the node is up
+ 4 | pinger_ovn_controller_down | If the ovn_controller on the node is down
+ 5 | pinger_inconsistent_port_binding | The number of mismatch port bindings between ovs and ovn-sb
+ 6 | pinger_apiserver_healthy | If the apiserver request is healthy on this node
+ 7 | pinger_apiserver_unhealthy | If the apiserver request is unhealthy on this node
+ 8 | pinger_apiserver_latency_ms | The latency ms histogram the node request apiserver
+ 9 | pinger_internal_dns_healthy | If the internal dns request is unhealthy on this node
+ 10 | pinger_internal_dns_unhealthy | If the internal dns request is unhealthy on this node
+ 11 | pinger_internal_dns_latency_ms | The latency ms histogram the node request internal dns
+ 12 | pinger_external_dns_health | If the external dns request is healthy on this node
+ 13 | pinger_external_dns_unhealthy | If the external dns request is unhealthy on this node
+ 14 | pinger_external_dns_latency_ms | The latency ms histogram the node request external dns
+ 15 | pinger_pod_ping_latency_ms | The latency ms histogram for pod peer ping
+ 16 | pinger_pod_ping_lost_total | The lost count for pod peer ping
+ 17 | pinger_node_ping_latency_ms | The latency ms histogram for pod ping node
+ 18 | pinger_node_ping_lost_total | The lost count for pod ping node
+ 19 | pinger_external_ping_latency_ms | The latency ms histogram for pod ping external address
+ 20 | pinger_node_external_lost_total | The lost count for pod ping external address
+ Kube-OVN-Controller | | Controller metrics
+ 1 | rest_client_request_latency_seconds | Request latency in seconds. Broken down by verb and URL
+ 2 | rest_client_requests_total | Number of HTTP requests, partitioned by status code, method, and host
+ 3 | lists_total | Total number of API lists done by the reflectors
+ 4 | list_duration_seconds | How long an API list takes to return and decode for the reflectors
+ 5 | items_per_list | How many items an API list returns to the reflectors
+ 6 | watches_total | Total number of API watches done by the reflectors
+ 7 | short_watches_total | Total number of short API watches done by the reflectors
+ 8 | watch_duration_seconds | How long an API watch takes to return and decode for the reflectors
+ 9 | items_per_watch | How many items an API watch returns to the reflectors
+ 10 | last_resource_version | Last resource version seen for the reflectors
+ 11 | ovs_client_request_latency_milliseconds | The latency histogram for ovs request 
+ Kube-OVN-CNI | | CNI metrics
+ 1 | cni_op_latency_seconds | The latency seconds for cni operations
+ 2 | cni_wait_address_seconds_total | Latency that cni wait controller to assign an address
+ 3 | cni_wait_connectivity_seconds_total | Latency that cni wait address ready in overlay network
+ 4 | rest_client_request_latency_seconds | Request latency in seconds. Broken down by verb and URL
+ 5 | rest_client_requests_total | Number of HTTP requests, partitioned by status code, method, and host
+ 6 | lists_total | Total number of API lists done by the reflectors
+ 7 | list_duration_seconds | How long an API list takes to return and decode for the reflectors
+ 8 | items_per_list | How many items an API list returns to the reflectors
+ 9 | watches_total | Total number of API watches done by the reflectors
+ 10 | short_watches_total | Total number of short API watches done by the reflectors
+ 11 | watch_duration_seconds | How long an API watch takes to return and decode for the reflectors
+ 12 | items_per_watch | How many items an API watch returns to the reflectors
+ 13 | last_resource_version | Last resource version seen for the reflectors
+ 14 | ovs_client_request_latency_milliseconds | The latency histogram for ovs request 
diff --git a/docs/prometheus.md b/docs/prometheus.md
@@ -2,25 +2,9 @@ Pinger makes network requests between pods/nodes/services/dns to test the connec
 
 ## Prometheus Integration
 
-Pinger exposes metrics at `:8080/metrics`, it will show following metrics
-
-```bash
-pinger_ovs_up
-pinger_ovs_down
-pinger_ovn_controller_up
-pinger_ovn_controller_down
-pinger_dns_healthy
-pinger_dns_unhealthy
-pinger_dns_latency_ms
-pinger_pod_ping_latency_ms
-pinger_pod_ping_lost_total
-pinger_node_ping_latency_ms
-pinger_node_ping_lost_total
-```
-
-Kube-OVN-Controller expose metrics at `10660/metrics`, it will show controller runtime metrics.
-
-You can use kube-prometheus to scrape the metrics. The related ServiceMonitor yaml can be found [here](../dist/monitoring)
+Kube-OVN will expose metrics of its own components and network quality. All exposed metrics can be found [here](ovn-ovs-monitor.md).
+
+You can use kube-prometheus to scrape the metrics. The related ServiceMonitor yaml can be found [here](../dist/monitoring).
 
 ## Grafana Dashboard
 

diff --git a/pkg/daemon/metrics.go b/pkg/daemon/metrics.go
@@ -14,7 +14,7 @@ var (
 	cniOperationHistogram = prometheus.NewHistogramVec(
 		prometheus.HistogramOpts{
 			Name:    "cni_op_latency_seconds",
-			Help:    "the latency seconds for cni operations",
+			Help:    "The latency seconds for cni operations",
 			Buckets: prometheus.ExponentialBuckets(0.001, 2, 10),
 		}, []string{
 			"node_name",

diff --git a/pkg/pinger/metrics.go b/pkg/pinger/metrics.go
@@ -46,23 +46,23 @@ var (
 	apiserverHealthyGauge = prometheus.NewGaugeVec(
 		prometheus.GaugeOpts{
 			Name: "pinger_apiserver_healthy",
-			Help: "if the apiserver request is healthy on this node",
+			Help: "If the apiserver request is healthy on this node",
 		},
 		[]string{
 			"nodeName",
 		})
 	apiserverUnhealthyGauge = prometheus.NewGaugeVec(
 		prometheus.GaugeOpts{
 			Name: "pinger_apiserver_unhealthy",
-			Help: "if the apiserver request is unhealthy on this node",
+			Help: "If the apiserver request is unhealthy on this node",
 		},
 		[]string{
 			"nodeName",
 		})
 	apiserverRequestLatencyHistogram = prometheus.NewHistogramVec(
 		prometheus.HistogramOpts{
 			Name:    "pinger_apiserver_latency_ms",
-			Help:    "the latency ms histogram the node request apiserver",
+			Help:    "The latency ms histogram the node request apiserver",
 			Buckets: []float64{2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50},
 		},
 		[]string{
@@ -71,23 +71,23 @@ var (
 	internalDnsHealthyGauge = prometheus.NewGaugeVec(
 		prometheus.GaugeOpts{
 			Name: "pinger_internal_dns_healthy",
-			Help: "if the dns request is healthy on this node",
+			Help: "If the internal dns request is healthy on this node",
 		},
 		[]string{
 			"nodeName",
 		})
 	internalDnsUnhealthyGauge = prometheus.NewGaugeVec(
 		prometheus.GaugeOpts{
 			Name: "pinger_internal_dns_unhealthy",
-			Help: "if the dns request is unhealthy on this node",
+			Help: "If the internal dns request is unhealthy on this node",
 		},
 		[]string{
 			"nodeName",
 		})
 	internalDnsRequestLatencyHistogram = prometheus.NewHistogramVec(
 		prometheus.HistogramOpts{
 			Name:    "pinger_internal_dns_latency_ms",
-			Help:    "the latency ms histogram the node request dns",
+			Help:    "The latency ms histogram the node request internal dns",
 			Buckets: []float64{2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50},
 		},
 		[]string{
@@ -96,23 +96,23 @@ var (
 	externalDnsHealthyGauge = prometheus.NewGaugeVec(
 		prometheus.GaugeOpts{
 			Name: "pinger_external_dns_healthy",
-			Help: "if the dns request is healthy on this node",
+			Help: "If the external dns request is healthy on this node",
 		},
 		[]string{
 			"nodeName",
 		})
 	externalDnsUnhealthyGauge = prometheus.NewGaugeVec(
 		prometheus.GaugeOpts{
 			Name: "pinger_external_dns_unhealthy",
-			Help: "if the dns request is unhealthy on this node",
+			Help: "If the external dns request is unhealthy on this node",
 		},
 		[]string{
 			"nodeName",
 		})
 	externalDnsRequestLatencyHistogram = prometheus.NewHistogramVec(
 		prometheus.HistogramOpts{
 			Name:    "pinger_external_dns_latency_ms",
-			Help:    "the latency ms histogram the node request dns",
+			Help:    "The latency ms histogram the node request external dns",
 			Buckets: []float64{2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50},
 		},
 		[]string{
@@ -121,7 +121,7 @@ var (
 	podPingLatencyHistogram = prometheus.NewHistogramVec(
 		prometheus.HistogramOpts{
 			Name:    "pinger_pod_ping_latency_ms",
-			Help:    "the latency ms histogram for pod peer ping",
+			Help:    "The latency ms histogram for pod peer ping",
 			Buckets: []float64{.25, .5, 1, 2, 5, 10, 30},
 		},
 		[]string{
@@ -135,7 +135,7 @@ var (
 	podPingLostCounter = prometheus.NewCounterVec(
 		prometheus.CounterOpts{
 			Name: "pinger_pod_ping_lost_total",
-			Help: "the lost count for pod peer ping",
+			Help: "The lost count for pod peer ping",
 		}, []string{
 			"src_node_name",
 			"src_node_ip",
@@ -147,7 +147,7 @@ var (
 	nodePingLatencyHistogram = prometheus.NewHistogramVec(
 		prometheus.HistogramOpts{
 			Name:    "pinger_node_ping_latency_ms",
-			Help:    "the latency ms histogram for pod ping node",
+			Help:    "The latency ms histogram for pod ping node",
 			Buckets: []float64{.25, .5, 1, 2, 5, 10, 30},
 		},
 		[]string{
@@ -160,7 +160,7 @@ var (
 	nodePingLostCounter = prometheus.NewCounterVec(
 		prometheus.CounterOpts{
 			Name: "pinger_node_ping_lost_total",
-			Help: "the lost count for pod ping node",
+			Help: "The lost count for pod ping node",
 		}, []string{
 			"src_node_name",
 			"src_node_ip",
@@ -171,7 +171,7 @@ var (
 	externalPingLatencyHistogram = prometheus.NewHistogramVec(
 		prometheus.HistogramOpts{
 			Name:    "pinger_external_ping_latency_ms",
-			Help:    "the latency ms histogram for pod ping external address",
+			Help:    "The latency ms histogram for pod ping external address",
 			Buckets: []float64{.25, .5, 1, 2, 5, 10, 30, 50, 100},
 		},
 		[]string{
@@ -183,7 +183,7 @@ var (
 	externalPingLostCounter = prometheus.NewCounterVec(
 		prometheus.CounterOpts{
 			Name: "pinger_node_external_lost_total",
-			Help: "the lost count for pod ping external address",
+			Help: "The lost count for pod ping external address",
 		}, []string{
 			"src_node_name",
 			"src_node_ip",