Skip to content

Commit

Permalink
pinger: add port binds check between local ovs and ovn-sb
Browse files Browse the repository at this point in the history
When ovn-controller is busy or some data lost in ovn-nb, the port bindings between local ovs and ovn-nb will mismatch. We are not sure how it happens now, but by providing metrics we can timely notice the problem and further investigate the issues.

(cherry picked from commit 3838a46)
  • Loading branch information
oilbeater committed Jan 2, 2020
1 parent 8435a33 commit 641d6f8
Show file tree
Hide file tree
Showing 2 changed files with 102 additions and 1 deletion.
9 changes: 9 additions & 0 deletions pkg/pinger/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,14 @@ var (
[]string{
"nodeName",
})
inconsistentPortBindingGauge = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "pinger_inconsistent_port_binding",
Help: "The number of mismatch port bindings between ovs and ovn-sb",
},
[]string{
"nodeName",
})
apiserverHealthyGauge = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "pinger_apiserver_healthy",
Expand Down Expand Up @@ -164,6 +172,7 @@ func init() {
prometheus.MustRegister(ovsDownGauge)
prometheus.MustRegister(ovnControllerUpGauge)
prometheus.MustRegister(ovnControllerDownGauge)
prometheus.MustRegister(inconsistentPortBindingGauge)
prometheus.MustRegister(apiserverHealthyGauge)
prometheus.MustRegister(apiserverUnhealthyGauge)
prometheus.MustRegister(apiserverRequestLatencyHistogram)
Expand Down
94 changes: 93 additions & 1 deletion pkg/pinger/ping.go
Original file line number Diff line number Diff line change
@@ -1,21 +1,27 @@
package pinger

import (
"context"
"fmt"
"github.com/alauda/kube-ovn/pkg/util"
goping "github.com/sparrc/go-ping"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/klog"
"math"
"net"
"os"
"os/exec"
"strings"
"time"
)

func StartPinger(config *Configuration) {
for {
checkOvs(config)
checkOvnController(config)
checkPortBindings(config)
checkApiServer(config)
ping(config)
if config.Mode != "server" {
Expand Down Expand Up @@ -145,7 +151,10 @@ func pingExternal(config *Configuration) {
func nslookup(config *Configuration) {
klog.Infof("start to check dns connectivity")
t1 := time.Now()
addrs, err := net.LookupHost(config.DNS)
ctx, cancel := context.WithTimeout(context.TODO(), 10 * time.Second)
defer cancel()
var r net.Resolver
addrs, err := r.LookupHost(ctx, config.DNS)
elpased := time.Since(t1)
if err != nil {
klog.Errorf("failed to resolve dns %s, %v", config.DNS, err)
Expand Down Expand Up @@ -193,3 +202,86 @@ func checkApiServer(config *Configuration) {
SetApiserverHealthyMetrics(config.NodeName, float64(elpased)/float64(time.Millisecond))
return
}

func checkPortBindings(config *Configuration) error {
klog.Infof("start to check por binding")
ovsBindings, err := checkOvsBindings()
if err != nil {
return err
}

sbBindings, err := checkSBBindings(config)
if err != nil {
return err
}
klog.Infof("port in sb is %v", sbBindings)
misMatch := []string{}
for _, port := range ovsBindings {
if !util.IsStringIn(port, sbBindings) {
misMatch = append(misMatch, port)
}
}
if len(misMatch) > 0 {
klog.Errorf("%d port %v not exist in sb-bindings", len(misMatch), misMatch)
inconsistentPortBindingGauge.WithLabelValues(config.NodeName).Set(float64(len(misMatch)))
} else {
klog.Infof("ovs and ovn-sb binding check passed")
inconsistentPortBindingGauge.WithLabelValues(config.NodeName).Set(0)
}
return nil
}

func checkOvsBindings() ([]string, error) {
output, err := exec.Command("ovs-vsctl", "--no-heading", "--data=bare", "--format=csv", "--columns=external_ids", "find", "interface", "external_ids:iface-id!=\"\"").CombinedOutput()
if err != nil {
klog.Errorf("failed to get ovs interface %v", err)
return nil, err
}
result := make([]string, 0, len(strings.Split(string(output), "\n")))
for _, line := range strings.Split(string(output), "\n") {
result = append(result, strings.TrimPrefix(line, "iface-id="))
}
return result, nil
}

func checkSBBindings(config *Configuration) ([]string, error) {
sbHost := os.Getenv("OVN_SB_SERVICE_HOST")
sbPort := os.Getenv("OVN_SB_SERVICE_PORT")
output, err := exec.Command(
"ovn-sbctl",
fmt.Sprintf("--db=tcp:%s:%s", sbHost, sbPort),
"--format=csv",
"--no-heading",
"--data=bare",
"--columns=_uuid",
"find",
"chassis",
fmt.Sprintf("hostname=%s", config.NodeName)).CombinedOutput()
if err != nil {
klog.Errorf("failed to find chassis %v", err)
return nil, err
}
if len(output) == 0 {
klog.Errorf("chassis for node %s not exist", config.NodeName)
return nil, fmt.Errorf("chassis for node %s not exist", config.NodeName)
}

chassis := strings.TrimSpace(string(output))
klog.Infof("chassis id is %s", chassis)
output, err = exec.Command(
"ovn-sbctl",
fmt.Sprintf("--db=tcp:%s:%s", sbHost, sbPort),
"--format=csv",
"--no-heading",
"--data=bare",
"--columns=logical_port",
"find",
"port_binding",
fmt.Sprintf("chassis=%s", chassis)).CombinedOutput()
if err != nil {
klog.Errorf("failed to list port_binding in ovn-sb %v", err)
return nil, err
}

return strings.Split(string(output), "\n"), nil
}

0 comments on commit 641d6f8

Please sign in to comment.