diff --git a/.github/workflows/build-x86-image.yaml b/.github/workflows/build-x86-image.yaml index c755b396d62..e98d46a4bba 100644 --- a/.github/workflows/build-x86-image.yaml +++ b/.github/workflows/build-x86-image.yaml @@ -473,6 +473,10 @@ jobs: E2E_NETWORK_MODE: ${{ matrix.mode }} run: make k8s-conformance-e2e + - name: Check kube ovn pod restarts + run: | + make check-kube-ovn-pod-restarts + - name: kubectl ko log if: failure() run: | @@ -630,6 +634,10 @@ jobs: working-directory: ${{ env.E2E_DIR }} run: make k8s-netpol-e2e + - name: Check kube ovn pod restarts + run: | + make check-kube-ovn-pod-restarts + - name: kubectl ko log if: failure() run: | @@ -762,6 +770,10 @@ jobs: working-directory: ${{ env.E2E_DIR }} run: make cyclonus-netpol-e2e + - name: Check kube ovn pod restarts + run: | + make check-kube-ovn-pod-restarts + - name: kubectl ko log if: failure() run: | @@ -920,6 +932,10 @@ jobs: make kind-install-kubevirt make kube-ovn-kubevirt-e2e + - name: Check kube ovn pod restarts + run: | + make check-kube-ovn-pod-restarts + - name: kubectl ko log if: failure() run: | @@ -1068,6 +1084,10 @@ jobs: E2E_IP_FAMILY: ${{ matrix.ip-family }} run: make kube-ovn-ic-conformance-e2e + - name: Check kube ovn pod restarts + run: | + make check-kube-ovn-pod-restarts + - name: kubectl ko log if: failure() run: | @@ -1191,6 +1211,10 @@ jobs: E2E_IP_FAMILY: ${{ matrix.ip-family }} run: make kube-ovn-multus-conformance-e2e + - name: Check kube ovn pod restarts + run: | + make check-kube-ovn-pod-restarts + - name: kubectl ko log if: failure() run: | @@ -1255,6 +1279,10 @@ jobs: ENABLE_SSL: "${{ matrix.ssl }}" run: make kind-install-chart + - name: Check kube ovn pod restarts + run: | + make check-kube-ovn-pod-restarts + - name: Uninstall Kube-OVN run: make kind-uninstall-chart @@ -1301,6 +1329,10 @@ jobs: - name: Install Kube-OVN run: make kind-install-underlay-logical-gateway-dual + - name: Check kube ovn pod restarts + run: | + make check-kube-ovn-pod-restarts + - name: Cleanup run: sh -x dist/images/cleanup.sh @@ -1349,6 +1381,10 @@ jobs: ENABLE_LB: "false" run: make kind-install + - name: Check kube ovn pod restarts + run: | + make check-kube-ovn-pod-restarts + - name: Cleanup run: sh -x dist/images/cleanup.sh @@ -1397,6 +1433,10 @@ jobs: ENABLE_NP: "false" run: make kind-install + - name: Check kube ovn pod restarts + run: | + make check-kube-ovn-pod-restarts + - name: Cleanup run: sh -x dist/images/cleanup.sh @@ -1502,6 +1542,10 @@ jobs: E2E_BRANCH: ${{ github.base_ref || github.ref_name }} run: make kube-ovn-lb-svc-conformance-e2e + - name: Check kube ovn pod restarts + run: | + make check-kube-ovn-pod-restarts + webhook-e2e: name: Webhook E2E needs: @@ -1597,6 +1641,10 @@ jobs: E2E_BRANCH: ${{ github.base_ref || github.ref_name }} run: make kube-ovn-webhook-e2e + - name: Check kube ovn pod restarts + run: | + make check-kube-ovn-pod-restarts + - name: kubectl ko log if: failure() run: | @@ -1653,6 +1701,10 @@ jobs: - name: Install Kube-OVN run: make kind-install + - name: Check kube ovn pod restarts + run: | + make check-kube-ovn-pod-restarts + - name: kubectl ko log if: failure() run: | @@ -1784,6 +1836,10 @@ jobs: E2E_NETWORK_MODE: ${{ matrix.mode }} run: make k8s-conformance-e2e + - name: Check kube ovn pod restarts + run: | + make check-kube-ovn-pod-restarts + - name: kubectl ko log if: failure() run: | @@ -1914,6 +1970,10 @@ jobs: make kube-ovn-security-e2e make kube-ovn-ha-e2e + - name: Check kube ovn pod restarts + run: | + make check-kube-ovn-pod-restarts + - name: kubectl ko log if: failure() run: | @@ -2004,6 +2064,10 @@ jobs: working-directory: ${{ env.E2E_DIR }} run: make kube-ovn-submariner-conformance-e2e + - name: Check kube ovn pod restarts + run: | + make check-kube-ovn-pod-restarts + - name: kubectl ko log if: failure() run: | @@ -2122,6 +2186,10 @@ jobs: E2E_BRANCH: ${{ github.base_ref || github.ref_name }} run: make iptables-vpc-nat-gw-conformance-e2e + - name: Check kube ovn pod restarts + run: | + make check-kube-ovn-pod-restarts + - name: kubectl ko log if: failure() run: | @@ -2235,6 +2303,10 @@ jobs: E2E_BRANCH: ${{ github.base_ref || github.ref_name }} run: make ovn-vpc-nat-gw-conformance-e2e + - name: Check kube ovn pod restarts + run: | + make check-kube-ovn-pod-restarts + push: name: Push Images needs: diff --git a/.github/workflows/scheduled-e2e.yaml b/.github/workflows/scheduled-e2e.yaml index a9c44c851f9..2c859361a3b 100644 --- a/.github/workflows/scheduled-e2e.yaml +++ b/.github/workflows/scheduled-e2e.yaml @@ -105,6 +105,10 @@ jobs: E2E_NETWORK_MODE: ${{ matrix.mode }} run: make k8s-conformance-e2e + - name: Check kube ovn pod restarts + run: | + make check-kube-ovn-pod-restarts + k8s-netpol-e2e: name: Kubernetes Network Policy E2E runs-on: ubuntu-22.04 @@ -187,6 +191,10 @@ jobs: - name: Run E2E run: make k8s-netpol-e2e + - name: Check kube ovn pod restarts + run: | + make check-kube-ovn-pod-restarts + cyclonus-netpol-e2e: name: Cyclonus Network Policy E2E runs-on: ubuntu-22.04 @@ -246,6 +254,10 @@ jobs: - name: Run E2E run: make cyclonus-netpol-e2e + - name: Check kube ovn pod restarts + run: | + make check-kube-ovn-pod-restarts + kube-ovn-conformance-e2e: name: Kube-OVN Conformance E2E runs-on: ubuntu-22.04 @@ -330,6 +342,10 @@ jobs: E2E_NETWORK_MODE: ${{ matrix.mode }} run: make kube-ovn-conformance-e2e + - name: Check kube ovn pod restarts + run: | + make check-kube-ovn-pod-restarts + kube-ovn-ic-conformance-e2e: name: Kube-OVN IC Conformance E2E runs-on: ubuntu-22.04 @@ -404,6 +420,10 @@ jobs: E2E_BRANCH: ${{ matrix.branch }} run: make kube-ovn-ic-conformance-e2e + - name: Check kube ovn pod restarts + run: | + make check-kube-ovn-pod-restarts + underlay-logical-gateway-installation-test: name: Underlay Logical Gateway Installation Test runs-on: ubuntu-22.04 @@ -451,6 +471,10 @@ jobs: docker pull kubeovn/kube-ovn:$version VERSION=$version make kind-install-underlay-logical-gateway-dual + - name: Check kube ovn pod restarts + run: | + make check-kube-ovn-pod-restarts + - name: Cleanup working-directory: test/e2e/kube-ovn/branches/${{ matrix.branch }} run: sh dist/images/cleanup.sh @@ -504,6 +528,10 @@ jobs: docker pull kubeovn/kube-ovn:$version VERSION=$version make kind-install + - name: Check kube ovn pod restarts + run: | + make check-kube-ovn-pod-restarts + - name: Cleanup working-directory: test/e2e/kube-ovn/branches/${{ matrix.branch }} run: sh dist/images/cleanup.sh @@ -557,6 +585,10 @@ jobs: docker pull kubeovn/kube-ovn:$version VERSION=$version make kind-install + - name: Check kube ovn pod restarts + run: | + make check-kube-ovn-pod-restarts + - name: Cleanup working-directory: test/e2e/kube-ovn/branches/${{ matrix.branch }} run: sh dist/images/cleanup.sh @@ -632,6 +664,10 @@ jobs: docker pull kubeovn/vpc-nat-gateway:$version VERSION=$version make kind-install-lb-svc + - name: Check kube ovn pod restarts + run: | + make check-kube-ovn-pod-restarts + - name: Run E2E run: make kube-ovn-lb-svc-conformance-e2e @@ -706,6 +742,10 @@ jobs: VERSION=$version make kind-install VERSION=$version make kind-install-kubevirt + - name: Check kube ovn pod restarts + run: | + make check-kube-ovn-pod-restarts + - name: Run E2E run: make kube-ovn-kubevirt-e2e @@ -780,6 +820,10 @@ jobs: - name: Run E2E run: make kube-ovn-webhook-e2e + - name: Check kube ovn pod restarts + run: | + make check-kube-ovn-pod-restarts + installation-compatibility-test: name: Installation Compatibility Test runs-on: ubuntu-22.04 @@ -830,6 +874,10 @@ jobs: working-directory: test/e2e/kube-ovn/branches/${{ matrix.branch }} run: sh dist/images/cleanup.sh + - name: Check kube ovn pod restarts + run: | + make check-kube-ovn-pod-restarts + cilium-chaining-e2e: name: Cilium Chaining E2E runs-on: ubuntu-22.04 @@ -914,6 +962,10 @@ jobs: E2E_CILIUM_CHAINING: "true" run: make k8s-conformance-e2e + - name: Check kube ovn pod restarts + run: | + make check-kube-ovn-pod-restarts + - name: Cleanup working-directory: test/e2e/kube-ovn/branches/${{ matrix.branch }} run: sh dist/images/cleanup.sh @@ -1008,6 +1060,10 @@ jobs: make kube-ovn-security-e2e make kube-ovn-ha-e2e + - name: Check kube ovn pod restarts + run: | + make check-kube-ovn-pod-restarts + - name: Cleanup run: sh dist/images/cleanup.sh @@ -1109,6 +1165,10 @@ jobs: E2E_BRANCH: ${{ env.VERSION_TO }} run: make k8s-conformance-e2e + - name: Check kube ovn pod restarts + run: | + make check-kube-ovn-pod-restarts + - name: Cleanup working-directory: test/e2e/kube-ovn/branches/${{ env.VERSION_TO }} run: sh dist/images/cleanup.sh @@ -1167,6 +1227,10 @@ jobs: - name: Run E2E run: make kube-ovn-submariner-conformance-e2e + - name: Check kube ovn pod restarts + run: | + make check-kube-ovn-pod-restarts + - name: Cleanup working-directory: test/e2e/kube-ovn/branches/${{ matrix.branch }} run: sh dist/images/cleanup.sh @@ -1241,6 +1305,10 @@ jobs: - name: Run E2E run: make iptables-vpc-nat-gw-conformance-e2e + - name: Check kube ovn pod restarts + run: | + make check-kube-ovn-pod-restarts + ovn-vpc-nat-gw-conformance-e2e: name: OVN VPC NAT Gateway E2E runs-on: ubuntu-22.04 @@ -1310,3 +1378,8 @@ jobs: - name: Run E2E run: make ovn-vpc-nat-gw-conformance-e2e + + - name: Check kube ovn pod restarts + run: | + make check-kube-ovn-pod-restarts + \ No newline at end of file diff --git a/Makefile b/Makefile index 01ef794c096..5e35ff21fd2 100644 --- a/Makefile +++ b/Makefile @@ -963,6 +963,10 @@ kind-clean-bgp-ha: $(CLAB_IMAGE) clab destroy -t /clab-bgp/clab.yaml @$(MAKE) kind-clean +.PHONY: check-kube-ovn-pod-restarts +check-kube-ovn-pod-restarts: + bash dist/images/check-kube-ovn-pod-restarts.sh + .PHONY: uninstall uninstall: bash dist/images/cleanup.sh diff --git a/cmd/daemon/cniserver.go b/cmd/daemon/cniserver.go index edc1c04b813..b2b409271d6 100644 --- a/cmd/daemon/cniserver.go +++ b/cmd/daemon/cniserver.go @@ -49,9 +49,10 @@ func CmdMain() { util.LogFatalAndExit(err, "failed to initialize ovn chassis annotation") } - if err = daemon.InitMirror(config); err != nil { + if err := Retry(util.MirrosRetryMaxTimes, util.MirrosRetryInterval, daemon.InitMirror, config); err != nil { util.LogFatalAndExit(err, "failed to initialize ovs mirror") } + klog.Info("init node gw") if err = daemon.InitNodeGateway(config); err != nil { util.LogFatalAndExit(err, "failed to initialize node gateway") diff --git a/dist/images/check-kube-ovn-pod-restarts.sh b/dist/images/check-kube-ovn-pod-restarts.sh new file mode 100644 index 00000000000..bc0fa916640 --- /dev/null +++ b/dist/images/check-kube-ovn-pod-restarts.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +namespace="kube-system" + +# check if there are any crashed pods +crashedPodsNum=$(kubectl get po -n "$namespace" -o wide | grep -E "ovn-controller|ovn-pinger|ovn-monitor|ovn-cni|ovn-central|ovs-ovn" | awk '{print $3$4}' | grep -v -c "Running0") +if [ "$crashedPodsNum" -gt 0 ]; then + echo "some ovn pods are not running" + kubectl get po -n "$namespace" -o wide | grep -E "ovn-controller|ovn-pinger|ovn-monitor|ovn-cni|ovn-central|ovs-ovn" + crashedPods=$(kubectl get po -n "$namespace" -o wide | grep -E "ovn-controller|ovn-pinger|ovn-monitor|ovn-cni|ovn-central|ovs-ovn" | awk '{print $1 " " $3$4}' | grep -v "Running0" | awk '{print $1}') + for crashedPod in $crashedPods; do + echo "kubectl logs -p -n $crashedPod | tail -n 100" + kubectl logs -p -n "$namespace" "$crashedPod" | tail -n 100 + echo "PLEASE CHECK THE ERROR LOGS ABOVE /|\ /|\ /|\ " + done + + echo "some ovn pods crashed" + exit 1 +else + echo "all ovn pods are ok" + exit 0 +fi diff --git a/dist/images/kubectl-ko b/dist/images/kubectl-ko index 540e345716a..e7182a5aae2 100755 --- a/dist/images/kubectl-ko +++ b/dist/images/kubectl-ko @@ -721,6 +721,8 @@ diagnose(){ node) nodeName="$2" kubectl get no "$nodeName" > /dev/null + echo "### make sure kube-ovn-pinger running on node $nodeName" + kubectl -n $KUBE_OVN_NS get po -l app=kube-ovn-pinger pinger=$(kubectl -n $KUBE_OVN_NS get po -l app=kube-ovn-pinger -o 'jsonpath={.items[?(@.spec.nodeName=="'$nodeName'")].metadata.name}') if [ ! -n "$pinger" ]; then echo "Error: No kube-ovn-pinger running on node $nodeName" diff --git a/pkg/controller/subnet.go b/pkg/controller/subnet.go index f56bf9e3648..3e1c3aacec0 100644 --- a/pkg/controller/subnet.go +++ b/pkg/controller/subnet.go @@ -85,13 +85,18 @@ func (c *Controller) enqueueUpdateSubnet(oldObj, newObj interface{}) { } if oldSubnet.Spec.Vpc != newSubnet.Spec.Vpc && - !(oldSubnet.Spec.Vpc == "" && newSubnet.Spec.Vpc == c.config.ClusterRouter || - oldSubnet.Spec.Vpc == c.config.ClusterRouter && newSubnet.Spec.Vpc == "") { + !(oldSubnet.Spec.Vpc == "" && newSubnet.Spec.Vpc == c.config.ClusterRouter || oldSubnet.Spec.Vpc == c.config.ClusterRouter && newSubnet.Spec.Vpc == "") { + + if newSubnet.Annotations == nil { + newSubnet.Annotations = make(map[string]string) + } + if oldSubnet.Spec.Vpc == "" { newSubnet.Annotations[util.VpcLastName] = c.config.ClusterRouter } else { newSubnet.Annotations[util.VpcLastName] = oldSubnet.Spec.Vpc } + c.updateVpcStatusQueue.Add(oldSubnet.Spec.Vpc) } diff --git a/pkg/daemon/ovs.go b/pkg/daemon/ovs.go index 7af13f65461..2d78711bf58 100644 --- a/pkg/daemon/ovs.go +++ b/pkg/daemon/ovs.go @@ -61,6 +61,7 @@ func configureGlobalMirror(portName string, mtu int) error { } if !nicExist { + klog.Infof("nic %s not exist, create it", portName) raw, err := ovs.Exec(ovs.MayExist, "add-port", "br-int", portName, "--", "set", "interface", portName, "type=internal", "--", "clear", "bridge", "br-int", "mirrors", "--", @@ -68,17 +69,18 @@ func configureGlobalMirror(portName string, mtu int) error { "--id=@m", "create", "mirror", fmt.Sprintf("name=%s", util.MirrorDefaultName), "select_all=true", "output_port=@mirror0", "--", "add", "bridge", "br-int", "mirrors", "@m") if err != nil { - klog.Errorf("failed to configure mirror nic %s %q", portName, raw) + klog.Errorf("failed to configure mirror nic %s, %q, %v", portName, raw, err) return fmt.Errorf(raw) } } else { + klog.Infof("nic %s exist, configure it", portName) raw, err := ovs.Exec(ovs.MayExist, "add-port", "br-int", portName, "--", "clear", "bridge", "br-int", "mirrors", "--", "--id=@mirror0", "get", "port", portName, "--", "--id=@m", "create", "mirror", fmt.Sprintf("name=%s", util.MirrorDefaultName), "select_all=true", "output_port=@mirror0", "--", "add", "bridge", "br-int", "mirrors", "@m") if err != nil { - klog.Errorf("failed to configure mirror nic %s %q", portName, raw) + klog.Errorf("failed to configure mirror nic %s, %q, %v", portName, raw, err) return fmt.Errorf(raw) } } @@ -94,6 +96,7 @@ func configureEmptyMirror(portName string, mtu int) error { } if !nicExist { + klog.Infof("nic %s not exist, create it", portName) raw, err := ovs.Exec(ovs.MayExist, "add-port", "br-int", portName, "--", "set", "interface", portName, "type=internal", "--", "clear", "bridge", "br-int", "mirrors", "--", @@ -101,10 +104,11 @@ func configureEmptyMirror(portName string, mtu int) error { "--id=@m", "create", "mirror", fmt.Sprintf("name=%s", util.MirrorDefaultName), "output_port=@mirror0", "--", "add", "bridge", "br-int", "mirrors", "@m") if err != nil { - klog.Errorf("failed to configure mirror nic %s %q", portName, raw) + klog.Errorf("failed to configure mirror nic %s %q, %v", portName, raw, err) return fmt.Errorf(raw) } } else { + klog.Infof("nic %s exist, configure it", portName) raw, err := ovs.Exec(ovs.MayExist, "add-port", "br-int", portName, "--", "clear", "bridge", "br-int", "mirrors", "--", "--id=@mirror0", "get", "port", portName, "--", diff --git a/pkg/daemon/ovs_linux.go b/pkg/daemon/ovs_linux.go index 2b941f4d111..8a56d2042d5 100644 --- a/pkg/daemon/ovs_linux.go +++ b/pkg/daemon/ovs_linux.go @@ -917,11 +917,13 @@ func (c *Controller) patchNodeExternalGwLabel(key string, enabled bool) error { func configureMirrorLink(portName string, _ int) error { mirrorLink, err := netlink.LinkByName(portName) if err != nil { + klog.Error(err) return fmt.Errorf("can not find mirror nic %s: %v", portName, err) } if mirrorLink.Attrs().OperState != netlink.OperUp { if err = netlink.LinkSetUp(mirrorLink); err != nil { + klog.Error(err) return fmt.Errorf("can not set mirror nic %s up: %v", portName, err) } } @@ -932,10 +934,12 @@ func configureMirrorLink(portName string, _ int) error { func configureNic(link, ip string, macAddr net.HardwareAddr, mtu int, detectIPConflict bool) error { nodeLink, err := netlink.LinkByName(link) if err != nil { + klog.Error(err) return fmt.Errorf("can not find nic %s: %v", link, err) } if err = netlink.LinkSetHardwareAddr(nodeLink, macAddr); err != nil { + klog.Error(err) return fmt.Errorf("can not set mac address to nic %s: %v", link, err) } @@ -952,6 +956,7 @@ func configureNic(link, ip string, macAddr net.HardwareAddr, mtu int, detectIPCo if nodeLink.Attrs().OperState != netlink.OperUp { if err = netlink.LinkSetUp(nodeLink); err != nil { + klog.Error(err) return fmt.Errorf("can not set node nic %s up: %v", link, err) } } @@ -960,6 +965,7 @@ func configureNic(link, ip string, macAddr net.HardwareAddr, mtu int, detectIPCo ipAddMap := make(map[string]netlink.Addr) ipAddrs, err := netlink.AddrList(nodeLink, unix.AF_UNSPEC) if err != nil { + klog.Error(err) return fmt.Errorf("can not get addr %s: %v", nodeLink, err) } for _, ipAddr := range ipAddrs { @@ -987,6 +993,7 @@ func configureNic(link, ip string, macAddr net.HardwareAddr, mtu int, detectIPCo for ip, addr := range ipDelMap { klog.Infof("delete ip address %s on %s", ip, link) if err = netlink.AddrDel(nodeLink, &addr); err != nil { + klog.Error(err) return fmt.Errorf("delete address %s: %v", addr, err) } } @@ -1012,6 +1019,7 @@ func configureNic(link, ip string, macAddr net.HardwareAddr, mtu int, detectIPCo klog.Infof("add ip address %s to %s", ip, link) if err = netlink.AddrAdd(nodeLink, &addr); err != nil { + klog.Error(err) return fmt.Errorf("can not add address %v to nic %s: %v", addr, link, err) } } @@ -1486,7 +1494,9 @@ func (csh cniServerHandler) configureNicWithInternalPort(podName, podNamespace, fmt.Sprintf("external_ids:ip=%s", ipStr), fmt.Sprintf("external_ids:pod_netns=%s", netns)) if err != nil { - return containerNicName, nil, fmt.Errorf("add nic to ovs failed %v: %q", err, output) + err := fmt.Errorf("add nic to ovs failed %v: %q", err, output) + klog.Error(err) + return containerNicName, nil, err } // container nic must use same mac address from pod annotation, otherwise ovn will reject these packets by default diff --git a/pkg/daemon/ovs_windows.go b/pkg/daemon/ovs_windows.go index 0fd513f1a6f..adff96ab9e4 100644 --- a/pkg/daemon/ovs_windows.go +++ b/pkg/daemon/ovs_windows.go @@ -122,7 +122,9 @@ func (csh cniServerHandler) configureNic(podName, podNamespace, provider, netns, fmt.Sprintf("external_ids:pod_namespace=%s", podNamespace), fmt.Sprintf("external_ids:ip=%s", ipAddr)) if err != nil { - return nil, fmt.Errorf("failed to add OVS port %s, %v: %q", epName, err, output) + err := fmt.Errorf("failed to add OVS port %s, %v: %q", epName, err, output) + klog.Error(err) + return nil, err } if err = ovs.SetInterfaceBandwidth(podName, podNamespace, ifaceID, egress, ingress); err != nil { diff --git a/pkg/ovs/ovn-nb_global.go b/pkg/ovs/ovn-nb_global.go index 38a41db5771..74b8d1d0bc9 100644 --- a/pkg/ovs/ovn-nb_global.go +++ b/pkg/ovs/ovn-nb_global.go @@ -99,11 +99,11 @@ func (c *OVNNbClient) SetNbGlobalOptions(key string, value interface{}) error { return nil } - options := make(map[string]string, len(nbGlobal.Options)+1) - for k, v := range nbGlobal.Options { - options[k] = v + if nbGlobal.Options == nil { + nbGlobal.Options = make(map[string]string) } nbGlobal.Options[key] = v + if err := c.UpdateNbGlobal(nbGlobal, &nbGlobal.Options); err != nil { return fmt.Errorf("failed to set nb global option %s to %v: %v", key, value, err) } diff --git a/pkg/ovs/ovn.go b/pkg/ovs/ovn.go index cad3a8e71af..9de23d922ac 100644 --- a/pkg/ovs/ovn.go +++ b/pkg/ovs/ovn.go @@ -78,10 +78,22 @@ func NewOvnNbClient(ovnNbAddr string, ovnNbTimeout int) (*OVNNbClient, error) { client.WithTable(&ovnnb.NBGlobal{}), client.WithTable(&ovnnb.PortGroup{}), } - nbClient, err := ovsclient.NewOvsDbClient(ovsclient.NBDB, ovnNbAddr, dbModel, monitors) - if err != nil { - klog.Errorf("failed to create OVN NB client: %v", err) - return nil, err + + try := 0 + maxRetry := 60 + var nbClient client.Client + for { + nbClient, err = ovsclient.NewOvsDbClient(ovsclient.NBDB, ovnNbAddr, dbModel, monitors) + if err != nil { + klog.Errorf("failed to create OVN NB client: %v", err) + } else { + break + } + if try >= maxRetry { + return nil, err + } + time.Sleep(2 * time.Second) + try++ } c := &OVNNbClient{ @@ -104,10 +116,21 @@ func NewOvnSbClient(ovnSbAddr string, ovnSbTimeout int) (*OVNSbClient, error) { client.WithTable(&ovnsb.Chassis{}), // TODO:// monitor other necessary tables in ovsdb/ovnsb/model.go } - sbClient, err := ovsclient.NewOvsDbClient(ovsclient.SBDB, ovnSbAddr, dbModel, monitors) - if err != nil { - klog.Errorf("failed to create OVN SB client: %v", err) - return nil, err + maxRetry := 60 + try := 0 + var sbClient client.Client + for { + sbClient, err = ovsclient.NewOvsDbClient(ovsclient.SBDB, ovnSbAddr, dbModel, monitors) + if err != nil { + klog.Errorf("failed to create OVN SB client: %v", err) + } else { + break + } + if try >= maxRetry { + return nil, err + } + time.Sleep(2 * time.Second) + try++ } c := &OVNSbClient{ diff --git a/pkg/util/const.go b/pkg/util/const.go index bfbed679330..c929799901b 100644 --- a/pkg/util/const.go +++ b/pkg/util/const.go @@ -211,6 +211,9 @@ const ( HostnameEnv = "KUBE_NODE_NAME" + MirrosRetryMaxTimes = 5 + MirrosRetryInterval = 1 + ChassisRetryMaxTimes = 5 ChassisCniDaemonRetryInterval = 1 ChassisControllerRetryInterval = 3