From d1cd3dddfee6ff61e748eefbf987d1334f3364cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E7=A5=96=E5=BB=BA?= Date: Tue, 14 Mar 2023 20:27:23 +0800 Subject: [PATCH] fix ovs-ovn startup/restart (#2467) --- Makefile | 2 +- dist/images/db_autocheck_script.sh | 2 +- dist/images/install.sh | 1 + dist/images/kubectl-ko | 6 +- dist/images/restore-ovn-nb-db.sh | 4 +- dist/images/start-ovs.sh | 90 ++++-------------------------- docs/performance-tuning.md | 46 +++++++-------- 7 files changed, 43 insertions(+), 108 deletions(-) diff --git a/Makefile b/Makefile index 6458642fe6d..173161e20cb 100644 --- a/Makefile +++ b/Makefile @@ -623,7 +623,7 @@ kind-reload: kind-reload-ovs .PHONY: kind-reload-ovs kind-reload-ovs: kind-load-image - kubectl delete pod -n kube-system -l app=ovs + kubectl -n kube-system rollout restart ds ovs-ovn .PHONY: kind-clean kind-clean: diff --git a/dist/images/db_autocheck_script.sh b/dist/images/db_autocheck_script.sh index caa96f7ae43..6ec99cb81fa 100644 --- a/dist/images/db_autocheck_script.sh +++ b/dist/images/db_autocheck_script.sh @@ -59,7 +59,7 @@ restoreNB(){ echo "restore db file, operate in pod ${podNameArray[0]}" kubectl exec -it -n $KUBE_OVN_NS ${podNameArray[0]} -- mv /etc/ovn/ovnnb_db_standalone.db /etc/ovn/ovnnb_db.db kubectl scale deployment -n $KUBE_OVN_NS ovn-central --replicas=$replicas - kubectl -n kube-system delete pod -l app=ovs + kubectl -n kube-system rollout restart ds ovs-ovn echo "finish restore db file and ovn-central replicas" exit 0 } diff --git a/dist/images/install.sh b/dist/images/install.sh index f2c27947d94..6124569e6b3 100755 --- a/dist/images/install.sh +++ b/dist/images/install.sh @@ -2979,6 +2979,7 @@ EOF kubectl apply -f ovn-dpdk.yaml fi kubectl rollout status deployment/ovn-central -n kube-system --timeout 300s +kubectl rollout status daemonset/ovs-ovn -n kube-system --timeout 120s echo "-------------------------------" echo "" diff --git a/dist/images/kubectl-ko b/dist/images/kubectl-ko index deedc211505..093db4d6113 100755 --- a/dist/images/kubectl-ko +++ b/dist/images/kubectl-ko @@ -740,8 +740,8 @@ dbtool(){ done echo "finish restore nb db file and ovn-central replicas" - echo "recreate ovs-ovn pods" - kubectl delete pod -n $KUBE_OVN_NS -l app=ovs + echo "restart ovs-ovn" + kubectl -n $KUBE_OVN_NS rollout restart ds ovs-ovn ;; *) echo "unknown action $action" @@ -915,7 +915,7 @@ tuning(){ reload(){ kubectl delete pod -n kube-system -l app=ovn-central kubectl rollout status deployment/ovn-central -n kube-system - kubectl delete pod -n kube-system -l app=ovs + kubectl rollout restart daemonset/ovs-ovn -n kube-system kubectl delete pod -n kube-system -l app=kube-ovn-controller kubectl rollout status deployment/kube-ovn-controller -n kube-system kubectl delete pod -n kube-system -l app=kube-ovn-cni diff --git a/dist/images/restore-ovn-nb-db.sh b/dist/images/restore-ovn-nb-db.sh index 6ffe3334765..a7c8016d624 100755 --- a/dist/images/restore-ovn-nb-db.sh +++ b/dist/images/restore-ovn-nb-db.sh @@ -53,5 +53,5 @@ mv /etc/ovn/ovnnb_db_standalone.db /etc/ovn/ovnnb_db.db kubectl scale deployment -n $KUBE_OVN_NS ovn-central --replicas=$replicas echo "finish restore nb db file and ovn-central replicas" -echo "recreate ovs-ovn pods" -kubectl delete pod -n $KUBE_OVN_NS -l app=ovs +echo "restart ovs-ovn" +kubectl -n $KUBE_OVN_NS rollout restart ds ovs-ovn diff --git a/dist/images/start-ovs.sh b/dist/images/start-ovs.sh index 9d6ecd4359e..a7ea06e07c0 100755 --- a/dist/images/start-ovs.sh +++ b/dist/images/start-ovs.sh @@ -34,25 +34,25 @@ cat /proc/cmdline" exit 1 fi -function quit { - set +e - for netns in /var/run/netns/*; do - nsenter --net=$netns sysctl -w net.ipv4.neigh.eth0.base_reachable_time_ms=180000; - nsenter --net=$netns sysctl -w net.ipv4.neigh.eth0.gc_stale_time=180; - done - # If the arp is in stale or delay status, stop vswitchd will lead prob failed. - # Wait a while for prob ready. - # As the timeout has been increased existing entry will not change to stale or delay at the moment - sleep 5 +function cgroup_match { + hash1=$(md5sum /proc/$1/cgroup | awk '{print $1}') + hash2=$(md5sum /proc/$2/cgroup | awk '{print $1}') + test -n "$hash1" -a "x$hash1" = "x$hash2" +} +function quit { gen_name=$(kubectl -n $POD_NAMESPACE get pod $POD_NAME -o jsonpath='{.metadata.generateName}') revision_hash=$(kubectl -n $POD_NAMESPACE get pod $POD_NAME -o jsonpath='{.metadata.labels.controller-revision-hash}') revision=$(kubectl -n $POD_NAMESPACE get controllerrevision $gen_name$revision_hash -o jsonpath='{.revision}') ds_name=${gen_name%-} latest_revision=$(kubectl -n kube-system get controllerrevision --no-headers | awk '$2 == "daemonset.apps/'$ds_name'" {print $3}' | sort -nr | head -n1) if [ "x$latest_revision" = "x$revision" ]; then - /usr/share/ovn/scripts/grace_stop_ovn_controller - /usr/share/openvswitch/scripts/ovs-ctl stop + # stop ovn-controller/ovs only when the processes are in the same cgroup + pid=$(/usr/share/ovn/scripts/ovn-ctl status_controller | awk '{print $NF}') + if cgroup_match $pid self; then + /usr/share/ovn/scripts/grace_stop_ovn_controller + /usr/share/openvswitch/scripts/ovs-ctl stop + fi fi exit 0 @@ -152,49 +152,6 @@ function exchange_link_names() { exchange_link_names -function wait_flows_pre_check() { - local devices="" - local ips=($(echo $OVN_DB_IPS | sed 's/,/ /g')) - for ip in ${ips[*]}; do - devices="$devices $(ip route get $ip | grep -oE 'dev .+' | awk '{print $2}')" - done - - bridges=($(ovs-vsctl --no-heading --columns=name find bridge external-ids:vendor=kube-ovn)) - for br in ${bridges[@]}; do - ports=($(ovs-vsctl list-ports $br)) - for port in ${ports[@]}; do - if ! echo $devices | grep -qw "$port"; then - continue - fi - - port_type=$(ovs-vsctl --no-heading --columns=type find interface name=$port) - if [ ! "x$port_type" = 'x""' ]; then - continue - fi - - if ! ip link show $port | grep -qw "master ovs-system"; then - return 1 - fi - done - done - - return 0 -} - -skip_wait_flows=0 -if ! wait_flows_pre_check; then - skip_wait_flows=1 -fi - -if [ $skip_wait_flows -eq 0 ]; then - # When ovs-vswitchd starts with this value set as true, it will neither flush or - # expire previously set datapath flows nor will it send and receive any - # packets to or from the datapath. Please check ovs-vswitchd.conf.db.5.txt - ovs-vsctl --no-wait set open_vswitch . other_config:flow-restore-wait="true" -else - ovs-vsctl --no-wait set open_vswitch . other_config:flow-restore-wait="false" -fi - # Start vswitchd. restart will automatically set/unset flow-restore-wait which is not what we want /usr/share/openvswitch/scripts/ovs-ctl restart --no-ovsdb-server --system-id=random --no-mlockall /usr/share/openvswitch/scripts/ovs-ctl --protocol=udp --dport=6081 enable-protocol @@ -300,28 +257,5 @@ else /usr/share/ovn/scripts/ovn-ctl --ovn-controller-ssl-key=/var/run/tls/key --ovn-controller-ssl-cert=/var/run/tls/cert --ovn-controller-ssl-ca-cert=/var/run/tls/cacert restart_controller fi -if [ $skip_wait_flows -eq 0 ]; then - # Wait ovn-controller finish init flow compute and update it to vswitchd, - # then update flow-restore-wait to indicate vswitchd to process flows - set +e - flow_num=$(ovs-ofctl dump-flows br-int | wc -l) - while [ $flow_num -le $FLOW_LIMIT ] - do - echo "$flow_num flows now, waiting for ovs-vswitchd flow ready" - sleep 1 - flow_num=$(ovs-ofctl dump-flows br-int | wc -l) - done - set -e - - ovs-vsctl --no-wait set open_vswitch . other_config:flow-restore-wait="false" -fi - -set +e -for netns in /var/run/netns/*; do - nsenter --net=$netns sysctl -w net.ipv4.neigh.eth0.base_reachable_time_ms=30000; - nsenter --net=$netns sysctl -w net.ipv4.neigh.eth0.gc_stale_time=60; -done -set -e - chmod 600 /etc/openvswitch/* tail --follow=name --retry /var/log/ovn/ovn-controller.log diff --git a/docs/performance-tuning.md b/docs/performance-tuning.md index d3e6d1673e7..01a77ef9425 100644 --- a/docs/performance-tuning.md +++ b/docs/performance-tuning.md @@ -18,7 +18,7 @@ We use `qperf -t 60 -ub -oo msg_size:1 -vu tcp_lat tcp_bw udp_lat ud for tcp/udp latency and throughput and compare with host network performance as baseline. | Type | tcp_lat (us) | udp_lat (us) | tcp_bw (Mb/s) | udp_bw(Mb/s) | -| ------------------ | -------------| -------------| --------------| -------------| +| ------------------ | ------------ | ------------ | ------------- | ------------ | | Kube-OVN Default | 25.7 | 22.9 | 27.1 | 1.59 | | Kube-OVN Optimized | 13.9 | 12.9 | 27.6 | 5.57 | | HOST Network | 13.1 | 12.4 | 28.2 | 6.02 | @@ -35,33 +35,33 @@ In a different environment set, we compare the performance between optimized Kub `qperf -t 60 -ub -oo msg_size:1 -vu tcp_lat tcp_bw udp_lat udp_bw` -| Type | tcp_lat (us) | udp_lat (us) | tcp_bw (Mb/s) | udp_bw(Mb/s) | -| ------------------ | -------------| -------------| --------------| -------------| -| Kube-OVN Overlay | 15.2 | 14.6 | 23.6 | 2.65 | -| Kube-OVN Underlay | 14.3 | 13.8 | 24.2 | 3.46 | -| Calico IPIP | 21.4 | 20.2 | 23.6 | 1.18 | -| Calico NoEncap | 19.3 | 16.9 | 23.6 | 1.76 | -| HOST Network | 16.6 | 15.4 | 24.8 | 2.64 | +| Type | tcp_lat (us) | udp_lat (us) | tcp_bw (Mb/s) | udp_bw(Mb/s) | +| ----------------- | ------------ | ------------ | ------------- | ------------ | +| Kube-OVN Overlay | 15.2 | 14.6 | 23.6 | 2.65 | +| Kube-OVN Underlay | 14.3 | 13.8 | 24.2 | 3.46 | +| Calico IPIP | 21.4 | 20.2 | 23.6 | 1.18 | +| Calico NoEncap | 19.3 | 16.9 | 23.6 | 1.76 | +| HOST Network | 16.6 | 15.4 | 24.8 | 2.64 | `qperf -t 60 -ub -oo msg_size:1K -vu tcp_lat tcp_bw udp_lat udp_bw` -| Type | tcp_lat (us) | udp_lat (us) | tcp_bw (Gb/s) | udp_bw(Gb/s) | -| ------------------ | -------------| -------------| --------------| -------------| -| Kube-OVN Overlay | 16.5 | 15.8 | 10.2 | 2.77 | -| Kube-OVN Underlay | 15.9 | 14.5 | 9.6 | 3.22 | -| Calico IPIP | 22.5 | 21.5 | 1.45 | 1.14 | -| Calico NoEncap | 19.4 | 18.3 | 3.76 | 1.63 | -| HOST Network | 18.1 | 16.6 | 9.32 | 2.66 | +| Type | tcp_lat (us) | udp_lat (us) | tcp_bw (Gb/s) | udp_bw(Gb/s) | +| ----------------- | ------------ | ------------ | ------------- | ------------ | +| Kube-OVN Overlay | 16.5 | 15.8 | 10.2 | 2.77 | +| Kube-OVN Underlay | 15.9 | 14.5 | 9.6 | 3.22 | +| Calico IPIP | 22.5 | 21.5 | 1.45 | 1.14 | +| Calico NoEncap | 19.4 | 18.3 | 3.76 | 1.63 | +| HOST Network | 18.1 | 16.6 | 9.32 | 2.66 | `qperf -t 60 -ub -oo msg_size:4K -vu tcp_lat tcp_bw udp_lat udp_bw` -| Type | tcp_lat (us) | udp_lat (us) | tcp_bw (Gb/s) | udp_bw(Gb/s) | -| ------------------ | -------------| -------------| --------------| -------------| -| Kube-OVN Overlay | 34.7 | 41.6 | 16.0 | 9.23 | -| Kube-OVN Underlay | 32.6 | 44 | 15.1 | 6.71 | -| Calico IPIP | 44.8 | 52.9 | 2.94 | 3.26 | -| Calico NoEncap | 40 | 49.6 | 6.56 | 4.19 | -| HOST Network | 35.9 | 45.9 | 14.6 | 5.59 | +| Type | tcp_lat (us) | udp_lat (us) | tcp_bw (Gb/s) | udp_bw(Gb/s) | +| ----------------- | ------------ | ------------ | ------------- | ------------ | +| Kube-OVN Overlay | 34.7 | 41.6 | 16.0 | 9.23 | +| Kube-OVN Underlay | 32.6 | 44 | 15.1 | 6.71 | +| Calico IPIP | 44.8 | 52.9 | 2.94 | 3.26 | +| Calico NoEncap | 40 | 49.6 | 6.56 | 4.19 | +| HOST Network | 35.9 | 45.9 | 14.6 | 5.59 | This benchmark is for reference only, the result may vary dramatically due to different hardware and software setups. Optimization for packets with big size and underlay latency are still in progress, we will publish the optimization @@ -263,5 +263,5 @@ Unfortunately, this tunnel type is not embedded in kernel, you have to compile O ```bash kubectl set env daemonset/ovs-ovn -n kube-system TUNNEL_TYPE=stt -kubectl delete pod -n kube-system -lapp=ovs +kubectl -n kube-system rollout restart ds ovs-ovn ```