Skip to content

Commit

Permalink
kubevirt: rework k3s install path, restart k3s.
Browse files Browse the repository at this point in the history
Rework the k3s install steps to allow watching
the containerd and k3s processes for crashes and
restarting them. Also reduce duplicated code in
k3s startup down to one spot in check_start_k3s().

Signed-off-by: Andrew Durbin <andrewd@zededa.com>
  • Loading branch information
andrewd-zededa authored and eriknordmark committed Apr 26, 2024
1 parent 751af32 commit 574d447
Showing 1 changed file with 77 additions and 36 deletions.
113 changes: 77 additions & 36 deletions pkg/kube/cluster-init.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ KUBEVIRT_VERSION=v0.59.0
LONGHORN_VERSION=v1.6.0
CDI_VERSION=v1.57.0
NODE_IP=""
MAX_K3S_RESTARTS=10
RESTART_COUNT=0

INSTALL_LOG=/var/lib/install.log
CTRD_LOG=/var/lib/containerd-user.log
Expand Down Expand Up @@ -74,13 +76,14 @@ apply_multus_cni() {
get_default_intf_IP_prefix
kubectl create namespace eve-kube-app
logmsg "Apply Multus, Node-IP: $NODE_IP"
while ! kubectl apply -f /tmp/multus-daemonset.yaml; do
sleep 1
done
if ! kubectl apply -f /tmp/multus-daemonset.yaml; then
return 1
fi
logmsg "Done applying Multus"
ln -s /var/lib/cni/bin/multus /var/lib/rancher/k3s/data/current/bin/multus
# need to only do this once
touch /var/lib/multus_initialized
return 0
}

copy_cni_plugin_files() {
Expand Down Expand Up @@ -155,6 +158,33 @@ check_overwrite_nsmounter() {
### REMOVE ME-
}

check_start_k3s() {
pgrep -f "k3s server" > /dev/null 2>&1
if [ $? -eq 1 ]; then
if [ $RESTART_COUNT -lt $MAX_K3S_RESTARTS ]; then
## Must be after reboot, or from k3s restart
RESTART_COUNT=$((RESTART_COUNT+1))
ln -s /var/lib/k3s/bin/* /usr/bin
logmsg "Starting k3s server, restart count: $RESTART_COUNT"
# for now, always copy to get the latest
nohup /usr/bin/k3s server --config /etc/rancher/k3s/config.yaml &
k3s_pid=$!
# Give the embedded etcd in k3s priority over io as its fsync latencies are critical
ionice -c2 -n0 -p $k3s_pid
# Default location where clients will look for config
# There is a very small window where this file is not available
# while k3s is starting up
while [ ! -f /etc/rancher/k3s/k3s.yaml ]; do
sleep 5
done
ln -s /etc/rancher/k3s/k3s.yaml ~/.kube/config
cp /etc/rancher/k3s/k3s.yaml /run/.kube/k3s/k3s.yaml
return 1
fi
fi
return 0
}

check_start_containerd() {
# Needed to get the pods to start
if [ ! -L /usr/bin/runc ]; then
Expand Down Expand Up @@ -203,6 +233,23 @@ trigger_k3s_selfextraction() {
/usr/bin/k3s check-config >> $INSTALL_LOG 2>&1
}

# Return success if all pods are Running/Succeeded and Ready
# Used in install time to control api server load
# Return unix style 0 for success. (Not 0 for false)
are_all_pods_ready() {
pod_json=$(kubectl get pods -A -o json)
not_running=$(echo "$pod_json" | jq '.items[] | select(.status.phase!="Running" and .status.phase!="Succeeded")' | jq -s length)
if [ "$not_running" -ne 0 ]; then
return 1
fi

not_ready=$(echo "$pod_json" | jq '.items[] | select(.status.phase=="Running") | .status.conditions[] | select(.type=="ContainersReady" and .status!="True")' | jq -s length)
if [ "$not_ready" -ne 0 ]; then
return 1
fi

return 0
}

#Make sure all prereqs are set after /var/lib is mounted to get logging info
setup_prereqs
Expand Down Expand Up @@ -262,34 +309,39 @@ while true;
do
if [ ! -f /var/lib/all_components_initialized ]; then
if [ ! -f /var/lib/k3s_initialized ]; then
#/var/lib is where all kubernetes components get installed.
logmsg "Installing K3S version $K3S_VERSION"
logmsg "Installing K3S version $K3S_VERSION on $(/bin/hostname)"
mkdir -p /var/lib/k3s/bin
/usr/bin/curl -sfL https://get.k3s.io | INSTALL_K3S_VERSION=${K3S_VERSION} INSTALL_K3S_SKIP_ENABLE=true INSTALL_K3S_BIN_DIR=/var/lib/k3s/bin sh -
ln -s /var/lib/k3s/bin/* /usr/bin
logmsg "Initializing K3S version $K3S_VERSION"
ln -s /var/lib/k3s/bin/* /usr/bin
trigger_k3s_selfextraction
check_start_containerd
nohup /usr/bin/k3s server --config /etc/rancher/k3s/config.yaml &
#wait until k3s is ready
logmsg "Looping until k3s is ready"
while [ "$(kubectl get node "$(/bin/hostname)" -o json | jq '.status.conditions[] | select(.reason=="KubeletReady") | .status=="True"')" != "true" ];
do
sleep 5;
done
# Give the embedded etcd in k3s priority over io as its fsync latencies are critical
ionice -c2 -n0 -p "$(pgrep -f "k3s server")"
logmsg "k3s is ready on this node"
# Default location where clients will look for config
ln -s /etc/rancher/k3s/k3s.yaml ~/.kube/config
touch /var/lib/k3s_initialized
fi

# Be kind to the API server
sleep 1

check_start_containerd
if ! check_start_k3s; then
continue
fi

this_node_ready=$(kubectl get node "$(/bin/hostname)" -o json | jq '.status.conditions[] | select(.reason=="KubeletReady") | .status=="True"')
if [ "$this_node_ready" != "true" ]; then
continue
fi

if ! are_all_pods_ready; then
continue
fi

if [ ! -f /var/lib/cni/bin ]; then
copy_cni_plugin_files
fi

if [ ! -f /var/lib/multus_initialized ]; then
apply_multus_cni
continue
fi
if ! pidof dhcp; then
# launch CNI dhcp service
Expand All @@ -311,6 +363,7 @@ if [ ! -f /var/lib/all_components_initialized ]; then
kubectl apply -f /etc/kubevirt-features.yaml

touch /var/lib/kubevirt_initialized
continue
fi

if [ ! -f /var/lib/longhorn_initialized ]; then
Expand All @@ -333,27 +386,11 @@ if [ ! -f /var/lib/all_components_initialized ]; then
fi
else
check_start_containerd
if pgrep k3s >> $INSTALL_LOG 2>&1; then
logmsg "k3s is alive "
if [ -e /var/lib/longhorn_initialized ]; then
check_overwrite_nsmounter
fi
else
## Must be after reboot
ln -s /var/lib/k3s/bin/* /usr/bin
logmsg "Starting k3s server after reboot"
nohup /usr/bin/k3s server --config /etc/rancher/k3s/config.yaml &
logmsg "Looping until k3s is ready"
if ! check_start_k3s; then
while [ "$(kubectl get node "$(/bin/hostname)" -o json | jq '.status.conditions[] | select(.reason=="KubeletReady") | .status=="True"')" != "true" ];
do
sleep 5;
done
# Give the embedded etcd in k3s priority over io as its fsync latencies are critical
ionice -c2 -n0 -p "$(pgrep -f "k3s server")"
logmsg "k3s is ready on this node"
# Default location where clients will look for config
ln -s /etc/rancher/k3s/k3s.yaml ~/.kube/config

# Initialize CNI after k3s reboot
if [ ! -f /var/lib/cni/bin ]; then
copy_cni_plugin_files
Expand All @@ -365,6 +402,10 @@ else
# launch CNI dhcp service
/opt/cni/bin/dhcp daemon &
fi
else
if [ -e /var/lib/longhorn_initialized ]; then
check_overwrite_nsmounter
fi
fi
fi
currentSize=$(wc -c <"$CTRD_LOG")
Expand Down

0 comments on commit 574d447

Please sign in to comment.