diff --git a/pkg/userdata/scripts/health-monitor.sh b/pkg/userdata/scripts/health-monitor.sh new file mode 100644 index 000000000..b6959895f --- /dev/null +++ b/pkg/userdata/scripts/health-monitor.sh @@ -0,0 +1,112 @@ +#!/usr/bin/env bash + +# Copyright 2016 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This script is for master and node instance health monitoring, which is +# packed in kube-manifest tarball. It is executed through a systemd service +# in cluster/gce/gci/.yaml. The env variables come from an env +# file provided by the systemd service. + +# This script is a slightly adjusted version of +# https://github.com/kubernetes/kubernetes/blob/e1a1aa211224fcd9b213420b80b2ae680669683d/cluster/gce/gci/health-monitor.sh +# Adjustments are: +# * Kubelet health port is 10248 not 10255 +# * Removal of all all references to the KUBE_ENV file + +set -o nounset +set -o pipefail + +# We simply kill the process when there is a failure. Another systemd service will +# automatically restart the process. +function container_runtime_monitoring { + local -r max_attempts=5 + local attempt=1 + local -r crictl="${KUBE_HOME}/bin/crictl" + local -r container_runtime_name="${CONTAINER_RUNTIME_NAME:-docker}" + # We still need to use 'docker ps' when container runtime is "docker". This is because + # dockershim is still part of kubelet today. When kubelet is down, crictl pods + # will also fail, and docker will be killed. This is undesirable especially when + # docker live restore is disabled. + local healthcheck_command="docker ps" + if [[ "${CONTAINER_RUNTIME:-docker}" != "docker" ]]; then + healthcheck_command="${crictl} pods" + fi + # Container runtime startup takes time. Make initial attempts before starting + # killing the container runtime. + until timeout 60 ${healthcheck_command} > /dev/null; do + if (( attempt == max_attempts )); then + echo "Max attempt ${max_attempts} reached! Proceeding to monitor container runtime healthiness." + break + fi + echo "$attempt initial attempt \"${healthcheck_command}\"! Trying again in $attempt seconds..." + sleep "$(( 2 ** attempt++ ))" + done + while true; do + if ! timeout 60 ${healthcheck_command} > /dev/null; then + echo "Container runtime ${container_runtime_name} failed!" + if [[ "$container_runtime_name" == "docker" ]]; then + # Dump stack of docker daemon for investigation. + # Log fle name looks like goroutine-stacks-TIMESTAMP and will be saved to + # the exec root directory, which is /var/run/docker/ on Ubuntu and COS. + pkill -SIGUSR1 dockerd + fi + systemctl kill --kill-who=main "${container_runtime_name}" + # Wait for a while, as we don't want to kill it again before it is really up. + sleep 120 + else + sleep "${SLEEP_SECONDS}" + fi + done +} + +function kubelet_monitoring { + echo "Wait for 2 minutes for kubelet to be functional" + # TODO(andyzheng0831): replace it with a more reliable method if possible. + sleep 120 + local -r max_seconds=10 + local output="" + while [ 1 ]; do + if ! output=$(curl -m "${max_seconds}" -f -s -S http://127.0.0.1:10248/healthz 2>&1); then + # Print the response and/or errors. + echo $output + echo "Kubelet is unhealthy!" + systemctl kill kubelet + # Wait for a while, as we don't want to kill it again before it is really up. + sleep 60 + else + sleep "${SLEEP_SECONDS}" + fi + done +} + + +############## Main Function ################ +if [[ "$#" -ne 1 ]]; then + echo "Usage: health-monitor.sh " + exit 1 +fi + +KUBE_HOME="/home/kubernetes" + +SLEEP_SECONDS=10 +component=$1 +echo "Start kubernetes health monitoring for ${component}" +if [[ "${component}" == "container-runtime" ]]; then + container_runtime_monitoring +elif [[ "${component}" == "kubelet" ]]; then + kubelet_monitoring +else + echo "Health monitoring for component "${component}" is not supported!" +fi