Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 112 additions & 0 deletions pkg/userdata/scripts/health-monitor.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
#!/usr/bin/env bash

# Copyright 2016 The Kubernetes Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This script is for master and node instance health monitoring, which is
# packed in kube-manifest tarball. It is executed through a systemd service
# in cluster/gce/gci/<master/node>.yaml. The env variables come from an env
# file provided by the systemd service.

# This script is a slightly adjusted version of
# https://github.com/kubernetes/kubernetes/blob/e1a1aa211224fcd9b213420b80b2ae680669683d/cluster/gce/gci/health-monitor.sh
# Adjustments are:
# * Kubelet health port is 10248 not 10255
# * Removal of all all references to the KUBE_ENV file

set -o nounset
set -o pipefail

# We simply kill the process when there is a failure. Another systemd service will
# automatically restart the process.
function container_runtime_monitoring {
local -r max_attempts=5
local attempt=1
local -r crictl="${KUBE_HOME}/bin/crictl"
local -r container_runtime_name="${CONTAINER_RUNTIME_NAME:-docker}"
# We still need to use 'docker ps' when container runtime is "docker". This is because
# dockershim is still part of kubelet today. When kubelet is down, crictl pods
# will also fail, and docker will be killed. This is undesirable especially when
# docker live restore is disabled.
local healthcheck_command="docker ps"
if [[ "${CONTAINER_RUNTIME:-docker}" != "docker" ]]; then
healthcheck_command="${crictl} pods"
fi
# Container runtime startup takes time. Make initial attempts before starting
# killing the container runtime.
until timeout 60 ${healthcheck_command} > /dev/null; do
if (( attempt == max_attempts )); then
echo "Max attempt ${max_attempts} reached! Proceeding to monitor container runtime healthiness."
break
fi
echo "$attempt initial attempt \"${healthcheck_command}\"! Trying again in $attempt seconds..."
sleep "$(( 2 ** attempt++ ))"
done
while true; do
if ! timeout 60 ${healthcheck_command} > /dev/null; then
echo "Container runtime ${container_runtime_name} failed!"
if [[ "$container_runtime_name" == "docker" ]]; then
# Dump stack of docker daemon for investigation.
# Log fle name looks like goroutine-stacks-TIMESTAMP and will be saved to
# the exec root directory, which is /var/run/docker/ on Ubuntu and COS.
pkill -SIGUSR1 dockerd
fi
systemctl kill --kill-who=main "${container_runtime_name}"
# Wait for a while, as we don't want to kill it again before it is really up.
sleep 120
else
sleep "${SLEEP_SECONDS}"
fi
done
}

function kubelet_monitoring {
echo "Wait for 2 minutes for kubelet to be functional"
# TODO(andyzheng0831): replace it with a more reliable method if possible.
sleep 120
local -r max_seconds=10
local output=""
while [ 1 ]; do
if ! output=$(curl -m "${max_seconds}" -f -s -S http://127.0.0.1:10248/healthz 2>&1); then
# Print the response and/or errors.
echo $output
echo "Kubelet is unhealthy!"
systemctl kill kubelet
# Wait for a while, as we don't want to kill it again before it is really up.
sleep 60
else
sleep "${SLEEP_SECONDS}"
fi
done
}


############## Main Function ################
if [[ "$#" -ne 1 ]]; then
echo "Usage: health-monitor.sh <container-runtime/kubelet>"
exit 1
fi

KUBE_HOME="/home/kubernetes"

SLEEP_SECONDS=10
component=$1
echo "Start kubernetes health monitoring for ${component}"
if [[ "${component}" == "container-runtime" ]]; then
container_runtime_monitoring
elif [[ "${component}" == "kubelet" ]]; then
kubelet_monitoring
else
echo "Health monitoring for component "${component}" is not supported!"
fi