Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rewrite docker-checker.sh to make it less kill-happy. #23992

Merged
merged 1 commit into from
Apr 13, 2016
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
88 changes: 60 additions & 28 deletions cluster/saltbase/salt/supervisor/docker-checker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,38 +18,70 @@
# it detects a failure. It then exits, and supervisord restarts it
# which in turn restarts docker.

/etc/init.d/docker stop
# Make sure docker gracefully terminated before start again
starttime=`date +%s`
while pidof docker > /dev/null; do
currenttime=`date +%s`
((elapsedtime = currenttime - starttime))
# after 60 seconds, forcefully terminate docker process
if test $elapsedtime -gt 60; then
echo "attempting to kill docker process with sigkill signal"
kill -9 `pidof docker` || sleep 10
main() {
if ! healthy 60; then
stop_docker
start_docker
echo "waiting 30s for startup"
sleep 30
healthy 60
fi

while healthy; do
sleep 10
done

echo "Docker failed!"
exit 2
}

# Performs health check on docker. If a parameter is passed, it is treated as
# the number of seconds to keep trying for a healthy result. If none is passed
# we make only one attempt.
healthy() {
max_retry_sec="$1"
shift

starttime=$(date +%s)
while ! timeout 60 docker ps > /dev/null; do
if [[ -z "$max_retry_sec" || $(( $(date +%s) - starttime )) -gt "$max_retry_sec" ]]; then
echo "docker ps did not succeed"
return 2
else
echo "waiting clean shutdown"
sleep 10
echo "waiting 5s before retry"
sleep 5
fi
done

echo "docker is not running. starting docker"
done
echo "docker is healthy"
return 0
}

# cleanup docker network checkpoint to avoid running into known issue
# of docker (https://github.com/docker/docker/issues/18283)
rm -rf /var/lib/docker/network
stop_docker() {
/etc/init.d/docker stop
# Make sure docker gracefully terminated before start again
starttime=`date +%s`
while pidof docker > /dev/null; do
currenttime=`date +%s`
((elapsedtime = currenttime - starttime))
# after 60 seconds, forcefully terminate docker process
if test $elapsedtime -gt 60; then
echo "attempting to kill docker process with sigkill signal"
kill -9 `pidof docker` || sleep 10
else
echo "waiting clean shutdown"
sleep 10
fi
done
}

/etc/init.d/docker start
start_docker() {
echo "docker is not running. starting docker"

echo "waiting 30s for startup"
sleep 30
# cleanup docker network checkpoint to avoid running into known issue
# of docker (https://github.com/docker/docker/issues/18283)
rm -rf /var/lib/docker/network

while true; do
if ! timeout 60 docker ps > /dev/null; then
echo "Docker failed!"
exit 2
fi
sleep 10
done
/etc/init.d/docker start
}

main