From 3a2143eb6ffa4a1b99c063aa101208b83c8f6262 Mon Sep 17 00:00:00 2001 From: Alvaro Aleman Date: Fri, 12 Oct 2018 12:15:48 +0200 Subject: [PATCH 1/4] Add container runtime and kubelet healthcheck scripts for centos and ubuntu --- .../centos/testdata/kubelet-v1.10-aws.golden | 160 ++++++++++++++++- .../centos/testdata/kubelet-v1.11-aws.golden | 160 ++++++++++++++++- .../centos/testdata/kubelet-v1.12-aws.golden | 160 ++++++++++++++++- .../centos/testdata/kubelet-v1.9-aws.golden | 160 ++++++++++++++++- pkg/userdata/centos/userdata.go | 164 +++++++++++++++++- pkg/userdata/ubuntu/testdata/1.11-aws.golden | 146 +++++++++++++++- .../1.9.2-dist-upgrade-on-boot-aws.golden | 146 +++++++++++++++- .../1.9.2-openstack-multiple-dns.golden | 146 +++++++++++++++- .../openstack-kubelet-v-version-prefix.golden | 146 +++++++++++++++- .../openstack-overwrite-cloud-config.golden | 146 +++++++++++++++- pkg/userdata/ubuntu/userdata.go | 146 +++++++++++++++- 11 files changed, 1637 insertions(+), 43 deletions(-) diff --git a/pkg/userdata/centos/testdata/kubelet-v1.10-aws.golden b/pkg/userdata/centos/testdata/kubelet-v1.10-aws.golden index 496695b35..7e3fa40fb 100644 --- a/pkg/userdata/centos/testdata/kubelet-v1.10-aws.golden +++ b/pkg/userdata/centos/testdata/kubelet-v1.10-aws.golden @@ -62,7 +62,7 @@ write_files: {aws-config:true} - path: "/usr/local/bin/setup" - permissions: "0777" + permissions: "0755" content: | #!/bin/bash set -xeuo pipefail @@ -83,14 +83,19 @@ write_files: systemctl enable --now docker systemctl enable --now kubelet - kubeadm join \ - --token my-token \ - --discovery-token-ca-cert-hash sha256:6caecce9fedcb55d4953d61a27dc6997361a2f226ad86d7e6004dde7526fc4b1 \ - --ignore-preflight-errors=CRI \ - server:443 + if ! [[ -e /etc/kubernetes/pki/ca.crt ]]; then + kubeadm join \ + --token my-token \ + --discovery-token-ca-cert-hash sha256:6caecce9fedcb55d4953d61a27dc6997361a2f226ad86d7e6004dde7526fc4b1 \ + --ignore-preflight-errors=CRI \ + server:443 + fi + + systemctl enable --now --no-block kubelet-healthcheck.service + systemctl enable --now --no-block docker-healthcheck.service - path: "/usr/local/bin/supervise.sh" - permissions: "0777" + permissions: "0755" content: | #!/bin/bash set -xeuo pipefail @@ -112,5 +117,146 @@ write_files: RemainAfterExit=true ExecStart=/usr/local/bin/supervise.sh /usr/local/bin/setup +- path: /etc/systemd/system/kubelet-healthcheck.service + permissions: "0644" + content: | + [Unit] + Requires=setup.service + After=setup.service + + [Service] + ExecStart=/usr/local/bin/health-monitor.sh kubelet + + [Install] + WantedBy=multi-user.target + +- path: /etc/systemd/system/docker-healthcheck.service + permissions: "0644" + content: | + [Unit] + Requires=setup.service + After=setup.service + + [Service] + ExecStart=/usr/local/bin/health-monitor.sh container-runtime + + [Install] + WantedBy=multi-user.target + +- path: /usr/local/bin/health-monitor.sh + permissions: "0755" + # This script is a slightly adjusted version of + # https://github.com/kubernetes/kubernetes/blob/e1a1aa211224fcd9b213420b80b2ae680669683d/cluster/gce/gci/health-monitor.sh + # Adjustments are: + # * Kubelet health port is 10248 not 10255 + # * Removal of all all references to the KUBE_ENV file + content: | + #!/usr/bin/env bash + + # Copyright 2016 The Kubernetes Authors. + # + # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + + # This script is for master and node instance health monitoring, which is + # packed in kube-manifest tarball. It is executed through a systemd service + # in cluster/gce/gci/.yaml. The env variables come from an env + # file provided by the systemd service. + + set -o nounset + set -o pipefail + + # We simply kill the process when there is a failure. Another systemd service will + # automatically restart the process. + function container_runtime_monitoring { + local -r max_attempts=5 + local attempt=1 + local -r crictl="${KUBE_HOME}/bin/crictl" + local -r container_runtime_name="${CONTAINER_RUNTIME_NAME:-docker}" + # We still need to use 'docker ps' when container runtime is "docker". This is because + # dockershim is still part of kubelet today. When kubelet is down, crictl pods + # will also fail, and docker will be killed. This is undesirable especially when + # docker live restore is disabled. + local healthcheck_command="docker ps" + if [[ "${CONTAINER_RUNTIME:-docker}" != "docker" ]]; then + healthcheck_command="${crictl} pods" + fi + # Container runtime startup takes time. Make initial attempts before starting + # killing the container runtime. + until timeout 60 ${healthcheck_command} > /dev/null; do + if (( attempt == max_attempts )); then + echo "Max attempt ${max_attempts} reached! Proceeding to monitor container runtime healthiness." + break + fi + echo "$attempt initial attempt \"${healthcheck_command}\"! Trying again in $attempt seconds..." + sleep "$(( 2 ** attempt++ ))" + done + while true; do + if ! timeout 60 ${healthcheck_command} > /dev/null; then + echo "Container runtime ${container_runtime_name} failed!" + if [[ "$container_runtime_name" == "docker" ]]; then + # Dump stack of docker daemon for investigation. + # Log fle name looks like goroutine-stacks-TIMESTAMP and will be saved to + # the exec root directory, which is /var/run/docker/ on Ubuntu and COS. + pkill -SIGUSR1 dockerd + fi + systemctl kill --kill-who=main "${container_runtime_name}" + # Wait for a while, as we don't want to kill it again before it is really up. + sleep 120 + else + sleep "${SLEEP_SECONDS}" + fi + done + } + + function kubelet_monitoring { + echo "Wait for 2 minutes for kubelet to be functional" + # TODO(andyzheng0831): replace it with a more reliable method if possible. + sleep 120 + local -r max_seconds=10 + local output="" + while [ 1 ]; do + if ! output=$(curl -m "${max_seconds}" -f -s -S http://127.0.0.1:10248/healthz 2>&1); then + # Print the response and/or errors. + echo $output + echo "Kubelet is unhealthy!" + systemctl kill kubelet + # Wait for a while, as we don't want to kill it again before it is really up. + sleep 60 + else + sleep "${SLEEP_SECONDS}" + fi + done + } + + + ############## Main Function ################ + if [[ "$#" -ne 1 ]]; then + echo "Usage: health-monitor.sh " + exit 1 + fi + + KUBE_HOME="/home/kubernetes" + + SLEEP_SECONDS=10 + component=$1 + echo "Start kubernetes health monitoring for ${component}" + if [[ "${component}" == "container-runtime" ]]; then + container_runtime_monitoring + elif [[ "${component}" == "kubelet" ]]; then + kubelet_monitoring + else + echo "Health monitoring for component "${component}" is not supported!" + fi + runcmd: - systemctl enable --now setup.service diff --git a/pkg/userdata/centos/testdata/kubelet-v1.11-aws.golden b/pkg/userdata/centos/testdata/kubelet-v1.11-aws.golden index 64a99d5d0..503007bc4 100644 --- a/pkg/userdata/centos/testdata/kubelet-v1.11-aws.golden +++ b/pkg/userdata/centos/testdata/kubelet-v1.11-aws.golden @@ -58,7 +58,7 @@ write_files: {aws-config:true} - path: "/usr/local/bin/setup" - permissions: "0777" + permissions: "0755" content: | #!/bin/bash set -xeuo pipefail @@ -79,14 +79,19 @@ write_files: systemctl enable --now docker systemctl enable --now kubelet - kubeadm join \ - --token my-token \ - --discovery-token-ca-cert-hash sha256:6caecce9fedcb55d4953d61a27dc6997361a2f226ad86d7e6004dde7526fc4b1 \ - --ignore-preflight-errors=CRI \ - server:443 + if ! [[ -e /etc/kubernetes/pki/ca.crt ]]; then + kubeadm join \ + --token my-token \ + --discovery-token-ca-cert-hash sha256:6caecce9fedcb55d4953d61a27dc6997361a2f226ad86d7e6004dde7526fc4b1 \ + --ignore-preflight-errors=CRI \ + server:443 + fi + + systemctl enable --now --no-block kubelet-healthcheck.service + systemctl enable --now --no-block docker-healthcheck.service - path: "/usr/local/bin/supervise.sh" - permissions: "0777" + permissions: "0755" content: | #!/bin/bash set -xeuo pipefail @@ -108,5 +113,146 @@ write_files: RemainAfterExit=true ExecStart=/usr/local/bin/supervise.sh /usr/local/bin/setup +- path: /etc/systemd/system/kubelet-healthcheck.service + permissions: "0644" + content: | + [Unit] + Requires=setup.service + After=setup.service + + [Service] + ExecStart=/usr/local/bin/health-monitor.sh kubelet + + [Install] + WantedBy=multi-user.target + +- path: /etc/systemd/system/docker-healthcheck.service + permissions: "0644" + content: | + [Unit] + Requires=setup.service + After=setup.service + + [Service] + ExecStart=/usr/local/bin/health-monitor.sh container-runtime + + [Install] + WantedBy=multi-user.target + +- path: /usr/local/bin/health-monitor.sh + permissions: "0755" + # This script is a slightly adjusted version of + # https://github.com/kubernetes/kubernetes/blob/e1a1aa211224fcd9b213420b80b2ae680669683d/cluster/gce/gci/health-monitor.sh + # Adjustments are: + # * Kubelet health port is 10248 not 10255 + # * Removal of all all references to the KUBE_ENV file + content: | + #!/usr/bin/env bash + + # Copyright 2016 The Kubernetes Authors. + # + # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + + # This script is for master and node instance health monitoring, which is + # packed in kube-manifest tarball. It is executed through a systemd service + # in cluster/gce/gci/.yaml. The env variables come from an env + # file provided by the systemd service. + + set -o nounset + set -o pipefail + + # We simply kill the process when there is a failure. Another systemd service will + # automatically restart the process. + function container_runtime_monitoring { + local -r max_attempts=5 + local attempt=1 + local -r crictl="${KUBE_HOME}/bin/crictl" + local -r container_runtime_name="${CONTAINER_RUNTIME_NAME:-docker}" + # We still need to use 'docker ps' when container runtime is "docker". This is because + # dockershim is still part of kubelet today. When kubelet is down, crictl pods + # will also fail, and docker will be killed. This is undesirable especially when + # docker live restore is disabled. + local healthcheck_command="docker ps" + if [[ "${CONTAINER_RUNTIME:-docker}" != "docker" ]]; then + healthcheck_command="${crictl} pods" + fi + # Container runtime startup takes time. Make initial attempts before starting + # killing the container runtime. + until timeout 60 ${healthcheck_command} > /dev/null; do + if (( attempt == max_attempts )); then + echo "Max attempt ${max_attempts} reached! Proceeding to monitor container runtime healthiness." + break + fi + echo "$attempt initial attempt \"${healthcheck_command}\"! Trying again in $attempt seconds..." + sleep "$(( 2 ** attempt++ ))" + done + while true; do + if ! timeout 60 ${healthcheck_command} > /dev/null; then + echo "Container runtime ${container_runtime_name} failed!" + if [[ "$container_runtime_name" == "docker" ]]; then + # Dump stack of docker daemon for investigation. + # Log fle name looks like goroutine-stacks-TIMESTAMP and will be saved to + # the exec root directory, which is /var/run/docker/ on Ubuntu and COS. + pkill -SIGUSR1 dockerd + fi + systemctl kill --kill-who=main "${container_runtime_name}" + # Wait for a while, as we don't want to kill it again before it is really up. + sleep 120 + else + sleep "${SLEEP_SECONDS}" + fi + done + } + + function kubelet_monitoring { + echo "Wait for 2 minutes for kubelet to be functional" + # TODO(andyzheng0831): replace it with a more reliable method if possible. + sleep 120 + local -r max_seconds=10 + local output="" + while [ 1 ]; do + if ! output=$(curl -m "${max_seconds}" -f -s -S http://127.0.0.1:10248/healthz 2>&1); then + # Print the response and/or errors. + echo $output + echo "Kubelet is unhealthy!" + systemctl kill kubelet + # Wait for a while, as we don't want to kill it again before it is really up. + sleep 60 + else + sleep "${SLEEP_SECONDS}" + fi + done + } + + + ############## Main Function ################ + if [[ "$#" -ne 1 ]]; then + echo "Usage: health-monitor.sh " + exit 1 + fi + + KUBE_HOME="/home/kubernetes" + + SLEEP_SECONDS=10 + component=$1 + echo "Start kubernetes health monitoring for ${component}" + if [[ "${component}" == "container-runtime" ]]; then + container_runtime_monitoring + elif [[ "${component}" == "kubelet" ]]; then + kubelet_monitoring + else + echo "Health monitoring for component "${component}" is not supported!" + fi + runcmd: - systemctl enable --now setup.service diff --git a/pkg/userdata/centos/testdata/kubelet-v1.12-aws.golden b/pkg/userdata/centos/testdata/kubelet-v1.12-aws.golden index 96c70ecbc..144a82e18 100644 --- a/pkg/userdata/centos/testdata/kubelet-v1.12-aws.golden +++ b/pkg/userdata/centos/testdata/kubelet-v1.12-aws.golden @@ -58,7 +58,7 @@ write_files: {aws-config:true} - path: "/usr/local/bin/setup" - permissions: "0777" + permissions: "0755" content: | #!/bin/bash set -xeuo pipefail @@ -79,14 +79,19 @@ write_files: systemctl enable --now docker systemctl enable --now kubelet - kubeadm join \ - --token my-token \ - --discovery-token-ca-cert-hash sha256:6caecce9fedcb55d4953d61a27dc6997361a2f226ad86d7e6004dde7526fc4b1 \ - --ignore-preflight-errors=CRI \ - server:443 + if ! [[ -e /etc/kubernetes/pki/ca.crt ]]; then + kubeadm join \ + --token my-token \ + --discovery-token-ca-cert-hash sha256:6caecce9fedcb55d4953d61a27dc6997361a2f226ad86d7e6004dde7526fc4b1 \ + --ignore-preflight-errors=CRI \ + server:443 + fi + + systemctl enable --now --no-block kubelet-healthcheck.service + systemctl enable --now --no-block docker-healthcheck.service - path: "/usr/local/bin/supervise.sh" - permissions: "0777" + permissions: "0755" content: | #!/bin/bash set -xeuo pipefail @@ -108,5 +113,146 @@ write_files: RemainAfterExit=true ExecStart=/usr/local/bin/supervise.sh /usr/local/bin/setup +- path: /etc/systemd/system/kubelet-healthcheck.service + permissions: "0644" + content: | + [Unit] + Requires=setup.service + After=setup.service + + [Service] + ExecStart=/usr/local/bin/health-monitor.sh kubelet + + [Install] + WantedBy=multi-user.target + +- path: /etc/systemd/system/docker-healthcheck.service + permissions: "0644" + content: | + [Unit] + Requires=setup.service + After=setup.service + + [Service] + ExecStart=/usr/local/bin/health-monitor.sh container-runtime + + [Install] + WantedBy=multi-user.target + +- path: /usr/local/bin/health-monitor.sh + permissions: "0755" + # This script is a slightly adjusted version of + # https://github.com/kubernetes/kubernetes/blob/e1a1aa211224fcd9b213420b80b2ae680669683d/cluster/gce/gci/health-monitor.sh + # Adjustments are: + # * Kubelet health port is 10248 not 10255 + # * Removal of all all references to the KUBE_ENV file + content: | + #!/usr/bin/env bash + + # Copyright 2016 The Kubernetes Authors. + # + # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + + # This script is for master and node instance health monitoring, which is + # packed in kube-manifest tarball. It is executed through a systemd service + # in cluster/gce/gci/.yaml. The env variables come from an env + # file provided by the systemd service. + + set -o nounset + set -o pipefail + + # We simply kill the process when there is a failure. Another systemd service will + # automatically restart the process. + function container_runtime_monitoring { + local -r max_attempts=5 + local attempt=1 + local -r crictl="${KUBE_HOME}/bin/crictl" + local -r container_runtime_name="${CONTAINER_RUNTIME_NAME:-docker}" + # We still need to use 'docker ps' when container runtime is "docker". This is because + # dockershim is still part of kubelet today. When kubelet is down, crictl pods + # will also fail, and docker will be killed. This is undesirable especially when + # docker live restore is disabled. + local healthcheck_command="docker ps" + if [[ "${CONTAINER_RUNTIME:-docker}" != "docker" ]]; then + healthcheck_command="${crictl} pods" + fi + # Container runtime startup takes time. Make initial attempts before starting + # killing the container runtime. + until timeout 60 ${healthcheck_command} > /dev/null; do + if (( attempt == max_attempts )); then + echo "Max attempt ${max_attempts} reached! Proceeding to monitor container runtime healthiness." + break + fi + echo "$attempt initial attempt \"${healthcheck_command}\"! Trying again in $attempt seconds..." + sleep "$(( 2 ** attempt++ ))" + done + while true; do + if ! timeout 60 ${healthcheck_command} > /dev/null; then + echo "Container runtime ${container_runtime_name} failed!" + if [[ "$container_runtime_name" == "docker" ]]; then + # Dump stack of docker daemon for investigation. + # Log fle name looks like goroutine-stacks-TIMESTAMP and will be saved to + # the exec root directory, which is /var/run/docker/ on Ubuntu and COS. + pkill -SIGUSR1 dockerd + fi + systemctl kill --kill-who=main "${container_runtime_name}" + # Wait for a while, as we don't want to kill it again before it is really up. + sleep 120 + else + sleep "${SLEEP_SECONDS}" + fi + done + } + + function kubelet_monitoring { + echo "Wait for 2 minutes for kubelet to be functional" + # TODO(andyzheng0831): replace it with a more reliable method if possible. + sleep 120 + local -r max_seconds=10 + local output="" + while [ 1 ]; do + if ! output=$(curl -m "${max_seconds}" -f -s -S http://127.0.0.1:10248/healthz 2>&1); then + # Print the response and/or errors. + echo $output + echo "Kubelet is unhealthy!" + systemctl kill kubelet + # Wait for a while, as we don't want to kill it again before it is really up. + sleep 60 + else + sleep "${SLEEP_SECONDS}" + fi + done + } + + + ############## Main Function ################ + if [[ "$#" -ne 1 ]]; then + echo "Usage: health-monitor.sh " + exit 1 + fi + + KUBE_HOME="/home/kubernetes" + + SLEEP_SECONDS=10 + component=$1 + echo "Start kubernetes health monitoring for ${component}" + if [[ "${component}" == "container-runtime" ]]; then + container_runtime_monitoring + elif [[ "${component}" == "kubelet" ]]; then + kubelet_monitoring + else + echo "Health monitoring for component "${component}" is not supported!" + fi + runcmd: - systemctl enable --now setup.service diff --git a/pkg/userdata/centos/testdata/kubelet-v1.9-aws.golden b/pkg/userdata/centos/testdata/kubelet-v1.9-aws.golden index 1e826649e..70093e76e 100644 --- a/pkg/userdata/centos/testdata/kubelet-v1.9-aws.golden +++ b/pkg/userdata/centos/testdata/kubelet-v1.9-aws.golden @@ -62,7 +62,7 @@ write_files: {aws-config:true} - path: "/usr/local/bin/setup" - permissions: "0777" + permissions: "0755" content: | #!/bin/bash set -xeuo pipefail @@ -83,14 +83,19 @@ write_files: systemctl enable --now docker systemctl enable --now kubelet - kubeadm join \ - --token my-token \ - --discovery-token-ca-cert-hash sha256:6caecce9fedcb55d4953d61a27dc6997361a2f226ad86d7e6004dde7526fc4b1 \ - --ignore-preflight-errors=CRI \ - server:443 + if ! [[ -e /etc/kubernetes/pki/ca.crt ]]; then + kubeadm join \ + --token my-token \ + --discovery-token-ca-cert-hash sha256:6caecce9fedcb55d4953d61a27dc6997361a2f226ad86d7e6004dde7526fc4b1 \ + --ignore-preflight-errors=CRI \ + server:443 + fi + + systemctl enable --now --no-block kubelet-healthcheck.service + systemctl enable --now --no-block docker-healthcheck.service - path: "/usr/local/bin/supervise.sh" - permissions: "0777" + permissions: "0755" content: | #!/bin/bash set -xeuo pipefail @@ -112,5 +117,146 @@ write_files: RemainAfterExit=true ExecStart=/usr/local/bin/supervise.sh /usr/local/bin/setup +- path: /etc/systemd/system/kubelet-healthcheck.service + permissions: "0644" + content: | + [Unit] + Requires=setup.service + After=setup.service + + [Service] + ExecStart=/usr/local/bin/health-monitor.sh kubelet + + [Install] + WantedBy=multi-user.target + +- path: /etc/systemd/system/docker-healthcheck.service + permissions: "0644" + content: | + [Unit] + Requires=setup.service + After=setup.service + + [Service] + ExecStart=/usr/local/bin/health-monitor.sh container-runtime + + [Install] + WantedBy=multi-user.target + +- path: /usr/local/bin/health-monitor.sh + permissions: "0755" + # This script is a slightly adjusted version of + # https://github.com/kubernetes/kubernetes/blob/e1a1aa211224fcd9b213420b80b2ae680669683d/cluster/gce/gci/health-monitor.sh + # Adjustments are: + # * Kubelet health port is 10248 not 10255 + # * Removal of all all references to the KUBE_ENV file + content: | + #!/usr/bin/env bash + + # Copyright 2016 The Kubernetes Authors. + # + # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + + # This script is for master and node instance health monitoring, which is + # packed in kube-manifest tarball. It is executed through a systemd service + # in cluster/gce/gci/.yaml. The env variables come from an env + # file provided by the systemd service. + + set -o nounset + set -o pipefail + + # We simply kill the process when there is a failure. Another systemd service will + # automatically restart the process. + function container_runtime_monitoring { + local -r max_attempts=5 + local attempt=1 + local -r crictl="${KUBE_HOME}/bin/crictl" + local -r container_runtime_name="${CONTAINER_RUNTIME_NAME:-docker}" + # We still need to use 'docker ps' when container runtime is "docker". This is because + # dockershim is still part of kubelet today. When kubelet is down, crictl pods + # will also fail, and docker will be killed. This is undesirable especially when + # docker live restore is disabled. + local healthcheck_command="docker ps" + if [[ "${CONTAINER_RUNTIME:-docker}" != "docker" ]]; then + healthcheck_command="${crictl} pods" + fi + # Container runtime startup takes time. Make initial attempts before starting + # killing the container runtime. + until timeout 60 ${healthcheck_command} > /dev/null; do + if (( attempt == max_attempts )); then + echo "Max attempt ${max_attempts} reached! Proceeding to monitor container runtime healthiness." + break + fi + echo "$attempt initial attempt \"${healthcheck_command}\"! Trying again in $attempt seconds..." + sleep "$(( 2 ** attempt++ ))" + done + while true; do + if ! timeout 60 ${healthcheck_command} > /dev/null; then + echo "Container runtime ${container_runtime_name} failed!" + if [[ "$container_runtime_name" == "docker" ]]; then + # Dump stack of docker daemon for investigation. + # Log fle name looks like goroutine-stacks-TIMESTAMP and will be saved to + # the exec root directory, which is /var/run/docker/ on Ubuntu and COS. + pkill -SIGUSR1 dockerd + fi + systemctl kill --kill-who=main "${container_runtime_name}" + # Wait for a while, as we don't want to kill it again before it is really up. + sleep 120 + else + sleep "${SLEEP_SECONDS}" + fi + done + } + + function kubelet_monitoring { + echo "Wait for 2 minutes for kubelet to be functional" + # TODO(andyzheng0831): replace it with a more reliable method if possible. + sleep 120 + local -r max_seconds=10 + local output="" + while [ 1 ]; do + if ! output=$(curl -m "${max_seconds}" -f -s -S http://127.0.0.1:10248/healthz 2>&1); then + # Print the response and/or errors. + echo $output + echo "Kubelet is unhealthy!" + systemctl kill kubelet + # Wait for a while, as we don't want to kill it again before it is really up. + sleep 60 + else + sleep "${SLEEP_SECONDS}" + fi + done + } + + + ############## Main Function ################ + if [[ "$#" -ne 1 ]]; then + echo "Usage: health-monitor.sh " + exit 1 + fi + + KUBE_HOME="/home/kubernetes" + + SLEEP_SECONDS=10 + component=$1 + echo "Start kubernetes health monitoring for ${component}" + if [[ "${component}" == "container-runtime" ]]; then + container_runtime_monitoring + elif [[ "${component}" == "kubelet" ]]; then + kubelet_monitoring + else + echo "Health monitoring for component "${component}" is not supported!" + fi + runcmd: - systemctl enable --now setup.service diff --git a/pkg/userdata/centos/userdata.go b/pkg/userdata/centos/userdata.go index e01883da4..cdbef9453 100644 --- a/pkg/userdata/centos/userdata.go +++ b/pkg/userdata/centos/userdata.go @@ -209,7 +209,7 @@ write_files: {{ if ne .CloudConfig "" }}{{ .CloudConfig | indent 4 }}{{ end }} - path: "/usr/local/bin/setup" - permissions: "0777" + permissions: "0755" content: | #!/bin/bash set -xeuo pipefail @@ -230,16 +230,21 @@ write_files: systemctl enable --now docker systemctl enable --now kubelet - kubeadm join \ - --token {{ .BoostrapToken }} \ - --discovery-token-ca-cert-hash sha256:{{ .KubeadmCACertHash }} \ - {{- if semverCompare ">=1.9.X" .KubeletVersion }} - --ignore-preflight-errors=CRI \ - {{- end }} - {{ .ServerAddr }} + if ! [[ -e /etc/kubernetes/pki/ca.crt ]]; then + kubeadm join \ + --token {{ .BoostrapToken }} \ + --discovery-token-ca-cert-hash sha256:{{ .KubeadmCACertHash }} \ + {{- if semverCompare ">=1.9.X" .KubeletVersion }} + --ignore-preflight-errors=CRI \ + {{- end }} + {{ .ServerAddr }} + fi + + systemctl enable --now --no-block kubelet-healthcheck.service + systemctl enable --now --no-block docker-healthcheck.service - path: "/usr/local/bin/supervise.sh" - permissions: "0777" + permissions: "0755" content: | #!/bin/bash set -xeuo pipefail @@ -261,6 +266,147 @@ write_files: RemainAfterExit=true ExecStart=/usr/local/bin/supervise.sh /usr/local/bin/setup +- path: /etc/systemd/system/kubelet-healthcheck.service + permissions: "0644" + content: | + [Unit] + Requires=setup.service + After=setup.service + + [Service] + ExecStart=/usr/local/bin/health-monitor.sh kubelet + + [Install] + WantedBy=multi-user.target + +- path: /etc/systemd/system/docker-healthcheck.service + permissions: "0644" + content: | + [Unit] + Requires=setup.service + After=setup.service + + [Service] + ExecStart=/usr/local/bin/health-monitor.sh container-runtime + + [Install] + WantedBy=multi-user.target + +- path: /usr/local/bin/health-monitor.sh + permissions: "0755" + # This script is a slightly adjusted version of + # https://github.com/kubernetes/kubernetes/blob/e1a1aa211224fcd9b213420b80b2ae680669683d/cluster/gce/gci/health-monitor.sh + # Adjustments are: + # * Kubelet health port is 10248 not 10255 + # * Removal of all all references to the KUBE_ENV file + content: | + #!/usr/bin/env bash + + # Copyright 2016 The Kubernetes Authors. + # + # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + + # This script is for master and node instance health monitoring, which is + # packed in kube-manifest tarball. It is executed through a systemd service + # in cluster/gce/gci/.yaml. The env variables come from an env + # file provided by the systemd service. + + set -o nounset + set -o pipefail + + # We simply kill the process when there is a failure. Another systemd service will + # automatically restart the process. + function container_runtime_monitoring { + local -r max_attempts=5 + local attempt=1 + local -r crictl="${KUBE_HOME}/bin/crictl" + local -r container_runtime_name="${CONTAINER_RUNTIME_NAME:-docker}" + # We still need to use 'docker ps' when container runtime is "docker". This is because + # dockershim is still part of kubelet today. When kubelet is down, crictl pods + # will also fail, and docker will be killed. This is undesirable especially when + # docker live restore is disabled. + local healthcheck_command="docker ps" + if [[ "${CONTAINER_RUNTIME:-docker}" != "docker" ]]; then + healthcheck_command="${crictl} pods" + fi + # Container runtime startup takes time. Make initial attempts before starting + # killing the container runtime. + until timeout 60 ${healthcheck_command} > /dev/null; do + if (( attempt == max_attempts )); then + echo "Max attempt ${max_attempts} reached! Proceeding to monitor container runtime healthiness." + break + fi + echo "$attempt initial attempt \"${healthcheck_command}\"! Trying again in $attempt seconds..." + sleep "$(( 2 ** attempt++ ))" + done + while true; do + if ! timeout 60 ${healthcheck_command} > /dev/null; then + echo "Container runtime ${container_runtime_name} failed!" + if [[ "$container_runtime_name" == "docker" ]]; then + # Dump stack of docker daemon for investigation. + # Log fle name looks like goroutine-stacks-TIMESTAMP and will be saved to + # the exec root directory, which is /var/run/docker/ on Ubuntu and COS. + pkill -SIGUSR1 dockerd + fi + systemctl kill --kill-who=main "${container_runtime_name}" + # Wait for a while, as we don't want to kill it again before it is really up. + sleep 120 + else + sleep "${SLEEP_SECONDS}" + fi + done + } + + function kubelet_monitoring { + echo "Wait for 2 minutes for kubelet to be functional" + # TODO(andyzheng0831): replace it with a more reliable method if possible. + sleep 120 + local -r max_seconds=10 + local output="" + while [ 1 ]; do + if ! output=$(curl -m "${max_seconds}" -f -s -S http://127.0.0.1:10248/healthz 2>&1); then + # Print the response and/or errors. + echo $output + echo "Kubelet is unhealthy!" + systemctl kill kubelet + # Wait for a while, as we don't want to kill it again before it is really up. + sleep 60 + else + sleep "${SLEEP_SECONDS}" + fi + done + } + + + ############## Main Function ################ + if [[ "$#" -ne 1 ]]; then + echo "Usage: health-monitor.sh " + exit 1 + fi + + KUBE_HOME="/home/kubernetes" + + SLEEP_SECONDS=10 + component=$1 + echo "Start kubernetes health monitoring for ${component}" + if [[ "${component}" == "container-runtime" ]]; then + container_runtime_monitoring + elif [[ "${component}" == "kubelet" ]]; then + kubelet_monitoring + else + echo "Health monitoring for component "${component}" is not supported!" + fi + runcmd: - systemctl enable --now setup.service ` diff --git a/pkg/userdata/ubuntu/testdata/1.11-aws.golden b/pkg/userdata/ubuntu/testdata/1.11-aws.golden index e4a5090b3..b7a07004a 100644 --- a/pkg/userdata/ubuntu/testdata/1.11-aws.golden +++ b/pkg/userdata/ubuntu/testdata/1.11-aws.golden @@ -106,6 +106,9 @@ write_files: server:443 fi + systemctl enable --now --no-block kubelet-healthcheck.service + systemctl enable --now --no-block docker-healthcheck.service + - path: "/opt/kubernetes.asc" permissions: "0400" content: | @@ -232,11 +235,152 @@ write_files: ExecStart=/usr/local/bin/supervise.sh /usr/local/bin/setup - path: /etc/systemd/system/docker.service.d/10-storage.conf - permission: "0644" + permissions: "0644" content: | [Service] ExecStart= ExecStart=/usr/bin/dockerd -H fd:// --storage-driver=overlay2 +- path: /etc/systemd/system/kubelet-healthcheck.service + permissions: "0644" + content: | + [Unit] + Requires=setup.service + After=setup.service + + [Service] + ExecStart=/usr/local/bin/health-monitor.sh kubelet + + [Install] + WantedBy=multi-user.target + +- path: /etc/systemd/system/docker-healthcheck.service + permissions: "0644" + content: | + [Unit] + Requires=setup.service + After=setup.service + + [Service] + ExecStart=/usr/local/bin/health-monitor.sh container-runtime + + [Install] + WantedBy=multi-user.target + +- path: /usr/local/bin/health-monitor.sh + permissions: "0755" + # This script is a slightly adjusted version of + # https://github.com/kubernetes/kubernetes/blob/e1a1aa211224fcd9b213420b80b2ae680669683d/cluster/gce/gci/health-monitor.sh + # Adjustments are: + # * Kubelet health port is 10248 not 10255 + # * Removal of all all references to the KUBE_ENV file + content: | + #!/usr/bin/env bash + + # Copyright 2016 The Kubernetes Authors. + # + # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + + # This script is for master and node instance health monitoring, which is + # packed in kube-manifest tarball. It is executed through a systemd service + # in cluster/gce/gci/.yaml. The env variables come from an env + # file provided by the systemd service. + + set -o nounset + set -o pipefail + + # We simply kill the process when there is a failure. Another systemd service will + # automatically restart the process. + function container_runtime_monitoring { + local -r max_attempts=5 + local attempt=1 + local -r crictl="${KUBE_HOME}/bin/crictl" + local -r container_runtime_name="${CONTAINER_RUNTIME_NAME:-docker}" + # We still need to use 'docker ps' when container runtime is "docker". This is because + # dockershim is still part of kubelet today. When kubelet is down, crictl pods + # will also fail, and docker will be killed. This is undesirable especially when + # docker live restore is disabled. + local healthcheck_command="docker ps" + if [[ "${CONTAINER_RUNTIME:-docker}" != "docker" ]]; then + healthcheck_command="${crictl} pods" + fi + # Container runtime startup takes time. Make initial attempts before starting + # killing the container runtime. + until timeout 60 ${healthcheck_command} > /dev/null; do + if (( attempt == max_attempts )); then + echo "Max attempt ${max_attempts} reached! Proceeding to monitor container runtime healthiness." + break + fi + echo "$attempt initial attempt \"${healthcheck_command}\"! Trying again in $attempt seconds..." + sleep "$(( 2 ** attempt++ ))" + done + while true; do + if ! timeout 60 ${healthcheck_command} > /dev/null; then + echo "Container runtime ${container_runtime_name} failed!" + if [[ "$container_runtime_name" == "docker" ]]; then + # Dump stack of docker daemon for investigation. + # Log fle name looks like goroutine-stacks-TIMESTAMP and will be saved to + # the exec root directory, which is /var/run/docker/ on Ubuntu and COS. + pkill -SIGUSR1 dockerd + fi + systemctl kill --kill-who=main "${container_runtime_name}" + # Wait for a while, as we don't want to kill it again before it is really up. + sleep 120 + else + sleep "${SLEEP_SECONDS}" + fi + done + } + + function kubelet_monitoring { + echo "Wait for 2 minutes for kubelet to be functional" + # TODO(andyzheng0831): replace it with a more reliable method if possible. + sleep 120 + local -r max_seconds=10 + local output="" + while [ 1 ]; do + if ! output=$(curl -m "${max_seconds}" -f -s -S http://127.0.0.1:10248/healthz 2>&1); then + # Print the response and/or errors. + echo $output + echo "Kubelet is unhealthy!" + systemctl kill kubelet + # Wait for a while, as we don't want to kill it again before it is really up. + sleep 60 + else + sleep "${SLEEP_SECONDS}" + fi + done + } + + + ############## Main Function ################ + if [[ "$#" -ne 1 ]]; then + echo "Usage: health-monitor.sh " + exit 1 + fi + + KUBE_HOME="/home/kubernetes" + + SLEEP_SECONDS=10 + component=$1 + echo "Start kubernetes health monitoring for ${component}" + if [[ "${component}" == "container-runtime" ]]; then + container_runtime_monitoring + elif [[ "${component}" == "kubelet" ]]; then + kubelet_monitoring + else + echo "Health monitoring for component "${component}" is not supported!" + fi + runcmd: - systemctl enable --now setup.service diff --git a/pkg/userdata/ubuntu/testdata/1.9.2-dist-upgrade-on-boot-aws.golden b/pkg/userdata/ubuntu/testdata/1.9.2-dist-upgrade-on-boot-aws.golden index 709e560b8..567731b13 100644 --- a/pkg/userdata/ubuntu/testdata/1.9.2-dist-upgrade-on-boot-aws.golden +++ b/pkg/userdata/ubuntu/testdata/1.9.2-dist-upgrade-on-boot-aws.golden @@ -107,6 +107,9 @@ write_files: server:443 fi + systemctl enable --now --no-block kubelet-healthcheck.service + systemctl enable --now --no-block docker-healthcheck.service + - path: "/opt/kubernetes.asc" permissions: "0400" content: | @@ -240,11 +243,152 @@ write_files: ExecStart=/usr/local/bin/supervise.sh /usr/local/bin/setup - path: /etc/systemd/system/docker.service.d/10-storage.conf - permission: "0644" + permissions: "0644" content: | [Service] ExecStart= ExecStart=/usr/bin/dockerd -H fd:// --storage-driver=overlay2 +- path: /etc/systemd/system/kubelet-healthcheck.service + permissions: "0644" + content: | + [Unit] + Requires=setup.service + After=setup.service + + [Service] + ExecStart=/usr/local/bin/health-monitor.sh kubelet + + [Install] + WantedBy=multi-user.target + +- path: /etc/systemd/system/docker-healthcheck.service + permissions: "0644" + content: | + [Unit] + Requires=setup.service + After=setup.service + + [Service] + ExecStart=/usr/local/bin/health-monitor.sh container-runtime + + [Install] + WantedBy=multi-user.target + +- path: /usr/local/bin/health-monitor.sh + permissions: "0755" + # This script is a slightly adjusted version of + # https://github.com/kubernetes/kubernetes/blob/e1a1aa211224fcd9b213420b80b2ae680669683d/cluster/gce/gci/health-monitor.sh + # Adjustments are: + # * Kubelet health port is 10248 not 10255 + # * Removal of all all references to the KUBE_ENV file + content: | + #!/usr/bin/env bash + + # Copyright 2016 The Kubernetes Authors. + # + # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + + # This script is for master and node instance health monitoring, which is + # packed in kube-manifest tarball. It is executed through a systemd service + # in cluster/gce/gci/.yaml. The env variables come from an env + # file provided by the systemd service. + + set -o nounset + set -o pipefail + + # We simply kill the process when there is a failure. Another systemd service will + # automatically restart the process. + function container_runtime_monitoring { + local -r max_attempts=5 + local attempt=1 + local -r crictl="${KUBE_HOME}/bin/crictl" + local -r container_runtime_name="${CONTAINER_RUNTIME_NAME:-docker}" + # We still need to use 'docker ps' when container runtime is "docker". This is because + # dockershim is still part of kubelet today. When kubelet is down, crictl pods + # will also fail, and docker will be killed. This is undesirable especially when + # docker live restore is disabled. + local healthcheck_command="docker ps" + if [[ "${CONTAINER_RUNTIME:-docker}" != "docker" ]]; then + healthcheck_command="${crictl} pods" + fi + # Container runtime startup takes time. Make initial attempts before starting + # killing the container runtime. + until timeout 60 ${healthcheck_command} > /dev/null; do + if (( attempt == max_attempts )); then + echo "Max attempt ${max_attempts} reached! Proceeding to monitor container runtime healthiness." + break + fi + echo "$attempt initial attempt \"${healthcheck_command}\"! Trying again in $attempt seconds..." + sleep "$(( 2 ** attempt++ ))" + done + while true; do + if ! timeout 60 ${healthcheck_command} > /dev/null; then + echo "Container runtime ${container_runtime_name} failed!" + if [[ "$container_runtime_name" == "docker" ]]; then + # Dump stack of docker daemon for investigation. + # Log fle name looks like goroutine-stacks-TIMESTAMP and will be saved to + # the exec root directory, which is /var/run/docker/ on Ubuntu and COS. + pkill -SIGUSR1 dockerd + fi + systemctl kill --kill-who=main "${container_runtime_name}" + # Wait for a while, as we don't want to kill it again before it is really up. + sleep 120 + else + sleep "${SLEEP_SECONDS}" + fi + done + } + + function kubelet_monitoring { + echo "Wait for 2 minutes for kubelet to be functional" + # TODO(andyzheng0831): replace it with a more reliable method if possible. + sleep 120 + local -r max_seconds=10 + local output="" + while [ 1 ]; do + if ! output=$(curl -m "${max_seconds}" -f -s -S http://127.0.0.1:10248/healthz 2>&1); then + # Print the response and/or errors. + echo $output + echo "Kubelet is unhealthy!" + systemctl kill kubelet + # Wait for a while, as we don't want to kill it again before it is really up. + sleep 60 + else + sleep "${SLEEP_SECONDS}" + fi + done + } + + + ############## Main Function ################ + if [[ "$#" -ne 1 ]]; then + echo "Usage: health-monitor.sh " + exit 1 + fi + + KUBE_HOME="/home/kubernetes" + + SLEEP_SECONDS=10 + component=$1 + echo "Start kubernetes health monitoring for ${component}" + if [[ "${component}" == "container-runtime" ]]; then + container_runtime_monitoring + elif [[ "${component}" == "kubelet" ]]; then + kubelet_monitoring + else + echo "Health monitoring for component "${component}" is not supported!" + fi + runcmd: - systemctl enable --now setup.service diff --git a/pkg/userdata/ubuntu/testdata/1.9.2-openstack-multiple-dns.golden b/pkg/userdata/ubuntu/testdata/1.9.2-openstack-multiple-dns.golden index e2a3d7740..18322c6cd 100644 --- a/pkg/userdata/ubuntu/testdata/1.9.2-openstack-multiple-dns.golden +++ b/pkg/userdata/ubuntu/testdata/1.9.2-openstack-multiple-dns.golden @@ -107,6 +107,9 @@ write_files: server:443 fi + systemctl enable --now --no-block kubelet-healthcheck.service + systemctl enable --now --no-block docker-healthcheck.service + - path: "/opt/kubernetes.asc" permissions: "0400" content: | @@ -240,11 +243,152 @@ write_files: ExecStart=/usr/local/bin/supervise.sh /usr/local/bin/setup - path: /etc/systemd/system/docker.service.d/10-storage.conf - permission: "0644" + permissions: "0644" content: | [Service] ExecStart= ExecStart=/usr/bin/dockerd -H fd:// --storage-driver=overlay2 +- path: /etc/systemd/system/kubelet-healthcheck.service + permissions: "0644" + content: | + [Unit] + Requires=setup.service + After=setup.service + + [Service] + ExecStart=/usr/local/bin/health-monitor.sh kubelet + + [Install] + WantedBy=multi-user.target + +- path: /etc/systemd/system/docker-healthcheck.service + permissions: "0644" + content: | + [Unit] + Requires=setup.service + After=setup.service + + [Service] + ExecStart=/usr/local/bin/health-monitor.sh container-runtime + + [Install] + WantedBy=multi-user.target + +- path: /usr/local/bin/health-monitor.sh + permissions: "0755" + # This script is a slightly adjusted version of + # https://github.com/kubernetes/kubernetes/blob/e1a1aa211224fcd9b213420b80b2ae680669683d/cluster/gce/gci/health-monitor.sh + # Adjustments are: + # * Kubelet health port is 10248 not 10255 + # * Removal of all all references to the KUBE_ENV file + content: | + #!/usr/bin/env bash + + # Copyright 2016 The Kubernetes Authors. + # + # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + + # This script is for master and node instance health monitoring, which is + # packed in kube-manifest tarball. It is executed through a systemd service + # in cluster/gce/gci/.yaml. The env variables come from an env + # file provided by the systemd service. + + set -o nounset + set -o pipefail + + # We simply kill the process when there is a failure. Another systemd service will + # automatically restart the process. + function container_runtime_monitoring { + local -r max_attempts=5 + local attempt=1 + local -r crictl="${KUBE_HOME}/bin/crictl" + local -r container_runtime_name="${CONTAINER_RUNTIME_NAME:-docker}" + # We still need to use 'docker ps' when container runtime is "docker". This is because + # dockershim is still part of kubelet today. When kubelet is down, crictl pods + # will also fail, and docker will be killed. This is undesirable especially when + # docker live restore is disabled. + local healthcheck_command="docker ps" + if [[ "${CONTAINER_RUNTIME:-docker}" != "docker" ]]; then + healthcheck_command="${crictl} pods" + fi + # Container runtime startup takes time. Make initial attempts before starting + # killing the container runtime. + until timeout 60 ${healthcheck_command} > /dev/null; do + if (( attempt == max_attempts )); then + echo "Max attempt ${max_attempts} reached! Proceeding to monitor container runtime healthiness." + break + fi + echo "$attempt initial attempt \"${healthcheck_command}\"! Trying again in $attempt seconds..." + sleep "$(( 2 ** attempt++ ))" + done + while true; do + if ! timeout 60 ${healthcheck_command} > /dev/null; then + echo "Container runtime ${container_runtime_name} failed!" + if [[ "$container_runtime_name" == "docker" ]]; then + # Dump stack of docker daemon for investigation. + # Log fle name looks like goroutine-stacks-TIMESTAMP and will be saved to + # the exec root directory, which is /var/run/docker/ on Ubuntu and COS. + pkill -SIGUSR1 dockerd + fi + systemctl kill --kill-who=main "${container_runtime_name}" + # Wait for a while, as we don't want to kill it again before it is really up. + sleep 120 + else + sleep "${SLEEP_SECONDS}" + fi + done + } + + function kubelet_monitoring { + echo "Wait for 2 minutes for kubelet to be functional" + # TODO(andyzheng0831): replace it with a more reliable method if possible. + sleep 120 + local -r max_seconds=10 + local output="" + while [ 1 ]; do + if ! output=$(curl -m "${max_seconds}" -f -s -S http://127.0.0.1:10248/healthz 2>&1); then + # Print the response and/or errors. + echo $output + echo "Kubelet is unhealthy!" + systemctl kill kubelet + # Wait for a while, as we don't want to kill it again before it is really up. + sleep 60 + else + sleep "${SLEEP_SECONDS}" + fi + done + } + + + ############## Main Function ################ + if [[ "$#" -ne 1 ]]; then + echo "Usage: health-monitor.sh " + exit 1 + fi + + KUBE_HOME="/home/kubernetes" + + SLEEP_SECONDS=10 + component=$1 + echo "Start kubernetes health monitoring for ${component}" + if [[ "${component}" == "container-runtime" ]]; then + container_runtime_monitoring + elif [[ "${component}" == "kubelet" ]]; then + kubelet_monitoring + else + echo "Health monitoring for component "${component}" is not supported!" + fi + runcmd: - systemctl enable --now setup.service diff --git a/pkg/userdata/ubuntu/testdata/openstack-kubelet-v-version-prefix.golden b/pkg/userdata/ubuntu/testdata/openstack-kubelet-v-version-prefix.golden index 2cef07e49..e16e27348 100644 --- a/pkg/userdata/ubuntu/testdata/openstack-kubelet-v-version-prefix.golden +++ b/pkg/userdata/ubuntu/testdata/openstack-kubelet-v-version-prefix.golden @@ -107,6 +107,9 @@ write_files: server:443 fi + systemctl enable --now --no-block kubelet-healthcheck.service + systemctl enable --now --no-block docker-healthcheck.service + - path: "/opt/kubernetes.asc" permissions: "0400" content: | @@ -240,11 +243,152 @@ write_files: ExecStart=/usr/local/bin/supervise.sh /usr/local/bin/setup - path: /etc/systemd/system/docker.service.d/10-storage.conf - permission: "0644" + permissions: "0644" content: | [Service] ExecStart= ExecStart=/usr/bin/dockerd -H fd:// --storage-driver=overlay2 +- path: /etc/systemd/system/kubelet-healthcheck.service + permissions: "0644" + content: | + [Unit] + Requires=setup.service + After=setup.service + + [Service] + ExecStart=/usr/local/bin/health-monitor.sh kubelet + + [Install] + WantedBy=multi-user.target + +- path: /etc/systemd/system/docker-healthcheck.service + permissions: "0644" + content: | + [Unit] + Requires=setup.service + After=setup.service + + [Service] + ExecStart=/usr/local/bin/health-monitor.sh container-runtime + + [Install] + WantedBy=multi-user.target + +- path: /usr/local/bin/health-monitor.sh + permissions: "0755" + # This script is a slightly adjusted version of + # https://github.com/kubernetes/kubernetes/blob/e1a1aa211224fcd9b213420b80b2ae680669683d/cluster/gce/gci/health-monitor.sh + # Adjustments are: + # * Kubelet health port is 10248 not 10255 + # * Removal of all all references to the KUBE_ENV file + content: | + #!/usr/bin/env bash + + # Copyright 2016 The Kubernetes Authors. + # + # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + + # This script is for master and node instance health monitoring, which is + # packed in kube-manifest tarball. It is executed through a systemd service + # in cluster/gce/gci/.yaml. The env variables come from an env + # file provided by the systemd service. + + set -o nounset + set -o pipefail + + # We simply kill the process when there is a failure. Another systemd service will + # automatically restart the process. + function container_runtime_monitoring { + local -r max_attempts=5 + local attempt=1 + local -r crictl="${KUBE_HOME}/bin/crictl" + local -r container_runtime_name="${CONTAINER_RUNTIME_NAME:-docker}" + # We still need to use 'docker ps' when container runtime is "docker". This is because + # dockershim is still part of kubelet today. When kubelet is down, crictl pods + # will also fail, and docker will be killed. This is undesirable especially when + # docker live restore is disabled. + local healthcheck_command="docker ps" + if [[ "${CONTAINER_RUNTIME:-docker}" != "docker" ]]; then + healthcheck_command="${crictl} pods" + fi + # Container runtime startup takes time. Make initial attempts before starting + # killing the container runtime. + until timeout 60 ${healthcheck_command} > /dev/null; do + if (( attempt == max_attempts )); then + echo "Max attempt ${max_attempts} reached! Proceeding to monitor container runtime healthiness." + break + fi + echo "$attempt initial attempt \"${healthcheck_command}\"! Trying again in $attempt seconds..." + sleep "$(( 2 ** attempt++ ))" + done + while true; do + if ! timeout 60 ${healthcheck_command} > /dev/null; then + echo "Container runtime ${container_runtime_name} failed!" + if [[ "$container_runtime_name" == "docker" ]]; then + # Dump stack of docker daemon for investigation. + # Log fle name looks like goroutine-stacks-TIMESTAMP and will be saved to + # the exec root directory, which is /var/run/docker/ on Ubuntu and COS. + pkill -SIGUSR1 dockerd + fi + systemctl kill --kill-who=main "${container_runtime_name}" + # Wait for a while, as we don't want to kill it again before it is really up. + sleep 120 + else + sleep "${SLEEP_SECONDS}" + fi + done + } + + function kubelet_monitoring { + echo "Wait for 2 minutes for kubelet to be functional" + # TODO(andyzheng0831): replace it with a more reliable method if possible. + sleep 120 + local -r max_seconds=10 + local output="" + while [ 1 ]; do + if ! output=$(curl -m "${max_seconds}" -f -s -S http://127.0.0.1:10248/healthz 2>&1); then + # Print the response and/or errors. + echo $output + echo "Kubelet is unhealthy!" + systemctl kill kubelet + # Wait for a while, as we don't want to kill it again before it is really up. + sleep 60 + else + sleep "${SLEEP_SECONDS}" + fi + done + } + + + ############## Main Function ################ + if [[ "$#" -ne 1 ]]; then + echo "Usage: health-monitor.sh " + exit 1 + fi + + KUBE_HOME="/home/kubernetes" + + SLEEP_SECONDS=10 + component=$1 + echo "Start kubernetes health monitoring for ${component}" + if [[ "${component}" == "container-runtime" ]]; then + container_runtime_monitoring + elif [[ "${component}" == "kubelet" ]]; then + kubelet_monitoring + else + echo "Health monitoring for component "${component}" is not supported!" + fi + runcmd: - systemctl enable --now setup.service diff --git a/pkg/userdata/ubuntu/testdata/openstack-overwrite-cloud-config.golden b/pkg/userdata/ubuntu/testdata/openstack-overwrite-cloud-config.golden index 577cc05c2..5c54c048c 100644 --- a/pkg/userdata/ubuntu/testdata/openstack-overwrite-cloud-config.golden +++ b/pkg/userdata/ubuntu/testdata/openstack-overwrite-cloud-config.golden @@ -109,6 +109,9 @@ write_files: server:443 fi + systemctl enable --now --no-block kubelet-healthcheck.service + systemctl enable --now --no-block docker-healthcheck.service + - path: "/opt/kubernetes.asc" permissions: "0400" content: | @@ -242,11 +245,152 @@ write_files: ExecStart=/usr/local/bin/supervise.sh /usr/local/bin/setup - path: /etc/systemd/system/docker.service.d/10-storage.conf - permission: "0644" + permissions: "0644" content: | [Service] ExecStart= ExecStart=/usr/bin/dockerd -H fd:// --storage-driver=overlay2 +- path: /etc/systemd/system/kubelet-healthcheck.service + permissions: "0644" + content: | + [Unit] + Requires=setup.service + After=setup.service + + [Service] + ExecStart=/usr/local/bin/health-monitor.sh kubelet + + [Install] + WantedBy=multi-user.target + +- path: /etc/systemd/system/docker-healthcheck.service + permissions: "0644" + content: | + [Unit] + Requires=setup.service + After=setup.service + + [Service] + ExecStart=/usr/local/bin/health-monitor.sh container-runtime + + [Install] + WantedBy=multi-user.target + +- path: /usr/local/bin/health-monitor.sh + permissions: "0755" + # This script is a slightly adjusted version of + # https://github.com/kubernetes/kubernetes/blob/e1a1aa211224fcd9b213420b80b2ae680669683d/cluster/gce/gci/health-monitor.sh + # Adjustments are: + # * Kubelet health port is 10248 not 10255 + # * Removal of all all references to the KUBE_ENV file + content: | + #!/usr/bin/env bash + + # Copyright 2016 The Kubernetes Authors. + # + # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + + # This script is for master and node instance health monitoring, which is + # packed in kube-manifest tarball. It is executed through a systemd service + # in cluster/gce/gci/.yaml. The env variables come from an env + # file provided by the systemd service. + + set -o nounset + set -o pipefail + + # We simply kill the process when there is a failure. Another systemd service will + # automatically restart the process. + function container_runtime_monitoring { + local -r max_attempts=5 + local attempt=1 + local -r crictl="${KUBE_HOME}/bin/crictl" + local -r container_runtime_name="${CONTAINER_RUNTIME_NAME:-docker}" + # We still need to use 'docker ps' when container runtime is "docker". This is because + # dockershim is still part of kubelet today. When kubelet is down, crictl pods + # will also fail, and docker will be killed. This is undesirable especially when + # docker live restore is disabled. + local healthcheck_command="docker ps" + if [[ "${CONTAINER_RUNTIME:-docker}" != "docker" ]]; then + healthcheck_command="${crictl} pods" + fi + # Container runtime startup takes time. Make initial attempts before starting + # killing the container runtime. + until timeout 60 ${healthcheck_command} > /dev/null; do + if (( attempt == max_attempts )); then + echo "Max attempt ${max_attempts} reached! Proceeding to monitor container runtime healthiness." + break + fi + echo "$attempt initial attempt \"${healthcheck_command}\"! Trying again in $attempt seconds..." + sleep "$(( 2 ** attempt++ ))" + done + while true; do + if ! timeout 60 ${healthcheck_command} > /dev/null; then + echo "Container runtime ${container_runtime_name} failed!" + if [[ "$container_runtime_name" == "docker" ]]; then + # Dump stack of docker daemon for investigation. + # Log fle name looks like goroutine-stacks-TIMESTAMP and will be saved to + # the exec root directory, which is /var/run/docker/ on Ubuntu and COS. + pkill -SIGUSR1 dockerd + fi + systemctl kill --kill-who=main "${container_runtime_name}" + # Wait for a while, as we don't want to kill it again before it is really up. + sleep 120 + else + sleep "${SLEEP_SECONDS}" + fi + done + } + + function kubelet_monitoring { + echo "Wait for 2 minutes for kubelet to be functional" + # TODO(andyzheng0831): replace it with a more reliable method if possible. + sleep 120 + local -r max_seconds=10 + local output="" + while [ 1 ]; do + if ! output=$(curl -m "${max_seconds}" -f -s -S http://127.0.0.1:10248/healthz 2>&1); then + # Print the response and/or errors. + echo $output + echo "Kubelet is unhealthy!" + systemctl kill kubelet + # Wait for a while, as we don't want to kill it again before it is really up. + sleep 60 + else + sleep "${SLEEP_SECONDS}" + fi + done + } + + + ############## Main Function ################ + if [[ "$#" -ne 1 ]]; then + echo "Usage: health-monitor.sh " + exit 1 + fi + + KUBE_HOME="/home/kubernetes" + + SLEEP_SECONDS=10 + component=$1 + echo "Start kubernetes health monitoring for ${component}" + if [[ "${component}" == "container-runtime" ]]; then + container_runtime_monitoring + elif [[ "${component}" == "kubelet" ]]; then + kubelet_monitoring + else + echo "Health monitoring for component "${component}" is not supported!" + fi + runcmd: - systemctl enable --now setup.service diff --git a/pkg/userdata/ubuntu/userdata.go b/pkg/userdata/ubuntu/userdata.go index 28ce1a2e6..5eb3c3b90 100644 --- a/pkg/userdata/ubuntu/userdata.go +++ b/pkg/userdata/ubuntu/userdata.go @@ -256,6 +256,9 @@ write_files: {{ .ServerAddr }} fi + systemctl enable --now --no-block kubelet-healthcheck.service + systemctl enable --now --no-block docker-healthcheck.service + - path: "/opt/kubernetes.asc" permissions: "0400" content: | @@ -393,12 +396,153 @@ write_files: ExecStart=/usr/local/bin/supervise.sh /usr/local/bin/setup - path: /etc/systemd/system/docker.service.d/10-storage.conf - permission: "0644" + permissions: "0644" content: | [Service] ExecStart= ExecStart=/usr/bin/dockerd -H fd:// --storage-driver=overlay2 +- path: /etc/systemd/system/kubelet-healthcheck.service + permissions: "0644" + content: | + [Unit] + Requires=setup.service + After=setup.service + + [Service] + ExecStart=/usr/local/bin/health-monitor.sh kubelet + + [Install] + WantedBy=multi-user.target + +- path: /etc/systemd/system/docker-healthcheck.service + permissions: "0644" + content: | + [Unit] + Requires=setup.service + After=setup.service + + [Service] + ExecStart=/usr/local/bin/health-monitor.sh container-runtime + + [Install] + WantedBy=multi-user.target + +- path: /usr/local/bin/health-monitor.sh + permissions: "0755" + # This script is a slightly adjusted version of + # https://github.com/kubernetes/kubernetes/blob/e1a1aa211224fcd9b213420b80b2ae680669683d/cluster/gce/gci/health-monitor.sh + # Adjustments are: + # * Kubelet health port is 10248 not 10255 + # * Removal of all all references to the KUBE_ENV file + content: | + #!/usr/bin/env bash + + # Copyright 2016 The Kubernetes Authors. + # + # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + + # This script is for master and node instance health monitoring, which is + # packed in kube-manifest tarball. It is executed through a systemd service + # in cluster/gce/gci/.yaml. The env variables come from an env + # file provided by the systemd service. + + set -o nounset + set -o pipefail + + # We simply kill the process when there is a failure. Another systemd service will + # automatically restart the process. + function container_runtime_monitoring { + local -r max_attempts=5 + local attempt=1 + local -r crictl="${KUBE_HOME}/bin/crictl" + local -r container_runtime_name="${CONTAINER_RUNTIME_NAME:-docker}" + # We still need to use 'docker ps' when container runtime is "docker". This is because + # dockershim is still part of kubelet today. When kubelet is down, crictl pods + # will also fail, and docker will be killed. This is undesirable especially when + # docker live restore is disabled. + local healthcheck_command="docker ps" + if [[ "${CONTAINER_RUNTIME:-docker}" != "docker" ]]; then + healthcheck_command="${crictl} pods" + fi + # Container runtime startup takes time. Make initial attempts before starting + # killing the container runtime. + until timeout 60 ${healthcheck_command} > /dev/null; do + if (( attempt == max_attempts )); then + echo "Max attempt ${max_attempts} reached! Proceeding to monitor container runtime healthiness." + break + fi + echo "$attempt initial attempt \"${healthcheck_command}\"! Trying again in $attempt seconds..." + sleep "$(( 2 ** attempt++ ))" + done + while true; do + if ! timeout 60 ${healthcheck_command} > /dev/null; then + echo "Container runtime ${container_runtime_name} failed!" + if [[ "$container_runtime_name" == "docker" ]]; then + # Dump stack of docker daemon for investigation. + # Log fle name looks like goroutine-stacks-TIMESTAMP and will be saved to + # the exec root directory, which is /var/run/docker/ on Ubuntu and COS. + pkill -SIGUSR1 dockerd + fi + systemctl kill --kill-who=main "${container_runtime_name}" + # Wait for a while, as we don't want to kill it again before it is really up. + sleep 120 + else + sleep "${SLEEP_SECONDS}" + fi + done + } + + function kubelet_monitoring { + echo "Wait for 2 minutes for kubelet to be functional" + # TODO(andyzheng0831): replace it with a more reliable method if possible. + sleep 120 + local -r max_seconds=10 + local output="" + while [ 1 ]; do + if ! output=$(curl -m "${max_seconds}" -f -s -S http://127.0.0.1:10248/healthz 2>&1); then + # Print the response and/or errors. + echo $output + echo "Kubelet is unhealthy!" + systemctl kill kubelet + # Wait for a while, as we don't want to kill it again before it is really up. + sleep 60 + else + sleep "${SLEEP_SECONDS}" + fi + done + } + + + ############## Main Function ################ + if [[ "$#" -ne 1 ]]; then + echo "Usage: health-monitor.sh " + exit 1 + fi + + KUBE_HOME="/home/kubernetes" + + SLEEP_SECONDS=10 + component=$1 + echo "Start kubernetes health monitoring for ${component}" + if [[ "${component}" == "container-runtime" ]]; then + container_runtime_monitoring + elif [[ "${component}" == "kubelet" ]]; then + kubelet_monitoring + else + echo "Health monitoring for component "${component}" is not supported!" + fi + runcmd: - systemctl enable --now setup.service ` From 4eec50fc9760309d16ea916b40cd5ea52776a59c Mon Sep 17 00:00:00 2001 From: Alvaro Aleman Date: Fri, 12 Oct 2018 12:40:55 +0200 Subject: [PATCH 2/4] Add container runtime and kubelet health check script for coreos --- ...-openstack-kubelet-v-version-prefix.golden | 21 +++ ...-auto-update-openstack-multiple-dns.golden | 21 +++ .../v1.11.2-vsphere-static-ipconfig.golden | 21 +++ ....12.0-vsphere-overwrite-cloudconfig.golden | 21 +++ .../v1.9.2-disable-auto-update-aws.golden | 21 +++ pkg/userdata/coreos/userdata.go | 143 ++++++++++++++++++ 6 files changed, 248 insertions(+) diff --git a/pkg/userdata/coreos/testdata/auto-update-openstack-kubelet-v-version-prefix.golden b/pkg/userdata/coreos/testdata/auto-update-openstack-kubelet-v-version-prefix.golden index 432161cd6..78f7d1de9 100644 --- a/pkg/userdata/coreos/testdata/auto-update-openstack-kubelet-v-version-prefix.golden +++ b/pkg/userdata/coreos/testdata/auto-update-openstack-kubelet-v-version-prefix.golden @@ -153,6 +153,17 @@ "verification": {} }, "mode": 420 + }, + { + "filesystem": "root", + "group": {}, + "path": "/opt/bin/health-monitor.sh", + "user": {}, + "contents": { + "source": "data:,%23!%2Fusr%2Fbin%2Fenv%20bash%0A%0A%23%20Copyright%202016%20The%20Kubernetes%20Authors.%0A%23%0A%23%20Licensed%20under%20the%20Apache%20License%2C%20Version%202.0%20(the%20%22License%22)%3B%0A%23%20you%20may%20not%20use%20this%20file%20except%20in%20compliance%20with%20the%20License.%0A%23%20You%20may%20obtain%20a%20copy%20of%20the%20License%20at%0A%23%0A%23%20%20%20%20%20http%3A%2F%2Fwww.apache.org%2Flicenses%2FLICENSE-2.0%0A%23%0A%23%20Unless%20required%20by%20applicable%20law%20or%20agreed%20to%20in%20writing%2C%20software%0A%23%20distributed%20under%20the%20License%20is%20distributed%20on%20an%20%22AS%20IS%22%20BASIS%2C%0A%23%20WITHOUT%20WARRANTIES%20OR%20CONDITIONS%20OF%20ANY%20KIND%2C%20either%20express%20or%20implied.%0A%23%20See%20the%20License%20for%20the%20specific%20language%20governing%20permissions%20and%0A%23%20limitations%20under%20the%20License.%0A%0A%23%20This%20script%20is%20for%20master%20and%20node%20instance%20health%20monitoring%2C%20which%20is%0A%23%20packed%20in%20kube-manifest%20tarball.%20It%20is%20executed%20through%20a%20systemd%20service%0A%23%20in%20cluster%2Fgce%2Fgci%2F%3Cmaster%2Fnode%3E.yaml.%20The%20env%20variables%20come%20from%20an%20env%0A%23%20file%20provided%20by%20the%20systemd%20service.%0A%0Aset%20-o%20nounset%0Aset%20-o%20pipefail%0A%0A%23%20We%20simply%20kill%20the%20process%20when%20there%20is%20a%20failure.%20Another%20systemd%20service%20will%0A%23%20automatically%20restart%20the%20process.%0Afunction%20container_runtime_monitoring%20%7B%0A%20%20local%20-r%20max_attempts%3D5%0A%20%20local%20attempt%3D1%0A%20%20local%20-r%20crictl%3D%22%24%7BKUBE_HOME%7D%2Fbin%2Fcrictl%22%0A%20%20local%20-r%20container_runtime_name%3D%22%24%7BCONTAINER_RUNTIME_NAME%3A-docker%7D%22%0A%20%20%23%20We%20still%20need%20to%20use%20'docker%20ps'%20when%20container%20runtime%20is%20%22docker%22.%20This%20is%20because%0A%20%20%23%20dockershim%20is%20still%20part%20of%20kubelet%20today.%20When%20kubelet%20is%20down%2C%20crictl%20pods%0A%20%20%23%20will%20also%20fail%2C%20and%20docker%20will%20be%20killed.%20This%20is%20undesirable%20especially%20when%0A%20%20%23%20docker%20live%20restore%20is%20disabled.%0A%20%20local%20healthcheck_command%3D%22docker%20ps%22%0A%20%20if%20%5B%5B%20%22%24%7BCONTAINER_RUNTIME%3A-docker%7D%22%20!%3D%20%22docker%22%20%5D%5D%3B%20then%0A%20%20%20%20healthcheck_command%3D%22%24%7Bcrictl%7D%20pods%22%0A%20%20fi%0A%20%20%23%20Container%20runtime%20startup%20takes%20time.%20Make%20initial%20attempts%20before%20starting%0A%20%20%23%20killing%20the%20container%20runtime.%0A%20%20until%20timeout%2060%20%24%7Bhealthcheck_command%7D%20%3E%20%2Fdev%2Fnull%3B%20do%0A%20%20%20%20if%20((%20attempt%20%3D%3D%20max_attempts%20))%3B%20then%0A%20%20%20%20%20%20echo%20%22Max%20attempt%20%24%7Bmax_attempts%7D%20reached!%20Proceeding%20to%20monitor%20container%20runtime%20healthiness.%22%0A%20%20%20%20%20%20break%0A%20%20%20%20fi%0A%20%20%20%20echo%20%22%24attempt%20initial%20attempt%20%5C%22%24%7Bhealthcheck_command%7D%5C%22!%20Trying%20again%20in%20%24attempt%20seconds...%22%0A%20%20%20%20sleep%20%22%24((%202%20**%20attempt%2B%2B%20))%22%0A%20%20done%0A%20%20while%20true%3B%20do%0A%20%20%20%20if%20!%20timeout%2060%20%24%7Bhealthcheck_command%7D%20%3E%20%2Fdev%2Fnull%3B%20then%0A%20%20%20%20%20%20echo%20%22Container%20runtime%20%24%7Bcontainer_runtime_name%7D%20failed!%22%0A%20%20%20%20%20%20if%20%5B%5B%20%22%24container_runtime_name%22%20%3D%3D%20%22docker%22%20%5D%5D%3B%20then%0A%20%20%20%20%20%20%20%20%20%20%23%20Dump%20stack%20of%20docker%20daemon%20for%20investigation.%0A%20%20%20%20%20%20%20%20%20%20%23%20Log%20fle%20name%20looks%20like%20goroutine-stacks-TIMESTAMP%20and%20will%20be%20saved%20to%0A%20%20%20%20%20%20%20%20%20%20%23%20the%20exec%20root%20directory%2C%20which%20is%20%2Fvar%2Frun%2Fdocker%2F%20on%20Ubuntu%20and%20COS.%0A%20%20%20%20%20%20%20%20%20%20pkill%20-SIGUSR1%20dockerd%0A%20%20%20%20%20%20fi%0A%20%20%20%20%20%20systemctl%20kill%20--kill-who%3Dmain%20%22%24%7Bcontainer_runtime_name%7D%22%0A%20%20%20%20%20%20%23%20Wait%20for%20a%20while%2C%20as%20we%20don't%20want%20to%20kill%20it%20again%20before%20it%20is%20really%20up.%0A%20%20%20%20%20%20sleep%20120%0A%20%20%20%20else%0A%20%20%20%20%20%20sleep%20%22%24%7BSLEEP_SECONDS%7D%22%0A%20%20%20%20fi%0A%20%20done%0A%7D%0A%0Afunction%20kubelet_monitoring%20%7B%0A%20%20echo%20%22Wait%20for%202%20minutes%20for%20kubelet%20to%20be%20functional%22%0A%20%20%23%20TODO(andyzheng0831)%3A%20replace%20it%20with%20a%20more%20reliable%20method%20if%20possible.%0A%20%20sleep%20120%0A%20%20local%20-r%20max_seconds%3D10%0A%20%20local%20output%3D%22%22%0A%20%20while%20%5B%201%20%5D%3B%20do%0A%20%20%20%20if%20!%20output%3D%24(curl%20-m%20%22%24%7Bmax_seconds%7D%22%20-f%20-s%20-S%20http%3A%2F%2F127.0.0.1%3A10248%2Fhealthz%202%3E%261)%3B%20then%0A%20%20%20%20%20%20%23%20Print%20the%20response%20and%2For%20errors.%0A%20%20%20%20%20%20echo%20%24output%0A%20%20%20%20%20%20echo%20%22Kubelet%20is%20unhealthy!%22%0A%20%20%20%20%20%20systemctl%20kill%20kubelet%0A%20%20%20%20%20%20%23%20Wait%20for%20a%20while%2C%20as%20we%20don't%20want%20to%20kill%20it%20again%20before%20it%20is%20really%20up.%0A%20%20%20%20%20%20sleep%2060%0A%20%20%20%20else%0A%20%20%20%20%20%20sleep%20%22%24%7BSLEEP_SECONDS%7D%22%0A%20%20%20%20fi%0A%20%20done%0A%7D%0A%0A%0A%23%23%23%23%23%23%23%23%23%23%23%23%23%23%20Main%20Function%20%23%23%23%23%23%23%23%23%23%23%23%23%23%23%23%23%0Aif%20%5B%5B%20%22%24%23%22%20-ne%201%20%5D%5D%3B%20then%0A%20%20echo%20%22Usage%3A%20health-monitor.sh%20%3Ccontainer-runtime%2Fkubelet%3E%22%0A%20%20exit%201%0Afi%0A%0AKUBE_HOME%3D%22%2Fhome%2Fkubernetes%22%0A%0ASLEEP_SECONDS%3D10%0Acomponent%3D%241%0Aecho%20%22Start%20kubernetes%20health%20monitoring%20for%20%24%7Bcomponent%7D%22%0Aif%20%5B%5B%20%22%24%7Bcomponent%7D%22%20%3D%3D%20%22container-runtime%22%20%5D%5D%3B%20then%0A%20%20container_runtime_monitoring%0Aelif%20%5B%5B%20%22%24%7Bcomponent%7D%22%20%3D%3D%20%22kubelet%22%20%5D%5D%3B%20then%0A%20%20kubelet_monitoring%0Aelse%0A%20%20echo%20%22Health%20monitoring%20for%20component%20%22%24%7Bcomponent%7D%22%20is%20not%20supported!%22%0Afi%0A", + "verification": {} + }, + "mode": 755 } ] }, @@ -162,6 +173,16 @@ "enabled": true, "name": "docker.service" }, + { + "contents": "[Unit]\nRequires=network-online.target\nAfter=network-online.target\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh kubelet\n\n[Install]\nWantedBy=multi-user.target\n", + "enabled": true, + "name": "kubelet-healthcheck.service" + }, + { + "contents": "[Unit]\nRequires=network-online.target\nAfter=network-online.target\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh container-runtime\n\n[Install]\nWantedBy=multi-user.target\n", + "enabled": true, + "name": "docker-healthcheck.service" + }, { "contents": "[Unit]\nDescription=Kubernetes Kubelet\nRequires=docker.service\nAfter=docker.service\n[Service]\nTimeoutStartSec=5min\nEnvironment=KUBELET_IMAGE=docker://k8s.gcr.io/hyperkube-amd64:v1.9.2\nEnvironment=\"RKT_RUN_ARGS=--uuid-file-save=/var/cache/kubelet-pod.uuid \\\n --insecure-options=image \\\n --volume=resolv,kind=host,source=/etc/resolv.conf \\\n --mount volume=resolv,target=/etc/resolv.conf \\\n --volume cni-bin,kind=host,source=/opt/cni/bin \\\n --mount volume=cni-bin,target=/opt/cni/bin \\\n --volume cni-conf,kind=host,source=/etc/cni/net.d \\\n --mount volume=cni-conf,target=/etc/cni/net.d \\\n --volume etc-kubernetes,kind=host,source=/etc/kubernetes \\\n --mount volume=etc-kubernetes,target=/etc/kubernetes \\\n --volume var-log,kind=host,source=/var/log \\\n --mount volume=var-log,target=/var/log \\\n --volume var-lib-calico,kind=host,source=/var/lib/calico \\\n --mount volume=var-lib-calico,target=/var/lib/calico\"\nExecStartPre=/bin/mkdir -p /var/lib/calico\nExecStartPre=/bin/mkdir -p /etc/kubernetes/manifests\nExecStartPre=/bin/mkdir -p /etc/cni/net.d\nExecStartPre=/bin/mkdir -p /opt/cni/bin\nExecStartPre=-/usr/bin/rkt rm --uuid-file=/var/cache/kubelet-pod.uuid\nExecStart=/usr/lib/coreos/kubelet-wrapper \\\n --container-runtime=docker \\\n --allow-privileged=true \\\n --cni-bin-dir=/opt/cni/bin \\\n --cni-conf-dir=/etc/cni/net.d \\\n --cluster-dns=10.10.10.10 \\\n --cluster-domain=cluster.local \\\n --authentication-token-webhook=true \\\n --hostname-override=node1 \\\n --network-plugin=cni \\\n --cloud-provider=openstack \\\n --cloud-config=/etc/kubernetes/cloud-config \\\n --cert-dir=/etc/kubernetes/ \\\n --pod-manifest-path=/etc/kubernetes/manifests \\\n --resolv-conf=/etc/resolv.conf \\\n --rotate-certificates=true \\\n --kubeconfig=/etc/kubernetes/kubeconfig \\\n --bootstrap-kubeconfig=/etc/kubernetes/bootstrap.kubeconfig \\\n --lock-file=/var/run/lock/kubelet.lock \\\n --exit-on-lock-contention \\\n --read-only-port=0 \\\n --protect-kernel-defaults=true \\\n --authorization-mode=Webhook \\\n --anonymous-auth=false \\\n --client-ca-file=/etc/kubernetes/ca.crt\nExecStop=-/usr/bin/rkt stop --uuid-file=/var/cache/kubelet-pod.uuid\nRestart=always\nRestartSec=10\n[Install]\nWantedBy=multi-user.target\n", "dropins": [ diff --git a/pkg/userdata/coreos/testdata/v1.10.3-auto-update-openstack-multiple-dns.golden b/pkg/userdata/coreos/testdata/v1.10.3-auto-update-openstack-multiple-dns.golden index faddcda21..d0a960d8b 100644 --- a/pkg/userdata/coreos/testdata/v1.10.3-auto-update-openstack-multiple-dns.golden +++ b/pkg/userdata/coreos/testdata/v1.10.3-auto-update-openstack-multiple-dns.golden @@ -153,6 +153,17 @@ "verification": {} }, "mode": 420 + }, + { + "filesystem": "root", + "group": {}, + "path": "/opt/bin/health-monitor.sh", + "user": {}, + "contents": { + "source": "data:,%23!%2Fusr%2Fbin%2Fenv%20bash%0A%0A%23%20Copyright%202016%20The%20Kubernetes%20Authors.%0A%23%0A%23%20Licensed%20under%20the%20Apache%20License%2C%20Version%202.0%20(the%20%22License%22)%3B%0A%23%20you%20may%20not%20use%20this%20file%20except%20in%20compliance%20with%20the%20License.%0A%23%20You%20may%20obtain%20a%20copy%20of%20the%20License%20at%0A%23%0A%23%20%20%20%20%20http%3A%2F%2Fwww.apache.org%2Flicenses%2FLICENSE-2.0%0A%23%0A%23%20Unless%20required%20by%20applicable%20law%20or%20agreed%20to%20in%20writing%2C%20software%0A%23%20distributed%20under%20the%20License%20is%20distributed%20on%20an%20%22AS%20IS%22%20BASIS%2C%0A%23%20WITHOUT%20WARRANTIES%20OR%20CONDITIONS%20OF%20ANY%20KIND%2C%20either%20express%20or%20implied.%0A%23%20See%20the%20License%20for%20the%20specific%20language%20governing%20permissions%20and%0A%23%20limitations%20under%20the%20License.%0A%0A%23%20This%20script%20is%20for%20master%20and%20node%20instance%20health%20monitoring%2C%20which%20is%0A%23%20packed%20in%20kube-manifest%20tarball.%20It%20is%20executed%20through%20a%20systemd%20service%0A%23%20in%20cluster%2Fgce%2Fgci%2F%3Cmaster%2Fnode%3E.yaml.%20The%20env%20variables%20come%20from%20an%20env%0A%23%20file%20provided%20by%20the%20systemd%20service.%0A%0Aset%20-o%20nounset%0Aset%20-o%20pipefail%0A%0A%23%20We%20simply%20kill%20the%20process%20when%20there%20is%20a%20failure.%20Another%20systemd%20service%20will%0A%23%20automatically%20restart%20the%20process.%0Afunction%20container_runtime_monitoring%20%7B%0A%20%20local%20-r%20max_attempts%3D5%0A%20%20local%20attempt%3D1%0A%20%20local%20-r%20crictl%3D%22%24%7BKUBE_HOME%7D%2Fbin%2Fcrictl%22%0A%20%20local%20-r%20container_runtime_name%3D%22%24%7BCONTAINER_RUNTIME_NAME%3A-docker%7D%22%0A%20%20%23%20We%20still%20need%20to%20use%20'docker%20ps'%20when%20container%20runtime%20is%20%22docker%22.%20This%20is%20because%0A%20%20%23%20dockershim%20is%20still%20part%20of%20kubelet%20today.%20When%20kubelet%20is%20down%2C%20crictl%20pods%0A%20%20%23%20will%20also%20fail%2C%20and%20docker%20will%20be%20killed.%20This%20is%20undesirable%20especially%20when%0A%20%20%23%20docker%20live%20restore%20is%20disabled.%0A%20%20local%20healthcheck_command%3D%22docker%20ps%22%0A%20%20if%20%5B%5B%20%22%24%7BCONTAINER_RUNTIME%3A-docker%7D%22%20!%3D%20%22docker%22%20%5D%5D%3B%20then%0A%20%20%20%20healthcheck_command%3D%22%24%7Bcrictl%7D%20pods%22%0A%20%20fi%0A%20%20%23%20Container%20runtime%20startup%20takes%20time.%20Make%20initial%20attempts%20before%20starting%0A%20%20%23%20killing%20the%20container%20runtime.%0A%20%20until%20timeout%2060%20%24%7Bhealthcheck_command%7D%20%3E%20%2Fdev%2Fnull%3B%20do%0A%20%20%20%20if%20((%20attempt%20%3D%3D%20max_attempts%20))%3B%20then%0A%20%20%20%20%20%20echo%20%22Max%20attempt%20%24%7Bmax_attempts%7D%20reached!%20Proceeding%20to%20monitor%20container%20runtime%20healthiness.%22%0A%20%20%20%20%20%20break%0A%20%20%20%20fi%0A%20%20%20%20echo%20%22%24attempt%20initial%20attempt%20%5C%22%24%7Bhealthcheck_command%7D%5C%22!%20Trying%20again%20in%20%24attempt%20seconds...%22%0A%20%20%20%20sleep%20%22%24((%202%20**%20attempt%2B%2B%20))%22%0A%20%20done%0A%20%20while%20true%3B%20do%0A%20%20%20%20if%20!%20timeout%2060%20%24%7Bhealthcheck_command%7D%20%3E%20%2Fdev%2Fnull%3B%20then%0A%20%20%20%20%20%20echo%20%22Container%20runtime%20%24%7Bcontainer_runtime_name%7D%20failed!%22%0A%20%20%20%20%20%20if%20%5B%5B%20%22%24container_runtime_name%22%20%3D%3D%20%22docker%22%20%5D%5D%3B%20then%0A%20%20%20%20%20%20%20%20%20%20%23%20Dump%20stack%20of%20docker%20daemon%20for%20investigation.%0A%20%20%20%20%20%20%20%20%20%20%23%20Log%20fle%20name%20looks%20like%20goroutine-stacks-TIMESTAMP%20and%20will%20be%20saved%20to%0A%20%20%20%20%20%20%20%20%20%20%23%20the%20exec%20root%20directory%2C%20which%20is%20%2Fvar%2Frun%2Fdocker%2F%20on%20Ubuntu%20and%20COS.%0A%20%20%20%20%20%20%20%20%20%20pkill%20-SIGUSR1%20dockerd%0A%20%20%20%20%20%20fi%0A%20%20%20%20%20%20systemctl%20kill%20--kill-who%3Dmain%20%22%24%7Bcontainer_runtime_name%7D%22%0A%20%20%20%20%20%20%23%20Wait%20for%20a%20while%2C%20as%20we%20don't%20want%20to%20kill%20it%20again%20before%20it%20is%20really%20up.%0A%20%20%20%20%20%20sleep%20120%0A%20%20%20%20else%0A%20%20%20%20%20%20sleep%20%22%24%7BSLEEP_SECONDS%7D%22%0A%20%20%20%20fi%0A%20%20done%0A%7D%0A%0Afunction%20kubelet_monitoring%20%7B%0A%20%20echo%20%22Wait%20for%202%20minutes%20for%20kubelet%20to%20be%20functional%22%0A%20%20%23%20TODO(andyzheng0831)%3A%20replace%20it%20with%20a%20more%20reliable%20method%20if%20possible.%0A%20%20sleep%20120%0A%20%20local%20-r%20max_seconds%3D10%0A%20%20local%20output%3D%22%22%0A%20%20while%20%5B%201%20%5D%3B%20do%0A%20%20%20%20if%20!%20output%3D%24(curl%20-m%20%22%24%7Bmax_seconds%7D%22%20-f%20-s%20-S%20http%3A%2F%2F127.0.0.1%3A10248%2Fhealthz%202%3E%261)%3B%20then%0A%20%20%20%20%20%20%23%20Print%20the%20response%20and%2For%20errors.%0A%20%20%20%20%20%20echo%20%24output%0A%20%20%20%20%20%20echo%20%22Kubelet%20is%20unhealthy!%22%0A%20%20%20%20%20%20systemctl%20kill%20kubelet%0A%20%20%20%20%20%20%23%20Wait%20for%20a%20while%2C%20as%20we%20don't%20want%20to%20kill%20it%20again%20before%20it%20is%20really%20up.%0A%20%20%20%20%20%20sleep%2060%0A%20%20%20%20else%0A%20%20%20%20%20%20sleep%20%22%24%7BSLEEP_SECONDS%7D%22%0A%20%20%20%20fi%0A%20%20done%0A%7D%0A%0A%0A%23%23%23%23%23%23%23%23%23%23%23%23%23%23%20Main%20Function%20%23%23%23%23%23%23%23%23%23%23%23%23%23%23%23%23%0Aif%20%5B%5B%20%22%24%23%22%20-ne%201%20%5D%5D%3B%20then%0A%20%20echo%20%22Usage%3A%20health-monitor.sh%20%3Ccontainer-runtime%2Fkubelet%3E%22%0A%20%20exit%201%0Afi%0A%0AKUBE_HOME%3D%22%2Fhome%2Fkubernetes%22%0A%0ASLEEP_SECONDS%3D10%0Acomponent%3D%241%0Aecho%20%22Start%20kubernetes%20health%20monitoring%20for%20%24%7Bcomponent%7D%22%0Aif%20%5B%5B%20%22%24%7Bcomponent%7D%22%20%3D%3D%20%22container-runtime%22%20%5D%5D%3B%20then%0A%20%20container_runtime_monitoring%0Aelif%20%5B%5B%20%22%24%7Bcomponent%7D%22%20%3D%3D%20%22kubelet%22%20%5D%5D%3B%20then%0A%20%20kubelet_monitoring%0Aelse%0A%20%20echo%20%22Health%20monitoring%20for%20component%20%22%24%7Bcomponent%7D%22%20is%20not%20supported!%22%0Afi%0A", + "verification": {} + }, + "mode": 755 } ] }, @@ -162,6 +173,16 @@ "enabled": true, "name": "docker.service" }, + { + "contents": "[Unit]\nRequires=network-online.target\nAfter=network-online.target\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh kubelet\n\n[Install]\nWantedBy=multi-user.target\n", + "enabled": true, + "name": "kubelet-healthcheck.service" + }, + { + "contents": "[Unit]\nRequires=network-online.target\nAfter=network-online.target\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh container-runtime\n\n[Install]\nWantedBy=multi-user.target\n", + "enabled": true, + "name": "docker-healthcheck.service" + }, { "contents": "[Unit]\nDescription=Kubernetes Kubelet\nRequires=docker.service\nAfter=docker.service\n[Service]\nTimeoutStartSec=5min\nEnvironment=KUBELET_IMAGE=docker://k8s.gcr.io/hyperkube-amd64:v1.10.3\nEnvironment=\"RKT_RUN_ARGS=--uuid-file-save=/var/cache/kubelet-pod.uuid \\\n --insecure-options=image \\\n --volume=resolv,kind=host,source=/etc/resolv.conf \\\n --mount volume=resolv,target=/etc/resolv.conf \\\n --volume cni-bin,kind=host,source=/opt/cni/bin \\\n --mount volume=cni-bin,target=/opt/cni/bin \\\n --volume cni-conf,kind=host,source=/etc/cni/net.d \\\n --mount volume=cni-conf,target=/etc/cni/net.d \\\n --volume etc-kubernetes,kind=host,source=/etc/kubernetes \\\n --mount volume=etc-kubernetes,target=/etc/kubernetes \\\n --volume var-log,kind=host,source=/var/log \\\n --mount volume=var-log,target=/var/log \\\n --volume var-lib-calico,kind=host,source=/var/lib/calico \\\n --mount volume=var-lib-calico,target=/var/lib/calico\"\nExecStartPre=/bin/mkdir -p /var/lib/calico\nExecStartPre=/bin/mkdir -p /etc/kubernetes/manifests\nExecStartPre=/bin/mkdir -p /etc/cni/net.d\nExecStartPre=/bin/mkdir -p /opt/cni/bin\nExecStartPre=-/usr/bin/rkt rm --uuid-file=/var/cache/kubelet-pod.uuid\nExecStart=/usr/lib/coreos/kubelet-wrapper \\\n --container-runtime=docker \\\n --allow-privileged=true \\\n --cni-bin-dir=/opt/cni/bin \\\n --cni-conf-dir=/etc/cni/net.d \\\n --cluster-dns=10.10.10.10,10.10.10.11,10.10.10.12 \\\n --cluster-domain=cluster.local \\\n --authentication-token-webhook=true \\\n --hostname-override=node1 \\\n --network-plugin=cni \\\n --cloud-provider=openstack \\\n --cloud-config=/etc/kubernetes/cloud-config \\\n --cert-dir=/etc/kubernetes/ \\\n --pod-manifest-path=/etc/kubernetes/manifests \\\n --resolv-conf=/etc/resolv.conf \\\n --rotate-certificates=true \\\n --kubeconfig=/etc/kubernetes/kubeconfig \\\n --bootstrap-kubeconfig=/etc/kubernetes/bootstrap.kubeconfig \\\n --lock-file=/var/run/lock/kubelet.lock \\\n --exit-on-lock-contention \\\n --read-only-port=0 \\\n --protect-kernel-defaults=true \\\n --authorization-mode=Webhook \\\n --anonymous-auth=false \\\n --client-ca-file=/etc/kubernetes/ca.crt\nExecStop=-/usr/bin/rkt stop --uuid-file=/var/cache/kubelet-pod.uuid\nRestart=always\nRestartSec=10\n[Install]\nWantedBy=multi-user.target\n", "dropins": [ diff --git a/pkg/userdata/coreos/testdata/v1.11.2-vsphere-static-ipconfig.golden b/pkg/userdata/coreos/testdata/v1.11.2-vsphere-static-ipconfig.golden index 00e0f0f6f..979552fec 100644 --- a/pkg/userdata/coreos/testdata/v1.11.2-vsphere-static-ipconfig.golden +++ b/pkg/userdata/coreos/testdata/v1.11.2-vsphere-static-ipconfig.golden @@ -160,6 +160,17 @@ "verification": {} }, "mode": 420 + }, + { + "filesystem": "root", + "group": {}, + "path": "/opt/bin/health-monitor.sh", + "user": {}, + "contents": { + "source": "data:,%23!%2Fusr%2Fbin%2Fenv%20bash%0A%0A%23%20Copyright%202016%20The%20Kubernetes%20Authors.%0A%23%0A%23%20Licensed%20under%20the%20Apache%20License%2C%20Version%202.0%20(the%20%22License%22)%3B%0A%23%20you%20may%20not%20use%20this%20file%20except%20in%20compliance%20with%20the%20License.%0A%23%20You%20may%20obtain%20a%20copy%20of%20the%20License%20at%0A%23%0A%23%20%20%20%20%20http%3A%2F%2Fwww.apache.org%2Flicenses%2FLICENSE-2.0%0A%23%0A%23%20Unless%20required%20by%20applicable%20law%20or%20agreed%20to%20in%20writing%2C%20software%0A%23%20distributed%20under%20the%20License%20is%20distributed%20on%20an%20%22AS%20IS%22%20BASIS%2C%0A%23%20WITHOUT%20WARRANTIES%20OR%20CONDITIONS%20OF%20ANY%20KIND%2C%20either%20express%20or%20implied.%0A%23%20See%20the%20License%20for%20the%20specific%20language%20governing%20permissions%20and%0A%23%20limitations%20under%20the%20License.%0A%0A%23%20This%20script%20is%20for%20master%20and%20node%20instance%20health%20monitoring%2C%20which%20is%0A%23%20packed%20in%20kube-manifest%20tarball.%20It%20is%20executed%20through%20a%20systemd%20service%0A%23%20in%20cluster%2Fgce%2Fgci%2F%3Cmaster%2Fnode%3E.yaml.%20The%20env%20variables%20come%20from%20an%20env%0A%23%20file%20provided%20by%20the%20systemd%20service.%0A%0Aset%20-o%20nounset%0Aset%20-o%20pipefail%0A%0A%23%20We%20simply%20kill%20the%20process%20when%20there%20is%20a%20failure.%20Another%20systemd%20service%20will%0A%23%20automatically%20restart%20the%20process.%0Afunction%20container_runtime_monitoring%20%7B%0A%20%20local%20-r%20max_attempts%3D5%0A%20%20local%20attempt%3D1%0A%20%20local%20-r%20crictl%3D%22%24%7BKUBE_HOME%7D%2Fbin%2Fcrictl%22%0A%20%20local%20-r%20container_runtime_name%3D%22%24%7BCONTAINER_RUNTIME_NAME%3A-docker%7D%22%0A%20%20%23%20We%20still%20need%20to%20use%20'docker%20ps'%20when%20container%20runtime%20is%20%22docker%22.%20This%20is%20because%0A%20%20%23%20dockershim%20is%20still%20part%20of%20kubelet%20today.%20When%20kubelet%20is%20down%2C%20crictl%20pods%0A%20%20%23%20will%20also%20fail%2C%20and%20docker%20will%20be%20killed.%20This%20is%20undesirable%20especially%20when%0A%20%20%23%20docker%20live%20restore%20is%20disabled.%0A%20%20local%20healthcheck_command%3D%22docker%20ps%22%0A%20%20if%20%5B%5B%20%22%24%7BCONTAINER_RUNTIME%3A-docker%7D%22%20!%3D%20%22docker%22%20%5D%5D%3B%20then%0A%20%20%20%20healthcheck_command%3D%22%24%7Bcrictl%7D%20pods%22%0A%20%20fi%0A%20%20%23%20Container%20runtime%20startup%20takes%20time.%20Make%20initial%20attempts%20before%20starting%0A%20%20%23%20killing%20the%20container%20runtime.%0A%20%20until%20timeout%2060%20%24%7Bhealthcheck_command%7D%20%3E%20%2Fdev%2Fnull%3B%20do%0A%20%20%20%20if%20((%20attempt%20%3D%3D%20max_attempts%20))%3B%20then%0A%20%20%20%20%20%20echo%20%22Max%20attempt%20%24%7Bmax_attempts%7D%20reached!%20Proceeding%20to%20monitor%20container%20runtime%20healthiness.%22%0A%20%20%20%20%20%20break%0A%20%20%20%20fi%0A%20%20%20%20echo%20%22%24attempt%20initial%20attempt%20%5C%22%24%7Bhealthcheck_command%7D%5C%22!%20Trying%20again%20in%20%24attempt%20seconds...%22%0A%20%20%20%20sleep%20%22%24((%202%20**%20attempt%2B%2B%20))%22%0A%20%20done%0A%20%20while%20true%3B%20do%0A%20%20%20%20if%20!%20timeout%2060%20%24%7Bhealthcheck_command%7D%20%3E%20%2Fdev%2Fnull%3B%20then%0A%20%20%20%20%20%20echo%20%22Container%20runtime%20%24%7Bcontainer_runtime_name%7D%20failed!%22%0A%20%20%20%20%20%20if%20%5B%5B%20%22%24container_runtime_name%22%20%3D%3D%20%22docker%22%20%5D%5D%3B%20then%0A%20%20%20%20%20%20%20%20%20%20%23%20Dump%20stack%20of%20docker%20daemon%20for%20investigation.%0A%20%20%20%20%20%20%20%20%20%20%23%20Log%20fle%20name%20looks%20like%20goroutine-stacks-TIMESTAMP%20and%20will%20be%20saved%20to%0A%20%20%20%20%20%20%20%20%20%20%23%20the%20exec%20root%20directory%2C%20which%20is%20%2Fvar%2Frun%2Fdocker%2F%20on%20Ubuntu%20and%20COS.%0A%20%20%20%20%20%20%20%20%20%20pkill%20-SIGUSR1%20dockerd%0A%20%20%20%20%20%20fi%0A%20%20%20%20%20%20systemctl%20kill%20--kill-who%3Dmain%20%22%24%7Bcontainer_runtime_name%7D%22%0A%20%20%20%20%20%20%23%20Wait%20for%20a%20while%2C%20as%20we%20don't%20want%20to%20kill%20it%20again%20before%20it%20is%20really%20up.%0A%20%20%20%20%20%20sleep%20120%0A%20%20%20%20else%0A%20%20%20%20%20%20sleep%20%22%24%7BSLEEP_SECONDS%7D%22%0A%20%20%20%20fi%0A%20%20done%0A%7D%0A%0Afunction%20kubelet_monitoring%20%7B%0A%20%20echo%20%22Wait%20for%202%20minutes%20for%20kubelet%20to%20be%20functional%22%0A%20%20%23%20TODO(andyzheng0831)%3A%20replace%20it%20with%20a%20more%20reliable%20method%20if%20possible.%0A%20%20sleep%20120%0A%20%20local%20-r%20max_seconds%3D10%0A%20%20local%20output%3D%22%22%0A%20%20while%20%5B%201%20%5D%3B%20do%0A%20%20%20%20if%20!%20output%3D%24(curl%20-m%20%22%24%7Bmax_seconds%7D%22%20-f%20-s%20-S%20http%3A%2F%2F127.0.0.1%3A10248%2Fhealthz%202%3E%261)%3B%20then%0A%20%20%20%20%20%20%23%20Print%20the%20response%20and%2For%20errors.%0A%20%20%20%20%20%20echo%20%24output%0A%20%20%20%20%20%20echo%20%22Kubelet%20is%20unhealthy!%22%0A%20%20%20%20%20%20systemctl%20kill%20kubelet%0A%20%20%20%20%20%20%23%20Wait%20for%20a%20while%2C%20as%20we%20don't%20want%20to%20kill%20it%20again%20before%20it%20is%20really%20up.%0A%20%20%20%20%20%20sleep%2060%0A%20%20%20%20else%0A%20%20%20%20%20%20sleep%20%22%24%7BSLEEP_SECONDS%7D%22%0A%20%20%20%20fi%0A%20%20done%0A%7D%0A%0A%0A%23%23%23%23%23%23%23%23%23%23%23%23%23%23%20Main%20Function%20%23%23%23%23%23%23%23%23%23%23%23%23%23%23%23%23%0Aif%20%5B%5B%20%22%24%23%22%20-ne%201%20%5D%5D%3B%20then%0A%20%20echo%20%22Usage%3A%20health-monitor.sh%20%3Ccontainer-runtime%2Fkubelet%3E%22%0A%20%20exit%201%0Afi%0A%0AKUBE_HOME%3D%22%2Fhome%2Fkubernetes%22%0A%0ASLEEP_SECONDS%3D10%0Acomponent%3D%241%0Aecho%20%22Start%20kubernetes%20health%20monitoring%20for%20%24%7Bcomponent%7D%22%0Aif%20%5B%5B%20%22%24%7Bcomponent%7D%22%20%3D%3D%20%22container-runtime%22%20%5D%5D%3B%20then%0A%20%20container_runtime_monitoring%0Aelif%20%5B%5B%20%22%24%7Bcomponent%7D%22%20%3D%3D%20%22kubelet%22%20%5D%5D%3B%20then%0A%20%20kubelet_monitoring%0Aelse%0A%20%20echo%20%22Health%20monitoring%20for%20component%20%22%24%7Bcomponent%7D%22%20is%20not%20supported!%22%0Afi%0A", + "verification": {} + }, + "mode": 755 } ] }, @@ -177,6 +188,16 @@ "enabled": true, "name": "docker.service" }, + { + "contents": "[Unit]\nRequires=network-online.target\nAfter=network-online.target\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh kubelet\n\n[Install]\nWantedBy=multi-user.target\n", + "enabled": true, + "name": "kubelet-healthcheck.service" + }, + { + "contents": "[Unit]\nRequires=network-online.target\nAfter=network-online.target\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh container-runtime\n\n[Install]\nWantedBy=multi-user.target\n", + "enabled": true, + "name": "docker-healthcheck.service" + }, { "contents": "[Unit]\nDescription=Kubernetes Kubelet\nRequires=docker.service\nAfter=docker.service\n[Service]\nTimeoutStartSec=5min\nEnvironment=KUBELET_IMAGE=docker://k8s.gcr.io/hyperkube-amd64:v1.11.2\nEnvironment=\"RKT_RUN_ARGS=--uuid-file-save=/var/cache/kubelet-pod.uuid \\\n --insecure-options=image \\\n --volume=resolv,kind=host,source=/etc/resolv.conf \\\n --mount volume=resolv,target=/etc/resolv.conf \\\n --volume cni-bin,kind=host,source=/opt/cni/bin \\\n --mount volume=cni-bin,target=/opt/cni/bin \\\n --volume cni-conf,kind=host,source=/etc/cni/net.d \\\n --mount volume=cni-conf,target=/etc/cni/net.d \\\n --volume etc-kubernetes,kind=host,source=/etc/kubernetes \\\n --mount volume=etc-kubernetes,target=/etc/kubernetes \\\n --volume var-log,kind=host,source=/var/log \\\n --mount volume=var-log,target=/var/log \\\n --volume var-lib-calico,kind=host,source=/var/lib/calico \\\n --mount volume=var-lib-calico,target=/var/lib/calico\"\nExecStartPre=/bin/mkdir -p /var/lib/calico\nExecStartPre=/bin/mkdir -p /etc/kubernetes/manifests\nExecStartPre=/bin/mkdir -p /etc/cni/net.d\nExecStartPre=/bin/mkdir -p /opt/cni/bin\nExecStartPre=-/usr/bin/rkt rm --uuid-file=/var/cache/kubelet-pod.uuid\nExecStart=/usr/lib/coreos/kubelet-wrapper \\\n --container-runtime=docker \\\n --allow-privileged=true \\\n --cni-bin-dir=/opt/cni/bin \\\n --cni-conf-dir=/etc/cni/net.d \\\n --cluster-dns=10.10.10.10 \\\n --cluster-domain=cluster.local \\\n --authentication-token-webhook=true \\\n --hostname-override=node1 \\\n --network-plugin=cni \\\n --cloud-provider=vsphere \\\n --cloud-config=/etc/kubernetes/cloud-config \\\n --cert-dir=/etc/kubernetes/ \\\n --pod-manifest-path=/etc/kubernetes/manifests \\\n --resolv-conf=/etc/resolv.conf \\\n --rotate-certificates=true \\\n --kubeconfig=/etc/kubernetes/kubeconfig \\\n --bootstrap-kubeconfig=/etc/kubernetes/bootstrap.kubeconfig \\\n --lock-file=/var/run/lock/kubelet.lock \\\n --exit-on-lock-contention \\\n --read-only-port=0 \\\n --protect-kernel-defaults=true \\\n --authorization-mode=Webhook \\\n --anonymous-auth=false \\\n --client-ca-file=/etc/kubernetes/ca.crt\nExecStop=-/usr/bin/rkt stop --uuid-file=/var/cache/kubelet-pod.uuid\nRestart=always\nRestartSec=10\n[Install]\nWantedBy=multi-user.target\n", "dropins": [ diff --git a/pkg/userdata/coreos/testdata/v1.12.0-vsphere-overwrite-cloudconfig.golden b/pkg/userdata/coreos/testdata/v1.12.0-vsphere-overwrite-cloudconfig.golden index 573c01f6d..f4a00f1a3 100644 --- a/pkg/userdata/coreos/testdata/v1.12.0-vsphere-overwrite-cloudconfig.golden +++ b/pkg/userdata/coreos/testdata/v1.12.0-vsphere-overwrite-cloudconfig.golden @@ -149,6 +149,17 @@ "verification": {} }, "mode": 420 + }, + { + "filesystem": "root", + "group": {}, + "path": "/opt/bin/health-monitor.sh", + "user": {}, + "contents": { + "source": "data:,%23!%2Fusr%2Fbin%2Fenv%20bash%0A%0A%23%20Copyright%202016%20The%20Kubernetes%20Authors.%0A%23%0A%23%20Licensed%20under%20the%20Apache%20License%2C%20Version%202.0%20(the%20%22License%22)%3B%0A%23%20you%20may%20not%20use%20this%20file%20except%20in%20compliance%20with%20the%20License.%0A%23%20You%20may%20obtain%20a%20copy%20of%20the%20License%20at%0A%23%0A%23%20%20%20%20%20http%3A%2F%2Fwww.apache.org%2Flicenses%2FLICENSE-2.0%0A%23%0A%23%20Unless%20required%20by%20applicable%20law%20or%20agreed%20to%20in%20writing%2C%20software%0A%23%20distributed%20under%20the%20License%20is%20distributed%20on%20an%20%22AS%20IS%22%20BASIS%2C%0A%23%20WITHOUT%20WARRANTIES%20OR%20CONDITIONS%20OF%20ANY%20KIND%2C%20either%20express%20or%20implied.%0A%23%20See%20the%20License%20for%20the%20specific%20language%20governing%20permissions%20and%0A%23%20limitations%20under%20the%20License.%0A%0A%23%20This%20script%20is%20for%20master%20and%20node%20instance%20health%20monitoring%2C%20which%20is%0A%23%20packed%20in%20kube-manifest%20tarball.%20It%20is%20executed%20through%20a%20systemd%20service%0A%23%20in%20cluster%2Fgce%2Fgci%2F%3Cmaster%2Fnode%3E.yaml.%20The%20env%20variables%20come%20from%20an%20env%0A%23%20file%20provided%20by%20the%20systemd%20service.%0A%0Aset%20-o%20nounset%0Aset%20-o%20pipefail%0A%0A%23%20We%20simply%20kill%20the%20process%20when%20there%20is%20a%20failure.%20Another%20systemd%20service%20will%0A%23%20automatically%20restart%20the%20process.%0Afunction%20container_runtime_monitoring%20%7B%0A%20%20local%20-r%20max_attempts%3D5%0A%20%20local%20attempt%3D1%0A%20%20local%20-r%20crictl%3D%22%24%7BKUBE_HOME%7D%2Fbin%2Fcrictl%22%0A%20%20local%20-r%20container_runtime_name%3D%22%24%7BCONTAINER_RUNTIME_NAME%3A-docker%7D%22%0A%20%20%23%20We%20still%20need%20to%20use%20'docker%20ps'%20when%20container%20runtime%20is%20%22docker%22.%20This%20is%20because%0A%20%20%23%20dockershim%20is%20still%20part%20of%20kubelet%20today.%20When%20kubelet%20is%20down%2C%20crictl%20pods%0A%20%20%23%20will%20also%20fail%2C%20and%20docker%20will%20be%20killed.%20This%20is%20undesirable%20especially%20when%0A%20%20%23%20docker%20live%20restore%20is%20disabled.%0A%20%20local%20healthcheck_command%3D%22docker%20ps%22%0A%20%20if%20%5B%5B%20%22%24%7BCONTAINER_RUNTIME%3A-docker%7D%22%20!%3D%20%22docker%22%20%5D%5D%3B%20then%0A%20%20%20%20healthcheck_command%3D%22%24%7Bcrictl%7D%20pods%22%0A%20%20fi%0A%20%20%23%20Container%20runtime%20startup%20takes%20time.%20Make%20initial%20attempts%20before%20starting%0A%20%20%23%20killing%20the%20container%20runtime.%0A%20%20until%20timeout%2060%20%24%7Bhealthcheck_command%7D%20%3E%20%2Fdev%2Fnull%3B%20do%0A%20%20%20%20if%20((%20attempt%20%3D%3D%20max_attempts%20))%3B%20then%0A%20%20%20%20%20%20echo%20%22Max%20attempt%20%24%7Bmax_attempts%7D%20reached!%20Proceeding%20to%20monitor%20container%20runtime%20healthiness.%22%0A%20%20%20%20%20%20break%0A%20%20%20%20fi%0A%20%20%20%20echo%20%22%24attempt%20initial%20attempt%20%5C%22%24%7Bhealthcheck_command%7D%5C%22!%20Trying%20again%20in%20%24attempt%20seconds...%22%0A%20%20%20%20sleep%20%22%24((%202%20**%20attempt%2B%2B%20))%22%0A%20%20done%0A%20%20while%20true%3B%20do%0A%20%20%20%20if%20!%20timeout%2060%20%24%7Bhealthcheck_command%7D%20%3E%20%2Fdev%2Fnull%3B%20then%0A%20%20%20%20%20%20echo%20%22Container%20runtime%20%24%7Bcontainer_runtime_name%7D%20failed!%22%0A%20%20%20%20%20%20if%20%5B%5B%20%22%24container_runtime_name%22%20%3D%3D%20%22docker%22%20%5D%5D%3B%20then%0A%20%20%20%20%20%20%20%20%20%20%23%20Dump%20stack%20of%20docker%20daemon%20for%20investigation.%0A%20%20%20%20%20%20%20%20%20%20%23%20Log%20fle%20name%20looks%20like%20goroutine-stacks-TIMESTAMP%20and%20will%20be%20saved%20to%0A%20%20%20%20%20%20%20%20%20%20%23%20the%20exec%20root%20directory%2C%20which%20is%20%2Fvar%2Frun%2Fdocker%2F%20on%20Ubuntu%20and%20COS.%0A%20%20%20%20%20%20%20%20%20%20pkill%20-SIGUSR1%20dockerd%0A%20%20%20%20%20%20fi%0A%20%20%20%20%20%20systemctl%20kill%20--kill-who%3Dmain%20%22%24%7Bcontainer_runtime_name%7D%22%0A%20%20%20%20%20%20%23%20Wait%20for%20a%20while%2C%20as%20we%20don't%20want%20to%20kill%20it%20again%20before%20it%20is%20really%20up.%0A%20%20%20%20%20%20sleep%20120%0A%20%20%20%20else%0A%20%20%20%20%20%20sleep%20%22%24%7BSLEEP_SECONDS%7D%22%0A%20%20%20%20fi%0A%20%20done%0A%7D%0A%0Afunction%20kubelet_monitoring%20%7B%0A%20%20echo%20%22Wait%20for%202%20minutes%20for%20kubelet%20to%20be%20functional%22%0A%20%20%23%20TODO(andyzheng0831)%3A%20replace%20it%20with%20a%20more%20reliable%20method%20if%20possible.%0A%20%20sleep%20120%0A%20%20local%20-r%20max_seconds%3D10%0A%20%20local%20output%3D%22%22%0A%20%20while%20%5B%201%20%5D%3B%20do%0A%20%20%20%20if%20!%20output%3D%24(curl%20-m%20%22%24%7Bmax_seconds%7D%22%20-f%20-s%20-S%20http%3A%2F%2F127.0.0.1%3A10248%2Fhealthz%202%3E%261)%3B%20then%0A%20%20%20%20%20%20%23%20Print%20the%20response%20and%2For%20errors.%0A%20%20%20%20%20%20echo%20%24output%0A%20%20%20%20%20%20echo%20%22Kubelet%20is%20unhealthy!%22%0A%20%20%20%20%20%20systemctl%20kill%20kubelet%0A%20%20%20%20%20%20%23%20Wait%20for%20a%20while%2C%20as%20we%20don't%20want%20to%20kill%20it%20again%20before%20it%20is%20really%20up.%0A%20%20%20%20%20%20sleep%2060%0A%20%20%20%20else%0A%20%20%20%20%20%20sleep%20%22%24%7BSLEEP_SECONDS%7D%22%0A%20%20%20%20fi%0A%20%20done%0A%7D%0A%0A%0A%23%23%23%23%23%23%23%23%23%23%23%23%23%23%20Main%20Function%20%23%23%23%23%23%23%23%23%23%23%23%23%23%23%23%23%0Aif%20%5B%5B%20%22%24%23%22%20-ne%201%20%5D%5D%3B%20then%0A%20%20echo%20%22Usage%3A%20health-monitor.sh%20%3Ccontainer-runtime%2Fkubelet%3E%22%0A%20%20exit%201%0Afi%0A%0AKUBE_HOME%3D%22%2Fhome%2Fkubernetes%22%0A%0ASLEEP_SECONDS%3D10%0Acomponent%3D%241%0Aecho%20%22Start%20kubernetes%20health%20monitoring%20for%20%24%7Bcomponent%7D%22%0Aif%20%5B%5B%20%22%24%7Bcomponent%7D%22%20%3D%3D%20%22container-runtime%22%20%5D%5D%3B%20then%0A%20%20container_runtime_monitoring%0Aelif%20%5B%5B%20%22%24%7Bcomponent%7D%22%20%3D%3D%20%22kubelet%22%20%5D%5D%3B%20then%0A%20%20kubelet_monitoring%0Aelse%0A%20%20echo%20%22Health%20monitoring%20for%20component%20%22%24%7Bcomponent%7D%22%20is%20not%20supported!%22%0Afi%0A", + "verification": {} + }, + "mode": 755 } ] }, @@ -166,6 +177,16 @@ "enabled": true, "name": "docker.service" }, + { + "contents": "[Unit]\nRequires=network-online.target\nAfter=network-online.target\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh kubelet\n\n[Install]\nWantedBy=multi-user.target\n", + "enabled": true, + "name": "kubelet-healthcheck.service" + }, + { + "contents": "[Unit]\nRequires=network-online.target\nAfter=network-online.target\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh container-runtime\n\n[Install]\nWantedBy=multi-user.target\n", + "enabled": true, + "name": "docker-healthcheck.service" + }, { "contents": "[Unit]\nDescription=Kubernetes Kubelet\nRequires=docker.service\nAfter=docker.service\n[Service]\nTimeoutStartSec=5min\nEnvironment=KUBELET_IMAGE=docker://k8s.gcr.io/hyperkube-amd64:v1.12.0\nEnvironment=\"RKT_RUN_ARGS=--uuid-file-save=/var/cache/kubelet-pod.uuid \\\n --insecure-options=image \\\n --volume=resolv,kind=host,source=/etc/resolv.conf \\\n --mount volume=resolv,target=/etc/resolv.conf \\\n --volume cni-bin,kind=host,source=/opt/cni/bin \\\n --mount volume=cni-bin,target=/opt/cni/bin \\\n --volume cni-conf,kind=host,source=/etc/cni/net.d \\\n --mount volume=cni-conf,target=/etc/cni/net.d \\\n --volume etc-kubernetes,kind=host,source=/etc/kubernetes \\\n --mount volume=etc-kubernetes,target=/etc/kubernetes \\\n --volume var-log,kind=host,source=/var/log \\\n --mount volume=var-log,target=/var/log \\\n --volume var-lib-calico,kind=host,source=/var/lib/calico \\\n --mount volume=var-lib-calico,target=/var/lib/calico\"\nExecStartPre=/bin/mkdir -p /var/lib/calico\nExecStartPre=/bin/mkdir -p /etc/kubernetes/manifests\nExecStartPre=/bin/mkdir -p /etc/cni/net.d\nExecStartPre=/bin/mkdir -p /opt/cni/bin\nExecStartPre=-/usr/bin/rkt rm --uuid-file=/var/cache/kubelet-pod.uuid\nExecStart=/usr/lib/coreos/kubelet-wrapper \\\n --container-runtime=docker \\\n --allow-privileged=true \\\n --cni-bin-dir=/opt/cni/bin \\\n --cni-conf-dir=/etc/cni/net.d \\\n --cluster-dns=10.10.10.10 \\\n --cluster-domain=cluster.local \\\n --authentication-token-webhook=true \\\n --hostname-override=node1 \\\n --network-plugin=cni \\\n --cloud-provider=vsphere \\\n --cloud-config=/etc/kubernetes/cloud-config \\\n --cert-dir=/etc/kubernetes/ \\\n --pod-manifest-path=/etc/kubernetes/manifests \\\n --resolv-conf=/etc/resolv.conf \\\n --rotate-certificates=true \\\n --kubeconfig=/etc/kubernetes/kubeconfig \\\n --bootstrap-kubeconfig=/etc/kubernetes/bootstrap.kubeconfig \\\n --lock-file=/var/run/lock/kubelet.lock \\\n --exit-on-lock-contention \\\n --read-only-port=0 \\\n --protect-kernel-defaults=true \\\n --authorization-mode=Webhook \\\n --anonymous-auth=false \\\n --client-ca-file=/etc/kubernetes/ca.crt\nExecStop=-/usr/bin/rkt stop --uuid-file=/var/cache/kubelet-pod.uuid\nRestart=always\nRestartSec=10\n[Install]\nWantedBy=multi-user.target\n", "dropins": [ diff --git a/pkg/userdata/coreos/testdata/v1.9.2-disable-auto-update-aws.golden b/pkg/userdata/coreos/testdata/v1.9.2-disable-auto-update-aws.golden index 03c291c09..163eae849 100644 --- a/pkg/userdata/coreos/testdata/v1.9.2-disable-auto-update-aws.golden +++ b/pkg/userdata/coreos/testdata/v1.9.2-disable-auto-update-aws.golden @@ -153,6 +153,17 @@ "verification": {} }, "mode": 420 + }, + { + "filesystem": "root", + "group": {}, + "path": "/opt/bin/health-monitor.sh", + "user": {}, + "contents": { + "source": "data:,%23!%2Fusr%2Fbin%2Fenv%20bash%0A%0A%23%20Copyright%202016%20The%20Kubernetes%20Authors.%0A%23%0A%23%20Licensed%20under%20the%20Apache%20License%2C%20Version%202.0%20(the%20%22License%22)%3B%0A%23%20you%20may%20not%20use%20this%20file%20except%20in%20compliance%20with%20the%20License.%0A%23%20You%20may%20obtain%20a%20copy%20of%20the%20License%20at%0A%23%0A%23%20%20%20%20%20http%3A%2F%2Fwww.apache.org%2Flicenses%2FLICENSE-2.0%0A%23%0A%23%20Unless%20required%20by%20applicable%20law%20or%20agreed%20to%20in%20writing%2C%20software%0A%23%20distributed%20under%20the%20License%20is%20distributed%20on%20an%20%22AS%20IS%22%20BASIS%2C%0A%23%20WITHOUT%20WARRANTIES%20OR%20CONDITIONS%20OF%20ANY%20KIND%2C%20either%20express%20or%20implied.%0A%23%20See%20the%20License%20for%20the%20specific%20language%20governing%20permissions%20and%0A%23%20limitations%20under%20the%20License.%0A%0A%23%20This%20script%20is%20for%20master%20and%20node%20instance%20health%20monitoring%2C%20which%20is%0A%23%20packed%20in%20kube-manifest%20tarball.%20It%20is%20executed%20through%20a%20systemd%20service%0A%23%20in%20cluster%2Fgce%2Fgci%2F%3Cmaster%2Fnode%3E.yaml.%20The%20env%20variables%20come%20from%20an%20env%0A%23%20file%20provided%20by%20the%20systemd%20service.%0A%0Aset%20-o%20nounset%0Aset%20-o%20pipefail%0A%0A%23%20We%20simply%20kill%20the%20process%20when%20there%20is%20a%20failure.%20Another%20systemd%20service%20will%0A%23%20automatically%20restart%20the%20process.%0Afunction%20container_runtime_monitoring%20%7B%0A%20%20local%20-r%20max_attempts%3D5%0A%20%20local%20attempt%3D1%0A%20%20local%20-r%20crictl%3D%22%24%7BKUBE_HOME%7D%2Fbin%2Fcrictl%22%0A%20%20local%20-r%20container_runtime_name%3D%22%24%7BCONTAINER_RUNTIME_NAME%3A-docker%7D%22%0A%20%20%23%20We%20still%20need%20to%20use%20'docker%20ps'%20when%20container%20runtime%20is%20%22docker%22.%20This%20is%20because%0A%20%20%23%20dockershim%20is%20still%20part%20of%20kubelet%20today.%20When%20kubelet%20is%20down%2C%20crictl%20pods%0A%20%20%23%20will%20also%20fail%2C%20and%20docker%20will%20be%20killed.%20This%20is%20undesirable%20especially%20when%0A%20%20%23%20docker%20live%20restore%20is%20disabled.%0A%20%20local%20healthcheck_command%3D%22docker%20ps%22%0A%20%20if%20%5B%5B%20%22%24%7BCONTAINER_RUNTIME%3A-docker%7D%22%20!%3D%20%22docker%22%20%5D%5D%3B%20then%0A%20%20%20%20healthcheck_command%3D%22%24%7Bcrictl%7D%20pods%22%0A%20%20fi%0A%20%20%23%20Container%20runtime%20startup%20takes%20time.%20Make%20initial%20attempts%20before%20starting%0A%20%20%23%20killing%20the%20container%20runtime.%0A%20%20until%20timeout%2060%20%24%7Bhealthcheck_command%7D%20%3E%20%2Fdev%2Fnull%3B%20do%0A%20%20%20%20if%20((%20attempt%20%3D%3D%20max_attempts%20))%3B%20then%0A%20%20%20%20%20%20echo%20%22Max%20attempt%20%24%7Bmax_attempts%7D%20reached!%20Proceeding%20to%20monitor%20container%20runtime%20healthiness.%22%0A%20%20%20%20%20%20break%0A%20%20%20%20fi%0A%20%20%20%20echo%20%22%24attempt%20initial%20attempt%20%5C%22%24%7Bhealthcheck_command%7D%5C%22!%20Trying%20again%20in%20%24attempt%20seconds...%22%0A%20%20%20%20sleep%20%22%24((%202%20**%20attempt%2B%2B%20))%22%0A%20%20done%0A%20%20while%20true%3B%20do%0A%20%20%20%20if%20!%20timeout%2060%20%24%7Bhealthcheck_command%7D%20%3E%20%2Fdev%2Fnull%3B%20then%0A%20%20%20%20%20%20echo%20%22Container%20runtime%20%24%7Bcontainer_runtime_name%7D%20failed!%22%0A%20%20%20%20%20%20if%20%5B%5B%20%22%24container_runtime_name%22%20%3D%3D%20%22docker%22%20%5D%5D%3B%20then%0A%20%20%20%20%20%20%20%20%20%20%23%20Dump%20stack%20of%20docker%20daemon%20for%20investigation.%0A%20%20%20%20%20%20%20%20%20%20%23%20Log%20fle%20name%20looks%20like%20goroutine-stacks-TIMESTAMP%20and%20will%20be%20saved%20to%0A%20%20%20%20%20%20%20%20%20%20%23%20the%20exec%20root%20directory%2C%20which%20is%20%2Fvar%2Frun%2Fdocker%2F%20on%20Ubuntu%20and%20COS.%0A%20%20%20%20%20%20%20%20%20%20pkill%20-SIGUSR1%20dockerd%0A%20%20%20%20%20%20fi%0A%20%20%20%20%20%20systemctl%20kill%20--kill-who%3Dmain%20%22%24%7Bcontainer_runtime_name%7D%22%0A%20%20%20%20%20%20%23%20Wait%20for%20a%20while%2C%20as%20we%20don't%20want%20to%20kill%20it%20again%20before%20it%20is%20really%20up.%0A%20%20%20%20%20%20sleep%20120%0A%20%20%20%20else%0A%20%20%20%20%20%20sleep%20%22%24%7BSLEEP_SECONDS%7D%22%0A%20%20%20%20fi%0A%20%20done%0A%7D%0A%0Afunction%20kubelet_monitoring%20%7B%0A%20%20echo%20%22Wait%20for%202%20minutes%20for%20kubelet%20to%20be%20functional%22%0A%20%20%23%20TODO(andyzheng0831)%3A%20replace%20it%20with%20a%20more%20reliable%20method%20if%20possible.%0A%20%20sleep%20120%0A%20%20local%20-r%20max_seconds%3D10%0A%20%20local%20output%3D%22%22%0A%20%20while%20%5B%201%20%5D%3B%20do%0A%20%20%20%20if%20!%20output%3D%24(curl%20-m%20%22%24%7Bmax_seconds%7D%22%20-f%20-s%20-S%20http%3A%2F%2F127.0.0.1%3A10248%2Fhealthz%202%3E%261)%3B%20then%0A%20%20%20%20%20%20%23%20Print%20the%20response%20and%2For%20errors.%0A%20%20%20%20%20%20echo%20%24output%0A%20%20%20%20%20%20echo%20%22Kubelet%20is%20unhealthy!%22%0A%20%20%20%20%20%20systemctl%20kill%20kubelet%0A%20%20%20%20%20%20%23%20Wait%20for%20a%20while%2C%20as%20we%20don't%20want%20to%20kill%20it%20again%20before%20it%20is%20really%20up.%0A%20%20%20%20%20%20sleep%2060%0A%20%20%20%20else%0A%20%20%20%20%20%20sleep%20%22%24%7BSLEEP_SECONDS%7D%22%0A%20%20%20%20fi%0A%20%20done%0A%7D%0A%0A%0A%23%23%23%23%23%23%23%23%23%23%23%23%23%23%20Main%20Function%20%23%23%23%23%23%23%23%23%23%23%23%23%23%23%23%23%0Aif%20%5B%5B%20%22%24%23%22%20-ne%201%20%5D%5D%3B%20then%0A%20%20echo%20%22Usage%3A%20health-monitor.sh%20%3Ccontainer-runtime%2Fkubelet%3E%22%0A%20%20exit%201%0Afi%0A%0AKUBE_HOME%3D%22%2Fhome%2Fkubernetes%22%0A%0ASLEEP_SECONDS%3D10%0Acomponent%3D%241%0Aecho%20%22Start%20kubernetes%20health%20monitoring%20for%20%24%7Bcomponent%7D%22%0Aif%20%5B%5B%20%22%24%7Bcomponent%7D%22%20%3D%3D%20%22container-runtime%22%20%5D%5D%3B%20then%0A%20%20container_runtime_monitoring%0Aelif%20%5B%5B%20%22%24%7Bcomponent%7D%22%20%3D%3D%20%22kubelet%22%20%5D%5D%3B%20then%0A%20%20kubelet_monitoring%0Aelse%0A%20%20echo%20%22Health%20monitoring%20for%20component%20%22%24%7Bcomponent%7D%22%20is%20not%20supported!%22%0Afi%0A", + "verification": {} + }, + "mode": 755 } ] }, @@ -170,6 +181,16 @@ "enabled": true, "name": "docker.service" }, + { + "contents": "[Unit]\nRequires=network-online.target\nAfter=network-online.target\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh kubelet\n\n[Install]\nWantedBy=multi-user.target\n", + "enabled": true, + "name": "kubelet-healthcheck.service" + }, + { + "contents": "[Unit]\nRequires=network-online.target\nAfter=network-online.target\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh container-runtime\n\n[Install]\nWantedBy=multi-user.target\n", + "enabled": true, + "name": "docker-healthcheck.service" + }, { "contents": "[Unit]\nDescription=Kubernetes Kubelet\nRequires=docker.service\nAfter=docker.service\n[Service]\nTimeoutStartSec=5min\nEnvironment=KUBELET_IMAGE=docker://k8s.gcr.io/hyperkube-amd64:v1.9.2\nEnvironment=\"RKT_RUN_ARGS=--uuid-file-save=/var/cache/kubelet-pod.uuid \\\n --insecure-options=image \\\n --volume=resolv,kind=host,source=/etc/resolv.conf \\\n --mount volume=resolv,target=/etc/resolv.conf \\\n --volume cni-bin,kind=host,source=/opt/cni/bin \\\n --mount volume=cni-bin,target=/opt/cni/bin \\\n --volume cni-conf,kind=host,source=/etc/cni/net.d \\\n --mount volume=cni-conf,target=/etc/cni/net.d \\\n --volume etc-kubernetes,kind=host,source=/etc/kubernetes \\\n --mount volume=etc-kubernetes,target=/etc/kubernetes \\\n --volume var-log,kind=host,source=/var/log \\\n --mount volume=var-log,target=/var/log \\\n --volume var-lib-calico,kind=host,source=/var/lib/calico \\\n --mount volume=var-lib-calico,target=/var/lib/calico\"\nExecStartPre=/bin/mkdir -p /var/lib/calico\nExecStartPre=/bin/mkdir -p /etc/kubernetes/manifests\nExecStartPre=/bin/mkdir -p /etc/cni/net.d\nExecStartPre=/bin/mkdir -p /opt/cni/bin\nExecStartPre=-/usr/bin/rkt rm --uuid-file=/var/cache/kubelet-pod.uuid\nExecStart=/usr/lib/coreos/kubelet-wrapper \\\n --container-runtime=docker \\\n --allow-privileged=true \\\n --cni-bin-dir=/opt/cni/bin \\\n --cni-conf-dir=/etc/cni/net.d \\\n --cluster-dns=10.10.10.10 \\\n --cluster-domain=cluster.local \\\n --authentication-token-webhook=true \\\n --hostname-override=node1 \\\n --network-plugin=cni \\\n --cloud-provider=aws \\\n --cloud-config=/etc/kubernetes/cloud-config \\\n --cert-dir=/etc/kubernetes/ \\\n --pod-manifest-path=/etc/kubernetes/manifests \\\n --resolv-conf=/etc/resolv.conf \\\n --rotate-certificates=true \\\n --kubeconfig=/etc/kubernetes/kubeconfig \\\n --bootstrap-kubeconfig=/etc/kubernetes/bootstrap.kubeconfig \\\n --lock-file=/var/run/lock/kubelet.lock \\\n --exit-on-lock-contention \\\n --read-only-port=0 \\\n --protect-kernel-defaults=true \\\n --authorization-mode=Webhook \\\n --anonymous-auth=false \\\n --client-ca-file=/etc/kubernetes/ca.crt\nExecStop=-/usr/bin/rkt stop --uuid-file=/var/cache/kubelet-pod.uuid\nRestart=always\nRestartSec=10\n[Install]\nWantedBy=multi-user.target\n", "dropins": [ diff --git a/pkg/userdata/coreos/userdata.go b/pkg/userdata/coreos/userdata.go index 17334f53d..888336be8 100644 --- a/pkg/userdata/coreos/userdata.go +++ b/pkg/userdata/coreos/userdata.go @@ -175,6 +175,32 @@ systemd: - name: docker.service enabled: true + - name: kubelet-healthcheck.service + enabled: true + contents: | + [Unit] + Requires=network-online.target + After=network-online.target + + [Service] + ExecStart=/opt/bin/health-monitor.sh kubelet + + [Install] + WantedBy=multi-user.target + + - name: docker-healthcheck.service + enabled: true + contents: | + [Unit] + Requires=network-online.target + After=network-online.target + + [Service] + ExecStart=/opt/bin/health-monitor.sh container-runtime + + [Install] + WantedBy=multi-user.target + - name: kubelet.service enabled: true dropins: @@ -345,4 +371,121 @@ storage: inline: | [Service] Environment=DOCKER_OPTS=--storage-driver=overlay2 + + - path: /opt/bin/health-monitor.sh + filesystem: root + mode: 755 + # This script is a slightly adjusted version of + # https://github.com/kubernetes/kubernetes/blob/e1a1aa211224fcd9b213420b80b2ae680669683d/cluster/gce/gci/health-monitor.sh + # Adjustments are: + # * Kubelet health port is 10248 not 10255 + # * Removal of all all references to the KUBE_ENV file + contents: + inline: | + #!/usr/bin/env bash + + # Copyright 2016 The Kubernetes Authors. + # + # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + + # This script is for master and node instance health monitoring, which is + # packed in kube-manifest tarball. It is executed through a systemd service + # in cluster/gce/gci/.yaml. The env variables come from an env + # file provided by the systemd service. + + set -o nounset + set -o pipefail + + # We simply kill the process when there is a failure. Another systemd service will + # automatically restart the process. + function container_runtime_monitoring { + local -r max_attempts=5 + local attempt=1 + local -r crictl="${KUBE_HOME}/bin/crictl" + local -r container_runtime_name="${CONTAINER_RUNTIME_NAME:-docker}" + # We still need to use 'docker ps' when container runtime is "docker". This is because + # dockershim is still part of kubelet today. When kubelet is down, crictl pods + # will also fail, and docker will be killed. This is undesirable especially when + # docker live restore is disabled. + local healthcheck_command="docker ps" + if [[ "${CONTAINER_RUNTIME:-docker}" != "docker" ]]; then + healthcheck_command="${crictl} pods" + fi + # Container runtime startup takes time. Make initial attempts before starting + # killing the container runtime. + until timeout 60 ${healthcheck_command} > /dev/null; do + if (( attempt == max_attempts )); then + echo "Max attempt ${max_attempts} reached! Proceeding to monitor container runtime healthiness." + break + fi + echo "$attempt initial attempt \"${healthcheck_command}\"! Trying again in $attempt seconds..." + sleep "$(( 2 ** attempt++ ))" + done + while true; do + if ! timeout 60 ${healthcheck_command} > /dev/null; then + echo "Container runtime ${container_runtime_name} failed!" + if [[ "$container_runtime_name" == "docker" ]]; then + # Dump stack of docker daemon for investigation. + # Log fle name looks like goroutine-stacks-TIMESTAMP and will be saved to + # the exec root directory, which is /var/run/docker/ on Ubuntu and COS. + pkill -SIGUSR1 dockerd + fi + systemctl kill --kill-who=main "${container_runtime_name}" + # Wait for a while, as we don't want to kill it again before it is really up. + sleep 120 + else + sleep "${SLEEP_SECONDS}" + fi + done + } + + function kubelet_monitoring { + echo "Wait for 2 minutes for kubelet to be functional" + # TODO(andyzheng0831): replace it with a more reliable method if possible. + sleep 120 + local -r max_seconds=10 + local output="" + while [ 1 ]; do + if ! output=$(curl -m "${max_seconds}" -f -s -S http://127.0.0.1:10248/healthz 2>&1); then + # Print the response and/or errors. + echo $output + echo "Kubelet is unhealthy!" + systemctl kill kubelet + # Wait for a while, as we don't want to kill it again before it is really up. + sleep 60 + else + sleep "${SLEEP_SECONDS}" + fi + done + } + + + ############## Main Function ################ + if [[ "$#" -ne 1 ]]; then + echo "Usage: health-monitor.sh " + exit 1 + fi + + KUBE_HOME="/home/kubernetes" + + SLEEP_SECONDS=10 + component=$1 + echo "Start kubernetes health monitoring for ${component}" + if [[ "${component}" == "container-runtime" ]]; then + container_runtime_monitoring + elif [[ "${component}" == "kubelet" ]]; then + kubelet_monitoring + else + echo "Health monitoring for component "${component}" is not supported!" + fi ` From 34869bb43aaefa2a16a0b3bae59bb80360d9c145 Mon Sep 17 00:00:00 2001 From: Alvaro Aleman Date: Fri, 12 Oct 2018 15:29:15 +0200 Subject: [PATCH 3/4] Download healthcheck script instead if putting it into userdata because of size limits --- .../centos/testdata/kubelet-v1.10-aws.golden | 127 ++-------------- .../centos/testdata/kubelet-v1.11-aws.golden | 127 ++-------------- .../centos/testdata/kubelet-v1.12-aws.golden | 127 ++-------------- .../centos/testdata/kubelet-v1.9-aws.golden | 127 ++-------------- pkg/userdata/centos/userdata.go | 127 ++-------------- ...-openstack-kubelet-v-version-prefix.golden | 13 +- ...-auto-update-openstack-multiple-dns.golden | 13 +- .../v1.11.2-vsphere-static-ipconfig.golden | 13 +- ....12.0-vsphere-overwrite-cloudconfig.golden | 13 +- .../v1.9.2-disable-auto-update-aws.golden | 13 +- pkg/userdata/coreos/userdata.go | 137 +++--------------- pkg/userdata/ubuntu/testdata/1.11-aws.golden | 121 +--------------- .../1.9.2-dist-upgrade-on-boot-aws.golden | 121 +--------------- .../1.9.2-openstack-multiple-dns.golden | 121 +--------------- .../openstack-kubelet-v-version-prefix.golden | 121 +--------------- .../openstack-overwrite-cloud-config.golden | 121 +--------------- pkg/userdata/ubuntu/userdata.go | 121 +--------------- 17 files changed, 164 insertions(+), 1399 deletions(-) diff --git a/pkg/userdata/centos/testdata/kubelet-v1.10-aws.golden b/pkg/userdata/centos/testdata/kubelet-v1.10-aws.golden index 7e3fa40fb..1d0331599 100644 --- a/pkg/userdata/centos/testdata/kubelet-v1.10-aws.golden +++ b/pkg/userdata/centos/testdata/kubelet-v1.10-aws.golden @@ -83,6 +83,12 @@ write_files: systemctl enable --now docker systemctl enable --now kubelet + if [[ ! -x /usr/local/bin/health-monitor.sh ]]; then + curl -Lfo /usr/local/bin/health-monitor.sh \ + https://raw.githubusercontent.com/kubermatic/machine-controller/8b5b66e4910a6228dfaecccaa0a3b05ec4902f8e/pkg/userdata/scripts/health-monitor.sh + chmod +x /usr/local/bin/health-monitor.sh + fi + if ! [[ -e /etc/kubernetes/pki/ca.crt ]]; then kubeadm join \ --token my-token \ @@ -91,6 +97,12 @@ write_files: server:443 fi + if [[ ! -x /usr/local/bin/health-monitor.sh ]]; then + curl -Lfo /usr/local/bin/health-monitor.sh \ + https://raw.githubusercontent.com/kubermatic/machine-controller/8b5b66e4910a6228dfaecccaa0a3b05ec4902f8e/pkg/userdata/scripts/health-monitor.sh + chmod +x /usr/local/bin/health-monitor.sh + fi + systemctl enable --now --no-block kubelet-healthcheck.service systemctl enable --now --no-block docker-healthcheck.service @@ -143,120 +155,5 @@ write_files: [Install] WantedBy=multi-user.target -- path: /usr/local/bin/health-monitor.sh - permissions: "0755" - # This script is a slightly adjusted version of - # https://github.com/kubernetes/kubernetes/blob/e1a1aa211224fcd9b213420b80b2ae680669683d/cluster/gce/gci/health-monitor.sh - # Adjustments are: - # * Kubelet health port is 10248 not 10255 - # * Removal of all all references to the KUBE_ENV file - content: | - #!/usr/bin/env bash - - # Copyright 2016 The Kubernetes Authors. - # - # Licensed under the Apache License, Version 2.0 (the "License"); - # you may not use this file except in compliance with the License. - # You may obtain a copy of the License at - # - # http://www.apache.org/licenses/LICENSE-2.0 - # - # Unless required by applicable law or agreed to in writing, software - # distributed under the License is distributed on an "AS IS" BASIS, - # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - # See the License for the specific language governing permissions and - # limitations under the License. - - # This script is for master and node instance health monitoring, which is - # packed in kube-manifest tarball. It is executed through a systemd service - # in cluster/gce/gci/.yaml. The env variables come from an env - # file provided by the systemd service. - - set -o nounset - set -o pipefail - - # We simply kill the process when there is a failure. Another systemd service will - # automatically restart the process. - function container_runtime_monitoring { - local -r max_attempts=5 - local attempt=1 - local -r crictl="${KUBE_HOME}/bin/crictl" - local -r container_runtime_name="${CONTAINER_RUNTIME_NAME:-docker}" - # We still need to use 'docker ps' when container runtime is "docker". This is because - # dockershim is still part of kubelet today. When kubelet is down, crictl pods - # will also fail, and docker will be killed. This is undesirable especially when - # docker live restore is disabled. - local healthcheck_command="docker ps" - if [[ "${CONTAINER_RUNTIME:-docker}" != "docker" ]]; then - healthcheck_command="${crictl} pods" - fi - # Container runtime startup takes time. Make initial attempts before starting - # killing the container runtime. - until timeout 60 ${healthcheck_command} > /dev/null; do - if (( attempt == max_attempts )); then - echo "Max attempt ${max_attempts} reached! Proceeding to monitor container runtime healthiness." - break - fi - echo "$attempt initial attempt \"${healthcheck_command}\"! Trying again in $attempt seconds..." - sleep "$(( 2 ** attempt++ ))" - done - while true; do - if ! timeout 60 ${healthcheck_command} > /dev/null; then - echo "Container runtime ${container_runtime_name} failed!" - if [[ "$container_runtime_name" == "docker" ]]; then - # Dump stack of docker daemon for investigation. - # Log fle name looks like goroutine-stacks-TIMESTAMP and will be saved to - # the exec root directory, which is /var/run/docker/ on Ubuntu and COS. - pkill -SIGUSR1 dockerd - fi - systemctl kill --kill-who=main "${container_runtime_name}" - # Wait for a while, as we don't want to kill it again before it is really up. - sleep 120 - else - sleep "${SLEEP_SECONDS}" - fi - done - } - - function kubelet_monitoring { - echo "Wait for 2 minutes for kubelet to be functional" - # TODO(andyzheng0831): replace it with a more reliable method if possible. - sleep 120 - local -r max_seconds=10 - local output="" - while [ 1 ]; do - if ! output=$(curl -m "${max_seconds}" -f -s -S http://127.0.0.1:10248/healthz 2>&1); then - # Print the response and/or errors. - echo $output - echo "Kubelet is unhealthy!" - systemctl kill kubelet - # Wait for a while, as we don't want to kill it again before it is really up. - sleep 60 - else - sleep "${SLEEP_SECONDS}" - fi - done - } - - - ############## Main Function ################ - if [[ "$#" -ne 1 ]]; then - echo "Usage: health-monitor.sh " - exit 1 - fi - - KUBE_HOME="/home/kubernetes" - - SLEEP_SECONDS=10 - component=$1 - echo "Start kubernetes health monitoring for ${component}" - if [[ "${component}" == "container-runtime" ]]; then - container_runtime_monitoring - elif [[ "${component}" == "kubelet" ]]; then - kubelet_monitoring - else - echo "Health monitoring for component "${component}" is not supported!" - fi - runcmd: - systemctl enable --now setup.service diff --git a/pkg/userdata/centos/testdata/kubelet-v1.11-aws.golden b/pkg/userdata/centos/testdata/kubelet-v1.11-aws.golden index 503007bc4..dfffb6818 100644 --- a/pkg/userdata/centos/testdata/kubelet-v1.11-aws.golden +++ b/pkg/userdata/centos/testdata/kubelet-v1.11-aws.golden @@ -79,6 +79,12 @@ write_files: systemctl enable --now docker systemctl enable --now kubelet + if [[ ! -x /usr/local/bin/health-monitor.sh ]]; then + curl -Lfo /usr/local/bin/health-monitor.sh \ + https://raw.githubusercontent.com/kubermatic/machine-controller/8b5b66e4910a6228dfaecccaa0a3b05ec4902f8e/pkg/userdata/scripts/health-monitor.sh + chmod +x /usr/local/bin/health-monitor.sh + fi + if ! [[ -e /etc/kubernetes/pki/ca.crt ]]; then kubeadm join \ --token my-token \ @@ -87,6 +93,12 @@ write_files: server:443 fi + if [[ ! -x /usr/local/bin/health-monitor.sh ]]; then + curl -Lfo /usr/local/bin/health-monitor.sh \ + https://raw.githubusercontent.com/kubermatic/machine-controller/8b5b66e4910a6228dfaecccaa0a3b05ec4902f8e/pkg/userdata/scripts/health-monitor.sh + chmod +x /usr/local/bin/health-monitor.sh + fi + systemctl enable --now --no-block kubelet-healthcheck.service systemctl enable --now --no-block docker-healthcheck.service @@ -139,120 +151,5 @@ write_files: [Install] WantedBy=multi-user.target -- path: /usr/local/bin/health-monitor.sh - permissions: "0755" - # This script is a slightly adjusted version of - # https://github.com/kubernetes/kubernetes/blob/e1a1aa211224fcd9b213420b80b2ae680669683d/cluster/gce/gci/health-monitor.sh - # Adjustments are: - # * Kubelet health port is 10248 not 10255 - # * Removal of all all references to the KUBE_ENV file - content: | - #!/usr/bin/env bash - - # Copyright 2016 The Kubernetes Authors. - # - # Licensed under the Apache License, Version 2.0 (the "License"); - # you may not use this file except in compliance with the License. - # You may obtain a copy of the License at - # - # http://www.apache.org/licenses/LICENSE-2.0 - # - # Unless required by applicable law or agreed to in writing, software - # distributed under the License is distributed on an "AS IS" BASIS, - # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - # See the License for the specific language governing permissions and - # limitations under the License. - - # This script is for master and node instance health monitoring, which is - # packed in kube-manifest tarball. It is executed through a systemd service - # in cluster/gce/gci/.yaml. The env variables come from an env - # file provided by the systemd service. - - set -o nounset - set -o pipefail - - # We simply kill the process when there is a failure. Another systemd service will - # automatically restart the process. - function container_runtime_monitoring { - local -r max_attempts=5 - local attempt=1 - local -r crictl="${KUBE_HOME}/bin/crictl" - local -r container_runtime_name="${CONTAINER_RUNTIME_NAME:-docker}" - # We still need to use 'docker ps' when container runtime is "docker". This is because - # dockershim is still part of kubelet today. When kubelet is down, crictl pods - # will also fail, and docker will be killed. This is undesirable especially when - # docker live restore is disabled. - local healthcheck_command="docker ps" - if [[ "${CONTAINER_RUNTIME:-docker}" != "docker" ]]; then - healthcheck_command="${crictl} pods" - fi - # Container runtime startup takes time. Make initial attempts before starting - # killing the container runtime. - until timeout 60 ${healthcheck_command} > /dev/null; do - if (( attempt == max_attempts )); then - echo "Max attempt ${max_attempts} reached! Proceeding to monitor container runtime healthiness." - break - fi - echo "$attempt initial attempt \"${healthcheck_command}\"! Trying again in $attempt seconds..." - sleep "$(( 2 ** attempt++ ))" - done - while true; do - if ! timeout 60 ${healthcheck_command} > /dev/null; then - echo "Container runtime ${container_runtime_name} failed!" - if [[ "$container_runtime_name" == "docker" ]]; then - # Dump stack of docker daemon for investigation. - # Log fle name looks like goroutine-stacks-TIMESTAMP and will be saved to - # the exec root directory, which is /var/run/docker/ on Ubuntu and COS. - pkill -SIGUSR1 dockerd - fi - systemctl kill --kill-who=main "${container_runtime_name}" - # Wait for a while, as we don't want to kill it again before it is really up. - sleep 120 - else - sleep "${SLEEP_SECONDS}" - fi - done - } - - function kubelet_monitoring { - echo "Wait for 2 minutes for kubelet to be functional" - # TODO(andyzheng0831): replace it with a more reliable method if possible. - sleep 120 - local -r max_seconds=10 - local output="" - while [ 1 ]; do - if ! output=$(curl -m "${max_seconds}" -f -s -S http://127.0.0.1:10248/healthz 2>&1); then - # Print the response and/or errors. - echo $output - echo "Kubelet is unhealthy!" - systemctl kill kubelet - # Wait for a while, as we don't want to kill it again before it is really up. - sleep 60 - else - sleep "${SLEEP_SECONDS}" - fi - done - } - - - ############## Main Function ################ - if [[ "$#" -ne 1 ]]; then - echo "Usage: health-monitor.sh " - exit 1 - fi - - KUBE_HOME="/home/kubernetes" - - SLEEP_SECONDS=10 - component=$1 - echo "Start kubernetes health monitoring for ${component}" - if [[ "${component}" == "container-runtime" ]]; then - container_runtime_monitoring - elif [[ "${component}" == "kubelet" ]]; then - kubelet_monitoring - else - echo "Health monitoring for component "${component}" is not supported!" - fi - runcmd: - systemctl enable --now setup.service diff --git a/pkg/userdata/centos/testdata/kubelet-v1.12-aws.golden b/pkg/userdata/centos/testdata/kubelet-v1.12-aws.golden index 144a82e18..1728ae3bd 100644 --- a/pkg/userdata/centos/testdata/kubelet-v1.12-aws.golden +++ b/pkg/userdata/centos/testdata/kubelet-v1.12-aws.golden @@ -79,6 +79,12 @@ write_files: systemctl enable --now docker systemctl enable --now kubelet + if [[ ! -x /usr/local/bin/health-monitor.sh ]]; then + curl -Lfo /usr/local/bin/health-monitor.sh \ + https://raw.githubusercontent.com/kubermatic/machine-controller/8b5b66e4910a6228dfaecccaa0a3b05ec4902f8e/pkg/userdata/scripts/health-monitor.sh + chmod +x /usr/local/bin/health-monitor.sh + fi + if ! [[ -e /etc/kubernetes/pki/ca.crt ]]; then kubeadm join \ --token my-token \ @@ -87,6 +93,12 @@ write_files: server:443 fi + if [[ ! -x /usr/local/bin/health-monitor.sh ]]; then + curl -Lfo /usr/local/bin/health-monitor.sh \ + https://raw.githubusercontent.com/kubermatic/machine-controller/8b5b66e4910a6228dfaecccaa0a3b05ec4902f8e/pkg/userdata/scripts/health-monitor.sh + chmod +x /usr/local/bin/health-monitor.sh + fi + systemctl enable --now --no-block kubelet-healthcheck.service systemctl enable --now --no-block docker-healthcheck.service @@ -139,120 +151,5 @@ write_files: [Install] WantedBy=multi-user.target -- path: /usr/local/bin/health-monitor.sh - permissions: "0755" - # This script is a slightly adjusted version of - # https://github.com/kubernetes/kubernetes/blob/e1a1aa211224fcd9b213420b80b2ae680669683d/cluster/gce/gci/health-monitor.sh - # Adjustments are: - # * Kubelet health port is 10248 not 10255 - # * Removal of all all references to the KUBE_ENV file - content: | - #!/usr/bin/env bash - - # Copyright 2016 The Kubernetes Authors. - # - # Licensed under the Apache License, Version 2.0 (the "License"); - # you may not use this file except in compliance with the License. - # You may obtain a copy of the License at - # - # http://www.apache.org/licenses/LICENSE-2.0 - # - # Unless required by applicable law or agreed to in writing, software - # distributed under the License is distributed on an "AS IS" BASIS, - # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - # See the License for the specific language governing permissions and - # limitations under the License. - - # This script is for master and node instance health monitoring, which is - # packed in kube-manifest tarball. It is executed through a systemd service - # in cluster/gce/gci/.yaml. The env variables come from an env - # file provided by the systemd service. - - set -o nounset - set -o pipefail - - # We simply kill the process when there is a failure. Another systemd service will - # automatically restart the process. - function container_runtime_monitoring { - local -r max_attempts=5 - local attempt=1 - local -r crictl="${KUBE_HOME}/bin/crictl" - local -r container_runtime_name="${CONTAINER_RUNTIME_NAME:-docker}" - # We still need to use 'docker ps' when container runtime is "docker". This is because - # dockershim is still part of kubelet today. When kubelet is down, crictl pods - # will also fail, and docker will be killed. This is undesirable especially when - # docker live restore is disabled. - local healthcheck_command="docker ps" - if [[ "${CONTAINER_RUNTIME:-docker}" != "docker" ]]; then - healthcheck_command="${crictl} pods" - fi - # Container runtime startup takes time. Make initial attempts before starting - # killing the container runtime. - until timeout 60 ${healthcheck_command} > /dev/null; do - if (( attempt == max_attempts )); then - echo "Max attempt ${max_attempts} reached! Proceeding to monitor container runtime healthiness." - break - fi - echo "$attempt initial attempt \"${healthcheck_command}\"! Trying again in $attempt seconds..." - sleep "$(( 2 ** attempt++ ))" - done - while true; do - if ! timeout 60 ${healthcheck_command} > /dev/null; then - echo "Container runtime ${container_runtime_name} failed!" - if [[ "$container_runtime_name" == "docker" ]]; then - # Dump stack of docker daemon for investigation. - # Log fle name looks like goroutine-stacks-TIMESTAMP and will be saved to - # the exec root directory, which is /var/run/docker/ on Ubuntu and COS. - pkill -SIGUSR1 dockerd - fi - systemctl kill --kill-who=main "${container_runtime_name}" - # Wait for a while, as we don't want to kill it again before it is really up. - sleep 120 - else - sleep "${SLEEP_SECONDS}" - fi - done - } - - function kubelet_monitoring { - echo "Wait for 2 minutes for kubelet to be functional" - # TODO(andyzheng0831): replace it with a more reliable method if possible. - sleep 120 - local -r max_seconds=10 - local output="" - while [ 1 ]; do - if ! output=$(curl -m "${max_seconds}" -f -s -S http://127.0.0.1:10248/healthz 2>&1); then - # Print the response and/or errors. - echo $output - echo "Kubelet is unhealthy!" - systemctl kill kubelet - # Wait for a while, as we don't want to kill it again before it is really up. - sleep 60 - else - sleep "${SLEEP_SECONDS}" - fi - done - } - - - ############## Main Function ################ - if [[ "$#" -ne 1 ]]; then - echo "Usage: health-monitor.sh " - exit 1 - fi - - KUBE_HOME="/home/kubernetes" - - SLEEP_SECONDS=10 - component=$1 - echo "Start kubernetes health monitoring for ${component}" - if [[ "${component}" == "container-runtime" ]]; then - container_runtime_monitoring - elif [[ "${component}" == "kubelet" ]]; then - kubelet_monitoring - else - echo "Health monitoring for component "${component}" is not supported!" - fi - runcmd: - systemctl enable --now setup.service diff --git a/pkg/userdata/centos/testdata/kubelet-v1.9-aws.golden b/pkg/userdata/centos/testdata/kubelet-v1.9-aws.golden index 70093e76e..b99c0cac2 100644 --- a/pkg/userdata/centos/testdata/kubelet-v1.9-aws.golden +++ b/pkg/userdata/centos/testdata/kubelet-v1.9-aws.golden @@ -83,6 +83,12 @@ write_files: systemctl enable --now docker systemctl enable --now kubelet + if [[ ! -x /usr/local/bin/health-monitor.sh ]]; then + curl -Lfo /usr/local/bin/health-monitor.sh \ + https://raw.githubusercontent.com/kubermatic/machine-controller/8b5b66e4910a6228dfaecccaa0a3b05ec4902f8e/pkg/userdata/scripts/health-monitor.sh + chmod +x /usr/local/bin/health-monitor.sh + fi + if ! [[ -e /etc/kubernetes/pki/ca.crt ]]; then kubeadm join \ --token my-token \ @@ -91,6 +97,12 @@ write_files: server:443 fi + if [[ ! -x /usr/local/bin/health-monitor.sh ]]; then + curl -Lfo /usr/local/bin/health-monitor.sh \ + https://raw.githubusercontent.com/kubermatic/machine-controller/8b5b66e4910a6228dfaecccaa0a3b05ec4902f8e/pkg/userdata/scripts/health-monitor.sh + chmod +x /usr/local/bin/health-monitor.sh + fi + systemctl enable --now --no-block kubelet-healthcheck.service systemctl enable --now --no-block docker-healthcheck.service @@ -143,120 +155,5 @@ write_files: [Install] WantedBy=multi-user.target -- path: /usr/local/bin/health-monitor.sh - permissions: "0755" - # This script is a slightly adjusted version of - # https://github.com/kubernetes/kubernetes/blob/e1a1aa211224fcd9b213420b80b2ae680669683d/cluster/gce/gci/health-monitor.sh - # Adjustments are: - # * Kubelet health port is 10248 not 10255 - # * Removal of all all references to the KUBE_ENV file - content: | - #!/usr/bin/env bash - - # Copyright 2016 The Kubernetes Authors. - # - # Licensed under the Apache License, Version 2.0 (the "License"); - # you may not use this file except in compliance with the License. - # You may obtain a copy of the License at - # - # http://www.apache.org/licenses/LICENSE-2.0 - # - # Unless required by applicable law or agreed to in writing, software - # distributed under the License is distributed on an "AS IS" BASIS, - # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - # See the License for the specific language governing permissions and - # limitations under the License. - - # This script is for master and node instance health monitoring, which is - # packed in kube-manifest tarball. It is executed through a systemd service - # in cluster/gce/gci/.yaml. The env variables come from an env - # file provided by the systemd service. - - set -o nounset - set -o pipefail - - # We simply kill the process when there is a failure. Another systemd service will - # automatically restart the process. - function container_runtime_monitoring { - local -r max_attempts=5 - local attempt=1 - local -r crictl="${KUBE_HOME}/bin/crictl" - local -r container_runtime_name="${CONTAINER_RUNTIME_NAME:-docker}" - # We still need to use 'docker ps' when container runtime is "docker". This is because - # dockershim is still part of kubelet today. When kubelet is down, crictl pods - # will also fail, and docker will be killed. This is undesirable especially when - # docker live restore is disabled. - local healthcheck_command="docker ps" - if [[ "${CONTAINER_RUNTIME:-docker}" != "docker" ]]; then - healthcheck_command="${crictl} pods" - fi - # Container runtime startup takes time. Make initial attempts before starting - # killing the container runtime. - until timeout 60 ${healthcheck_command} > /dev/null; do - if (( attempt == max_attempts )); then - echo "Max attempt ${max_attempts} reached! Proceeding to monitor container runtime healthiness." - break - fi - echo "$attempt initial attempt \"${healthcheck_command}\"! Trying again in $attempt seconds..." - sleep "$(( 2 ** attempt++ ))" - done - while true; do - if ! timeout 60 ${healthcheck_command} > /dev/null; then - echo "Container runtime ${container_runtime_name} failed!" - if [[ "$container_runtime_name" == "docker" ]]; then - # Dump stack of docker daemon for investigation. - # Log fle name looks like goroutine-stacks-TIMESTAMP and will be saved to - # the exec root directory, which is /var/run/docker/ on Ubuntu and COS. - pkill -SIGUSR1 dockerd - fi - systemctl kill --kill-who=main "${container_runtime_name}" - # Wait for a while, as we don't want to kill it again before it is really up. - sleep 120 - else - sleep "${SLEEP_SECONDS}" - fi - done - } - - function kubelet_monitoring { - echo "Wait for 2 minutes for kubelet to be functional" - # TODO(andyzheng0831): replace it with a more reliable method if possible. - sleep 120 - local -r max_seconds=10 - local output="" - while [ 1 ]; do - if ! output=$(curl -m "${max_seconds}" -f -s -S http://127.0.0.1:10248/healthz 2>&1); then - # Print the response and/or errors. - echo $output - echo "Kubelet is unhealthy!" - systemctl kill kubelet - # Wait for a while, as we don't want to kill it again before it is really up. - sleep 60 - else - sleep "${SLEEP_SECONDS}" - fi - done - } - - - ############## Main Function ################ - if [[ "$#" -ne 1 ]]; then - echo "Usage: health-monitor.sh " - exit 1 - fi - - KUBE_HOME="/home/kubernetes" - - SLEEP_SECONDS=10 - component=$1 - echo "Start kubernetes health monitoring for ${component}" - if [[ "${component}" == "container-runtime" ]]; then - container_runtime_monitoring - elif [[ "${component}" == "kubelet" ]]; then - kubelet_monitoring - else - echo "Health monitoring for component "${component}" is not supported!" - fi - runcmd: - systemctl enable --now setup.service diff --git a/pkg/userdata/centos/userdata.go b/pkg/userdata/centos/userdata.go index cdbef9453..2ba37678c 100644 --- a/pkg/userdata/centos/userdata.go +++ b/pkg/userdata/centos/userdata.go @@ -230,6 +230,12 @@ write_files: systemctl enable --now docker systemctl enable --now kubelet + if [[ ! -x /usr/local/bin/health-monitor.sh ]]; then + curl -Lfo /usr/local/bin/health-monitor.sh \ + https://raw.githubusercontent.com/kubermatic/machine-controller/8b5b66e4910a6228dfaecccaa0a3b05ec4902f8e/pkg/userdata/scripts/health-monitor.sh + chmod +x /usr/local/bin/health-monitor.sh + fi + if ! [[ -e /etc/kubernetes/pki/ca.crt ]]; then kubeadm join \ --token {{ .BoostrapToken }} \ @@ -240,6 +246,12 @@ write_files: {{ .ServerAddr }} fi + if [[ ! -x /usr/local/bin/health-monitor.sh ]]; then + curl -Lfo /usr/local/bin/health-monitor.sh \ + https://raw.githubusercontent.com/kubermatic/machine-controller/8b5b66e4910a6228dfaecccaa0a3b05ec4902f8e/pkg/userdata/scripts/health-monitor.sh + chmod +x /usr/local/bin/health-monitor.sh + fi + systemctl enable --now --no-block kubelet-healthcheck.service systemctl enable --now --no-block docker-healthcheck.service @@ -292,121 +304,6 @@ write_files: [Install] WantedBy=multi-user.target -- path: /usr/local/bin/health-monitor.sh - permissions: "0755" - # This script is a slightly adjusted version of - # https://github.com/kubernetes/kubernetes/blob/e1a1aa211224fcd9b213420b80b2ae680669683d/cluster/gce/gci/health-monitor.sh - # Adjustments are: - # * Kubelet health port is 10248 not 10255 - # * Removal of all all references to the KUBE_ENV file - content: | - #!/usr/bin/env bash - - # Copyright 2016 The Kubernetes Authors. - # - # Licensed under the Apache License, Version 2.0 (the "License"); - # you may not use this file except in compliance with the License. - # You may obtain a copy of the License at - # - # http://www.apache.org/licenses/LICENSE-2.0 - # - # Unless required by applicable law or agreed to in writing, software - # distributed under the License is distributed on an "AS IS" BASIS, - # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - # See the License for the specific language governing permissions and - # limitations under the License. - - # This script is for master and node instance health monitoring, which is - # packed in kube-manifest tarball. It is executed through a systemd service - # in cluster/gce/gci/.yaml. The env variables come from an env - # file provided by the systemd service. - - set -o nounset - set -o pipefail - - # We simply kill the process when there is a failure. Another systemd service will - # automatically restart the process. - function container_runtime_monitoring { - local -r max_attempts=5 - local attempt=1 - local -r crictl="${KUBE_HOME}/bin/crictl" - local -r container_runtime_name="${CONTAINER_RUNTIME_NAME:-docker}" - # We still need to use 'docker ps' when container runtime is "docker". This is because - # dockershim is still part of kubelet today. When kubelet is down, crictl pods - # will also fail, and docker will be killed. This is undesirable especially when - # docker live restore is disabled. - local healthcheck_command="docker ps" - if [[ "${CONTAINER_RUNTIME:-docker}" != "docker" ]]; then - healthcheck_command="${crictl} pods" - fi - # Container runtime startup takes time. Make initial attempts before starting - # killing the container runtime. - until timeout 60 ${healthcheck_command} > /dev/null; do - if (( attempt == max_attempts )); then - echo "Max attempt ${max_attempts} reached! Proceeding to monitor container runtime healthiness." - break - fi - echo "$attempt initial attempt \"${healthcheck_command}\"! Trying again in $attempt seconds..." - sleep "$(( 2 ** attempt++ ))" - done - while true; do - if ! timeout 60 ${healthcheck_command} > /dev/null; then - echo "Container runtime ${container_runtime_name} failed!" - if [[ "$container_runtime_name" == "docker" ]]; then - # Dump stack of docker daemon for investigation. - # Log fle name looks like goroutine-stacks-TIMESTAMP and will be saved to - # the exec root directory, which is /var/run/docker/ on Ubuntu and COS. - pkill -SIGUSR1 dockerd - fi - systemctl kill --kill-who=main "${container_runtime_name}" - # Wait for a while, as we don't want to kill it again before it is really up. - sleep 120 - else - sleep "${SLEEP_SECONDS}" - fi - done - } - - function kubelet_monitoring { - echo "Wait for 2 minutes for kubelet to be functional" - # TODO(andyzheng0831): replace it with a more reliable method if possible. - sleep 120 - local -r max_seconds=10 - local output="" - while [ 1 ]; do - if ! output=$(curl -m "${max_seconds}" -f -s -S http://127.0.0.1:10248/healthz 2>&1); then - # Print the response and/or errors. - echo $output - echo "Kubelet is unhealthy!" - systemctl kill kubelet - # Wait for a while, as we don't want to kill it again before it is really up. - sleep 60 - else - sleep "${SLEEP_SECONDS}" - fi - done - } - - - ############## Main Function ################ - if [[ "$#" -ne 1 ]]; then - echo "Usage: health-monitor.sh " - exit 1 - fi - - KUBE_HOME="/home/kubernetes" - - SLEEP_SECONDS=10 - component=$1 - echo "Start kubernetes health monitoring for ${component}" - if [[ "${component}" == "container-runtime" ]]; then - container_runtime_monitoring - elif [[ "${component}" == "kubelet" ]]; then - kubelet_monitoring - else - echo "Health monitoring for component "${component}" is not supported!" - fi - runcmd: - systemctl enable --now setup.service ` diff --git a/pkg/userdata/coreos/testdata/auto-update-openstack-kubelet-v-version-prefix.golden b/pkg/userdata/coreos/testdata/auto-update-openstack-kubelet-v-version-prefix.golden index 78f7d1de9..83c7e594f 100644 --- a/pkg/userdata/coreos/testdata/auto-update-openstack-kubelet-v-version-prefix.golden +++ b/pkg/userdata/coreos/testdata/auto-update-openstack-kubelet-v-version-prefix.golden @@ -157,10 +157,10 @@ { "filesystem": "root", "group": {}, - "path": "/opt/bin/health-monitor.sh", + "path": "/opt/bin/download-healthcheck-script.sh", "user": {}, "contents": { - "source": "data:,%23!%2Fusr%2Fbin%2Fenv%20bash%0A%0A%23%20Copyright%202016%20The%20Kubernetes%20Authors.%0A%23%0A%23%20Licensed%20under%20the%20Apache%20License%2C%20Version%202.0%20(the%20%22License%22)%3B%0A%23%20you%20may%20not%20use%20this%20file%20except%20in%20compliance%20with%20the%20License.%0A%23%20You%20may%20obtain%20a%20copy%20of%20the%20License%20at%0A%23%0A%23%20%20%20%20%20http%3A%2F%2Fwww.apache.org%2Flicenses%2FLICENSE-2.0%0A%23%0A%23%20Unless%20required%20by%20applicable%20law%20or%20agreed%20to%20in%20writing%2C%20software%0A%23%20distributed%20under%20the%20License%20is%20distributed%20on%20an%20%22AS%20IS%22%20BASIS%2C%0A%23%20WITHOUT%20WARRANTIES%20OR%20CONDITIONS%20OF%20ANY%20KIND%2C%20either%20express%20or%20implied.%0A%23%20See%20the%20License%20for%20the%20specific%20language%20governing%20permissions%20and%0A%23%20limitations%20under%20the%20License.%0A%0A%23%20This%20script%20is%20for%20master%20and%20node%20instance%20health%20monitoring%2C%20which%20is%0A%23%20packed%20in%20kube-manifest%20tarball.%20It%20is%20executed%20through%20a%20systemd%20service%0A%23%20in%20cluster%2Fgce%2Fgci%2F%3Cmaster%2Fnode%3E.yaml.%20The%20env%20variables%20come%20from%20an%20env%0A%23%20file%20provided%20by%20the%20systemd%20service.%0A%0Aset%20-o%20nounset%0Aset%20-o%20pipefail%0A%0A%23%20We%20simply%20kill%20the%20process%20when%20there%20is%20a%20failure.%20Another%20systemd%20service%20will%0A%23%20automatically%20restart%20the%20process.%0Afunction%20container_runtime_monitoring%20%7B%0A%20%20local%20-r%20max_attempts%3D5%0A%20%20local%20attempt%3D1%0A%20%20local%20-r%20crictl%3D%22%24%7BKUBE_HOME%7D%2Fbin%2Fcrictl%22%0A%20%20local%20-r%20container_runtime_name%3D%22%24%7BCONTAINER_RUNTIME_NAME%3A-docker%7D%22%0A%20%20%23%20We%20still%20need%20to%20use%20'docker%20ps'%20when%20container%20runtime%20is%20%22docker%22.%20This%20is%20because%0A%20%20%23%20dockershim%20is%20still%20part%20of%20kubelet%20today.%20When%20kubelet%20is%20down%2C%20crictl%20pods%0A%20%20%23%20will%20also%20fail%2C%20and%20docker%20will%20be%20killed.%20This%20is%20undesirable%20especially%20when%0A%20%20%23%20docker%20live%20restore%20is%20disabled.%0A%20%20local%20healthcheck_command%3D%22docker%20ps%22%0A%20%20if%20%5B%5B%20%22%24%7BCONTAINER_RUNTIME%3A-docker%7D%22%20!%3D%20%22docker%22%20%5D%5D%3B%20then%0A%20%20%20%20healthcheck_command%3D%22%24%7Bcrictl%7D%20pods%22%0A%20%20fi%0A%20%20%23%20Container%20runtime%20startup%20takes%20time.%20Make%20initial%20attempts%20before%20starting%0A%20%20%23%20killing%20the%20container%20runtime.%0A%20%20until%20timeout%2060%20%24%7Bhealthcheck_command%7D%20%3E%20%2Fdev%2Fnull%3B%20do%0A%20%20%20%20if%20((%20attempt%20%3D%3D%20max_attempts%20))%3B%20then%0A%20%20%20%20%20%20echo%20%22Max%20attempt%20%24%7Bmax_attempts%7D%20reached!%20Proceeding%20to%20monitor%20container%20runtime%20healthiness.%22%0A%20%20%20%20%20%20break%0A%20%20%20%20fi%0A%20%20%20%20echo%20%22%24attempt%20initial%20attempt%20%5C%22%24%7Bhealthcheck_command%7D%5C%22!%20Trying%20again%20in%20%24attempt%20seconds...%22%0A%20%20%20%20sleep%20%22%24((%202%20**%20attempt%2B%2B%20))%22%0A%20%20done%0A%20%20while%20true%3B%20do%0A%20%20%20%20if%20!%20timeout%2060%20%24%7Bhealthcheck_command%7D%20%3E%20%2Fdev%2Fnull%3B%20then%0A%20%20%20%20%20%20echo%20%22Container%20runtime%20%24%7Bcontainer_runtime_name%7D%20failed!%22%0A%20%20%20%20%20%20if%20%5B%5B%20%22%24container_runtime_name%22%20%3D%3D%20%22docker%22%20%5D%5D%3B%20then%0A%20%20%20%20%20%20%20%20%20%20%23%20Dump%20stack%20of%20docker%20daemon%20for%20investigation.%0A%20%20%20%20%20%20%20%20%20%20%23%20Log%20fle%20name%20looks%20like%20goroutine-stacks-TIMESTAMP%20and%20will%20be%20saved%20to%0A%20%20%20%20%20%20%20%20%20%20%23%20the%20exec%20root%20directory%2C%20which%20is%20%2Fvar%2Frun%2Fdocker%2F%20on%20Ubuntu%20and%20COS.%0A%20%20%20%20%20%20%20%20%20%20pkill%20-SIGUSR1%20dockerd%0A%20%20%20%20%20%20fi%0A%20%20%20%20%20%20systemctl%20kill%20--kill-who%3Dmain%20%22%24%7Bcontainer_runtime_name%7D%22%0A%20%20%20%20%20%20%23%20Wait%20for%20a%20while%2C%20as%20we%20don't%20want%20to%20kill%20it%20again%20before%20it%20is%20really%20up.%0A%20%20%20%20%20%20sleep%20120%0A%20%20%20%20else%0A%20%20%20%20%20%20sleep%20%22%24%7BSLEEP_SECONDS%7D%22%0A%20%20%20%20fi%0A%20%20done%0A%7D%0A%0Afunction%20kubelet_monitoring%20%7B%0A%20%20echo%20%22Wait%20for%202%20minutes%20for%20kubelet%20to%20be%20functional%22%0A%20%20%23%20TODO(andyzheng0831)%3A%20replace%20it%20with%20a%20more%20reliable%20method%20if%20possible.%0A%20%20sleep%20120%0A%20%20local%20-r%20max_seconds%3D10%0A%20%20local%20output%3D%22%22%0A%20%20while%20%5B%201%20%5D%3B%20do%0A%20%20%20%20if%20!%20output%3D%24(curl%20-m%20%22%24%7Bmax_seconds%7D%22%20-f%20-s%20-S%20http%3A%2F%2F127.0.0.1%3A10248%2Fhealthz%202%3E%261)%3B%20then%0A%20%20%20%20%20%20%23%20Print%20the%20response%20and%2For%20errors.%0A%20%20%20%20%20%20echo%20%24output%0A%20%20%20%20%20%20echo%20%22Kubelet%20is%20unhealthy!%22%0A%20%20%20%20%20%20systemctl%20kill%20kubelet%0A%20%20%20%20%20%20%23%20Wait%20for%20a%20while%2C%20as%20we%20don't%20want%20to%20kill%20it%20again%20before%20it%20is%20really%20up.%0A%20%20%20%20%20%20sleep%2060%0A%20%20%20%20else%0A%20%20%20%20%20%20sleep%20%22%24%7BSLEEP_SECONDS%7D%22%0A%20%20%20%20fi%0A%20%20done%0A%7D%0A%0A%0A%23%23%23%23%23%23%23%23%23%23%23%23%23%23%20Main%20Function%20%23%23%23%23%23%23%23%23%23%23%23%23%23%23%23%23%0Aif%20%5B%5B%20%22%24%23%22%20-ne%201%20%5D%5D%3B%20then%0A%20%20echo%20%22Usage%3A%20health-monitor.sh%20%3Ccontainer-runtime%2Fkubelet%3E%22%0A%20%20exit%201%0Afi%0A%0AKUBE_HOME%3D%22%2Fhome%2Fkubernetes%22%0A%0ASLEEP_SECONDS%3D10%0Acomponent%3D%241%0Aecho%20%22Start%20kubernetes%20health%20monitoring%20for%20%24%7Bcomponent%7D%22%0Aif%20%5B%5B%20%22%24%7Bcomponent%7D%22%20%3D%3D%20%22container-runtime%22%20%5D%5D%3B%20then%0A%20%20container_runtime_monitoring%0Aelif%20%5B%5B%20%22%24%7Bcomponent%7D%22%20%3D%3D%20%22kubelet%22%20%5D%5D%3B%20then%0A%20%20kubelet_monitoring%0Aelse%0A%20%20echo%20%22Health%20monitoring%20for%20component%20%22%24%7Bcomponent%7D%22%20is%20not%20supported!%22%0Afi%0A", + "source": "data:,%23!%2Fusr%2Fbin%2Fenv%20bash%0Aset%20-xeuo%20pipefail%0Auntil%20%5B%5B%20-x%20%2Fopt%2Fbin%2Fhealth-monitor.sh%20%5D%5D%3B%20do%0A%20%20curl%20-Lfo%20%2Fopt%2Fbin%2Fhealth-monitor.sh%20%5C%0A%20%20%20%20https%3A%2F%2Fraw.githubusercontent.com%2Fkubermatic%2Fmachine-controller%2F8b5b66e4910a6228dfaecccaa0a3b05ec4902f8e%2Fpkg%2Fuserdata%2Fscripts%2Fhealth-monitor.sh%0A%20%20chmod%20%2Bx%20%2Fopt%2Fbin%2Fhealth-monitor.sh%0Adone%0A", "verification": {} }, "mode": 755 @@ -174,12 +174,17 @@ "name": "docker.service" }, { - "contents": "[Unit]\nRequires=network-online.target\nAfter=network-online.target\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh kubelet\n\n[Install]\nWantedBy=multi-user.target\n", + "contents": "[Unit]\nRequires=network-online.target\nAfter=network-online.target\n[Service]\nType=oneshot\nExecStart=/opt/bin/download-healthcheck-script.sh\n[Install]\nWantedBy=multi-user.target\n", + "enabled": true, + "name": "download-healthcheck-script.service" + }, + { + "contents": "[Unit]\nRequires=download-healthcheck-script.service\nAfter=download-healthcheck-script.service\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh kubelet\n\n[Install]\nWantedBy=multi-user.target\n", "enabled": true, "name": "kubelet-healthcheck.service" }, { - "contents": "[Unit]\nRequires=network-online.target\nAfter=network-online.target\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh container-runtime\n\n[Install]\nWantedBy=multi-user.target\n", + "contents": "[Unit]\nRequires=download-healthcheck-script.service\nAfter=download-healthcheck-script.service\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh container-runtime\n\n[Install]\nWantedBy=multi-user.target\n", "enabled": true, "name": "docker-healthcheck.service" }, diff --git a/pkg/userdata/coreos/testdata/v1.10.3-auto-update-openstack-multiple-dns.golden b/pkg/userdata/coreos/testdata/v1.10.3-auto-update-openstack-multiple-dns.golden index d0a960d8b..844ffc0b9 100644 --- a/pkg/userdata/coreos/testdata/v1.10.3-auto-update-openstack-multiple-dns.golden +++ b/pkg/userdata/coreos/testdata/v1.10.3-auto-update-openstack-multiple-dns.golden @@ -157,10 +157,10 @@ { "filesystem": "root", "group": {}, - "path": "/opt/bin/health-monitor.sh", + "path": "/opt/bin/download-healthcheck-script.sh", "user": {}, "contents": { - "source": "data:,%23!%2Fusr%2Fbin%2Fenv%20bash%0A%0A%23%20Copyright%202016%20The%20Kubernetes%20Authors.%0A%23%0A%23%20Licensed%20under%20the%20Apache%20License%2C%20Version%202.0%20(the%20%22License%22)%3B%0A%23%20you%20may%20not%20use%20this%20file%20except%20in%20compliance%20with%20the%20License.%0A%23%20You%20may%20obtain%20a%20copy%20of%20the%20License%20at%0A%23%0A%23%20%20%20%20%20http%3A%2F%2Fwww.apache.org%2Flicenses%2FLICENSE-2.0%0A%23%0A%23%20Unless%20required%20by%20applicable%20law%20or%20agreed%20to%20in%20writing%2C%20software%0A%23%20distributed%20under%20the%20License%20is%20distributed%20on%20an%20%22AS%20IS%22%20BASIS%2C%0A%23%20WITHOUT%20WARRANTIES%20OR%20CONDITIONS%20OF%20ANY%20KIND%2C%20either%20express%20or%20implied.%0A%23%20See%20the%20License%20for%20the%20specific%20language%20governing%20permissions%20and%0A%23%20limitations%20under%20the%20License.%0A%0A%23%20This%20script%20is%20for%20master%20and%20node%20instance%20health%20monitoring%2C%20which%20is%0A%23%20packed%20in%20kube-manifest%20tarball.%20It%20is%20executed%20through%20a%20systemd%20service%0A%23%20in%20cluster%2Fgce%2Fgci%2F%3Cmaster%2Fnode%3E.yaml.%20The%20env%20variables%20come%20from%20an%20env%0A%23%20file%20provided%20by%20the%20systemd%20service.%0A%0Aset%20-o%20nounset%0Aset%20-o%20pipefail%0A%0A%23%20We%20simply%20kill%20the%20process%20when%20there%20is%20a%20failure.%20Another%20systemd%20service%20will%0A%23%20automatically%20restart%20the%20process.%0Afunction%20container_runtime_monitoring%20%7B%0A%20%20local%20-r%20max_attempts%3D5%0A%20%20local%20attempt%3D1%0A%20%20local%20-r%20crictl%3D%22%24%7BKUBE_HOME%7D%2Fbin%2Fcrictl%22%0A%20%20local%20-r%20container_runtime_name%3D%22%24%7BCONTAINER_RUNTIME_NAME%3A-docker%7D%22%0A%20%20%23%20We%20still%20need%20to%20use%20'docker%20ps'%20when%20container%20runtime%20is%20%22docker%22.%20This%20is%20because%0A%20%20%23%20dockershim%20is%20still%20part%20of%20kubelet%20today.%20When%20kubelet%20is%20down%2C%20crictl%20pods%0A%20%20%23%20will%20also%20fail%2C%20and%20docker%20will%20be%20killed.%20This%20is%20undesirable%20especially%20when%0A%20%20%23%20docker%20live%20restore%20is%20disabled.%0A%20%20local%20healthcheck_command%3D%22docker%20ps%22%0A%20%20if%20%5B%5B%20%22%24%7BCONTAINER_RUNTIME%3A-docker%7D%22%20!%3D%20%22docker%22%20%5D%5D%3B%20then%0A%20%20%20%20healthcheck_command%3D%22%24%7Bcrictl%7D%20pods%22%0A%20%20fi%0A%20%20%23%20Container%20runtime%20startup%20takes%20time.%20Make%20initial%20attempts%20before%20starting%0A%20%20%23%20killing%20the%20container%20runtime.%0A%20%20until%20timeout%2060%20%24%7Bhealthcheck_command%7D%20%3E%20%2Fdev%2Fnull%3B%20do%0A%20%20%20%20if%20((%20attempt%20%3D%3D%20max_attempts%20))%3B%20then%0A%20%20%20%20%20%20echo%20%22Max%20attempt%20%24%7Bmax_attempts%7D%20reached!%20Proceeding%20to%20monitor%20container%20runtime%20healthiness.%22%0A%20%20%20%20%20%20break%0A%20%20%20%20fi%0A%20%20%20%20echo%20%22%24attempt%20initial%20attempt%20%5C%22%24%7Bhealthcheck_command%7D%5C%22!%20Trying%20again%20in%20%24attempt%20seconds...%22%0A%20%20%20%20sleep%20%22%24((%202%20**%20attempt%2B%2B%20))%22%0A%20%20done%0A%20%20while%20true%3B%20do%0A%20%20%20%20if%20!%20timeout%2060%20%24%7Bhealthcheck_command%7D%20%3E%20%2Fdev%2Fnull%3B%20then%0A%20%20%20%20%20%20echo%20%22Container%20runtime%20%24%7Bcontainer_runtime_name%7D%20failed!%22%0A%20%20%20%20%20%20if%20%5B%5B%20%22%24container_runtime_name%22%20%3D%3D%20%22docker%22%20%5D%5D%3B%20then%0A%20%20%20%20%20%20%20%20%20%20%23%20Dump%20stack%20of%20docker%20daemon%20for%20investigation.%0A%20%20%20%20%20%20%20%20%20%20%23%20Log%20fle%20name%20looks%20like%20goroutine-stacks-TIMESTAMP%20and%20will%20be%20saved%20to%0A%20%20%20%20%20%20%20%20%20%20%23%20the%20exec%20root%20directory%2C%20which%20is%20%2Fvar%2Frun%2Fdocker%2F%20on%20Ubuntu%20and%20COS.%0A%20%20%20%20%20%20%20%20%20%20pkill%20-SIGUSR1%20dockerd%0A%20%20%20%20%20%20fi%0A%20%20%20%20%20%20systemctl%20kill%20--kill-who%3Dmain%20%22%24%7Bcontainer_runtime_name%7D%22%0A%20%20%20%20%20%20%23%20Wait%20for%20a%20while%2C%20as%20we%20don't%20want%20to%20kill%20it%20again%20before%20it%20is%20really%20up.%0A%20%20%20%20%20%20sleep%20120%0A%20%20%20%20else%0A%20%20%20%20%20%20sleep%20%22%24%7BSLEEP_SECONDS%7D%22%0A%20%20%20%20fi%0A%20%20done%0A%7D%0A%0Afunction%20kubelet_monitoring%20%7B%0A%20%20echo%20%22Wait%20for%202%20minutes%20for%20kubelet%20to%20be%20functional%22%0A%20%20%23%20TODO(andyzheng0831)%3A%20replace%20it%20with%20a%20more%20reliable%20method%20if%20possible.%0A%20%20sleep%20120%0A%20%20local%20-r%20max_seconds%3D10%0A%20%20local%20output%3D%22%22%0A%20%20while%20%5B%201%20%5D%3B%20do%0A%20%20%20%20if%20!%20output%3D%24(curl%20-m%20%22%24%7Bmax_seconds%7D%22%20-f%20-s%20-S%20http%3A%2F%2F127.0.0.1%3A10248%2Fhealthz%202%3E%261)%3B%20then%0A%20%20%20%20%20%20%23%20Print%20the%20response%20and%2For%20errors.%0A%20%20%20%20%20%20echo%20%24output%0A%20%20%20%20%20%20echo%20%22Kubelet%20is%20unhealthy!%22%0A%20%20%20%20%20%20systemctl%20kill%20kubelet%0A%20%20%20%20%20%20%23%20Wait%20for%20a%20while%2C%20as%20we%20don't%20want%20to%20kill%20it%20again%20before%20it%20is%20really%20up.%0A%20%20%20%20%20%20sleep%2060%0A%20%20%20%20else%0A%20%20%20%20%20%20sleep%20%22%24%7BSLEEP_SECONDS%7D%22%0A%20%20%20%20fi%0A%20%20done%0A%7D%0A%0A%0A%23%23%23%23%23%23%23%23%23%23%23%23%23%23%20Main%20Function%20%23%23%23%23%23%23%23%23%23%23%23%23%23%23%23%23%0Aif%20%5B%5B%20%22%24%23%22%20-ne%201%20%5D%5D%3B%20then%0A%20%20echo%20%22Usage%3A%20health-monitor.sh%20%3Ccontainer-runtime%2Fkubelet%3E%22%0A%20%20exit%201%0Afi%0A%0AKUBE_HOME%3D%22%2Fhome%2Fkubernetes%22%0A%0ASLEEP_SECONDS%3D10%0Acomponent%3D%241%0Aecho%20%22Start%20kubernetes%20health%20monitoring%20for%20%24%7Bcomponent%7D%22%0Aif%20%5B%5B%20%22%24%7Bcomponent%7D%22%20%3D%3D%20%22container-runtime%22%20%5D%5D%3B%20then%0A%20%20container_runtime_monitoring%0Aelif%20%5B%5B%20%22%24%7Bcomponent%7D%22%20%3D%3D%20%22kubelet%22%20%5D%5D%3B%20then%0A%20%20kubelet_monitoring%0Aelse%0A%20%20echo%20%22Health%20monitoring%20for%20component%20%22%24%7Bcomponent%7D%22%20is%20not%20supported!%22%0Afi%0A", + "source": "data:,%23!%2Fusr%2Fbin%2Fenv%20bash%0Aset%20-xeuo%20pipefail%0Auntil%20%5B%5B%20-x%20%2Fopt%2Fbin%2Fhealth-monitor.sh%20%5D%5D%3B%20do%0A%20%20curl%20-Lfo%20%2Fopt%2Fbin%2Fhealth-monitor.sh%20%5C%0A%20%20%20%20https%3A%2F%2Fraw.githubusercontent.com%2Fkubermatic%2Fmachine-controller%2F8b5b66e4910a6228dfaecccaa0a3b05ec4902f8e%2Fpkg%2Fuserdata%2Fscripts%2Fhealth-monitor.sh%0A%20%20chmod%20%2Bx%20%2Fopt%2Fbin%2Fhealth-monitor.sh%0Adone%0A", "verification": {} }, "mode": 755 @@ -174,12 +174,17 @@ "name": "docker.service" }, { - "contents": "[Unit]\nRequires=network-online.target\nAfter=network-online.target\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh kubelet\n\n[Install]\nWantedBy=multi-user.target\n", + "contents": "[Unit]\nRequires=network-online.target\nAfter=network-online.target\n[Service]\nType=oneshot\nExecStart=/opt/bin/download-healthcheck-script.sh\n[Install]\nWantedBy=multi-user.target\n", + "enabled": true, + "name": "download-healthcheck-script.service" + }, + { + "contents": "[Unit]\nRequires=download-healthcheck-script.service\nAfter=download-healthcheck-script.service\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh kubelet\n\n[Install]\nWantedBy=multi-user.target\n", "enabled": true, "name": "kubelet-healthcheck.service" }, { - "contents": "[Unit]\nRequires=network-online.target\nAfter=network-online.target\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh container-runtime\n\n[Install]\nWantedBy=multi-user.target\n", + "contents": "[Unit]\nRequires=download-healthcheck-script.service\nAfter=download-healthcheck-script.service\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh container-runtime\n\n[Install]\nWantedBy=multi-user.target\n", "enabled": true, "name": "docker-healthcheck.service" }, diff --git a/pkg/userdata/coreos/testdata/v1.11.2-vsphere-static-ipconfig.golden b/pkg/userdata/coreos/testdata/v1.11.2-vsphere-static-ipconfig.golden index 979552fec..196959cdc 100644 --- a/pkg/userdata/coreos/testdata/v1.11.2-vsphere-static-ipconfig.golden +++ b/pkg/userdata/coreos/testdata/v1.11.2-vsphere-static-ipconfig.golden @@ -164,10 +164,10 @@ { "filesystem": "root", "group": {}, - "path": "/opt/bin/health-monitor.sh", + "path": "/opt/bin/download-healthcheck-script.sh", "user": {}, "contents": { - "source": "data:,%23!%2Fusr%2Fbin%2Fenv%20bash%0A%0A%23%20Copyright%202016%20The%20Kubernetes%20Authors.%0A%23%0A%23%20Licensed%20under%20the%20Apache%20License%2C%20Version%202.0%20(the%20%22License%22)%3B%0A%23%20you%20may%20not%20use%20this%20file%20except%20in%20compliance%20with%20the%20License.%0A%23%20You%20may%20obtain%20a%20copy%20of%20the%20License%20at%0A%23%0A%23%20%20%20%20%20http%3A%2F%2Fwww.apache.org%2Flicenses%2FLICENSE-2.0%0A%23%0A%23%20Unless%20required%20by%20applicable%20law%20or%20agreed%20to%20in%20writing%2C%20software%0A%23%20distributed%20under%20the%20License%20is%20distributed%20on%20an%20%22AS%20IS%22%20BASIS%2C%0A%23%20WITHOUT%20WARRANTIES%20OR%20CONDITIONS%20OF%20ANY%20KIND%2C%20either%20express%20or%20implied.%0A%23%20See%20the%20License%20for%20the%20specific%20language%20governing%20permissions%20and%0A%23%20limitations%20under%20the%20License.%0A%0A%23%20This%20script%20is%20for%20master%20and%20node%20instance%20health%20monitoring%2C%20which%20is%0A%23%20packed%20in%20kube-manifest%20tarball.%20It%20is%20executed%20through%20a%20systemd%20service%0A%23%20in%20cluster%2Fgce%2Fgci%2F%3Cmaster%2Fnode%3E.yaml.%20The%20env%20variables%20come%20from%20an%20env%0A%23%20file%20provided%20by%20the%20systemd%20service.%0A%0Aset%20-o%20nounset%0Aset%20-o%20pipefail%0A%0A%23%20We%20simply%20kill%20the%20process%20when%20there%20is%20a%20failure.%20Another%20systemd%20service%20will%0A%23%20automatically%20restart%20the%20process.%0Afunction%20container_runtime_monitoring%20%7B%0A%20%20local%20-r%20max_attempts%3D5%0A%20%20local%20attempt%3D1%0A%20%20local%20-r%20crictl%3D%22%24%7BKUBE_HOME%7D%2Fbin%2Fcrictl%22%0A%20%20local%20-r%20container_runtime_name%3D%22%24%7BCONTAINER_RUNTIME_NAME%3A-docker%7D%22%0A%20%20%23%20We%20still%20need%20to%20use%20'docker%20ps'%20when%20container%20runtime%20is%20%22docker%22.%20This%20is%20because%0A%20%20%23%20dockershim%20is%20still%20part%20of%20kubelet%20today.%20When%20kubelet%20is%20down%2C%20crictl%20pods%0A%20%20%23%20will%20also%20fail%2C%20and%20docker%20will%20be%20killed.%20This%20is%20undesirable%20especially%20when%0A%20%20%23%20docker%20live%20restore%20is%20disabled.%0A%20%20local%20healthcheck_command%3D%22docker%20ps%22%0A%20%20if%20%5B%5B%20%22%24%7BCONTAINER_RUNTIME%3A-docker%7D%22%20!%3D%20%22docker%22%20%5D%5D%3B%20then%0A%20%20%20%20healthcheck_command%3D%22%24%7Bcrictl%7D%20pods%22%0A%20%20fi%0A%20%20%23%20Container%20runtime%20startup%20takes%20time.%20Make%20initial%20attempts%20before%20starting%0A%20%20%23%20killing%20the%20container%20runtime.%0A%20%20until%20timeout%2060%20%24%7Bhealthcheck_command%7D%20%3E%20%2Fdev%2Fnull%3B%20do%0A%20%20%20%20if%20((%20attempt%20%3D%3D%20max_attempts%20))%3B%20then%0A%20%20%20%20%20%20echo%20%22Max%20attempt%20%24%7Bmax_attempts%7D%20reached!%20Proceeding%20to%20monitor%20container%20runtime%20healthiness.%22%0A%20%20%20%20%20%20break%0A%20%20%20%20fi%0A%20%20%20%20echo%20%22%24attempt%20initial%20attempt%20%5C%22%24%7Bhealthcheck_command%7D%5C%22!%20Trying%20again%20in%20%24attempt%20seconds...%22%0A%20%20%20%20sleep%20%22%24((%202%20**%20attempt%2B%2B%20))%22%0A%20%20done%0A%20%20while%20true%3B%20do%0A%20%20%20%20if%20!%20timeout%2060%20%24%7Bhealthcheck_command%7D%20%3E%20%2Fdev%2Fnull%3B%20then%0A%20%20%20%20%20%20echo%20%22Container%20runtime%20%24%7Bcontainer_runtime_name%7D%20failed!%22%0A%20%20%20%20%20%20if%20%5B%5B%20%22%24container_runtime_name%22%20%3D%3D%20%22docker%22%20%5D%5D%3B%20then%0A%20%20%20%20%20%20%20%20%20%20%23%20Dump%20stack%20of%20docker%20daemon%20for%20investigation.%0A%20%20%20%20%20%20%20%20%20%20%23%20Log%20fle%20name%20looks%20like%20goroutine-stacks-TIMESTAMP%20and%20will%20be%20saved%20to%0A%20%20%20%20%20%20%20%20%20%20%23%20the%20exec%20root%20directory%2C%20which%20is%20%2Fvar%2Frun%2Fdocker%2F%20on%20Ubuntu%20and%20COS.%0A%20%20%20%20%20%20%20%20%20%20pkill%20-SIGUSR1%20dockerd%0A%20%20%20%20%20%20fi%0A%20%20%20%20%20%20systemctl%20kill%20--kill-who%3Dmain%20%22%24%7Bcontainer_runtime_name%7D%22%0A%20%20%20%20%20%20%23%20Wait%20for%20a%20while%2C%20as%20we%20don't%20want%20to%20kill%20it%20again%20before%20it%20is%20really%20up.%0A%20%20%20%20%20%20sleep%20120%0A%20%20%20%20else%0A%20%20%20%20%20%20sleep%20%22%24%7BSLEEP_SECONDS%7D%22%0A%20%20%20%20fi%0A%20%20done%0A%7D%0A%0Afunction%20kubelet_monitoring%20%7B%0A%20%20echo%20%22Wait%20for%202%20minutes%20for%20kubelet%20to%20be%20functional%22%0A%20%20%23%20TODO(andyzheng0831)%3A%20replace%20it%20with%20a%20more%20reliable%20method%20if%20possible.%0A%20%20sleep%20120%0A%20%20local%20-r%20max_seconds%3D10%0A%20%20local%20output%3D%22%22%0A%20%20while%20%5B%201%20%5D%3B%20do%0A%20%20%20%20if%20!%20output%3D%24(curl%20-m%20%22%24%7Bmax_seconds%7D%22%20-f%20-s%20-S%20http%3A%2F%2F127.0.0.1%3A10248%2Fhealthz%202%3E%261)%3B%20then%0A%20%20%20%20%20%20%23%20Print%20the%20response%20and%2For%20errors.%0A%20%20%20%20%20%20echo%20%24output%0A%20%20%20%20%20%20echo%20%22Kubelet%20is%20unhealthy!%22%0A%20%20%20%20%20%20systemctl%20kill%20kubelet%0A%20%20%20%20%20%20%23%20Wait%20for%20a%20while%2C%20as%20we%20don't%20want%20to%20kill%20it%20again%20before%20it%20is%20really%20up.%0A%20%20%20%20%20%20sleep%2060%0A%20%20%20%20else%0A%20%20%20%20%20%20sleep%20%22%24%7BSLEEP_SECONDS%7D%22%0A%20%20%20%20fi%0A%20%20done%0A%7D%0A%0A%0A%23%23%23%23%23%23%23%23%23%23%23%23%23%23%20Main%20Function%20%23%23%23%23%23%23%23%23%23%23%23%23%23%23%23%23%0Aif%20%5B%5B%20%22%24%23%22%20-ne%201%20%5D%5D%3B%20then%0A%20%20echo%20%22Usage%3A%20health-monitor.sh%20%3Ccontainer-runtime%2Fkubelet%3E%22%0A%20%20exit%201%0Afi%0A%0AKUBE_HOME%3D%22%2Fhome%2Fkubernetes%22%0A%0ASLEEP_SECONDS%3D10%0Acomponent%3D%241%0Aecho%20%22Start%20kubernetes%20health%20monitoring%20for%20%24%7Bcomponent%7D%22%0Aif%20%5B%5B%20%22%24%7Bcomponent%7D%22%20%3D%3D%20%22container-runtime%22%20%5D%5D%3B%20then%0A%20%20container_runtime_monitoring%0Aelif%20%5B%5B%20%22%24%7Bcomponent%7D%22%20%3D%3D%20%22kubelet%22%20%5D%5D%3B%20then%0A%20%20kubelet_monitoring%0Aelse%0A%20%20echo%20%22Health%20monitoring%20for%20component%20%22%24%7Bcomponent%7D%22%20is%20not%20supported!%22%0Afi%0A", + "source": "data:,%23!%2Fusr%2Fbin%2Fenv%20bash%0Aset%20-xeuo%20pipefail%0Auntil%20%5B%5B%20-x%20%2Fopt%2Fbin%2Fhealth-monitor.sh%20%5D%5D%3B%20do%0A%20%20curl%20-Lfo%20%2Fopt%2Fbin%2Fhealth-monitor.sh%20%5C%0A%20%20%20%20https%3A%2F%2Fraw.githubusercontent.com%2Fkubermatic%2Fmachine-controller%2F8b5b66e4910a6228dfaecccaa0a3b05ec4902f8e%2Fpkg%2Fuserdata%2Fscripts%2Fhealth-monitor.sh%0A%20%20chmod%20%2Bx%20%2Fopt%2Fbin%2Fhealth-monitor.sh%0Adone%0A", "verification": {} }, "mode": 755 @@ -189,12 +189,17 @@ "name": "docker.service" }, { - "contents": "[Unit]\nRequires=network-online.target\nAfter=network-online.target\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh kubelet\n\n[Install]\nWantedBy=multi-user.target\n", + "contents": "[Unit]\nRequires=network-online.target\nAfter=network-online.target\n[Service]\nType=oneshot\nExecStart=/opt/bin/download-healthcheck-script.sh\n[Install]\nWantedBy=multi-user.target\n", + "enabled": true, + "name": "download-healthcheck-script.service" + }, + { + "contents": "[Unit]\nRequires=download-healthcheck-script.service\nAfter=download-healthcheck-script.service\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh kubelet\n\n[Install]\nWantedBy=multi-user.target\n", "enabled": true, "name": "kubelet-healthcheck.service" }, { - "contents": "[Unit]\nRequires=network-online.target\nAfter=network-online.target\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh container-runtime\n\n[Install]\nWantedBy=multi-user.target\n", + "contents": "[Unit]\nRequires=download-healthcheck-script.service\nAfter=download-healthcheck-script.service\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh container-runtime\n\n[Install]\nWantedBy=multi-user.target\n", "enabled": true, "name": "docker-healthcheck.service" }, diff --git a/pkg/userdata/coreos/testdata/v1.12.0-vsphere-overwrite-cloudconfig.golden b/pkg/userdata/coreos/testdata/v1.12.0-vsphere-overwrite-cloudconfig.golden index f4a00f1a3..ddba5d93d 100644 --- a/pkg/userdata/coreos/testdata/v1.12.0-vsphere-overwrite-cloudconfig.golden +++ b/pkg/userdata/coreos/testdata/v1.12.0-vsphere-overwrite-cloudconfig.golden @@ -153,10 +153,10 @@ { "filesystem": "root", "group": {}, - "path": "/opt/bin/health-monitor.sh", + "path": "/opt/bin/download-healthcheck-script.sh", "user": {}, "contents": { - "source": "data:,%23!%2Fusr%2Fbin%2Fenv%20bash%0A%0A%23%20Copyright%202016%20The%20Kubernetes%20Authors.%0A%23%0A%23%20Licensed%20under%20the%20Apache%20License%2C%20Version%202.0%20(the%20%22License%22)%3B%0A%23%20you%20may%20not%20use%20this%20file%20except%20in%20compliance%20with%20the%20License.%0A%23%20You%20may%20obtain%20a%20copy%20of%20the%20License%20at%0A%23%0A%23%20%20%20%20%20http%3A%2F%2Fwww.apache.org%2Flicenses%2FLICENSE-2.0%0A%23%0A%23%20Unless%20required%20by%20applicable%20law%20or%20agreed%20to%20in%20writing%2C%20software%0A%23%20distributed%20under%20the%20License%20is%20distributed%20on%20an%20%22AS%20IS%22%20BASIS%2C%0A%23%20WITHOUT%20WARRANTIES%20OR%20CONDITIONS%20OF%20ANY%20KIND%2C%20either%20express%20or%20implied.%0A%23%20See%20the%20License%20for%20the%20specific%20language%20governing%20permissions%20and%0A%23%20limitations%20under%20the%20License.%0A%0A%23%20This%20script%20is%20for%20master%20and%20node%20instance%20health%20monitoring%2C%20which%20is%0A%23%20packed%20in%20kube-manifest%20tarball.%20It%20is%20executed%20through%20a%20systemd%20service%0A%23%20in%20cluster%2Fgce%2Fgci%2F%3Cmaster%2Fnode%3E.yaml.%20The%20env%20variables%20come%20from%20an%20env%0A%23%20file%20provided%20by%20the%20systemd%20service.%0A%0Aset%20-o%20nounset%0Aset%20-o%20pipefail%0A%0A%23%20We%20simply%20kill%20the%20process%20when%20there%20is%20a%20failure.%20Another%20systemd%20service%20will%0A%23%20automatically%20restart%20the%20process.%0Afunction%20container_runtime_monitoring%20%7B%0A%20%20local%20-r%20max_attempts%3D5%0A%20%20local%20attempt%3D1%0A%20%20local%20-r%20crictl%3D%22%24%7BKUBE_HOME%7D%2Fbin%2Fcrictl%22%0A%20%20local%20-r%20container_runtime_name%3D%22%24%7BCONTAINER_RUNTIME_NAME%3A-docker%7D%22%0A%20%20%23%20We%20still%20need%20to%20use%20'docker%20ps'%20when%20container%20runtime%20is%20%22docker%22.%20This%20is%20because%0A%20%20%23%20dockershim%20is%20still%20part%20of%20kubelet%20today.%20When%20kubelet%20is%20down%2C%20crictl%20pods%0A%20%20%23%20will%20also%20fail%2C%20and%20docker%20will%20be%20killed.%20This%20is%20undesirable%20especially%20when%0A%20%20%23%20docker%20live%20restore%20is%20disabled.%0A%20%20local%20healthcheck_command%3D%22docker%20ps%22%0A%20%20if%20%5B%5B%20%22%24%7BCONTAINER_RUNTIME%3A-docker%7D%22%20!%3D%20%22docker%22%20%5D%5D%3B%20then%0A%20%20%20%20healthcheck_command%3D%22%24%7Bcrictl%7D%20pods%22%0A%20%20fi%0A%20%20%23%20Container%20runtime%20startup%20takes%20time.%20Make%20initial%20attempts%20before%20starting%0A%20%20%23%20killing%20the%20container%20runtime.%0A%20%20until%20timeout%2060%20%24%7Bhealthcheck_command%7D%20%3E%20%2Fdev%2Fnull%3B%20do%0A%20%20%20%20if%20((%20attempt%20%3D%3D%20max_attempts%20))%3B%20then%0A%20%20%20%20%20%20echo%20%22Max%20attempt%20%24%7Bmax_attempts%7D%20reached!%20Proceeding%20to%20monitor%20container%20runtime%20healthiness.%22%0A%20%20%20%20%20%20break%0A%20%20%20%20fi%0A%20%20%20%20echo%20%22%24attempt%20initial%20attempt%20%5C%22%24%7Bhealthcheck_command%7D%5C%22!%20Trying%20again%20in%20%24attempt%20seconds...%22%0A%20%20%20%20sleep%20%22%24((%202%20**%20attempt%2B%2B%20))%22%0A%20%20done%0A%20%20while%20true%3B%20do%0A%20%20%20%20if%20!%20timeout%2060%20%24%7Bhealthcheck_command%7D%20%3E%20%2Fdev%2Fnull%3B%20then%0A%20%20%20%20%20%20echo%20%22Container%20runtime%20%24%7Bcontainer_runtime_name%7D%20failed!%22%0A%20%20%20%20%20%20if%20%5B%5B%20%22%24container_runtime_name%22%20%3D%3D%20%22docker%22%20%5D%5D%3B%20then%0A%20%20%20%20%20%20%20%20%20%20%23%20Dump%20stack%20of%20docker%20daemon%20for%20investigation.%0A%20%20%20%20%20%20%20%20%20%20%23%20Log%20fle%20name%20looks%20like%20goroutine-stacks-TIMESTAMP%20and%20will%20be%20saved%20to%0A%20%20%20%20%20%20%20%20%20%20%23%20the%20exec%20root%20directory%2C%20which%20is%20%2Fvar%2Frun%2Fdocker%2F%20on%20Ubuntu%20and%20COS.%0A%20%20%20%20%20%20%20%20%20%20pkill%20-SIGUSR1%20dockerd%0A%20%20%20%20%20%20fi%0A%20%20%20%20%20%20systemctl%20kill%20--kill-who%3Dmain%20%22%24%7Bcontainer_runtime_name%7D%22%0A%20%20%20%20%20%20%23%20Wait%20for%20a%20while%2C%20as%20we%20don't%20want%20to%20kill%20it%20again%20before%20it%20is%20really%20up.%0A%20%20%20%20%20%20sleep%20120%0A%20%20%20%20else%0A%20%20%20%20%20%20sleep%20%22%24%7BSLEEP_SECONDS%7D%22%0A%20%20%20%20fi%0A%20%20done%0A%7D%0A%0Afunction%20kubelet_monitoring%20%7B%0A%20%20echo%20%22Wait%20for%202%20minutes%20for%20kubelet%20to%20be%20functional%22%0A%20%20%23%20TODO(andyzheng0831)%3A%20replace%20it%20with%20a%20more%20reliable%20method%20if%20possible.%0A%20%20sleep%20120%0A%20%20local%20-r%20max_seconds%3D10%0A%20%20local%20output%3D%22%22%0A%20%20while%20%5B%201%20%5D%3B%20do%0A%20%20%20%20if%20!%20output%3D%24(curl%20-m%20%22%24%7Bmax_seconds%7D%22%20-f%20-s%20-S%20http%3A%2F%2F127.0.0.1%3A10248%2Fhealthz%202%3E%261)%3B%20then%0A%20%20%20%20%20%20%23%20Print%20the%20response%20and%2For%20errors.%0A%20%20%20%20%20%20echo%20%24output%0A%20%20%20%20%20%20echo%20%22Kubelet%20is%20unhealthy!%22%0A%20%20%20%20%20%20systemctl%20kill%20kubelet%0A%20%20%20%20%20%20%23%20Wait%20for%20a%20while%2C%20as%20we%20don't%20want%20to%20kill%20it%20again%20before%20it%20is%20really%20up.%0A%20%20%20%20%20%20sleep%2060%0A%20%20%20%20else%0A%20%20%20%20%20%20sleep%20%22%24%7BSLEEP_SECONDS%7D%22%0A%20%20%20%20fi%0A%20%20done%0A%7D%0A%0A%0A%23%23%23%23%23%23%23%23%23%23%23%23%23%23%20Main%20Function%20%23%23%23%23%23%23%23%23%23%23%23%23%23%23%23%23%0Aif%20%5B%5B%20%22%24%23%22%20-ne%201%20%5D%5D%3B%20then%0A%20%20echo%20%22Usage%3A%20health-monitor.sh%20%3Ccontainer-runtime%2Fkubelet%3E%22%0A%20%20exit%201%0Afi%0A%0AKUBE_HOME%3D%22%2Fhome%2Fkubernetes%22%0A%0ASLEEP_SECONDS%3D10%0Acomponent%3D%241%0Aecho%20%22Start%20kubernetes%20health%20monitoring%20for%20%24%7Bcomponent%7D%22%0Aif%20%5B%5B%20%22%24%7Bcomponent%7D%22%20%3D%3D%20%22container-runtime%22%20%5D%5D%3B%20then%0A%20%20container_runtime_monitoring%0Aelif%20%5B%5B%20%22%24%7Bcomponent%7D%22%20%3D%3D%20%22kubelet%22%20%5D%5D%3B%20then%0A%20%20kubelet_monitoring%0Aelse%0A%20%20echo%20%22Health%20monitoring%20for%20component%20%22%24%7Bcomponent%7D%22%20is%20not%20supported!%22%0Afi%0A", + "source": "data:,%23!%2Fusr%2Fbin%2Fenv%20bash%0Aset%20-xeuo%20pipefail%0Auntil%20%5B%5B%20-x%20%2Fopt%2Fbin%2Fhealth-monitor.sh%20%5D%5D%3B%20do%0A%20%20curl%20-Lfo%20%2Fopt%2Fbin%2Fhealth-monitor.sh%20%5C%0A%20%20%20%20https%3A%2F%2Fraw.githubusercontent.com%2Fkubermatic%2Fmachine-controller%2F8b5b66e4910a6228dfaecccaa0a3b05ec4902f8e%2Fpkg%2Fuserdata%2Fscripts%2Fhealth-monitor.sh%0A%20%20chmod%20%2Bx%20%2Fopt%2Fbin%2Fhealth-monitor.sh%0Adone%0A", "verification": {} }, "mode": 755 @@ -178,12 +178,17 @@ "name": "docker.service" }, { - "contents": "[Unit]\nRequires=network-online.target\nAfter=network-online.target\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh kubelet\n\n[Install]\nWantedBy=multi-user.target\n", + "contents": "[Unit]\nRequires=network-online.target\nAfter=network-online.target\n[Service]\nType=oneshot\nExecStart=/opt/bin/download-healthcheck-script.sh\n[Install]\nWantedBy=multi-user.target\n", + "enabled": true, + "name": "download-healthcheck-script.service" + }, + { + "contents": "[Unit]\nRequires=download-healthcheck-script.service\nAfter=download-healthcheck-script.service\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh kubelet\n\n[Install]\nWantedBy=multi-user.target\n", "enabled": true, "name": "kubelet-healthcheck.service" }, { - "contents": "[Unit]\nRequires=network-online.target\nAfter=network-online.target\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh container-runtime\n\n[Install]\nWantedBy=multi-user.target\n", + "contents": "[Unit]\nRequires=download-healthcheck-script.service\nAfter=download-healthcheck-script.service\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh container-runtime\n\n[Install]\nWantedBy=multi-user.target\n", "enabled": true, "name": "docker-healthcheck.service" }, diff --git a/pkg/userdata/coreos/testdata/v1.9.2-disable-auto-update-aws.golden b/pkg/userdata/coreos/testdata/v1.9.2-disable-auto-update-aws.golden index 163eae849..1531cd6ea 100644 --- a/pkg/userdata/coreos/testdata/v1.9.2-disable-auto-update-aws.golden +++ b/pkg/userdata/coreos/testdata/v1.9.2-disable-auto-update-aws.golden @@ -157,10 +157,10 @@ { "filesystem": "root", "group": {}, - "path": "/opt/bin/health-monitor.sh", + "path": "/opt/bin/download-healthcheck-script.sh", "user": {}, "contents": { - "source": "data:,%23!%2Fusr%2Fbin%2Fenv%20bash%0A%0A%23%20Copyright%202016%20The%20Kubernetes%20Authors.%0A%23%0A%23%20Licensed%20under%20the%20Apache%20License%2C%20Version%202.0%20(the%20%22License%22)%3B%0A%23%20you%20may%20not%20use%20this%20file%20except%20in%20compliance%20with%20the%20License.%0A%23%20You%20may%20obtain%20a%20copy%20of%20the%20License%20at%0A%23%0A%23%20%20%20%20%20http%3A%2F%2Fwww.apache.org%2Flicenses%2FLICENSE-2.0%0A%23%0A%23%20Unless%20required%20by%20applicable%20law%20or%20agreed%20to%20in%20writing%2C%20software%0A%23%20distributed%20under%20the%20License%20is%20distributed%20on%20an%20%22AS%20IS%22%20BASIS%2C%0A%23%20WITHOUT%20WARRANTIES%20OR%20CONDITIONS%20OF%20ANY%20KIND%2C%20either%20express%20or%20implied.%0A%23%20See%20the%20License%20for%20the%20specific%20language%20governing%20permissions%20and%0A%23%20limitations%20under%20the%20License.%0A%0A%23%20This%20script%20is%20for%20master%20and%20node%20instance%20health%20monitoring%2C%20which%20is%0A%23%20packed%20in%20kube-manifest%20tarball.%20It%20is%20executed%20through%20a%20systemd%20service%0A%23%20in%20cluster%2Fgce%2Fgci%2F%3Cmaster%2Fnode%3E.yaml.%20The%20env%20variables%20come%20from%20an%20env%0A%23%20file%20provided%20by%20the%20systemd%20service.%0A%0Aset%20-o%20nounset%0Aset%20-o%20pipefail%0A%0A%23%20We%20simply%20kill%20the%20process%20when%20there%20is%20a%20failure.%20Another%20systemd%20service%20will%0A%23%20automatically%20restart%20the%20process.%0Afunction%20container_runtime_monitoring%20%7B%0A%20%20local%20-r%20max_attempts%3D5%0A%20%20local%20attempt%3D1%0A%20%20local%20-r%20crictl%3D%22%24%7BKUBE_HOME%7D%2Fbin%2Fcrictl%22%0A%20%20local%20-r%20container_runtime_name%3D%22%24%7BCONTAINER_RUNTIME_NAME%3A-docker%7D%22%0A%20%20%23%20We%20still%20need%20to%20use%20'docker%20ps'%20when%20container%20runtime%20is%20%22docker%22.%20This%20is%20because%0A%20%20%23%20dockershim%20is%20still%20part%20of%20kubelet%20today.%20When%20kubelet%20is%20down%2C%20crictl%20pods%0A%20%20%23%20will%20also%20fail%2C%20and%20docker%20will%20be%20killed.%20This%20is%20undesirable%20especially%20when%0A%20%20%23%20docker%20live%20restore%20is%20disabled.%0A%20%20local%20healthcheck_command%3D%22docker%20ps%22%0A%20%20if%20%5B%5B%20%22%24%7BCONTAINER_RUNTIME%3A-docker%7D%22%20!%3D%20%22docker%22%20%5D%5D%3B%20then%0A%20%20%20%20healthcheck_command%3D%22%24%7Bcrictl%7D%20pods%22%0A%20%20fi%0A%20%20%23%20Container%20runtime%20startup%20takes%20time.%20Make%20initial%20attempts%20before%20starting%0A%20%20%23%20killing%20the%20container%20runtime.%0A%20%20until%20timeout%2060%20%24%7Bhealthcheck_command%7D%20%3E%20%2Fdev%2Fnull%3B%20do%0A%20%20%20%20if%20((%20attempt%20%3D%3D%20max_attempts%20))%3B%20then%0A%20%20%20%20%20%20echo%20%22Max%20attempt%20%24%7Bmax_attempts%7D%20reached!%20Proceeding%20to%20monitor%20container%20runtime%20healthiness.%22%0A%20%20%20%20%20%20break%0A%20%20%20%20fi%0A%20%20%20%20echo%20%22%24attempt%20initial%20attempt%20%5C%22%24%7Bhealthcheck_command%7D%5C%22!%20Trying%20again%20in%20%24attempt%20seconds...%22%0A%20%20%20%20sleep%20%22%24((%202%20**%20attempt%2B%2B%20))%22%0A%20%20done%0A%20%20while%20true%3B%20do%0A%20%20%20%20if%20!%20timeout%2060%20%24%7Bhealthcheck_command%7D%20%3E%20%2Fdev%2Fnull%3B%20then%0A%20%20%20%20%20%20echo%20%22Container%20runtime%20%24%7Bcontainer_runtime_name%7D%20failed!%22%0A%20%20%20%20%20%20if%20%5B%5B%20%22%24container_runtime_name%22%20%3D%3D%20%22docker%22%20%5D%5D%3B%20then%0A%20%20%20%20%20%20%20%20%20%20%23%20Dump%20stack%20of%20docker%20daemon%20for%20investigation.%0A%20%20%20%20%20%20%20%20%20%20%23%20Log%20fle%20name%20looks%20like%20goroutine-stacks-TIMESTAMP%20and%20will%20be%20saved%20to%0A%20%20%20%20%20%20%20%20%20%20%23%20the%20exec%20root%20directory%2C%20which%20is%20%2Fvar%2Frun%2Fdocker%2F%20on%20Ubuntu%20and%20COS.%0A%20%20%20%20%20%20%20%20%20%20pkill%20-SIGUSR1%20dockerd%0A%20%20%20%20%20%20fi%0A%20%20%20%20%20%20systemctl%20kill%20--kill-who%3Dmain%20%22%24%7Bcontainer_runtime_name%7D%22%0A%20%20%20%20%20%20%23%20Wait%20for%20a%20while%2C%20as%20we%20don't%20want%20to%20kill%20it%20again%20before%20it%20is%20really%20up.%0A%20%20%20%20%20%20sleep%20120%0A%20%20%20%20else%0A%20%20%20%20%20%20sleep%20%22%24%7BSLEEP_SECONDS%7D%22%0A%20%20%20%20fi%0A%20%20done%0A%7D%0A%0Afunction%20kubelet_monitoring%20%7B%0A%20%20echo%20%22Wait%20for%202%20minutes%20for%20kubelet%20to%20be%20functional%22%0A%20%20%23%20TODO(andyzheng0831)%3A%20replace%20it%20with%20a%20more%20reliable%20method%20if%20possible.%0A%20%20sleep%20120%0A%20%20local%20-r%20max_seconds%3D10%0A%20%20local%20output%3D%22%22%0A%20%20while%20%5B%201%20%5D%3B%20do%0A%20%20%20%20if%20!%20output%3D%24(curl%20-m%20%22%24%7Bmax_seconds%7D%22%20-f%20-s%20-S%20http%3A%2F%2F127.0.0.1%3A10248%2Fhealthz%202%3E%261)%3B%20then%0A%20%20%20%20%20%20%23%20Print%20the%20response%20and%2For%20errors.%0A%20%20%20%20%20%20echo%20%24output%0A%20%20%20%20%20%20echo%20%22Kubelet%20is%20unhealthy!%22%0A%20%20%20%20%20%20systemctl%20kill%20kubelet%0A%20%20%20%20%20%20%23%20Wait%20for%20a%20while%2C%20as%20we%20don't%20want%20to%20kill%20it%20again%20before%20it%20is%20really%20up.%0A%20%20%20%20%20%20sleep%2060%0A%20%20%20%20else%0A%20%20%20%20%20%20sleep%20%22%24%7BSLEEP_SECONDS%7D%22%0A%20%20%20%20fi%0A%20%20done%0A%7D%0A%0A%0A%23%23%23%23%23%23%23%23%23%23%23%23%23%23%20Main%20Function%20%23%23%23%23%23%23%23%23%23%23%23%23%23%23%23%23%0Aif%20%5B%5B%20%22%24%23%22%20-ne%201%20%5D%5D%3B%20then%0A%20%20echo%20%22Usage%3A%20health-monitor.sh%20%3Ccontainer-runtime%2Fkubelet%3E%22%0A%20%20exit%201%0Afi%0A%0AKUBE_HOME%3D%22%2Fhome%2Fkubernetes%22%0A%0ASLEEP_SECONDS%3D10%0Acomponent%3D%241%0Aecho%20%22Start%20kubernetes%20health%20monitoring%20for%20%24%7Bcomponent%7D%22%0Aif%20%5B%5B%20%22%24%7Bcomponent%7D%22%20%3D%3D%20%22container-runtime%22%20%5D%5D%3B%20then%0A%20%20container_runtime_monitoring%0Aelif%20%5B%5B%20%22%24%7Bcomponent%7D%22%20%3D%3D%20%22kubelet%22%20%5D%5D%3B%20then%0A%20%20kubelet_monitoring%0Aelse%0A%20%20echo%20%22Health%20monitoring%20for%20component%20%22%24%7Bcomponent%7D%22%20is%20not%20supported!%22%0Afi%0A", + "source": "data:,%23!%2Fusr%2Fbin%2Fenv%20bash%0Aset%20-xeuo%20pipefail%0Auntil%20%5B%5B%20-x%20%2Fopt%2Fbin%2Fhealth-monitor.sh%20%5D%5D%3B%20do%0A%20%20curl%20-Lfo%20%2Fopt%2Fbin%2Fhealth-monitor.sh%20%5C%0A%20%20%20%20https%3A%2F%2Fraw.githubusercontent.com%2Fkubermatic%2Fmachine-controller%2F8b5b66e4910a6228dfaecccaa0a3b05ec4902f8e%2Fpkg%2Fuserdata%2Fscripts%2Fhealth-monitor.sh%0A%20%20chmod%20%2Bx%20%2Fopt%2Fbin%2Fhealth-monitor.sh%0Adone%0A", "verification": {} }, "mode": 755 @@ -182,12 +182,17 @@ "name": "docker.service" }, { - "contents": "[Unit]\nRequires=network-online.target\nAfter=network-online.target\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh kubelet\n\n[Install]\nWantedBy=multi-user.target\n", + "contents": "[Unit]\nRequires=network-online.target\nAfter=network-online.target\n[Service]\nType=oneshot\nExecStart=/opt/bin/download-healthcheck-script.sh\n[Install]\nWantedBy=multi-user.target\n", + "enabled": true, + "name": "download-healthcheck-script.service" + }, + { + "contents": "[Unit]\nRequires=download-healthcheck-script.service\nAfter=download-healthcheck-script.service\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh kubelet\n\n[Install]\nWantedBy=multi-user.target\n", "enabled": true, "name": "kubelet-healthcheck.service" }, { - "contents": "[Unit]\nRequires=network-online.target\nAfter=network-online.target\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh container-runtime\n\n[Install]\nWantedBy=multi-user.target\n", + "contents": "[Unit]\nRequires=download-healthcheck-script.service\nAfter=download-healthcheck-script.service\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh container-runtime\n\n[Install]\nWantedBy=multi-user.target\n", "enabled": true, "name": "docker-healthcheck.service" }, diff --git a/pkg/userdata/coreos/userdata.go b/pkg/userdata/coreos/userdata.go index 888336be8..dbadfd80a 100644 --- a/pkg/userdata/coreos/userdata.go +++ b/pkg/userdata/coreos/userdata.go @@ -175,12 +175,25 @@ systemd: - name: docker.service enabled: true - - name: kubelet-healthcheck.service + - name: download-healthcheck-script.service enabled: true contents: | [Unit] Requires=network-online.target After=network-online.target + [Service] + Type=oneshot + ExecStart=/opt/bin/download-healthcheck-script.sh + [Install] + WantedBy=multi-user.target + + + - name: kubelet-healthcheck.service + enabled: true + contents: | + [Unit] + Requires=download-healthcheck-script.service + After=download-healthcheck-script.service [Service] ExecStart=/opt/bin/health-monitor.sh kubelet @@ -192,8 +205,8 @@ systemd: enabled: true contents: | [Unit] - Requires=network-online.target - After=network-online.target + Requires=download-healthcheck-script.service + After=download-healthcheck-script.service [Service] ExecStart=/opt/bin/health-monitor.sh container-runtime @@ -372,120 +385,16 @@ storage: [Service] Environment=DOCKER_OPTS=--storage-driver=overlay2 - - path: /opt/bin/health-monitor.sh + - path: /opt/bin/download-healthcheck-script.sh filesystem: root mode: 755 - # This script is a slightly adjusted version of - # https://github.com/kubernetes/kubernetes/blob/e1a1aa211224fcd9b213420b80b2ae680669683d/cluster/gce/gci/health-monitor.sh - # Adjustments are: - # * Kubelet health port is 10248 not 10255 - # * Removal of all all references to the KUBE_ENV file contents: inline: | #!/usr/bin/env bash - - # Copyright 2016 The Kubernetes Authors. - # - # Licensed under the Apache License, Version 2.0 (the "License"); - # you may not use this file except in compliance with the License. - # You may obtain a copy of the License at - # - # http://www.apache.org/licenses/LICENSE-2.0 - # - # Unless required by applicable law or agreed to in writing, software - # distributed under the License is distributed on an "AS IS" BASIS, - # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - # See the License for the specific language governing permissions and - # limitations under the License. - - # This script is for master and node instance health monitoring, which is - # packed in kube-manifest tarball. It is executed through a systemd service - # in cluster/gce/gci/.yaml. The env variables come from an env - # file provided by the systemd service. - - set -o nounset - set -o pipefail - - # We simply kill the process when there is a failure. Another systemd service will - # automatically restart the process. - function container_runtime_monitoring { - local -r max_attempts=5 - local attempt=1 - local -r crictl="${KUBE_HOME}/bin/crictl" - local -r container_runtime_name="${CONTAINER_RUNTIME_NAME:-docker}" - # We still need to use 'docker ps' when container runtime is "docker". This is because - # dockershim is still part of kubelet today. When kubelet is down, crictl pods - # will also fail, and docker will be killed. This is undesirable especially when - # docker live restore is disabled. - local healthcheck_command="docker ps" - if [[ "${CONTAINER_RUNTIME:-docker}" != "docker" ]]; then - healthcheck_command="${crictl} pods" - fi - # Container runtime startup takes time. Make initial attempts before starting - # killing the container runtime. - until timeout 60 ${healthcheck_command} > /dev/null; do - if (( attempt == max_attempts )); then - echo "Max attempt ${max_attempts} reached! Proceeding to monitor container runtime healthiness." - break - fi - echo "$attempt initial attempt \"${healthcheck_command}\"! Trying again in $attempt seconds..." - sleep "$(( 2 ** attempt++ ))" - done - while true; do - if ! timeout 60 ${healthcheck_command} > /dev/null; then - echo "Container runtime ${container_runtime_name} failed!" - if [[ "$container_runtime_name" == "docker" ]]; then - # Dump stack of docker daemon for investigation. - # Log fle name looks like goroutine-stacks-TIMESTAMP and will be saved to - # the exec root directory, which is /var/run/docker/ on Ubuntu and COS. - pkill -SIGUSR1 dockerd - fi - systemctl kill --kill-who=main "${container_runtime_name}" - # Wait for a while, as we don't want to kill it again before it is really up. - sleep 120 - else - sleep "${SLEEP_SECONDS}" - fi - done - } - - function kubelet_monitoring { - echo "Wait for 2 minutes for kubelet to be functional" - # TODO(andyzheng0831): replace it with a more reliable method if possible. - sleep 120 - local -r max_seconds=10 - local output="" - while [ 1 ]; do - if ! output=$(curl -m "${max_seconds}" -f -s -S http://127.0.0.1:10248/healthz 2>&1); then - # Print the response and/or errors. - echo $output - echo "Kubelet is unhealthy!" - systemctl kill kubelet - # Wait for a while, as we don't want to kill it again before it is really up. - sleep 60 - else - sleep "${SLEEP_SECONDS}" - fi - done - } - - - ############## Main Function ################ - if [[ "$#" -ne 1 ]]; then - echo "Usage: health-monitor.sh " - exit 1 - fi - - KUBE_HOME="/home/kubernetes" - - SLEEP_SECONDS=10 - component=$1 - echo "Start kubernetes health monitoring for ${component}" - if [[ "${component}" == "container-runtime" ]]; then - container_runtime_monitoring - elif [[ "${component}" == "kubelet" ]]; then - kubelet_monitoring - else - echo "Health monitoring for component "${component}" is not supported!" - fi + set -xeuo pipefail + until [[ -x /opt/bin/health-monitor.sh ]]; do + curl -Lfo /opt/bin/health-monitor.sh \ + https://raw.githubusercontent.com/kubermatic/machine-controller/8b5b66e4910a6228dfaecccaa0a3b05ec4902f8e/pkg/userdata/scripts/health-monitor.sh + chmod +x /opt/bin/health-monitor.sh + done ` diff --git a/pkg/userdata/ubuntu/testdata/1.11-aws.golden b/pkg/userdata/ubuntu/testdata/1.11-aws.golden index b7a07004a..97a806db5 100644 --- a/pkg/userdata/ubuntu/testdata/1.11-aws.golden +++ b/pkg/userdata/ubuntu/testdata/1.11-aws.golden @@ -98,6 +98,12 @@ write_files: systemctl enable --now docker systemctl enable kubelet + if [[ ! -x /usr/local/bin/health-monitor.sh ]]; then + curl -Lfo /usr/local/bin/health-monitor.sh \ + https://raw.githubusercontent.com/kubermatic/machine-controller/8b5b66e4910a6228dfaecccaa0a3b05ec4902f8e/pkg/userdata/scripts/health-monitor.sh + chmod +x /usr/local/bin/health-monitor.sh + fi + if ! [[ -e /etc/kubernetes/pki/ca.crt ]]; then kubeadm join \ --token my-token \ @@ -267,120 +273,5 @@ write_files: [Install] WantedBy=multi-user.target -- path: /usr/local/bin/health-monitor.sh - permissions: "0755" - # This script is a slightly adjusted version of - # https://github.com/kubernetes/kubernetes/blob/e1a1aa211224fcd9b213420b80b2ae680669683d/cluster/gce/gci/health-monitor.sh - # Adjustments are: - # * Kubelet health port is 10248 not 10255 - # * Removal of all all references to the KUBE_ENV file - content: | - #!/usr/bin/env bash - - # Copyright 2016 The Kubernetes Authors. - # - # Licensed under the Apache License, Version 2.0 (the "License"); - # you may not use this file except in compliance with the License. - # You may obtain a copy of the License at - # - # http://www.apache.org/licenses/LICENSE-2.0 - # - # Unless required by applicable law or agreed to in writing, software - # distributed under the License is distributed on an "AS IS" BASIS, - # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - # See the License for the specific language governing permissions and - # limitations under the License. - - # This script is for master and node instance health monitoring, which is - # packed in kube-manifest tarball. It is executed through a systemd service - # in cluster/gce/gci/.yaml. The env variables come from an env - # file provided by the systemd service. - - set -o nounset - set -o pipefail - - # We simply kill the process when there is a failure. Another systemd service will - # automatically restart the process. - function container_runtime_monitoring { - local -r max_attempts=5 - local attempt=1 - local -r crictl="${KUBE_HOME}/bin/crictl" - local -r container_runtime_name="${CONTAINER_RUNTIME_NAME:-docker}" - # We still need to use 'docker ps' when container runtime is "docker". This is because - # dockershim is still part of kubelet today. When kubelet is down, crictl pods - # will also fail, and docker will be killed. This is undesirable especially when - # docker live restore is disabled. - local healthcheck_command="docker ps" - if [[ "${CONTAINER_RUNTIME:-docker}" != "docker" ]]; then - healthcheck_command="${crictl} pods" - fi - # Container runtime startup takes time. Make initial attempts before starting - # killing the container runtime. - until timeout 60 ${healthcheck_command} > /dev/null; do - if (( attempt == max_attempts )); then - echo "Max attempt ${max_attempts} reached! Proceeding to monitor container runtime healthiness." - break - fi - echo "$attempt initial attempt \"${healthcheck_command}\"! Trying again in $attempt seconds..." - sleep "$(( 2 ** attempt++ ))" - done - while true; do - if ! timeout 60 ${healthcheck_command} > /dev/null; then - echo "Container runtime ${container_runtime_name} failed!" - if [[ "$container_runtime_name" == "docker" ]]; then - # Dump stack of docker daemon for investigation. - # Log fle name looks like goroutine-stacks-TIMESTAMP and will be saved to - # the exec root directory, which is /var/run/docker/ on Ubuntu and COS. - pkill -SIGUSR1 dockerd - fi - systemctl kill --kill-who=main "${container_runtime_name}" - # Wait for a while, as we don't want to kill it again before it is really up. - sleep 120 - else - sleep "${SLEEP_SECONDS}" - fi - done - } - - function kubelet_monitoring { - echo "Wait for 2 minutes for kubelet to be functional" - # TODO(andyzheng0831): replace it with a more reliable method if possible. - sleep 120 - local -r max_seconds=10 - local output="" - while [ 1 ]; do - if ! output=$(curl -m "${max_seconds}" -f -s -S http://127.0.0.1:10248/healthz 2>&1); then - # Print the response and/or errors. - echo $output - echo "Kubelet is unhealthy!" - systemctl kill kubelet - # Wait for a while, as we don't want to kill it again before it is really up. - sleep 60 - else - sleep "${SLEEP_SECONDS}" - fi - done - } - - - ############## Main Function ################ - if [[ "$#" -ne 1 ]]; then - echo "Usage: health-monitor.sh " - exit 1 - fi - - KUBE_HOME="/home/kubernetes" - - SLEEP_SECONDS=10 - component=$1 - echo "Start kubernetes health monitoring for ${component}" - if [[ "${component}" == "container-runtime" ]]; then - container_runtime_monitoring - elif [[ "${component}" == "kubelet" ]]; then - kubelet_monitoring - else - echo "Health monitoring for component "${component}" is not supported!" - fi - runcmd: - systemctl enable --now setup.service diff --git a/pkg/userdata/ubuntu/testdata/1.9.2-dist-upgrade-on-boot-aws.golden b/pkg/userdata/ubuntu/testdata/1.9.2-dist-upgrade-on-boot-aws.golden index 567731b13..5f89bbf3d 100644 --- a/pkg/userdata/ubuntu/testdata/1.9.2-dist-upgrade-on-boot-aws.golden +++ b/pkg/userdata/ubuntu/testdata/1.9.2-dist-upgrade-on-boot-aws.golden @@ -99,6 +99,12 @@ write_files: systemctl enable --now docker systemctl enable kubelet + if [[ ! -x /usr/local/bin/health-monitor.sh ]]; then + curl -Lfo /usr/local/bin/health-monitor.sh \ + https://raw.githubusercontent.com/kubermatic/machine-controller/8b5b66e4910a6228dfaecccaa0a3b05ec4902f8e/pkg/userdata/scripts/health-monitor.sh + chmod +x /usr/local/bin/health-monitor.sh + fi + if ! [[ -e /etc/kubernetes/pki/ca.crt ]]; then kubeadm join \ --token my-token \ @@ -275,120 +281,5 @@ write_files: [Install] WantedBy=multi-user.target -- path: /usr/local/bin/health-monitor.sh - permissions: "0755" - # This script is a slightly adjusted version of - # https://github.com/kubernetes/kubernetes/blob/e1a1aa211224fcd9b213420b80b2ae680669683d/cluster/gce/gci/health-monitor.sh - # Adjustments are: - # * Kubelet health port is 10248 not 10255 - # * Removal of all all references to the KUBE_ENV file - content: | - #!/usr/bin/env bash - - # Copyright 2016 The Kubernetes Authors. - # - # Licensed under the Apache License, Version 2.0 (the "License"); - # you may not use this file except in compliance with the License. - # You may obtain a copy of the License at - # - # http://www.apache.org/licenses/LICENSE-2.0 - # - # Unless required by applicable law or agreed to in writing, software - # distributed under the License is distributed on an "AS IS" BASIS, - # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - # See the License for the specific language governing permissions and - # limitations under the License. - - # This script is for master and node instance health monitoring, which is - # packed in kube-manifest tarball. It is executed through a systemd service - # in cluster/gce/gci/.yaml. The env variables come from an env - # file provided by the systemd service. - - set -o nounset - set -o pipefail - - # We simply kill the process when there is a failure. Another systemd service will - # automatically restart the process. - function container_runtime_monitoring { - local -r max_attempts=5 - local attempt=1 - local -r crictl="${KUBE_HOME}/bin/crictl" - local -r container_runtime_name="${CONTAINER_RUNTIME_NAME:-docker}" - # We still need to use 'docker ps' when container runtime is "docker". This is because - # dockershim is still part of kubelet today. When kubelet is down, crictl pods - # will also fail, and docker will be killed. This is undesirable especially when - # docker live restore is disabled. - local healthcheck_command="docker ps" - if [[ "${CONTAINER_RUNTIME:-docker}" != "docker" ]]; then - healthcheck_command="${crictl} pods" - fi - # Container runtime startup takes time. Make initial attempts before starting - # killing the container runtime. - until timeout 60 ${healthcheck_command} > /dev/null; do - if (( attempt == max_attempts )); then - echo "Max attempt ${max_attempts} reached! Proceeding to monitor container runtime healthiness." - break - fi - echo "$attempt initial attempt \"${healthcheck_command}\"! Trying again in $attempt seconds..." - sleep "$(( 2 ** attempt++ ))" - done - while true; do - if ! timeout 60 ${healthcheck_command} > /dev/null; then - echo "Container runtime ${container_runtime_name} failed!" - if [[ "$container_runtime_name" == "docker" ]]; then - # Dump stack of docker daemon for investigation. - # Log fle name looks like goroutine-stacks-TIMESTAMP and will be saved to - # the exec root directory, which is /var/run/docker/ on Ubuntu and COS. - pkill -SIGUSR1 dockerd - fi - systemctl kill --kill-who=main "${container_runtime_name}" - # Wait for a while, as we don't want to kill it again before it is really up. - sleep 120 - else - sleep "${SLEEP_SECONDS}" - fi - done - } - - function kubelet_monitoring { - echo "Wait for 2 minutes for kubelet to be functional" - # TODO(andyzheng0831): replace it with a more reliable method if possible. - sleep 120 - local -r max_seconds=10 - local output="" - while [ 1 ]; do - if ! output=$(curl -m "${max_seconds}" -f -s -S http://127.0.0.1:10248/healthz 2>&1); then - # Print the response and/or errors. - echo $output - echo "Kubelet is unhealthy!" - systemctl kill kubelet - # Wait for a while, as we don't want to kill it again before it is really up. - sleep 60 - else - sleep "${SLEEP_SECONDS}" - fi - done - } - - - ############## Main Function ################ - if [[ "$#" -ne 1 ]]; then - echo "Usage: health-monitor.sh " - exit 1 - fi - - KUBE_HOME="/home/kubernetes" - - SLEEP_SECONDS=10 - component=$1 - echo "Start kubernetes health monitoring for ${component}" - if [[ "${component}" == "container-runtime" ]]; then - container_runtime_monitoring - elif [[ "${component}" == "kubelet" ]]; then - kubelet_monitoring - else - echo "Health monitoring for component "${component}" is not supported!" - fi - runcmd: - systemctl enable --now setup.service diff --git a/pkg/userdata/ubuntu/testdata/1.9.2-openstack-multiple-dns.golden b/pkg/userdata/ubuntu/testdata/1.9.2-openstack-multiple-dns.golden index 18322c6cd..d993166a5 100644 --- a/pkg/userdata/ubuntu/testdata/1.9.2-openstack-multiple-dns.golden +++ b/pkg/userdata/ubuntu/testdata/1.9.2-openstack-multiple-dns.golden @@ -99,6 +99,12 @@ write_files: systemctl enable --now docker systemctl enable kubelet + if [[ ! -x /usr/local/bin/health-monitor.sh ]]; then + curl -Lfo /usr/local/bin/health-monitor.sh \ + https://raw.githubusercontent.com/kubermatic/machine-controller/8b5b66e4910a6228dfaecccaa0a3b05ec4902f8e/pkg/userdata/scripts/health-monitor.sh + chmod +x /usr/local/bin/health-monitor.sh + fi + if ! [[ -e /etc/kubernetes/pki/ca.crt ]]; then kubeadm join \ --token my-token \ @@ -275,120 +281,5 @@ write_files: [Install] WantedBy=multi-user.target -- path: /usr/local/bin/health-monitor.sh - permissions: "0755" - # This script is a slightly adjusted version of - # https://github.com/kubernetes/kubernetes/blob/e1a1aa211224fcd9b213420b80b2ae680669683d/cluster/gce/gci/health-monitor.sh - # Adjustments are: - # * Kubelet health port is 10248 not 10255 - # * Removal of all all references to the KUBE_ENV file - content: | - #!/usr/bin/env bash - - # Copyright 2016 The Kubernetes Authors. - # - # Licensed under the Apache License, Version 2.0 (the "License"); - # you may not use this file except in compliance with the License. - # You may obtain a copy of the License at - # - # http://www.apache.org/licenses/LICENSE-2.0 - # - # Unless required by applicable law or agreed to in writing, software - # distributed under the License is distributed on an "AS IS" BASIS, - # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - # See the License for the specific language governing permissions and - # limitations under the License. - - # This script is for master and node instance health monitoring, which is - # packed in kube-manifest tarball. It is executed through a systemd service - # in cluster/gce/gci/.yaml. The env variables come from an env - # file provided by the systemd service. - - set -o nounset - set -o pipefail - - # We simply kill the process when there is a failure. Another systemd service will - # automatically restart the process. - function container_runtime_monitoring { - local -r max_attempts=5 - local attempt=1 - local -r crictl="${KUBE_HOME}/bin/crictl" - local -r container_runtime_name="${CONTAINER_RUNTIME_NAME:-docker}" - # We still need to use 'docker ps' when container runtime is "docker". This is because - # dockershim is still part of kubelet today. When kubelet is down, crictl pods - # will also fail, and docker will be killed. This is undesirable especially when - # docker live restore is disabled. - local healthcheck_command="docker ps" - if [[ "${CONTAINER_RUNTIME:-docker}" != "docker" ]]; then - healthcheck_command="${crictl} pods" - fi - # Container runtime startup takes time. Make initial attempts before starting - # killing the container runtime. - until timeout 60 ${healthcheck_command} > /dev/null; do - if (( attempt == max_attempts )); then - echo "Max attempt ${max_attempts} reached! Proceeding to monitor container runtime healthiness." - break - fi - echo "$attempt initial attempt \"${healthcheck_command}\"! Trying again in $attempt seconds..." - sleep "$(( 2 ** attempt++ ))" - done - while true; do - if ! timeout 60 ${healthcheck_command} > /dev/null; then - echo "Container runtime ${container_runtime_name} failed!" - if [[ "$container_runtime_name" == "docker" ]]; then - # Dump stack of docker daemon for investigation. - # Log fle name looks like goroutine-stacks-TIMESTAMP and will be saved to - # the exec root directory, which is /var/run/docker/ on Ubuntu and COS. - pkill -SIGUSR1 dockerd - fi - systemctl kill --kill-who=main "${container_runtime_name}" - # Wait for a while, as we don't want to kill it again before it is really up. - sleep 120 - else - sleep "${SLEEP_SECONDS}" - fi - done - } - - function kubelet_monitoring { - echo "Wait for 2 minutes for kubelet to be functional" - # TODO(andyzheng0831): replace it with a more reliable method if possible. - sleep 120 - local -r max_seconds=10 - local output="" - while [ 1 ]; do - if ! output=$(curl -m "${max_seconds}" -f -s -S http://127.0.0.1:10248/healthz 2>&1); then - # Print the response and/or errors. - echo $output - echo "Kubelet is unhealthy!" - systemctl kill kubelet - # Wait for a while, as we don't want to kill it again before it is really up. - sleep 60 - else - sleep "${SLEEP_SECONDS}" - fi - done - } - - - ############## Main Function ################ - if [[ "$#" -ne 1 ]]; then - echo "Usage: health-monitor.sh " - exit 1 - fi - - KUBE_HOME="/home/kubernetes" - - SLEEP_SECONDS=10 - component=$1 - echo "Start kubernetes health monitoring for ${component}" - if [[ "${component}" == "container-runtime" ]]; then - container_runtime_monitoring - elif [[ "${component}" == "kubelet" ]]; then - kubelet_monitoring - else - echo "Health monitoring for component "${component}" is not supported!" - fi - runcmd: - systemctl enable --now setup.service diff --git a/pkg/userdata/ubuntu/testdata/openstack-kubelet-v-version-prefix.golden b/pkg/userdata/ubuntu/testdata/openstack-kubelet-v-version-prefix.golden index e16e27348..c983a53e1 100644 --- a/pkg/userdata/ubuntu/testdata/openstack-kubelet-v-version-prefix.golden +++ b/pkg/userdata/ubuntu/testdata/openstack-kubelet-v-version-prefix.golden @@ -99,6 +99,12 @@ write_files: systemctl enable --now docker systemctl enable kubelet + if [[ ! -x /usr/local/bin/health-monitor.sh ]]; then + curl -Lfo /usr/local/bin/health-monitor.sh \ + https://raw.githubusercontent.com/kubermatic/machine-controller/8b5b66e4910a6228dfaecccaa0a3b05ec4902f8e/pkg/userdata/scripts/health-monitor.sh + chmod +x /usr/local/bin/health-monitor.sh + fi + if ! [[ -e /etc/kubernetes/pki/ca.crt ]]; then kubeadm join \ --token my-token \ @@ -275,120 +281,5 @@ write_files: [Install] WantedBy=multi-user.target -- path: /usr/local/bin/health-monitor.sh - permissions: "0755" - # This script is a slightly adjusted version of - # https://github.com/kubernetes/kubernetes/blob/e1a1aa211224fcd9b213420b80b2ae680669683d/cluster/gce/gci/health-monitor.sh - # Adjustments are: - # * Kubelet health port is 10248 not 10255 - # * Removal of all all references to the KUBE_ENV file - content: | - #!/usr/bin/env bash - - # Copyright 2016 The Kubernetes Authors. - # - # Licensed under the Apache License, Version 2.0 (the "License"); - # you may not use this file except in compliance with the License. - # You may obtain a copy of the License at - # - # http://www.apache.org/licenses/LICENSE-2.0 - # - # Unless required by applicable law or agreed to in writing, software - # distributed under the License is distributed on an "AS IS" BASIS, - # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - # See the License for the specific language governing permissions and - # limitations under the License. - - # This script is for master and node instance health monitoring, which is - # packed in kube-manifest tarball. It is executed through a systemd service - # in cluster/gce/gci/.yaml. The env variables come from an env - # file provided by the systemd service. - - set -o nounset - set -o pipefail - - # We simply kill the process when there is a failure. Another systemd service will - # automatically restart the process. - function container_runtime_monitoring { - local -r max_attempts=5 - local attempt=1 - local -r crictl="${KUBE_HOME}/bin/crictl" - local -r container_runtime_name="${CONTAINER_RUNTIME_NAME:-docker}" - # We still need to use 'docker ps' when container runtime is "docker". This is because - # dockershim is still part of kubelet today. When kubelet is down, crictl pods - # will also fail, and docker will be killed. This is undesirable especially when - # docker live restore is disabled. - local healthcheck_command="docker ps" - if [[ "${CONTAINER_RUNTIME:-docker}" != "docker" ]]; then - healthcheck_command="${crictl} pods" - fi - # Container runtime startup takes time. Make initial attempts before starting - # killing the container runtime. - until timeout 60 ${healthcheck_command} > /dev/null; do - if (( attempt == max_attempts )); then - echo "Max attempt ${max_attempts} reached! Proceeding to monitor container runtime healthiness." - break - fi - echo "$attempt initial attempt \"${healthcheck_command}\"! Trying again in $attempt seconds..." - sleep "$(( 2 ** attempt++ ))" - done - while true; do - if ! timeout 60 ${healthcheck_command} > /dev/null; then - echo "Container runtime ${container_runtime_name} failed!" - if [[ "$container_runtime_name" == "docker" ]]; then - # Dump stack of docker daemon for investigation. - # Log fle name looks like goroutine-stacks-TIMESTAMP and will be saved to - # the exec root directory, which is /var/run/docker/ on Ubuntu and COS. - pkill -SIGUSR1 dockerd - fi - systemctl kill --kill-who=main "${container_runtime_name}" - # Wait for a while, as we don't want to kill it again before it is really up. - sleep 120 - else - sleep "${SLEEP_SECONDS}" - fi - done - } - - function kubelet_monitoring { - echo "Wait for 2 minutes for kubelet to be functional" - # TODO(andyzheng0831): replace it with a more reliable method if possible. - sleep 120 - local -r max_seconds=10 - local output="" - while [ 1 ]; do - if ! output=$(curl -m "${max_seconds}" -f -s -S http://127.0.0.1:10248/healthz 2>&1); then - # Print the response and/or errors. - echo $output - echo "Kubelet is unhealthy!" - systemctl kill kubelet - # Wait for a while, as we don't want to kill it again before it is really up. - sleep 60 - else - sleep "${SLEEP_SECONDS}" - fi - done - } - - - ############## Main Function ################ - if [[ "$#" -ne 1 ]]; then - echo "Usage: health-monitor.sh " - exit 1 - fi - - KUBE_HOME="/home/kubernetes" - - SLEEP_SECONDS=10 - component=$1 - echo "Start kubernetes health monitoring for ${component}" - if [[ "${component}" == "container-runtime" ]]; then - container_runtime_monitoring - elif [[ "${component}" == "kubelet" ]]; then - kubelet_monitoring - else - echo "Health monitoring for component "${component}" is not supported!" - fi - runcmd: - systemctl enable --now setup.service diff --git a/pkg/userdata/ubuntu/testdata/openstack-overwrite-cloud-config.golden b/pkg/userdata/ubuntu/testdata/openstack-overwrite-cloud-config.golden index 5c54c048c..eb98d17a5 100644 --- a/pkg/userdata/ubuntu/testdata/openstack-overwrite-cloud-config.golden +++ b/pkg/userdata/ubuntu/testdata/openstack-overwrite-cloud-config.golden @@ -101,6 +101,12 @@ write_files: systemctl enable --now docker systemctl enable kubelet + if [[ ! -x /usr/local/bin/health-monitor.sh ]]; then + curl -Lfo /usr/local/bin/health-monitor.sh \ + https://raw.githubusercontent.com/kubermatic/machine-controller/8b5b66e4910a6228dfaecccaa0a3b05ec4902f8e/pkg/userdata/scripts/health-monitor.sh + chmod +x /usr/local/bin/health-monitor.sh + fi + if ! [[ -e /etc/kubernetes/pki/ca.crt ]]; then kubeadm join \ --token my-token \ @@ -277,120 +283,5 @@ write_files: [Install] WantedBy=multi-user.target -- path: /usr/local/bin/health-monitor.sh - permissions: "0755" - # This script is a slightly adjusted version of - # https://github.com/kubernetes/kubernetes/blob/e1a1aa211224fcd9b213420b80b2ae680669683d/cluster/gce/gci/health-monitor.sh - # Adjustments are: - # * Kubelet health port is 10248 not 10255 - # * Removal of all all references to the KUBE_ENV file - content: | - #!/usr/bin/env bash - - # Copyright 2016 The Kubernetes Authors. - # - # Licensed under the Apache License, Version 2.0 (the "License"); - # you may not use this file except in compliance with the License. - # You may obtain a copy of the License at - # - # http://www.apache.org/licenses/LICENSE-2.0 - # - # Unless required by applicable law or agreed to in writing, software - # distributed under the License is distributed on an "AS IS" BASIS, - # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - # See the License for the specific language governing permissions and - # limitations under the License. - - # This script is for master and node instance health monitoring, which is - # packed in kube-manifest tarball. It is executed through a systemd service - # in cluster/gce/gci/.yaml. The env variables come from an env - # file provided by the systemd service. - - set -o nounset - set -o pipefail - - # We simply kill the process when there is a failure. Another systemd service will - # automatically restart the process. - function container_runtime_monitoring { - local -r max_attempts=5 - local attempt=1 - local -r crictl="${KUBE_HOME}/bin/crictl" - local -r container_runtime_name="${CONTAINER_RUNTIME_NAME:-docker}" - # We still need to use 'docker ps' when container runtime is "docker". This is because - # dockershim is still part of kubelet today. When kubelet is down, crictl pods - # will also fail, and docker will be killed. This is undesirable especially when - # docker live restore is disabled. - local healthcheck_command="docker ps" - if [[ "${CONTAINER_RUNTIME:-docker}" != "docker" ]]; then - healthcheck_command="${crictl} pods" - fi - # Container runtime startup takes time. Make initial attempts before starting - # killing the container runtime. - until timeout 60 ${healthcheck_command} > /dev/null; do - if (( attempt == max_attempts )); then - echo "Max attempt ${max_attempts} reached! Proceeding to monitor container runtime healthiness." - break - fi - echo "$attempt initial attempt \"${healthcheck_command}\"! Trying again in $attempt seconds..." - sleep "$(( 2 ** attempt++ ))" - done - while true; do - if ! timeout 60 ${healthcheck_command} > /dev/null; then - echo "Container runtime ${container_runtime_name} failed!" - if [[ "$container_runtime_name" == "docker" ]]; then - # Dump stack of docker daemon for investigation. - # Log fle name looks like goroutine-stacks-TIMESTAMP and will be saved to - # the exec root directory, which is /var/run/docker/ on Ubuntu and COS. - pkill -SIGUSR1 dockerd - fi - systemctl kill --kill-who=main "${container_runtime_name}" - # Wait for a while, as we don't want to kill it again before it is really up. - sleep 120 - else - sleep "${SLEEP_SECONDS}" - fi - done - } - - function kubelet_monitoring { - echo "Wait for 2 minutes for kubelet to be functional" - # TODO(andyzheng0831): replace it with a more reliable method if possible. - sleep 120 - local -r max_seconds=10 - local output="" - while [ 1 ]; do - if ! output=$(curl -m "${max_seconds}" -f -s -S http://127.0.0.1:10248/healthz 2>&1); then - # Print the response and/or errors. - echo $output - echo "Kubelet is unhealthy!" - systemctl kill kubelet - # Wait for a while, as we don't want to kill it again before it is really up. - sleep 60 - else - sleep "${SLEEP_SECONDS}" - fi - done - } - - - ############## Main Function ################ - if [[ "$#" -ne 1 ]]; then - echo "Usage: health-monitor.sh " - exit 1 - fi - - KUBE_HOME="/home/kubernetes" - - SLEEP_SECONDS=10 - component=$1 - echo "Start kubernetes health monitoring for ${component}" - if [[ "${component}" == "container-runtime" ]]; then - container_runtime_monitoring - elif [[ "${component}" == "kubelet" ]]; then - kubelet_monitoring - else - echo "Health monitoring for component "${component}" is not supported!" - fi - runcmd: - systemctl enable --now setup.service diff --git a/pkg/userdata/ubuntu/userdata.go b/pkg/userdata/ubuntu/userdata.go index 5eb3c3b90..ac118fc3e 100644 --- a/pkg/userdata/ubuntu/userdata.go +++ b/pkg/userdata/ubuntu/userdata.go @@ -246,6 +246,12 @@ write_files: systemctl enable --now docker systemctl enable kubelet + if [[ ! -x /usr/local/bin/health-monitor.sh ]]; then + curl -Lfo /usr/local/bin/health-monitor.sh \ + https://raw.githubusercontent.com/kubermatic/machine-controller/8b5b66e4910a6228dfaecccaa0a3b05ec4902f8e/pkg/userdata/scripts/health-monitor.sh + chmod +x /usr/local/bin/health-monitor.sh + fi + if ! [[ -e /etc/kubernetes/pki/ca.crt ]]; then kubeadm join \ --token {{ .BoostrapToken }} \ @@ -428,121 +434,6 @@ write_files: [Install] WantedBy=multi-user.target -- path: /usr/local/bin/health-monitor.sh - permissions: "0755" - # This script is a slightly adjusted version of - # https://github.com/kubernetes/kubernetes/blob/e1a1aa211224fcd9b213420b80b2ae680669683d/cluster/gce/gci/health-monitor.sh - # Adjustments are: - # * Kubelet health port is 10248 not 10255 - # * Removal of all all references to the KUBE_ENV file - content: | - #!/usr/bin/env bash - - # Copyright 2016 The Kubernetes Authors. - # - # Licensed under the Apache License, Version 2.0 (the "License"); - # you may not use this file except in compliance with the License. - # You may obtain a copy of the License at - # - # http://www.apache.org/licenses/LICENSE-2.0 - # - # Unless required by applicable law or agreed to in writing, software - # distributed under the License is distributed on an "AS IS" BASIS, - # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - # See the License for the specific language governing permissions and - # limitations under the License. - - # This script is for master and node instance health monitoring, which is - # packed in kube-manifest tarball. It is executed through a systemd service - # in cluster/gce/gci/.yaml. The env variables come from an env - # file provided by the systemd service. - - set -o nounset - set -o pipefail - - # We simply kill the process when there is a failure. Another systemd service will - # automatically restart the process. - function container_runtime_monitoring { - local -r max_attempts=5 - local attempt=1 - local -r crictl="${KUBE_HOME}/bin/crictl" - local -r container_runtime_name="${CONTAINER_RUNTIME_NAME:-docker}" - # We still need to use 'docker ps' when container runtime is "docker". This is because - # dockershim is still part of kubelet today. When kubelet is down, crictl pods - # will also fail, and docker will be killed. This is undesirable especially when - # docker live restore is disabled. - local healthcheck_command="docker ps" - if [[ "${CONTAINER_RUNTIME:-docker}" != "docker" ]]; then - healthcheck_command="${crictl} pods" - fi - # Container runtime startup takes time. Make initial attempts before starting - # killing the container runtime. - until timeout 60 ${healthcheck_command} > /dev/null; do - if (( attempt == max_attempts )); then - echo "Max attempt ${max_attempts} reached! Proceeding to monitor container runtime healthiness." - break - fi - echo "$attempt initial attempt \"${healthcheck_command}\"! Trying again in $attempt seconds..." - sleep "$(( 2 ** attempt++ ))" - done - while true; do - if ! timeout 60 ${healthcheck_command} > /dev/null; then - echo "Container runtime ${container_runtime_name} failed!" - if [[ "$container_runtime_name" == "docker" ]]; then - # Dump stack of docker daemon for investigation. - # Log fle name looks like goroutine-stacks-TIMESTAMP and will be saved to - # the exec root directory, which is /var/run/docker/ on Ubuntu and COS. - pkill -SIGUSR1 dockerd - fi - systemctl kill --kill-who=main "${container_runtime_name}" - # Wait for a while, as we don't want to kill it again before it is really up. - sleep 120 - else - sleep "${SLEEP_SECONDS}" - fi - done - } - - function kubelet_monitoring { - echo "Wait for 2 minutes for kubelet to be functional" - # TODO(andyzheng0831): replace it with a more reliable method if possible. - sleep 120 - local -r max_seconds=10 - local output="" - while [ 1 ]; do - if ! output=$(curl -m "${max_seconds}" -f -s -S http://127.0.0.1:10248/healthz 2>&1); then - # Print the response and/or errors. - echo $output - echo "Kubelet is unhealthy!" - systemctl kill kubelet - # Wait for a while, as we don't want to kill it again before it is really up. - sleep 60 - else - sleep "${SLEEP_SECONDS}" - fi - done - } - - - ############## Main Function ################ - if [[ "$#" -ne 1 ]]; then - echo "Usage: health-monitor.sh " - exit 1 - fi - - KUBE_HOME="/home/kubernetes" - - SLEEP_SECONDS=10 - component=$1 - echo "Start kubernetes health monitoring for ${component}" - if [[ "${component}" == "container-runtime" ]]; then - container_runtime_monitoring - elif [[ "${component}" == "kubelet" ]]; then - kubelet_monitoring - else - echo "Health monitoring for component "${component}" is not supported!" - fi - runcmd: - systemctl enable --now setup.service ` From 6928f78b381d9f8ee97fc2af4e2398f49a350807 Mon Sep 17 00:00:00 2001 From: Alvaro Aleman Date: Fri, 12 Oct 2018 19:16:10 +0200 Subject: [PATCH 4/4] Require docker/kubelet to be running for the respective healthcheck --- pkg/userdata/centos/testdata/kubelet-v1.10-aws.golden | 8 ++++---- pkg/userdata/centos/testdata/kubelet-v1.11-aws.golden | 8 ++++---- pkg/userdata/centos/testdata/kubelet-v1.12-aws.golden | 8 ++++---- pkg/userdata/centos/testdata/kubelet-v1.9-aws.golden | 8 ++++---- pkg/userdata/centos/userdata.go | 8 ++++---- .../auto-update-openstack-kubelet-v-version-prefix.golden | 4 ++-- .../v1.10.3-auto-update-openstack-multiple-dns.golden | 4 ++-- .../testdata/v1.11.2-vsphere-static-ipconfig.golden | 4 ++-- .../testdata/v1.12.0-vsphere-overwrite-cloudconfig.golden | 4 ++-- .../coreos/testdata/v1.9.2-disable-auto-update-aws.golden | 4 ++-- pkg/userdata/coreos/userdata.go | 8 ++++---- pkg/userdata/ubuntu/testdata/1.11-aws.golden | 8 ++++---- .../ubuntu/testdata/1.9.2-dist-upgrade-on-boot-aws.golden | 8 ++++---- .../ubuntu/testdata/1.9.2-openstack-multiple-dns.golden | 8 ++++---- .../testdata/openstack-kubelet-v-version-prefix.golden | 8 ++++---- .../testdata/openstack-overwrite-cloud-config.golden | 8 ++++---- pkg/userdata/ubuntu/userdata.go | 8 ++++---- 17 files changed, 58 insertions(+), 58 deletions(-) diff --git a/pkg/userdata/centos/testdata/kubelet-v1.10-aws.golden b/pkg/userdata/centos/testdata/kubelet-v1.10-aws.golden index 1d0331599..752fe597b 100644 --- a/pkg/userdata/centos/testdata/kubelet-v1.10-aws.golden +++ b/pkg/userdata/centos/testdata/kubelet-v1.10-aws.golden @@ -133,8 +133,8 @@ write_files: permissions: "0644" content: | [Unit] - Requires=setup.service - After=setup.service + Requires=kubelet.service + After=kubelet.service [Service] ExecStart=/usr/local/bin/health-monitor.sh kubelet @@ -146,8 +146,8 @@ write_files: permissions: "0644" content: | [Unit] - Requires=setup.service - After=setup.service + Requires=docker.service + After=docker.service [Service] ExecStart=/usr/local/bin/health-monitor.sh container-runtime diff --git a/pkg/userdata/centos/testdata/kubelet-v1.11-aws.golden b/pkg/userdata/centos/testdata/kubelet-v1.11-aws.golden index dfffb6818..15b16ac3f 100644 --- a/pkg/userdata/centos/testdata/kubelet-v1.11-aws.golden +++ b/pkg/userdata/centos/testdata/kubelet-v1.11-aws.golden @@ -129,8 +129,8 @@ write_files: permissions: "0644" content: | [Unit] - Requires=setup.service - After=setup.service + Requires=kubelet.service + After=kubelet.service [Service] ExecStart=/usr/local/bin/health-monitor.sh kubelet @@ -142,8 +142,8 @@ write_files: permissions: "0644" content: | [Unit] - Requires=setup.service - After=setup.service + Requires=docker.service + After=docker.service [Service] ExecStart=/usr/local/bin/health-monitor.sh container-runtime diff --git a/pkg/userdata/centos/testdata/kubelet-v1.12-aws.golden b/pkg/userdata/centos/testdata/kubelet-v1.12-aws.golden index 1728ae3bd..137e1db24 100644 --- a/pkg/userdata/centos/testdata/kubelet-v1.12-aws.golden +++ b/pkg/userdata/centos/testdata/kubelet-v1.12-aws.golden @@ -129,8 +129,8 @@ write_files: permissions: "0644" content: | [Unit] - Requires=setup.service - After=setup.service + Requires=kubelet.service + After=kubelet.service [Service] ExecStart=/usr/local/bin/health-monitor.sh kubelet @@ -142,8 +142,8 @@ write_files: permissions: "0644" content: | [Unit] - Requires=setup.service - After=setup.service + Requires=docker.service + After=docker.service [Service] ExecStart=/usr/local/bin/health-monitor.sh container-runtime diff --git a/pkg/userdata/centos/testdata/kubelet-v1.9-aws.golden b/pkg/userdata/centos/testdata/kubelet-v1.9-aws.golden index b99c0cac2..71c578729 100644 --- a/pkg/userdata/centos/testdata/kubelet-v1.9-aws.golden +++ b/pkg/userdata/centos/testdata/kubelet-v1.9-aws.golden @@ -133,8 +133,8 @@ write_files: permissions: "0644" content: | [Unit] - Requires=setup.service - After=setup.service + Requires=kubelet.service + After=kubelet.service [Service] ExecStart=/usr/local/bin/health-monitor.sh kubelet @@ -146,8 +146,8 @@ write_files: permissions: "0644" content: | [Unit] - Requires=setup.service - After=setup.service + Requires=docker.service + After=docker.service [Service] ExecStart=/usr/local/bin/health-monitor.sh container-runtime diff --git a/pkg/userdata/centos/userdata.go b/pkg/userdata/centos/userdata.go index 2ba37678c..f6e3ca2b1 100644 --- a/pkg/userdata/centos/userdata.go +++ b/pkg/userdata/centos/userdata.go @@ -282,8 +282,8 @@ write_files: permissions: "0644" content: | [Unit] - Requires=setup.service - After=setup.service + Requires=kubelet.service + After=kubelet.service [Service] ExecStart=/usr/local/bin/health-monitor.sh kubelet @@ -295,8 +295,8 @@ write_files: permissions: "0644" content: | [Unit] - Requires=setup.service - After=setup.service + Requires=docker.service + After=docker.service [Service] ExecStart=/usr/local/bin/health-monitor.sh container-runtime diff --git a/pkg/userdata/coreos/testdata/auto-update-openstack-kubelet-v-version-prefix.golden b/pkg/userdata/coreos/testdata/auto-update-openstack-kubelet-v-version-prefix.golden index 83c7e594f..3c638dd59 100644 --- a/pkg/userdata/coreos/testdata/auto-update-openstack-kubelet-v-version-prefix.golden +++ b/pkg/userdata/coreos/testdata/auto-update-openstack-kubelet-v-version-prefix.golden @@ -179,12 +179,12 @@ "name": "download-healthcheck-script.service" }, { - "contents": "[Unit]\nRequires=download-healthcheck-script.service\nAfter=download-healthcheck-script.service\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh kubelet\n\n[Install]\nWantedBy=multi-user.target\n", + "contents": "[Unit]\nRequires=download-healthcheck-script.service kubelet.service\nAfter=download-healthcheck-script.service kubelet.service\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh kubelet\n\n[Install]\nWantedBy=multi-user.target\n", "enabled": true, "name": "kubelet-healthcheck.service" }, { - "contents": "[Unit]\nRequires=download-healthcheck-script.service\nAfter=download-healthcheck-script.service\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh container-runtime\n\n[Install]\nWantedBy=multi-user.target\n", + "contents": "[Unit]\nRequires=download-healthcheck-script.service docker.service\nAfter=download-healthcheck-script.service docker.service\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh container-runtime\n\n[Install]\nWantedBy=multi-user.target\n", "enabled": true, "name": "docker-healthcheck.service" }, diff --git a/pkg/userdata/coreos/testdata/v1.10.3-auto-update-openstack-multiple-dns.golden b/pkg/userdata/coreos/testdata/v1.10.3-auto-update-openstack-multiple-dns.golden index 844ffc0b9..08bf7e004 100644 --- a/pkg/userdata/coreos/testdata/v1.10.3-auto-update-openstack-multiple-dns.golden +++ b/pkg/userdata/coreos/testdata/v1.10.3-auto-update-openstack-multiple-dns.golden @@ -179,12 +179,12 @@ "name": "download-healthcheck-script.service" }, { - "contents": "[Unit]\nRequires=download-healthcheck-script.service\nAfter=download-healthcheck-script.service\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh kubelet\n\n[Install]\nWantedBy=multi-user.target\n", + "contents": "[Unit]\nRequires=download-healthcheck-script.service kubelet.service\nAfter=download-healthcheck-script.service kubelet.service\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh kubelet\n\n[Install]\nWantedBy=multi-user.target\n", "enabled": true, "name": "kubelet-healthcheck.service" }, { - "contents": "[Unit]\nRequires=download-healthcheck-script.service\nAfter=download-healthcheck-script.service\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh container-runtime\n\n[Install]\nWantedBy=multi-user.target\n", + "contents": "[Unit]\nRequires=download-healthcheck-script.service docker.service\nAfter=download-healthcheck-script.service docker.service\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh container-runtime\n\n[Install]\nWantedBy=multi-user.target\n", "enabled": true, "name": "docker-healthcheck.service" }, diff --git a/pkg/userdata/coreos/testdata/v1.11.2-vsphere-static-ipconfig.golden b/pkg/userdata/coreos/testdata/v1.11.2-vsphere-static-ipconfig.golden index 196959cdc..dd2e2ae85 100644 --- a/pkg/userdata/coreos/testdata/v1.11.2-vsphere-static-ipconfig.golden +++ b/pkg/userdata/coreos/testdata/v1.11.2-vsphere-static-ipconfig.golden @@ -194,12 +194,12 @@ "name": "download-healthcheck-script.service" }, { - "contents": "[Unit]\nRequires=download-healthcheck-script.service\nAfter=download-healthcheck-script.service\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh kubelet\n\n[Install]\nWantedBy=multi-user.target\n", + "contents": "[Unit]\nRequires=download-healthcheck-script.service kubelet.service\nAfter=download-healthcheck-script.service kubelet.service\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh kubelet\n\n[Install]\nWantedBy=multi-user.target\n", "enabled": true, "name": "kubelet-healthcheck.service" }, { - "contents": "[Unit]\nRequires=download-healthcheck-script.service\nAfter=download-healthcheck-script.service\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh container-runtime\n\n[Install]\nWantedBy=multi-user.target\n", + "contents": "[Unit]\nRequires=download-healthcheck-script.service docker.service\nAfter=download-healthcheck-script.service docker.service\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh container-runtime\n\n[Install]\nWantedBy=multi-user.target\n", "enabled": true, "name": "docker-healthcheck.service" }, diff --git a/pkg/userdata/coreos/testdata/v1.12.0-vsphere-overwrite-cloudconfig.golden b/pkg/userdata/coreos/testdata/v1.12.0-vsphere-overwrite-cloudconfig.golden index ddba5d93d..3c36cf346 100644 --- a/pkg/userdata/coreos/testdata/v1.12.0-vsphere-overwrite-cloudconfig.golden +++ b/pkg/userdata/coreos/testdata/v1.12.0-vsphere-overwrite-cloudconfig.golden @@ -183,12 +183,12 @@ "name": "download-healthcheck-script.service" }, { - "contents": "[Unit]\nRequires=download-healthcheck-script.service\nAfter=download-healthcheck-script.service\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh kubelet\n\n[Install]\nWantedBy=multi-user.target\n", + "contents": "[Unit]\nRequires=download-healthcheck-script.service kubelet.service\nAfter=download-healthcheck-script.service kubelet.service\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh kubelet\n\n[Install]\nWantedBy=multi-user.target\n", "enabled": true, "name": "kubelet-healthcheck.service" }, { - "contents": "[Unit]\nRequires=download-healthcheck-script.service\nAfter=download-healthcheck-script.service\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh container-runtime\n\n[Install]\nWantedBy=multi-user.target\n", + "contents": "[Unit]\nRequires=download-healthcheck-script.service docker.service\nAfter=download-healthcheck-script.service docker.service\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh container-runtime\n\n[Install]\nWantedBy=multi-user.target\n", "enabled": true, "name": "docker-healthcheck.service" }, diff --git a/pkg/userdata/coreos/testdata/v1.9.2-disable-auto-update-aws.golden b/pkg/userdata/coreos/testdata/v1.9.2-disable-auto-update-aws.golden index 1531cd6ea..af2cea382 100644 --- a/pkg/userdata/coreos/testdata/v1.9.2-disable-auto-update-aws.golden +++ b/pkg/userdata/coreos/testdata/v1.9.2-disable-auto-update-aws.golden @@ -187,12 +187,12 @@ "name": "download-healthcheck-script.service" }, { - "contents": "[Unit]\nRequires=download-healthcheck-script.service\nAfter=download-healthcheck-script.service\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh kubelet\n\n[Install]\nWantedBy=multi-user.target\n", + "contents": "[Unit]\nRequires=download-healthcheck-script.service kubelet.service\nAfter=download-healthcheck-script.service kubelet.service\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh kubelet\n\n[Install]\nWantedBy=multi-user.target\n", "enabled": true, "name": "kubelet-healthcheck.service" }, { - "contents": "[Unit]\nRequires=download-healthcheck-script.service\nAfter=download-healthcheck-script.service\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh container-runtime\n\n[Install]\nWantedBy=multi-user.target\n", + "contents": "[Unit]\nRequires=download-healthcheck-script.service docker.service\nAfter=download-healthcheck-script.service docker.service\n\n[Service]\nExecStart=/opt/bin/health-monitor.sh container-runtime\n\n[Install]\nWantedBy=multi-user.target\n", "enabled": true, "name": "docker-healthcheck.service" }, diff --git a/pkg/userdata/coreos/userdata.go b/pkg/userdata/coreos/userdata.go index dbadfd80a..305afb277 100644 --- a/pkg/userdata/coreos/userdata.go +++ b/pkg/userdata/coreos/userdata.go @@ -192,8 +192,8 @@ systemd: enabled: true contents: | [Unit] - Requires=download-healthcheck-script.service - After=download-healthcheck-script.service + Requires=download-healthcheck-script.service kubelet.service + After=download-healthcheck-script.service kubelet.service [Service] ExecStart=/opt/bin/health-monitor.sh kubelet @@ -205,8 +205,8 @@ systemd: enabled: true contents: | [Unit] - Requires=download-healthcheck-script.service - After=download-healthcheck-script.service + Requires=download-healthcheck-script.service docker.service + After=download-healthcheck-script.service docker.service [Service] ExecStart=/opt/bin/health-monitor.sh container-runtime diff --git a/pkg/userdata/ubuntu/testdata/1.11-aws.golden b/pkg/userdata/ubuntu/testdata/1.11-aws.golden index 97a806db5..9ddfebc9c 100644 --- a/pkg/userdata/ubuntu/testdata/1.11-aws.golden +++ b/pkg/userdata/ubuntu/testdata/1.11-aws.golden @@ -251,8 +251,8 @@ write_files: permissions: "0644" content: | [Unit] - Requires=setup.service - After=setup.service + Requires=kubelet.service + After=kubelet.service [Service] ExecStart=/usr/local/bin/health-monitor.sh kubelet @@ -264,8 +264,8 @@ write_files: permissions: "0644" content: | [Unit] - Requires=setup.service - After=setup.service + Requires=docker.service + After=docker.service [Service] ExecStart=/usr/local/bin/health-monitor.sh container-runtime diff --git a/pkg/userdata/ubuntu/testdata/1.9.2-dist-upgrade-on-boot-aws.golden b/pkg/userdata/ubuntu/testdata/1.9.2-dist-upgrade-on-boot-aws.golden index 5f89bbf3d..ff0984858 100644 --- a/pkg/userdata/ubuntu/testdata/1.9.2-dist-upgrade-on-boot-aws.golden +++ b/pkg/userdata/ubuntu/testdata/1.9.2-dist-upgrade-on-boot-aws.golden @@ -259,8 +259,8 @@ write_files: permissions: "0644" content: | [Unit] - Requires=setup.service - After=setup.service + Requires=kubelet.service + After=kubelet.service [Service] ExecStart=/usr/local/bin/health-monitor.sh kubelet @@ -272,8 +272,8 @@ write_files: permissions: "0644" content: | [Unit] - Requires=setup.service - After=setup.service + Requires=docker.service + After=docker.service [Service] ExecStart=/usr/local/bin/health-monitor.sh container-runtime diff --git a/pkg/userdata/ubuntu/testdata/1.9.2-openstack-multiple-dns.golden b/pkg/userdata/ubuntu/testdata/1.9.2-openstack-multiple-dns.golden index d993166a5..b5b2948e5 100644 --- a/pkg/userdata/ubuntu/testdata/1.9.2-openstack-multiple-dns.golden +++ b/pkg/userdata/ubuntu/testdata/1.9.2-openstack-multiple-dns.golden @@ -259,8 +259,8 @@ write_files: permissions: "0644" content: | [Unit] - Requires=setup.service - After=setup.service + Requires=kubelet.service + After=kubelet.service [Service] ExecStart=/usr/local/bin/health-monitor.sh kubelet @@ -272,8 +272,8 @@ write_files: permissions: "0644" content: | [Unit] - Requires=setup.service - After=setup.service + Requires=docker.service + After=docker.service [Service] ExecStart=/usr/local/bin/health-monitor.sh container-runtime diff --git a/pkg/userdata/ubuntu/testdata/openstack-kubelet-v-version-prefix.golden b/pkg/userdata/ubuntu/testdata/openstack-kubelet-v-version-prefix.golden index c983a53e1..7e98523a2 100644 --- a/pkg/userdata/ubuntu/testdata/openstack-kubelet-v-version-prefix.golden +++ b/pkg/userdata/ubuntu/testdata/openstack-kubelet-v-version-prefix.golden @@ -259,8 +259,8 @@ write_files: permissions: "0644" content: | [Unit] - Requires=setup.service - After=setup.service + Requires=kubelet.service + After=kubelet.service [Service] ExecStart=/usr/local/bin/health-monitor.sh kubelet @@ -272,8 +272,8 @@ write_files: permissions: "0644" content: | [Unit] - Requires=setup.service - After=setup.service + Requires=docker.service + After=docker.service [Service] ExecStart=/usr/local/bin/health-monitor.sh container-runtime diff --git a/pkg/userdata/ubuntu/testdata/openstack-overwrite-cloud-config.golden b/pkg/userdata/ubuntu/testdata/openstack-overwrite-cloud-config.golden index eb98d17a5..48d607674 100644 --- a/pkg/userdata/ubuntu/testdata/openstack-overwrite-cloud-config.golden +++ b/pkg/userdata/ubuntu/testdata/openstack-overwrite-cloud-config.golden @@ -261,8 +261,8 @@ write_files: permissions: "0644" content: | [Unit] - Requires=setup.service - After=setup.service + Requires=kubelet.service + After=kubelet.service [Service] ExecStart=/usr/local/bin/health-monitor.sh kubelet @@ -274,8 +274,8 @@ write_files: permissions: "0644" content: | [Unit] - Requires=setup.service - After=setup.service + Requires=docker.service + After=docker.service [Service] ExecStart=/usr/local/bin/health-monitor.sh container-runtime diff --git a/pkg/userdata/ubuntu/userdata.go b/pkg/userdata/ubuntu/userdata.go index ac118fc3e..9be9418de 100644 --- a/pkg/userdata/ubuntu/userdata.go +++ b/pkg/userdata/ubuntu/userdata.go @@ -412,8 +412,8 @@ write_files: permissions: "0644" content: | [Unit] - Requires=setup.service - After=setup.service + Requires=kubelet.service + After=kubelet.service [Service] ExecStart=/usr/local/bin/health-monitor.sh kubelet @@ -425,8 +425,8 @@ write_files: permissions: "0644" content: | [Unit] - Requires=setup.service - After=setup.service + Requires=docker.service + After=docker.service [Service] ExecStart=/usr/local/bin/health-monitor.sh container-runtime