Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Automated cherry pick of #38814 #39079 release 1.5 #39137

Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
25 changes: 19 additions & 6 deletions cluster/validate-cluster.sh
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,13 @@ ALLOWED_NOTREADY_NODES="${ALLOWED_NOTREADY_NODES:-0}"
CLUSTER_READY_ADDITIONAL_TIME_SECONDS="${CLUSTER_READY_ADDITIONAL_TIME_SECONDS:-30}"

EXPECTED_NUM_NODES="${NUM_NODES}"

if [[ "${KUBERNETES_PROVIDER:-}" == "gce" ]]; then
echo "Validating gce cluster, MULTIZONE=${MULTIZONE:-}"
# In multizone mode we need to add instances for all nodes in the region.
if [[ "${MULTIZONE:-}" == "true" ]]; then
EXPECTED_NUM_NODES=$(gcloud compute instances list --format=[no-heading] --regexp="${NODE_INSTANCE_PREFIX}.*" \
--zones=$(gcloud compute zones list --filter=region=${REGION} --format=[no-heading]\(name\) | tr "\n" "," | sed "s/,$//") | wc -l)
EXPECTED_NUM_NODES=$(gcloud -q compute instances list --project="${PROJECT}" --format=[no-heading] --regexp="${NODE_INSTANCE_PREFIX}.*" \
--zones=$(gcloud -q compute zones list --project="${PROJECT}" --filter=region=${REGION} --format=[no-heading]\(name\) | tr "\n" "," | sed "s/,$//") | wc -l)
echo "Computing number of nodes, NODE_INSTANCE_PREFIX=${NODE_INSTANCE_PREFIX}, REGION=${REGION}, EXPECTED_NUM_NODES=${EXPECTED_NUM_NODES}"
fi
fi

Expand All @@ -73,7 +74,9 @@ REQUIRED_NUM_NODES=$((EXPECTED_NUM_NODES - ALLOWED_NOTREADY_NODES))
# Make several attempts to deal with slow cluster birth.
return_value=0
attempt=0
# Set the timeout to ~25minutes (100 x 15 second) to avoid timeouts for 1000-node clusters.
PAUSE_BETWEEN_ITERATIONS_SECONDS=15
MAX_ATTEMPTS=100
ADDITIONAL_ITERATIONS=$(((CLUSTER_READY_ADDITIONAL_TIME_SECONDS + PAUSE_BETWEEN_ITERATIONS_SECONDS - 1)/PAUSE_BETWEEN_ITERATIONS_SECONDS))
while true; do
# Pause between iterations of this large outer loop.
Expand All @@ -91,7 +94,18 @@ while true; do
# Suppress errors from kubectl output because during cluster bootstrapping
# for clusters where the master node is registered, the apiserver will become
# available and then get restarted as the kubelet configures the docker bridge.
node=$(kubectl_retry get nodes) || continue
#
# We are assigning the result of kubectl_retry get nodes operation to the res
# varaible in that way, to prevent stopping the whole script on an error.
node=$(kubectl_retry get nodes) && res="$?" || res="$?"
if [ "${res}" -ne "0" ]; then
if [[ "${attempt}" -gt "${last_run:-$MAX_ATTEMPTS}" ]]; then
echo -e "${color_red} Failed to get nodes.${color_norm}"
exit 1
else
continue
fi
fi
found=$(($(echo "${node}" | wc -l) - 1))
ready=$(($(echo "${node}" | grep -v "NotReady" | wc -l ) - 1))

Expand All @@ -110,8 +124,7 @@ while true; do
echo -e "${color_green}Found ${REQUIRED_NUM_NODES} Nodes, allowing additional ${ADDITIONAL_ITERATIONS} iterations for other Nodes to join.${color_norm}"
last_run="${last_run:-$((attempt + ADDITIONAL_ITERATIONS - 1))}"
fi
# Set the timeout to ~25minutes (100 x 15 second) to avoid timeouts for 1000-node clusters.
if [[ "${attempt}" -gt "${last_run:-100}" ]]; then
if [[ "${attempt}" -gt "${last_run:-$MAX_ATTEMPTS}" ]]; then
echo -e "${color_yellow}Detected ${ready} ready nodes, found ${found} nodes out of expected ${EXPECTED_NUM_NODES}. Your cluster may not be fully functional.${color_norm}"
kubectl_retry get nodes
if [[ "${REQUIRED_NUM_NODES}" -gt "${ready}" ]]; then
Expand Down