Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a timeout for GCE cluster initialization #15209

Merged
merged 2 commits into from Oct 9, 2015
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
38 changes: 22 additions & 16 deletions cluster/gce/util.sh
Expand Up @@ -39,6 +39,8 @@ KUBE_SKIP_UPDATE=${KUBE_SKIP_UPDATE-"n"}
# multiple versions of the server are being used in the same project
# simultaneously (e.g. on Jenkins).
KUBE_GCS_STAGING_PATH_SUFFIX=${KUBE_GCS_STAGING_PATH_SUFFIX-""}
# How long (in seconds) to wait for cluster initialization.
KUBE_CLUSTER_INITIALIZATION_TIMEOUT=${KUBE_CLUSTER_INITIALIZATION_TIMEOUT:-300}

# VERSION_REGEX matches things like "v0.13.1"
readonly KUBE_VERSION_REGEX="^v(0|[1-9][0-9]*)\\.(0|[1-9][0-9]*)\\.(0|[1-9][0-9]*)$"
Expand All @@ -59,7 +61,7 @@ function verify-prereqs {
else
# TODO: add checking if RUNTIME_CONFIG contains "experimental/v1alpha1=false" and appending "experimental/v1alpha1=true" if not.
if echo "${RUNTIME_CONFIG}" | grep -q -v "experimental/v1alpha1=true"; then
echo "Experimental API should be turned on, but is not turned on in RUNTIME_CONFIG!"
echo "Experimental API should be turned on, but is not turned on in RUNTIME_CONFIG!" >&2
exit 1
fi
fi
Expand All @@ -79,8 +81,8 @@ function verify-prereqs {
curl https://sdk.cloud.google.com | bash
fi
if ! which "${cmd}" >/dev/null; then
echo "Can't find ${cmd} in PATH, please fix and retry. The Google Cloud "
echo "SDK can be downloaded from https://cloud.google.com/sdk/."
echo "Can't find ${cmd} in PATH, please fix and retry. The Google Cloud " >&2
echo "SDK can be downloaded from https://cloud.google.com/sdk/." >&2
exit 1
fi
fi
Expand Down Expand Up @@ -123,7 +125,7 @@ function find-release-tars {
SERVER_BINARY_TAR="${KUBE_ROOT}/_output/release-tars/kubernetes-server-linux-amd64.tar.gz"
fi
if [[ ! -f "$SERVER_BINARY_TAR" ]]; then
echo "!!! Cannot find kubernetes-server-linux-amd64.tar.gz"
echo "!!! Cannot find kubernetes-server-linux-amd64.tar.gz" >&2
exit 1
fi

Expand All @@ -132,7 +134,7 @@ function find-release-tars {
SALT_TAR="${KUBE_ROOT}/_output/release-tars/kubernetes-salt.tar.gz"
fi
if [[ ! -f "$SALT_TAR" ]]; then
echo "!!! Cannot find kubernetes-salt.tar.gz"
echo "!!! Cannot find kubernetes-salt.tar.gz" >&2
exit 1
fi
}
Expand Down Expand Up @@ -346,10 +348,10 @@ function create-firewall-rule {
--target-tags "$3" \
--allow tcp,udp,icmp,esp,ah,sctp; then
if (( attempt > 5 )); then
echo -e "${color_red}Failed to create firewall rule $1 ${color_norm}"
echo -e "${color_red}Failed to create firewall rule $1 ${color_norm}" >&2
exit 2
fi
echo -e "${color_yellow}Attempt $(($attempt+1)) failed to create firewall rule $1. Retrying.${color_norm}"
echo -e "${color_yellow}Attempt $(($attempt+1)) failed to create firewall rule $1. Retrying.${color_norm}" >&2
attempt=$(($attempt+1))
else
break
Expand Down Expand Up @@ -431,10 +433,10 @@ function add-instance-metadata {
--zone "${ZONE}" \
--metadata "${kvs[@]}"; then
if (( attempt > 5 )); then
echo -e "${color_red}Failed to add instance metadata in ${instance} ${color_norm}"
echo -e "${color_red}Failed to add instance metadata in ${instance} ${color_norm}" >&2
exit 2
fi
echo -e "${color_yellow}Attempt $(($attempt+1)) failed to add metadata in ${instance}. Retrying.${color_norm}"
echo -e "${color_yellow}Attempt $(($attempt+1)) failed to add metadata in ${instance}. Retrying.${color_norm}" >&2
attempt=$(($attempt+1))
else
break
Expand All @@ -458,10 +460,10 @@ function add-instance-metadata-from-file {
--zone "${ZONE}" \
--metadata-from-file "$(join_csv ${kvs[@]})"; then
if (( attempt > 5 )); then
echo -e "${color_red}Failed to add instance metadata in ${instance} ${color_norm}"
echo -e "${color_red}Failed to add instance metadata in ${instance} ${color_norm}" >&2
exit 2
fi
echo -e "${color_yellow}Attempt $(($attempt+1)) failed to add metadata in ${instance}. Retrying.${color_norm}"
echo -e "${color_yellow}Attempt $(($attempt+1)) failed to add metadata in ${instance}. Retrying.${color_norm}" >&2
attempt=$(($attempt+1))
else
break
Expand Down Expand Up @@ -541,7 +543,7 @@ function create-certs {
./easyrsa build-client-full kubecfg nopass > /dev/null 2>&1) || {
# If there was an error in the subshell, just die.
# TODO(roberthbailey): add better error handling here
echo "=== Failed to generate certificates: Aborting ==="
echo "=== Failed to generate certificates: Aborting ===" >&2
exit 2
}
CERT_DIR="${KUBE_TEMP}/easy-rsa-master/easyrsa3"
Expand Down Expand Up @@ -721,11 +723,10 @@ function kube-up {
--min-num-replicas "${AUTOSCALER_MIN_NODES}" --max-num-replicas "${AUTOSCALER_MAX_NODES}" ${METRICS} || true
fi

echo "Waiting for cluster initialization."
echo "Waiting up to ${KUBE_CLUSTER_INITIALIZATION_TIMEOUT} seconds for cluster initialization."
echo
echo " This will continually check to see if the API for kubernetes is reachable."
echo " This might loop forever if there was some uncaught error during start"
echo " up."
echo " This may time out if there was some uncaught error during start up."
echo

# curl in mavericks is borked.
Expand All @@ -736,12 +737,17 @@ function kube-up {
fi
fi


local start_time=$(date +%s)
until curl --cacert "${CERT_DIR}/pki/ca.crt" \
-H "Authorization: Bearer ${KUBE_BEARER_TOKEN}" \
${secure} \
--max-time 5 --fail --output /dev/null --silent \
"https://${KUBE_MASTER_IP}/api/v1/pods"; do
local elapsed=$(($(date +%s) - ${start_time}))
if [[ ${elapsed} -gt ${KUBE_CLUSTER_INITIALIZATION_TIMEOUT} ]]; then
echo -e "${color_red}Cluster failed to initialize within ${KUBE_CLUSTER_INITIALIZATION_TIMEOUT} seconds.${color_norm}" >&2
exit 2
fi
printf "."
sleep 2
done
Expand Down