Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow some NotReady nodes in 1000 node clusters #20407

Merged
merged 1 commit into from
Feb 3, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
22 changes: 20 additions & 2 deletions cluster/validate-cluster.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@
# limitations under the License.

# Validates that the cluster is healthy.
# Error codes are:
# 0 - success
# 1 - fatal (cluster is unlikely to work)
# 2 - non-fatal (encountered some errors, but cluster should be working correctly)

set -o errexit
set -o nounset
Expand All @@ -29,11 +33,14 @@ fi
source "${KUBE_ROOT}/cluster/kube-env.sh"
source "${KUBE_ROOT}/cluster/kube-util.sh"

ALLOWED_NOTREADY_NODES="${ALLOWED_NOTREADY_NODES:-0}"

EXPECTED_NUM_NODES="${NUM_NODES}"
if [[ "${REGISTER_MASTER_KUBELET:-}" == "true" ]]; then
EXPECTED_NUM_NODES=$((EXPECTED_NUM_NODES+1))
fi
# Make several attempts to deal with slow cluster birth.
return_value=0
attempt=0
while true; do
# The "kubectl get nodes -o template" exports node information.
Expand All @@ -59,7 +66,12 @@ while true; do
if (( attempt > 100 )); then
echo -e "${color_red}Detected ${ready} ready nodes, found ${found} nodes out of expected ${EXPECTED_NUM_NODES}. Your cluster may not be fully functional.${color_norm}"
"${KUBE_ROOT}/cluster/kubectl.sh" get nodes
exit 2
if [ "$((${EXPECTED_NUM_NODES} - ${found}))" -gt "${ALLOWED_NOTREADY_NODES}" ]; then
exit 1
else
return_value=2
break
fi
else
echo -e "${color_yellow}Waiting for ${EXPECTED_NUM_NODES} ready nodes. ${ready} ready nodes, ${found} registered. Retrying.${color_norm}"
fi
Expand Down Expand Up @@ -99,4 +111,10 @@ done

echo "Validate output:"
"${KUBE_ROOT}/cluster/kubectl.sh" get cs
echo -e "${color_green}Cluster validation succeeded${color_norm}"
if [ "${return_value}" == "0" ]; then
echo -e "${color_green}Cluster validation succeeded${color_norm}"
else
echo -e "${color_yellow}Cluster validation encountered some problems, but cluster should be in working order${color_norm}"
fi

exit "${return_value}"
28 changes: 28 additions & 0 deletions hack/jenkins/e2e.sh
Original file line number Diff line number Diff line change
Expand Up @@ -508,6 +508,7 @@ case ${JOB_NAME} in

# Runs the performance/scalability test on huge 1000-node cluster on GCE.
# Flannel is used as network provider.
# Allows a couple of nodes to be NotReady during startup
kubernetes-e2e-gce-enormous-cluster)
: ${E2E_CLUSTER_NAME:="jenkins-gce-enormous-cluster"}
: ${E2E_NETWORK:="e2e-enormous-cluster"}
Expand All @@ -526,6 +527,32 @@ case ${JOB_NAME} in
NODE_SIZE="n1-standard-1"
NODE_DISK_SIZE="50GB"
NUM_NODES="1000"
ALLOWED_NOTREADY_NODES="2"
# Reduce logs verbosity
TEST_CLUSTER_LOG_LEVEL="--v=1"
# Increase resync period to simulate production
TEST_CLUSTER_RESYNC_PERIOD="--min-resync-period=12h"
;;

# Starts and tears down 1000-node cluster on GCE using flannel networking
# Requires all 1000 nodes to come up.
kubernetes-e2e-gce-enormous-startup)
: ${E2E_CLUSTER_NAME:="jenkins-gce-enormous-startup"}
# TODO: increase a quota for networks in kubernetes-scale and move this test to its own network
: ${E2E_NETWORK:="e2e-enormous-cluster"}
: ${E2E_TEST:="false"}
: ${KUBE_GCE_INSTANCE_PREFIX:="e2e-enormous-startup"}
: ${PROJECT:="kubernetes-scale"}
# Override GCE defaults.
NETWORK_PROVIDER="flannel"
# Temporarily switch of Heapster, as this will not schedule anywhere.
# TODO: Think of a solution to enable it.
ENABLE_CLUSTER_MONITORING="none"
E2E_ZONE="asia-east1-a"
MASTER_SIZE="n1-standard-32"
NODE_SIZE="n1-standard-1"
NODE_DISK_SIZE="50GB"
NUM_NODES="1000"
# Reduce logs verbosity
TEST_CLUSTER_LOG_LEVEL="--v=1"
# Increase resync period to simulate production
Expand Down Expand Up @@ -900,6 +927,7 @@ export KUBE_GCE_NODE_IMAGE=${KUBE_GCE_NODE_IMAGE:-}
export KUBE_OS_DISTRIBUTION=${KUBE_OS_DISTRIBUTION:-}
export GCE_SERVICE_ACCOUNT=$(gcloud auth list 2> /dev/null | grep active | cut -f3 -d' ')
export FAIL_ON_GCP_RESOURCE_LEAK="${FAIL_ON_GCP_RESOURCE_LEAK:-false}"
export ALLOWED_NOTREADY_NODES=${ALLOWED_NOTREADY_NODES:-}

# GKE variables
export CLUSTER_NAME=${E2E_CLUSTER_NAME}
Expand Down