Skip to content
This repository was archived by the owner on Feb 8, 2021. It is now read-only.

Commit b9aa710

Browse files
committed
Merge pull request kubernetes#18436 from gmarek/1000-kube-up
Allow creation of clusters larger than 500 nodes
2 parents 1b70a40 + 0c61269 commit b9aa710

File tree

2 files changed

+111
-57
lines changed

2 files changed

+111
-57
lines changed

cluster/gce/upgrade.sh

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,7 @@ function upgrade-nodes() {
174174
#
175175
# Vars set:
176176
# SANITIZED_VERSION
177+
# INSTANCE_GROUPS
177178
# KUBELET_TOKEN
178179
# KUBE_PROXY_TOKEN
179180
# CA_CERT_BASE64
@@ -184,7 +185,7 @@ function prepare-node-upgrade() {
184185
echo "== Preparing node upgrade (to ${KUBE_VERSION}). ==" >&2
185186
SANITIZED_VERSION=$(echo ${KUBE_VERSION} | sed 's/[\.\+]/-/g')
186187

187-
detect-node-names
188+
detect-node-names # sets INSTANCE_GROUPS
188189

189190
# TODO(zmerlynn): Refactor setting scope flags.
190191
local scope_flags=
@@ -231,16 +232,18 @@ function do-node-upgrade() {
231232
subgroup="alpha compute"
232233
fi
233234
local template_name=$(get-template-name-from-version ${SANITIZED_VERSION})
234-
gcloud ${subgroup} rolling-updates \
235-
--project="${PROJECT}" \
236-
--zone="${ZONE}" \
237-
start \
238-
--group="${NODE_INSTANCE_PREFIX}-group" \
239-
--template="${template_name}" \
240-
--instance-startup-timeout=300s \
241-
--max-num-concurrent-instances=1 \
242-
--max-num-failed-instances=0 \
243-
--min-instance-update-time=0s
235+
for group in ${INSTANCE_GROUPS[@]}; do
236+
gcloud ${subgroup} rolling-updates \
237+
--project="${PROJECT}" \
238+
--zone="${ZONE}" \
239+
start \
240+
--group="${group}" \
241+
--template="${template_name}" \
242+
--instance-startup-timeout=300s \
243+
--max-num-concurrent-instances=1 \
244+
--max-num-failed-instances=0 \
245+
--min-instance-update-time=0s
246+
done
244247

245248
# TODO(zmerlynn): Wait for the rolling-update to finish.
246249

cluster/gce/util.sh

Lines changed: 97 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -253,12 +253,24 @@ function upload-server-tars() {
253253
# NODE_INSTANCE_PREFIX
254254
# Vars set:
255255
# NODE_NAMES
256+
# INSTANCE_GROUPS
256257
function detect-node-names {
257258
detect-project
258-
NODE_NAMES=($(gcloud compute instance-groups managed list-instances \
259-
"${NODE_INSTANCE_PREFIX}-group" --zone "${ZONE}" --project "${PROJECT}" \
260-
--format=yaml | grep instance: | cut -d ' ' -f 2))
261-
echo "NODE_NAMES=${NODE_NAMES[*]}" >&2
259+
INSTANCE_GROUPS=()
260+
INSTANCE_GROUPS+=($(gcloud compute instance-groups managed list --zone "${ZONE}" --project "${PROJECT}" | grep ${NODE_INSTANCE_PREFIX} | cut -f1 -d" "))
261+
NODE_NAMES=()
262+
if [[ -n "${INSTANCE_GROUPS[@]:-}" ]]; then
263+
for group in "${INSTANCE_GROUPS[@]}"; do
264+
NODE_NAMES+=($(gcloud compute instance-groups managed list-instances \
265+
"${group}" --zone "${ZONE}" --project "${PROJECT}" \
266+
--format=yaml | grep instance: | cut -d ' ' -f 2))
267+
done
268+
echo "INSTANCE_GROUPS=${INSTANCE_GROUPS[*]}" >&2
269+
echo "NODE_NAMES=${NODE_NAMES[*]}" >&2
270+
else
271+
echo "INSTANCE_GROUPS=" >&2
272+
echo "NODE_NAMES=" >&2
273+
fi
262274
}
263275

264276
# Detect the information about the minions
@@ -713,17 +725,43 @@ function kube-up {
713725

714726
create-node-instance-template $template_name
715727

728+
local defaulted_max_instances_per_mig=${MAX_INSTANCES_PER_MIG:-500}
729+
730+
if [[ ${defaulted_max_instances_per_mig} -le "0" ]]; then
731+
echo "MAX_INSTANCES_PER_MIG cannot be negative. Assuming default 500"
732+
defaulted_max_instances_per_mig=500
733+
fi
734+
local num_migs=$(((${NUM_NODES} + ${defaulted_max_instances_per_mig} - 1) / ${defaulted_max_instances_per_mig}))
735+
local instances_per_mig=$(((${NUM_NODES} + ${num_migs} - 1) / ${num_migs}))
736+
local last_mig_size=$((${NUM_NODES} - (${num_migs} - 1) * ${instances_per_mig}))
737+
738+
#TODO: parallelize this loop to speed up the process
739+
for i in $(seq $((${num_migs} - 1))); do
740+
gcloud compute instance-groups managed \
741+
create "${NODE_INSTANCE_PREFIX}-group-$i" \
742+
--project "${PROJECT}" \
743+
--zone "${ZONE}" \
744+
--base-instance-name "${NODE_INSTANCE_PREFIX}" \
745+
--size "${instances_per_mig}" \
746+
--template "$template_name" || true;
747+
gcloud compute instance-groups managed wait-until-stable \
748+
"${NODE_INSTANCE_PREFIX}-group-$i" \
749+
--zone "${ZONE}" \
750+
--project "${PROJECT}" || true;
751+
done
752+
716753
gcloud compute instance-groups managed \
717-
create "${NODE_INSTANCE_PREFIX}-group" \
754+
create "${NODE_INSTANCE_PREFIX}-group-${num_migs}" \
718755
--project "${PROJECT}" \
719756
--zone "${ZONE}" \
720757
--base-instance-name "${NODE_INSTANCE_PREFIX}" \
721-
--size "${NUM_NODES}" \
758+
--size "${last_mig_size}" \
722759
--template "$template_name" || true;
723760
gcloud compute instance-groups managed wait-until-stable \
724-
"${NODE_INSTANCE_PREFIX}-group" \
725-
--zone "${ZONE}" \
726-
--project "${PROJECT}" || true;
761+
"${NODE_INSTANCE_PREFIX}-group-${num_migs}" \
762+
--zone "${ZONE}" \
763+
--project "${PROJECT}" || true;
764+
727765
detect-node-names
728766
detect-master
729767

@@ -742,9 +780,12 @@ function kube-up {
742780
METRICS+="--custom-metric-utilization metric=custom.cloudmonitoring.googleapis.com/kubernetes.io/memory/node_reservation,"
743781
METRICS+="utilization-target=${TARGET_NODE_UTILIZATION},utilization-target-type=GAUGE "
744782

745-
echo "Creating node autoscaler."
746-
gcloud compute instance-groups managed set-autoscaling "${NODE_INSTANCE_PREFIX}-group" --zone "${ZONE}" --project "${PROJECT}" \
747-
--min-num-replicas "${AUTOSCALER_MIN_NODES}" --max-num-replicas "${AUTOSCALER_MAX_NODES}" ${METRICS} || true
783+
echo "Creating node autoscalers."
784+
785+
for i in $(seq ${num_migs}); do
786+
gcloud compute instance-groups managed set-autoscaling "${NODE_INSTANCE_PREFIX}-group-$i" --zone "${ZONE}" --project "${PROJECT}" \
787+
--min-num-replicas "${AUTOSCALER_MIN_NODES}" --max-num-replicas "${AUTOSCALER_MAX_NODES}" ${METRICS} || true
788+
done
748789
fi
749790

750791
echo "Waiting up to ${KUBE_CLUSTER_INITIALIZATION_TIMEOUT} seconds for cluster initialization."
@@ -810,46 +851,51 @@ function kube-up {
810851
# down the firewall rules and routes.
811852
function kube-down {
812853
detect-project
854+
detect-node-names # For INSTANCE_GROUPS
813855

814856
echo "Bringing down cluster"
815857
set +e # Do not stop on error
816858

817-
# Delete autoscaler for nodes if present.
859+
# Delete autoscaler for nodes if present. We assume that all or none instance groups have an autoscaler
818860
local autoscaler
819861
autoscaler=( $(gcloud compute instance-groups managed list --zone "${ZONE}" --project "${PROJECT}" \
820-
| grep "${NODE_INSTANCE_PREFIX}-group" \
862+
| grep "${NODE_INSTANCE_PREFIX}-group-1" \
821863
| awk '{print $7}') )
822864
if [[ "${autoscaler:-}" == "yes" ]]; then
823-
gcloud compute instance-groups managed stop-autoscaling "${NODE_INSTANCE_PREFIX}-group" --zone "${ZONE}" --project "${PROJECT}"
865+
for group in ${INSTANCE_GROUPS[@]}; do
866+
gcloud compute instance-groups managed stop-autoscaling "${group}" --zone "${ZONE}" --project "${PROJECT}"
867+
done
824868
fi
825869

826870
# Get the name of the managed instance group template before we delete the
827871
# managed instange group. (The name of the managed instnace group template may
828872
# change during a cluster upgrade.)
829-
local template=$(get-template "${PROJECT}" "${ZONE}" "${NODE_INSTANCE_PREFIX}-group")
873+
local template=$(get-template "${PROJECT}" "${ZONE}" "${NODE_INSTANCE_PREFIX}-group-1")
830874

831875
# The gcloud APIs don't return machine parseable error codes/retry information. Therefore the best we can
832876
# do is parse the output and special case particular responses we are interested in.
833-
if gcloud compute instance-groups managed describe "${NODE_INSTANCE_PREFIX}-group" --project "${PROJECT}" --zone "${ZONE}" &>/dev/null; then
834-
deleteCmdOutput=$(gcloud compute instance-groups managed delete --zone "${ZONE}" \
835-
--project "${PROJECT}" \
836-
--quiet \
837-
"${NODE_INSTANCE_PREFIX}-group")
838-
if [[ "$deleteCmdOutput" != "" ]]; then
839-
# Managed instance group deletion is done asynchronously, we must wait for it to complete, or subsequent steps fail
840-
deleteCmdOperationId=$(echo $deleteCmdOutput | grep "Operation:" | sed "s/.*Operation:[[:space:]]*\([^[:space:]]*\).*/\1/g")
841-
if [[ "$deleteCmdOperationId" != "" ]]; then
842-
deleteCmdStatus="PENDING"
843-
while [[ "$deleteCmdStatus" != "DONE" ]]
844-
do
845-
sleep 5
846-
deleteCmdOperationOutput=$(gcloud compute instance-groups managed --zone "${ZONE}" --project "${PROJECT}" get-operation $deleteCmdOperationId)
847-
deleteCmdStatus=$(echo $deleteCmdOperationOutput | grep -i "status:" | sed "s/.*status:[[:space:]]*\([^[:space:]]*\).*/\1/g")
848-
echo "Waiting for MIG deletion to complete. Current status: " $deleteCmdStatus
849-
done
877+
for group in ${INSTANCE_GROUPS[@]}; do
878+
if gcloud compute instance-groups managed describe "${group}" --project "${PROJECT}" --zone "${ZONE}" &>/dev/null; then
879+
deleteCmdOutput=$(gcloud compute instance-groups managed delete --zone "${ZONE}" \
880+
--project "${PROJECT}" \
881+
--quiet \
882+
"${group}")
883+
if [[ "$deleteCmdOutput" != "" ]]; then
884+
# Managed instance group deletion is done asynchronously, we must wait for it to complete, or subsequent steps fail
885+
deleteCmdOperationId=$(echo $deleteCmdOutput | grep "Operation:" | sed "s/.*Operation:[[:space:]]*\([^[:space:]]*\).*/\1/g")
886+
if [[ "$deleteCmdOperationId" != "" ]]; then
887+
deleteCmdStatus="PENDING"
888+
while [[ "$deleteCmdStatus" != "DONE" ]]
889+
do
890+
sleep 5
891+
deleteCmdOperationOutput=$(gcloud compute instance-groups managed --zone "${ZONE}" --project "${PROJECT}" get-operation $deleteCmdOperationId)
892+
deleteCmdStatus=$(echo $deleteCmdOperationOutput | grep -i "status:" | sed "s/.*status:[[:space:]]*\([^[:space:]]*\).*/\1/g")
893+
echo "Waiting for MIG deletion to complete. Current status: " $deleteCmdStatus
894+
done
895+
fi
850896
fi
851897
fi
852-
fi
898+
done
853899

854900
if gcloud compute instance-templates describe --project "${PROJECT}" "${template}" &>/dev/null; then
855901
gcloud compute instance-templates delete \
@@ -982,12 +1028,13 @@ function get-template {
9821028
# KUBE_RESOURCE_FOUND
9831029
function check-resources {
9841030
detect-project
1031+
detect-node-names
9851032

9861033
echo "Looking for already existing resources"
9871034
KUBE_RESOURCE_FOUND=""
9881035

989-
if gcloud compute instance-groups managed describe --project "${PROJECT}" --zone "${ZONE}" "${NODE_INSTANCE_PREFIX}-group" &>/dev/null; then
990-
KUBE_RESOURCE_FOUND="Managed instance group ${NODE_INSTANCE_PREFIX}-group"
1036+
if [[ -n "${INSTANCE_GROUPS[@]:-}" ]]; then
1037+
KUBE_RESOURCE_FOUND="Managed instance groups ${INSTANCE_GROUPS[@]}"
9911038
return 1
9921039
fi
9931040

@@ -1090,11 +1137,13 @@ function prepare-push() {
10901137
create-node-instance-template $tmp_template_name
10911138

10921139
local template_name="${NODE_INSTANCE_PREFIX}-template"
1093-
gcloud compute instance-groups managed \
1094-
set-instance-template "${NODE_INSTANCE_PREFIX}-group" \
1095-
--template "$tmp_template_name" \
1096-
--zone "${ZONE}" \
1097-
--project "${PROJECT}" || true;
1140+
for group in ${INSTANCE_GROUPS[@]}; do
1141+
gcloud compute instance-groups managed \
1142+
set-instance-template "${group}" \
1143+
--template "$tmp_template_name" \
1144+
--zone "${ZONE}" \
1145+
--project "${PROJECT}" || true;
1146+
done
10981147

10991148
gcloud compute instance-templates delete \
11001149
--project "${PROJECT}" \
@@ -1103,11 +1152,13 @@ function prepare-push() {
11031152

11041153
create-node-instance-template "$template_name"
11051154

1106-
gcloud compute instance-groups managed \
1107-
set-instance-template "${NODE_INSTANCE_PREFIX}-group" \
1108-
--template "$template_name" \
1109-
--zone "${ZONE}" \
1110-
--project "${PROJECT}" || true;
1155+
for group in ${INSTANCE_GROUPS[@]}; do
1156+
gcloud compute instance-groups managed \
1157+
set-instance-template "${group}" \
1158+
--template "$template_name" \
1159+
--zone "${ZONE}" \
1160+
--project "${PROJECT}" || true;
1161+
done
11111162

11121163
gcloud compute instance-templates delete \
11131164
--project "${PROJECT}" \

0 commit comments

Comments
 (0)