Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use Managed Instance Group for managing Nodes in GCE #4164

Merged
merged 1 commit into from
Feb 17, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 0 additions & 1 deletion cluster/gce/config-default.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ INSTANCE_PREFIX="${KUBE_GCE_INSTANCE_PREFIX:-kubernetes}"
MASTER_NAME="${INSTANCE_PREFIX}-master"
MASTER_TAG="${INSTANCE_PREFIX}-master"
MINION_TAG="${INSTANCE_PREFIX}-minion"
MINION_NAMES=($(eval echo ${INSTANCE_PREFIX}-minion-{1..${NUM_MINIONS}}))

# Compute IP addresses for nodes.
function increment_ipv4 {
Expand Down
1 change: 0 additions & 1 deletion cluster/gce/config-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ INSTANCE_PREFIX="${KUBE_GCE_INSTANCE_PREFIX:-e2e-test-${USER}}"
MASTER_NAME="${INSTANCE_PREFIX}-master"
MASTER_TAG="${INSTANCE_PREFIX}-master"
MINION_TAG="${INSTANCE_PREFIX}-minion"
MINION_NAMES=($(eval echo ${INSTANCE_PREFIX}-minion-{1..${NUM_MINIONS}}))
CLUSTER_IP_RANGE="10.245.0.0/16"
MINION_IP_RANGES=($(eval echo "10.245.{1..${NUM_MINIONS}}.0/24"))
MINION_SCOPES=("storage-ro" "compute-rw")
Expand Down
154 changes: 113 additions & 41 deletions cluster/gce/util.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
KUBE_ROOT=$(dirname "${BASH_SOURCE}")/../..
source "${KUBE_ROOT}/cluster/gce/${KUBE_CONFIG_FILE-"config-default.sh"}"

NODE_INSTANCE_PREFIX="${INSTANCE_PREFIX}-minion"

# Verify prereqs
function verify-prereqs {
local cmd
Expand Down Expand Up @@ -138,15 +140,48 @@ function upload-server-tars() {
SALT_TAR_URL="${salt_gs_url/gs:\/\//https://storage.googleapis.com/}"
}

# Detect the information about the minions
# Detect minions created in the minion group
#
# Assumed vars:
# NODE_INSTANCE_PREFIX
# Vars set:
# MINION_NAMES
function detect-minion-names {
detect-project
MINION_NAMES=($(gcloud preview --project "${PROJECT}" instance-groups \
--zone "${ZONE}" instances --group "${NODE_INSTANCE_PREFIX}-group" list \
| cut -d'/' -f11))
echo "MINION_NAMES=${MINION_NAMES[*]}"
}

# Waits until the number of running nodes in the instance group is equal to NUM_NODES
#
# Assumed vars:
# NODE_INSTANCE_PREFIX
# NUM_MINIONS
function wait-for-minions-to-run {
detect-project
local running_minions=0
while [[ "${NUM_MINIONS}" != "${running_minions}" ]]; do
echo -e -n "${color_yellow}Waiting for minions to run. "
echo -e "${running_minions} out of ${NUM_MINIONS} running. Retrying.${color_norm}"
sleep 5
running_minions=$(gcloud preview --project "${PROJECT}" instance-groups \
--zone "${ZONE}" instances --group "${NODE_INSTANCE_PREFIX}-group" list \
--running | wc -l)
done
}

# Detect the information about the minions
#
# Assumed vars:
# ZONE
# Vars set:
# MINION_NAMES
# KUBE_MINION_IP_ADDRESSES (array)
function detect-minions () {
detect-project
detect-minion-names
KUBE_MINION_IP_ADDRESSES=()
for (( i=0; i<${#MINION_NAMES[@]}; i++)); do
local minion_ip=$(gcloud compute instances describe --project "${PROJECT}" --zone "${ZONE}" \
Expand Down Expand Up @@ -263,7 +298,7 @@ function create-firewall-rule {
echo -e "${color_yellow}Attempt $(($attempt+1)) failed to create firewall rule $1. Retrying.${color_norm}"
attempt=$(($attempt+1))
else
break
break
fi
done
}
Expand All @@ -288,22 +323,21 @@ function create-route {
echo -e "${color_yellow}Attempt $(($attempt+1)) failed to create route $1. Retrying.${color_norm}"
attempt=$(($attempt+1))
else
break
break
fi
done
}

# Robustly try to create an instance.
# $1: The name of the instance.
# Robustly try to create an instance template.
# $1: The name of the instance template.
# $2: The scopes flag.
# $3: The minion start script.
function create-minion {
# $3: The minion start script metadata from file.
function create-node-template {
detect-project
local attempt=0
while true; do
if ! gcloud compute instances create "$1" \
if ! gcloud compute instance-templates create "$1" \
--project "${PROJECT}" \
--zone "${ZONE}" \
--machine-type "${MINION_SIZE}" \
--boot-disk-type "${MINION_DISK_TYPE}" \
--boot-disk-size "${MINION_DISK_SIZE}" \
Expand All @@ -315,16 +349,36 @@ function create-minion {
--can-ip-forward \
--metadata-from-file "$3"; then
if (( attempt > 5 )); then
echo -e "${color_red}Failed to create instance $1 ${color_norm}"
echo -e "${color_red}Failed to create instance template $1 ${color_norm}"
exit 2
fi
echo -e "${color_yellow}Attempt $(($attempt+1)) failed to create node $1. Retrying.${color_norm}"
echo -e "${color_yellow}Attempt $(($attempt+1)) failed to create instance template $1. Retrying.${color_norm}"
attempt=$(($attempt+1))
# Attempt to delete the disk for this node (the disk may have been created even
# if the instance creation failed).
gcloud compute disks delete "$1" --project "${PROJECT}" --zone "${ZONE}" --quiet || true
else
break
else
break
fi
done
}

# Robustly try to add metadata on an instance.
# $1: The name of the instace.
# $2: The metadata key=value pair to add.
function add-instance-metadata {
detect-project
local attempt=0
while true; do
if ! gcloud compute instances add-metadata "$1" \
--project "${PROJECT}" \
--zone "${ZONE}" \
--metadata "$2"; then
if (( attempt > 5 )); then
echo -e "${color_red}Failed to add instance metadata in $1 ${color_norm}"
exit 2
fi
echo -e "${color_yellow}Attempt $(($attempt+1)) failed to add metadata in $1. Retrying.${color_norm}"
attempt=$(($attempt+1))
else
break
fi
done
}
Expand Down Expand Up @@ -384,7 +438,7 @@ function kube-up {
echo "mkdir -p /var/cache/kubernetes-install"
echo "cd /var/cache/kubernetes-install"
echo "readonly MASTER_NAME='${MASTER_NAME}'"
echo "readonly NODE_INSTANCE_PREFIX='${INSTANCE_PREFIX}-minion'"
echo "readonly NODE_INSTANCE_PREFIX='${NODE_INSTANCE_PREFIX}'"
echo "readonly SERVER_BINARY_TAR_URL='${SERVER_BINARY_TAR_URL}'"
echo "readonly SALT_TAR_URL='${SALT_TAR_URL}'"
echo "readonly MASTER_HTPASSWD='${htpasswd}'"
Expand Down Expand Up @@ -440,43 +494,51 @@ function kube-up {
# Wait for last batch of jobs.
wait-for-jobs

# Create the routes, 10 at a time.
for (( i=0; i<${#MINION_NAMES[@]}; i++)); do
create-route "${MINION_NAMES[$i]}" "${MINION_IP_RANGES[$i]}" &

if [ $i -ne 0 ] && [ $((i%10)) -eq 0 ]; then
echo Waiting for a batch of routes at $i...
wait-for-jobs
fi

done
# Wait for last batch of jobs.
wait-for-jobs

local -a scope_flags=()
if (( "${#MINION_SCOPES[@]}" > 0 )); then
scope_flags=("--scopes" "${MINION_SCOPES[@]}")
else
scope_flags=("--no-scopes")
fi
# Create the instances, 5 at a time.
for (( i=0; i<${#MINION_NAMES[@]}; i++)); do
(

(
echo "#! /bin/bash"
echo "ZONE='${ZONE}'"
echo "MASTER_NAME='${MASTER_NAME}'"
echo "MINION_IP_RANGE='${MINION_IP_RANGES[$i]}'"
echo "until MINION_IP_RANGE=\$(curl --fail --silent -H 'Metadata-Flavor: Google'\\"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pre-existing, but seeing shell scripts generating other shell scripts always makes me cringe... Why do people think it's OK to do it? If I were to suggest they should use Go (or Python, Javascript, etc.) to generate a code snippet on the fly I'm sure people would agree that is bad... Why not with shell?

echo " http://metadata/computeMetadata/v1/instance/attributes/node-ip-range); do"
echo " echo 'Waiting for metadata MINION_IP_RANGE...'"
echo " sleep 3"
echo "done"
echo "EXTRA_DOCKER_OPTS='${EXTRA_DOCKER_OPTS}'"
echo "ENABLE_DOCKER_REGISTRY_CACHE='${ENABLE_DOCKER_REGISTRY_CACHE:-false}'"
grep -v "^#" "${KUBE_ROOT}/cluster/gce/templates/common.sh"
grep -v "^#" "${KUBE_ROOT}/cluster/gce/templates/salt-minion.sh"
) > "${KUBE_TEMP}/minion-start-${i}.sh"
) > "${KUBE_TEMP}/minion-start.sh"

create-node-template "${NODE_INSTANCE_PREFIX}-template" "${scope_flags[*]}" \
"startup-script=${KUBE_TEMP}/minion-start.sh"

gcloud preview managed-instance-groups --zone "${ZONE}" \
create "${NODE_INSTANCE_PREFIX}-group" \
--project "${PROJECT}" \
--base-instance-name "${NODE_INSTANCE_PREFIX}" \
--size "${NUM_MINIONS}" \
--template "${NODE_INSTANCE_PREFIX}-template" || true;
# TODO: this should be true when the above create managed-instance-group
# command returns, but currently it returns before the instances come up due
# to gcloud's deficiency.
wait-for-minions-to-run

local scopes_flag="${scope_flags[@]}"
create-minion "${MINION_NAMES[$i]}" "${scopes_flag}" "startup-script=${KUBE_TEMP}/minion-start-${i}.sh" &
detect-minion-names

# Create the routes and set IP ranges to instance metadata, 5 instances at a time.
for (( i=0; i<${#MINION_NAMES[@]}; i++)); do
create-route "${MINION_NAMES[$i]}" "${MINION_IP_RANGES[$i]}" &
add-instance-metadata "${MINION_NAMES[$i]}" "node-ip-range=${MINION_IP_RANGES[$i]}" &

if [ $i -ne 0 ] && [ $((i%5)) -eq 0 ]; then
echo Waiting for creation of a batch of instances at $i...
echo Waiting for a batch of routes at $i...
wait-for-jobs
fi

Expand Down Expand Up @@ -595,7 +657,7 @@ EOF
#
# Assumed vars:
# MASTER_NAME
# INSTANCE_PREFIX
# NODE_INSTANCE_PREFIX
# ZONE
# This function tears down cluster resources 10 at a time to avoid issuing too many
# API calls and exceeding API quota. It is important to bring down the instances before bringing
Expand All @@ -605,6 +667,16 @@ function kube-down {

echo "Bringing down cluster"

gcloud preview managed-instance-groups --zone "${ZONE}" delete \
--project "${PROJECT}" \
--quiet \
"${NODE_INSTANCE_PREFIX}-group" || true

gcloud compute instance-templates delete \
--project "${PROJECT}" \
--quiet \
"${NODE_INSTANCE_PREFIX}-template" || true

# First delete the master (if it exists).
gcloud compute instances delete \
--project "${PROJECT}" \
Expand All @@ -616,7 +688,7 @@ function kube-down {
local -a minions
minions=( $(gcloud compute instances list \
--project "${PROJECT}" --zone "${ZONE}" \
--regexp "${INSTANCE_PREFIX}-minion-[0-9]+" \
--regexp "${NODE_INSTANCE_PREFIX}-.+" \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also pre-existing, but this assumes ${NODE_INSTANCE_PREFIX} can be used as a regexp and yet doesn't escape the characters in it, so if it had any magic characters such as . or * or [, etc. this might potentially present invalid matches. But this is already in use elsewhere and it's not really easy to fix in shell...

I think my problem here is the amount of shell we let into Kubernetes but right now there's not much we can do in individual PRs to really address that... Sigh.

| awk 'NR >= 2 { print $1 }') )
# If any minions are running, delete them in batches.
while (( "${#minions[@]}" > 0 )); do
Expand Down Expand Up @@ -645,7 +717,7 @@ function kube-down {
# Delete routes.
local -a routes
routes=( $(gcloud compute routes list --project "${PROJECT}" \
--regexp "${INSTANCE_PREFIX}-minion-[0-9]+" | awk 'NR >= 2 { print $1 }') )
--regexp "${NODE_INSTANCE_PREFIX}-.+" | awk 'NR >= 2 { print $1 }') )
while (( "${#routes[@]}" > 0 )); do
echo Deleting routes "${routes[*]::10}"
gcloud compute routes delete \
Expand Down
5 changes: 5 additions & 0 deletions cluster/kube-util.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@ function detect-master {
echo "KUBE_MASTER: $KUBE_MASTER"
}

# Get minion names if they are not static.
function detect-minion-names {
echo "MINION_NAMES: ${MINION_NAMES[*]}"
}

# Get minion IP addresses and store in KUBE_MINION_IP_ADDRESSES[]
function detect-minions {
echo "KUBE_MINION_IP_ADDRESSES=[]"
Expand Down
2 changes: 2 additions & 0 deletions hack/e2e-suite/pd.sh
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@ function teardown() {
delete_pd_pod
rm -rf ${config}

detect-minion-names

# This should really work immediately after the pod is killed, but
# it doesn't (yet). So let's be resilient to that.
#
Expand Down