Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AWS: Experiment support for multiple AZs / subnets #16735

Merged
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
129 changes: 103 additions & 26 deletions cluster/aws/util.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,19 @@

# A library of helper functions and constant for the local config.

# Experimental flags can be removed/renamed at any time.
# The intent is to allow experimentation/advanced functionality before we
# are ready to commit to supporting it.
# Experimental functionality:
# KUBE_SHARE_MASTER=true
# Detect and reuse an existing master; useful if you want to
# create more nodes, perhaps with a different instance type or in
# a different subnet/AZ
# KUBE_SUBNET_CIDR=172.20.1.0/24
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: Do we want to make this comment consistent with the default?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was trying to provide a hint as to the typical usage - I imagine most people will not specify the default, but may then need a hint for the second AZ. (Same as KUBE_SHARE_MASTER=true)

# Override the default subnet CIDR; useful if you want to create
# a second subnet. The default subnet is 172.20.0.0/24. The VPC
# is created with 172.20.0.0/16; you must pick a sub-CIDR of that.

# Use the config file specified in $KUBE_CONFIG_FILE, or default to
# config-default.sh.
KUBE_ROOT=$(dirname "${BASH_SOURCE}")/../..
Expand All @@ -25,7 +38,9 @@ source "${KUBE_ROOT}/cluster/common.sh"
ALLOCATE_NODE_CIDRS=true

NODE_INSTANCE_PREFIX="${INSTANCE_PREFIX}-minion"
ASG_NAME="${NODE_INSTANCE_PREFIX}-group"

# The Auto Scaling Group (ASG) name must be unique, so we include the zone
ASG_NAME="${NODE_INSTANCE_PREFIX}-group-${ZONE}"

# We could allow the master disk volume id to be specified in future
MASTER_DISK_ID=
Expand Down Expand Up @@ -53,9 +68,15 @@ AWS_CMD="aws --output json ec2"
AWS_ELB_CMD="aws --output json elb"
AWS_ASG_CMD="aws --output json autoscaling"

INTERNAL_IP_BASE=172.20.0
VPC_CIDR_BASE=172.20
MASTER_IP_SUFFIX=.9
MASTER_INTERNAL_IP=${INTERNAL_IP_BASE}${MASTER_IP_SUFFIX}
MASTER_INTERNAL_IP=${VPC_CIDR_BASE}.0${MASTER_IP_SUFFIX}
VPC_CIDR=${VPC_CIDR_BASE}.0.0/16
SUBNET_CIDR=${VPC_CIDR_BASE}.0.0/24
if [[ -n "${KUBE_SUBNET_CIDR:-}" ]]; then
echo "Using subnet CIDR override: ${KUBE_SUBNET_CIDR}"
SUBNET_CIDR=${KUBE_SUBNET_CIDR}
fi

MASTER_SG_NAME="kubernetes-master-${CLUSTER_ID}"
MINION_SG_NAME="kubernetes-minion-${CLUSTER_ID}"
Expand All @@ -82,15 +103,20 @@ function get_vpc_id {
}

function get_subnet_id {
python -c "import json,sys; lst = [str(subnet['SubnetId']) for subnet in json.load(sys.stdin)['Subnets'] if subnet['VpcId'] == '$1' and subnet['AvailabilityZone'] == '$2']; print ''.join(lst)"
local vpc_id=$1
local az=$2
$AWS_CMD --output text describe-subnets \
--filters Name=tag:KubernetesCluster,Values=${CLUSTER_ID} \
Name=availabilityZone,Values=${az} \
Name=vpc-id,Values=${vpc_id} \
--query Subnets[].SubnetId
}

function get_igw_id {
python -c "import json,sys; lst = [str(igw['InternetGatewayId']) for igw in json.load(sys.stdin)['InternetGateways'] for attachment in igw['Attachments'] if attachment['VpcId'] == '$1']; print ''.join(lst)"
}

function get_route_table_id {
python -c "import json,sys; lst = [str(route_table['RouteTableId']) for route_table in json.load(sys.stdin)['RouteTables'] if route_table['VpcId'] == '$1']; print ''.join(lst)"
local vpc_id=$1
$AWS_CMD --output text describe-internet-gateways \
--filters Name=attachment.vpc-id,Values=${vpc_id} \
--query InternetGateways[].InternetGatewayId
}

function get_elbs_in_vpc {
Expand Down Expand Up @@ -163,6 +189,7 @@ function query-running-minions () {
--filters Name=instance-state-name,Values=running \
Name=vpc-id,Values=${VPC_ID} \
Name=tag:KubernetesCluster,Values=${CLUSTER_ID} \
Name=tag:aws:autoscaling:groupName,Values=${ASG_NAME} \
Name=tag:Role,Values=${MINION_TAG} \
--query ${query}
}
Expand Down Expand Up @@ -701,7 +728,7 @@ function kube-up {
fi
if [[ -z "$VPC_ID" ]]; then
echo "Creating vpc."
VPC_ID=$($AWS_CMD create-vpc --cidr-block $INTERNAL_IP_BASE.0/16 | json_val '["Vpc"]["VpcId"]')
VPC_ID=$($AWS_CMD create-vpc --cidr-block ${VPC_CIDR} | json_val '["Vpc"]["VpcId"]')
$AWS_CMD modify-vpc-attribute --vpc-id $VPC_ID --enable-dns-support '{"Value": true}' > $LOG
$AWS_CMD modify-vpc-attribute --vpc-id $VPC_ID --enable-dns-hostnames '{"Value": true}' > $LOG
add-tag $VPC_ID Name kubernetes-vpc
Expand All @@ -713,23 +740,26 @@ function kube-up {
create-dhcp-option-set

if [[ -z "${SUBNET_ID:-}" ]]; then
SUBNET_ID=$($AWS_CMD describe-subnets --filters Name=tag:KubernetesCluster,Values=${CLUSTER_ID} | get_subnet_id $VPC_ID $ZONE)
SUBNET_ID=$(get_subnet_id $VPC_ID $ZONE)
fi

if [[ -z "$SUBNET_ID" ]]; then
echo "Creating subnet."
SUBNET_ID=$($AWS_CMD create-subnet --cidr-block $INTERNAL_IP_BASE.0/24 --vpc-id $VPC_ID --availability-zone ${ZONE} | json_val '["Subnet"]["SubnetId"]')
SUBNET_ID=$($AWS_CMD create-subnet --cidr-block ${SUBNET_CIDR} --vpc-id $VPC_ID --availability-zone ${ZONE} | json_val '["Subnet"]["SubnetId"]')
add-tag $SUBNET_ID KubernetesCluster ${CLUSTER_ID}
else
EXISTING_CIDR=$($AWS_CMD describe-subnets --subnet-ids ${SUBNET_ID} --query Subnets[].CidrBlock --output text)
echo "Using existing CIDR $EXISTING_CIDR"
INTERNAL_IP_BASE=${EXISTING_CIDR%.*}
MASTER_INTERNAL_IP=${INTERNAL_IP_BASE}${MASTER_IP_SUFFIX}
echo "Using existing subnet with CIDR $EXISTING_CIDR"
VPC_CIDR=$($AWS_CMD describe-vpcs --vpc-ids ${VPC_ID} --query Vpcs[].CidrBlock --output text)
echo "VPC CIDR is $VPC_CIDR"
VPC_CIDR_BASE=${VPC_CIDR%.*.*}
MASTER_INTERNAL_IP=${VPC_CIDR_BASE}.0${MASTER_IP_SUFFIX}
echo "Assuming MASTER_INTERNAL_IP=${MASTER_INTERNAL_IP}"
fi

echo "Using subnet $SUBNET_ID"

IGW_ID=$($AWS_CMD describe-internet-gateways | get_igw_id $VPC_ID)
IGW_ID=$(get_igw_id $VPC_ID)
if [[ -z "$IGW_ID" ]]; then
echo "Creating Internet Gateway."
IGW_ID=$($AWS_CMD create-internet-gateway | json_val '["InternetGateway"]["InternetGatewayId"]')
Expand Down Expand Up @@ -791,6 +821,33 @@ function kube-up {
# HTTPS to the master is allowed (for API access)
authorize-security-group-ingress "${MASTER_SG_ID}" "--protocol tcp --port 443 --cidr 0.0.0.0/0"

# KUBE_SHARE_MASTER is used to add minions to an existing master
if [[ "${KUBE_SHARE_MASTER:-}" == "true" ]]; then
# Detect existing master
detect-master

# Start minions
start-minions
else
# Create the master
start-master

# Start minions
start-minions

# Wait for the master to be ready
wait-master

# Build ~/.kube/config
build-config
fi

# Check the cluster is OK
check-cluster
}

# Starts the master node
function start-master() {
# Get or create master persistent volume
ensure-master-pd

Expand Down Expand Up @@ -952,7 +1009,10 @@ function kube-up {
attempt=$(($attempt+1))
sleep 10
done
}

# Creates an ASG for the minion nodes
function start-minions() {
echo "Creating minion configuration"
generate-minion-user-data > "${KUBE_TEMP}/minion-user-data"
local public_ip_option
Expand Down Expand Up @@ -1007,9 +1067,11 @@ function kube-up {
attempt=$(($attempt+1))
sleep 10
done
}

# Wait for the master to be started
function wait-master() {
detect-master > $LOG
detect-minions > $LOG

# TODO(justinsb): This is really not necessary any more
# Wait 3 minutes for cluster to come up. We hit it with a "highstate" after that to
Expand Down Expand Up @@ -1038,7 +1100,11 @@ function kube-up {
done

echo "Kubernetes cluster created."
}

# Creates the ~/.kube/config file, getting the information from the master
# The master much be running and set in KUBE_MASTER_IP
function build-config() {
# TODO use token instead of kube_auth
export KUBE_CERT="/tmp/$RANDOM-kubecfg.crt"
export KUBE_KEY="/tmp/$RANDOM-kubecfg.key"
Expand All @@ -1057,11 +1123,16 @@ function kube-up {

create-kubeconfig
)
}

# Sanity check the cluster and print confirmation messages
function check-cluster() {
echo "Sanity checking cluster..."

sleep 5

detect-minions > $LOG

# Don't bail on errors, we want to be able to print some info.
set +e

Expand Down Expand Up @@ -1127,20 +1198,26 @@ function kube-down {
done
fi

if [[ -n $(${AWS_ASG_CMD} --output text describe-auto-scaling-groups --auto-scaling-group-names ${ASG_NAME} --query AutoScalingGroups[].AutoScalingGroupName) ]]; then
echo "Deleting auto-scaling group: ${ASG_NAME}"
${AWS_ASG_CMD} delete-auto-scaling-group --force-delete --auto-scaling-group-name ${ASG_NAME}
fi
if [[ -n $(${AWS_ASG_CMD} --output text describe-launch-configurations --launch-configuration-names ${ASG_NAME} --query LaunchConfigurations[].LaunchConfigurationName) ]]; then
echo "Deleting auto-scaling launch configuration: ${ASG_NAME}"
${AWS_ASG_CMD} delete-launch-configuration --launch-configuration-name ${ASG_NAME}
fi

echo "Deleting instances in VPC: ${vpc_id}"
instance_ids=$($AWS_CMD --output text describe-instances \
--filters Name=vpc-id,Values=${vpc_id} \
Name=tag:KubernetesCluster,Values=${CLUSTER_ID} \
--query Reservations[].Instances[].InstanceId)

asg_groups=$($AWS_CMD --output text describe-instances \
--query 'Reservations[].Instances[].Tags[?Key==`aws:autoscaling:groupName`].Value[]' \
--instance-ids ${instance_ids})
for asg_group in ${asg_groups}; do
if [[ -n $(${AWS_ASG_CMD} --output text describe-auto-scaling-groups --auto-scaling-group-names ${asg_group} --query AutoScalingGroups[].AutoScalingGroupName) ]]; then
echo "Deleting auto-scaling group: ${asg_group}"
${AWS_ASG_CMD} delete-auto-scaling-group --force-delete --auto-scaling-group-name ${asg_group}
fi
if [[ -n $(${AWS_ASG_CMD} --output text describe-launch-configurations --launch-configuration-names ${asg_group} --query LaunchConfigurations[].LaunchConfigurationName) ]]; then
echo "Deleting auto-scaling launch configuration: ${asg_group}"
${AWS_ASG_CMD} delete-launch-configuration --launch-configuration-name ${asg_group}
fi
done

if [[ -n "${instance_ids}" ]]; then
$AWS_CMD terminate-instances --instance-ids ${instance_ids} > $LOG
echo "Waiting for instances to be deleted"
Expand Down