Skip to content

Commit

Permalink
Try and simplify the logging and checks while waiting for ASG/cluster…
Browse files Browse the repository at this point in the history
… health by avoiding nesting and boolean variable sttae checks
  • Loading branch information
chadlwilson committed Aug 31, 2020
1 parent 036f5d3 commit 29e9e29
Showing 1 changed file with 22 additions and 31 deletions.
53 changes: 22 additions & 31 deletions eksrollup/cli.py
Expand Up @@ -14,52 +14,43 @@
def validate_cluster_health(asg_name, new_desired_asg_capacity, cluster_name, predictive, health_check_type="regular",):
cluster_health_retry = app_config['CLUSTER_HEALTH_RETRY']
cluster_health_wait = app_config['CLUSTER_HEALTH_WAIT']
cluster_healthy = False
retry_count = 0

while retry_count < cluster_health_retry:
retry_count += 1
node_count_mismatch = False
if health_check_type == "asg":
logger.info(f'Waiting for {cluster_health_wait} seconds for ASG to scale before validating cluster health...')
else:
logger.info(f'Waiting for {cluster_health_wait} seconds before validating cluster health...')

time.sleep(cluster_health_wait)
desired_k8s_node_count = count_all_cluster_instances(cluster_name, predictive=predictive)

# check if asg has enough nodes first before checking instance health
if is_asg_scaled(asg_name, new_desired_asg_capacity):
# if asg is healthy start draining and terminating instances
if is_asg_healthy(asg_name):
# check if k8s nodes are all online and with the matches the desired amount of instances
if k8s_nodes_count(desired_k8s_node_count):
# check k8s nodes are healthy
if k8s_nodes_ready():
cluster_healthy = True
break
else:
node_count_mismatch = True
else:
logger.info('Validation failed for asg {}. Not enough instances online.'.format(asg_name))
if not is_asg_scaled(asg_name, new_desired_asg_capacity):
logger.info(f'Validation failed for asg {asg_name}. Not enough instances online.')
continue

# wait and check for instances in ASG to become healthy
if not is_asg_healthy(asg_name):
logger.info(f'Validation failed for asg {asg_name}. Some instances not yet healthy.')
continue

# wait and check for desired amount of k8s nodes to come online within the cluster
desired_k8s_node_count = count_all_cluster_instances(cluster_name, predictive=predictive)
if not k8s_nodes_count(desired_k8s_node_count):
logger.info(f'Validation failed for cluster {cluster_name}. Didn\'t reach expected node count {desired_k8s_node_count}.')
continue

# Wait and check for nodes to become ready
if not k8s_nodes_ready():
logger.info('Validation failed for cluster. Expected node count reached but nodes are not ready.')
continue

if cluster_healthy:
logger.info('Cluster validation passed. Proceeding with node draining and termination...')
elif node_count_mismatch:
nodes = get_k8s_nodes()
logger.info('Validation failed for cluster. Current node count {} Expected node count {}.'.format(
len(nodes),
desired_k8s_node_count))
elif not cluster_healthy:
logger.info('Validation failed for cluster. Expected node count reached but nodes are not healthy.')
else:
logger.info(
'Validation failed for asg {}.'
'Instances not healthy'.format(asg_name))
return

if not cluster_healthy:
logger.info('Exiting since ASG healthcheck failed')
raise Exception('ASG healthcheck failed')
logger.info(f'Exiting since ASG healthcheck failed after {cluster_health_retry} attempts')
raise Exception('ASG healthcheck failed')


def scale_up_asg(cluster_name, asg, count):
Expand Down

0 comments on commit 29e9e29

Please sign in to comment.