Try and simplify the logging and checks while waiting for ASG/cluster…

… health by avoiding nesting and boolean variable sttae checks
hellofresh · Aug 31, 2020 · 29e9e29 · 29e9e29
1 parent 036f5d3
commit 29e9e29
Showing 1 changed file with 22 additions and 31 deletions.
diff --git a/eksrollup/cli.py b/eksrollup/cli.py
@@ -14,52 +14,43 @@
 def validate_cluster_health(asg_name, new_desired_asg_capacity, cluster_name, predictive, health_check_type="regular",):
     cluster_health_retry = app_config['CLUSTER_HEALTH_RETRY']
     cluster_health_wait = app_config['CLUSTER_HEALTH_WAIT']
-    cluster_healthy = False
     retry_count = 0
 
     while retry_count < cluster_health_retry:
         retry_count += 1
-        node_count_mismatch = False
         if health_check_type == "asg":
             logger.info(f'Waiting for {cluster_health_wait} seconds for ASG to scale before validating cluster health...')
         else:
             logger.info(f'Waiting for {cluster_health_wait} seconds before validating cluster health...')
 
         time.sleep(cluster_health_wait)
-        desired_k8s_node_count = count_all_cluster_instances(cluster_name, predictive=predictive)
 
         # check if asg has enough nodes first before checking instance health
-        if is_asg_scaled(asg_name, new_desired_asg_capacity):
-            # if asg is healthy start draining and terminating instances
-            if is_asg_healthy(asg_name):
-                # check if k8s nodes are all online and with the matches the desired amount of instances
-                if k8s_nodes_count(desired_k8s_node_count):
-                    # check k8s nodes are healthy
-                    if k8s_nodes_ready():
-                        cluster_healthy = True
-                        break
-                else:
-                    node_count_mismatch = True
-        else:
-            logger.info('Validation failed for asg {}. Not enough instances online.'.format(asg_name))
+        if not is_asg_scaled(asg_name, new_desired_asg_capacity):
+            logger.info(f'Validation failed for asg {asg_name}. Not enough instances online.')
+            continue
+
+        # wait and check for instances in ASG to become healthy
+        if not is_asg_healthy(asg_name):
+            logger.info(f'Validation failed for asg {asg_name}. Some instances not yet healthy.')
+            continue
+
+        # wait and check for desired amount of k8s nodes to come online within the cluster
+        desired_k8s_node_count = count_all_cluster_instances(cluster_name, predictive=predictive)
+        if not k8s_nodes_count(desired_k8s_node_count):
+            logger.info(f'Validation failed for cluster {cluster_name}. Didn\'t reach expected node count {desired_k8s_node_count}.')
+            continue
+
+        # Wait and check for nodes to become ready
+        if not k8s_nodes_ready():
+            logger.info('Validation failed for cluster. Expected node count reached but nodes are not ready.')
+            continue
 
-    if cluster_healthy:
         logger.info('Cluster validation passed. Proceeding with node draining and termination...')
-    elif node_count_mismatch:
-        nodes = get_k8s_nodes()
-        logger.info('Validation failed for cluster. Current node count {} Expected node count {}.'.format(
-            len(nodes),
-            desired_k8s_node_count))
-    elif not cluster_healthy:
-        logger.info('Validation failed for cluster. Expected node count reached but nodes are not healthy.')
-    else:
-        logger.info(
-            'Validation failed for asg {}.'
-            'Instances not healthy'.format(asg_name))
+        return
 
-    if not cluster_healthy:
-        logger.info('Exiting since ASG healthcheck failed')
-        raise Exception('ASG healthcheck failed')
+    logger.info(f'Exiting since ASG healthcheck failed after {cluster_health_retry} attempts')
+    raise Exception('ASG healthcheck failed')
 
 
 def scale_up_asg(cluster_name, asg, count):