Add node level chaos scenarios for bastion node

Signed-off-by: Pravin Dsilva <pravin.d-silva@ibm.com>
krkn-chaos · Feb 4, 2021 · ed73d0a · ed73d0a
1 parent ca44f53
commit ed73d0a
Show file tree

Hide file tree

Showing 6 changed files with 99 additions and 0 deletions.
diff --git a/docs/node_scenarios.md b/docs/node_scenarios.md
@@ -10,6 +10,7 @@ Following node chaos scenarios are supported:
 6. **stop_kubelet_scenario**: scenario to stop the kubelet of the node instance.
 7. **stop_start_kubelet_scenario**: scenario to stop and start the kubelet of the node instance.
 8. **node_crash_scenario**: scenario to crash the node instance.
+9. **stop_start_helper_node_scenario**: scenario to stop and start the helper node and check service status.
 
 **NOTE**: If the node doesn't recover from the node_crash_scenario injection, reboot the node to get it back to Ready state.
 
@@ -34,6 +35,10 @@ After creating the service account you'll need to enable the account using the f
 
 The supported node level chaos scenarios on an OPENSTACK cloud are `node_stop_start_scenario`, `stop_start_kubelet_scenario` and `node_reboot_scenario`.
 
+**NOTE**: For `stop_start_helper_node_scenario`,  visit [here](https://github.com/RedHatOfficial/ocp4-helpernode) to learn more about the helper node and its usage.
+
+On the host running Kraken, the private key at /root/.ssh/id_rsa will be used for ssh connection to the helper node. Ensure passwordless ssh is configured on the helper node to avoid connection errors.
+
 **NOTE**: The `node_crash_scenario` and `stop_kubelet_scenario` scenario is supported independent of the cloud platform.
 
 Use 'generic' or do not add the 'cloud_type' key to your scenario if your cluster is not set up using one of the current supported cloud types

diff --git a/kraken/node_actions/abstract_node_scenarios.py b/kraken/node_actions/abstract_node_scenarios.py
@@ -21,6 +21,12 @@ def node_stop_start_scenario(self, instance_kill_count, node, timeout):
         self.node_start_scenario(instance_kill_count, node, timeout)
         logging.info("node_stop_start_scenario has been successfully injected!")
 
+    def helper_node_stop_start_scenario(self, instance_kill_count, node, timeout):
+        logging.info("Starting helper_node_stop_start_scenario injection")
+        self.helper_node_stop_scenario(instance_kill_count, node, timeout)
+        self.helper_node_start_scenario(instance_kill_count, node, timeout)
+        logging.info("helper_node_stop_start_scenario has been successfully injected!")
+
     # Node scenario to terminate the node
     def node_termination_scenario(self, instance_kill_count, node, timeout):
         pass
@@ -66,3 +72,7 @@ def node_crash_scenario(self, instance_kill_count, node, timeout):
                               "Test Failed" % (e))
                 logging.error("node_crash_scenario injection failed!")
                 sys.exit(1)
+
+    # Node scenario to check service status on helper node
+    def node_service_status(self, node, service, timeout):
+        pass
diff --git a/kraken/node_actions/common_node_functions.py b/kraken/node_actions/common_node_functions.py
@@ -1,6 +1,7 @@
 import time
 import random
 import logging
+import paramiko
 import kraken.kubernetes.client as kubecli
 import kraken.invoke.command as runcommand
 
@@ -39,3 +40,28 @@ def wait_for_unknown_status(node, timeout):
 # Get the ip of the cluster node
 def get_node_ip(node):
     return runcommand.invoke("kubectl get node %s -o jsonpath='{.status.addresses[?(@.type==\"InternalIP\")].address}'" % (node))
+
+def check_service_status(node, service, timeout):
+    ssh = paramiko.SSHClient()
+    ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+    i = 0
+    sleeper = 1
+    while i <= timeout:
+        try:
+            time.sleep(sleeper)
+            i += sleeper            
+            logging.info("Trying to ssh to instance: %s" % (node))
+            connection = ssh.connect(node, username='root', key_filename='/root/.ssh/id_rsa', timeout=800, banner_timeout=400)
+            if connection is None:
+                break
+        except:
+            pass
+    for service_name in service:
+        logging.info("Checking status of Service: %s" % (service_name))
+        stdin, stdout, stderr = ssh.exec_command("systemctl status %s  | grep '^   Active' |  awk '{print $2}'" % (service_name))
+        service_status =  stdout.readlines()[0]
+        logging.info("Status of service %s is %s \n" % (service_name, service_status.strip()))
+        if(service_status.strip() != "active" ):
+            logging.error("Expected service %s to be in Active state but is %s" %  (service_name, service_status.strip()))
+    ssh.close()
+
diff --git a/kraken/node_actions/openstack_node_scenarios.py b/kraken/node_actions/openstack_node_scenarios.py
@@ -131,3 +131,47 @@ def node_reboot_scenario(self, instance_kill_count, node, timeout):
                 logging.error("node_reboot_scenario injection failed!")
                 sys.exit(1)
 
+   # Node scenario to start the node
+    def helper_node_start_scenario(self, instance_kill_count, node_ip, timeout):
+        for _ in range(instance_kill_count):
+            try:
+                logging.info("Starting helper_node_start_scenario injection")
+                openstack_node_name = self.openstackcloud.get_openstack_nodename(node_ip.strip())
+                logging.info("Starting the helper node %s" % (openstack_node_name))
+                self.openstackcloud.start_instances(openstack_node_name)
+                self.openstackcloud.wait_until_running(openstack_node_name)
+                logging.info("Helper node with IP: %s is in running state" % (node_ip))
+                logging.info("node_start_scenario has been successfully injected!")
+            except Exception as e:
+                logging.error("Failed to start node instance. Encountered following "
+                              "exception: %s. Test Failed" % (e))
+                logging.error("helper_node_start_scenario injection failed!")
+                sys.exit(1)
+
+    # Node scenario to stop the node
+    def helper_node_stop_scenario(self, instance_kill_count, node_ip, timeout):
+        for _ in range(instance_kill_count):
+            try:
+                logging.info("Starting helper_node_stop_scenario injection")
+                openstack_node_name = self.openstackcloud.get_openstack_nodename(node_ip.strip())
+                logging.info("Stopping the helper node %s " % (openstack_node_name))
+                self.openstackcloud.stop_instances(openstack_node_name)
+                self.openstackcloud.wait_until_stopped(openstack_node_name)
+                logging.info("Helper node with IP: %s is in stopped state" % (node_ip))
+            except Exception as e:
+                logging.error("Failed to stop node instance. Encountered following exception: %s. "
+                              "Test Failed" % (e))
+                logging.error("helper_node_stop_scenario injection failed!")
+                sys.exit(1)
+
+    def helper_node_service_status(self, node_ip, service, timeout):
+        try:
+            logging.info("Checking service status on the helper node")
+            nodeaction.check_service_status(node_ip.strip(), service, timeout)
+            logging.info("Service status checked on %s" % (node_ip))
+            logging.info("Check service status is successfuly injected!")
+        except Exception as e:
+            logging.error("Failed to check service status. Encountered following exception:"
+                          " %s. Test Failed" % (e))
+            logging.error("helper_node_service_status injection failed!")
+            sys.exit(1)
diff --git a/run_kraken.py b/run_kraken.py
@@ -49,6 +49,7 @@ def inject_node_scenario(action, node_scenario, node_scenario_object):
     node_name = node_scenario.get("node_name", "")
     label_selector = node_scenario.get("label_selector", "")
     timeout = node_scenario.get("timeout", 120)
+    service = node_scenario.get("service", "")
     # Get the node to apply the scenario
     node = nodeaction.get_node(node_name, label_selector)
 
@@ -71,6 +72,13 @@ def inject_node_scenario(action, node_scenario, node_scenario_object):
             node_scenario_object.stop_kubelet_scenario(instance_kill_count, node, timeout)
         elif action == "node_crash_scenario":
             node_scenario_object.node_crash_scenario(instance_kill_count, node, timeout)
+        elif action == "stop_start_helper_node_scenario":
+            if not node_scenario['helper_node_ip']:
+                logging.error("Helper node IP address is not provided")
+                sys.exit(1)
+            node_scenario_object.helper_node_stop_start_scenario(instance_kill_count, node_scenario['helper_node_ip'], timeout)
+            node_scenario_object.helper_node_service_status(node_scenario['helper_node_ip'], service, timeout)
+
 
 
 # Get cerberus status

diff --git a/scenarios/node_scenarios_example.yml b/scenarios/node_scenarios_example.yml
@@ -3,11 +3,17 @@ node_scenarios:
     - node_stop_start_scenario
     - stop_start_kubelet_scenario
     - node_crash_scenario
+   #- stop_start_helper_node_scenario
     node_name:                                                      # node on which scenario has to be injected
     label_selector: node-role.kubernetes.io/worker                  # when node_name is not specified, a node with matching label_selector is selected for node chaos scenario injection
     instance_kill_count: 1                                          # number of times to inject each scenario under actions
     timeout: 120                                                    # duration to wait for completion of node scenario injection
     cloud_type: aws                                                 # cloud type on which Kubernetes/OpenShift runs
+    helper_node_ip:                                            # ip address of the helper node
+    service:                                                        # check status of the services on the helper node
+      - haproxy
+      - dhcpd
+      - named
   - actions:
     - node_reboot_scenario
     node_name: