Skip to content

Commit

Permalink
Add node level chaos scenarios for bastion node
Browse files Browse the repository at this point in the history
Signed-off-by: Pravin Dsilva <pravin.d-silva@ibm.com>
  • Loading branch information
Pravin Dsilva committed Feb 4, 2021
1 parent ca44f53 commit ed73d0a
Show file tree
Hide file tree
Showing 6 changed files with 99 additions and 0 deletions.
5 changes: 5 additions & 0 deletions docs/node_scenarios.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ Following node chaos scenarios are supported:
6. **stop_kubelet_scenario**: scenario to stop the kubelet of the node instance.
7. **stop_start_kubelet_scenario**: scenario to stop and start the kubelet of the node instance.
8. **node_crash_scenario**: scenario to crash the node instance.
9. **stop_start_helper_node_scenario**: scenario to stop and start the helper node and check service status.

**NOTE**: If the node doesn't recover from the node_crash_scenario injection, reboot the node to get it back to Ready state.

Expand All @@ -34,6 +35,10 @@ After creating the service account you'll need to enable the account using the f

The supported node level chaos scenarios on an OPENSTACK cloud are `node_stop_start_scenario`, `stop_start_kubelet_scenario` and `node_reboot_scenario`.

**NOTE**: For `stop_start_helper_node_scenario`, visit [here](https://github.com/RedHatOfficial/ocp4-helpernode) to learn more about the helper node and its usage.

On the host running Kraken, the private key at /root/.ssh/id_rsa will be used for ssh connection to the helper node. Ensure passwordless ssh is configured on the helper node to avoid connection errors.

**NOTE**: The `node_crash_scenario` and `stop_kubelet_scenario` scenario is supported independent of the cloud platform.

Use 'generic' or do not add the 'cloud_type' key to your scenario if your cluster is not set up using one of the current supported cloud types
Expand Down
10 changes: 10 additions & 0 deletions kraken/node_actions/abstract_node_scenarios.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,12 @@ def node_stop_start_scenario(self, instance_kill_count, node, timeout):
self.node_start_scenario(instance_kill_count, node, timeout)
logging.info("node_stop_start_scenario has been successfully injected!")

def helper_node_stop_start_scenario(self, instance_kill_count, node, timeout):
logging.info("Starting helper_node_stop_start_scenario injection")
self.helper_node_stop_scenario(instance_kill_count, node, timeout)
self.helper_node_start_scenario(instance_kill_count, node, timeout)
logging.info("helper_node_stop_start_scenario has been successfully injected!")

# Node scenario to terminate the node
def node_termination_scenario(self, instance_kill_count, node, timeout):
pass
Expand Down Expand Up @@ -66,3 +72,7 @@ def node_crash_scenario(self, instance_kill_count, node, timeout):
"Test Failed" % (e))
logging.error("node_crash_scenario injection failed!")
sys.exit(1)

# Node scenario to check service status on helper node
def node_service_status(self, node, service, timeout):
pass
26 changes: 26 additions & 0 deletions kraken/node_actions/common_node_functions.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import time
import random
import logging
import paramiko
import kraken.kubernetes.client as kubecli
import kraken.invoke.command as runcommand

Expand Down Expand Up @@ -39,3 +40,28 @@ def wait_for_unknown_status(node, timeout):
# Get the ip of the cluster node
def get_node_ip(node):
return runcommand.invoke("kubectl get node %s -o jsonpath='{.status.addresses[?(@.type==\"InternalIP\")].address}'" % (node))

def check_service_status(node, service, timeout):
ssh = paramiko.SSHClient()
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
i = 0
sleeper = 1
while i <= timeout:
try:
time.sleep(sleeper)
i += sleeper
logging.info("Trying to ssh to instance: %s" % (node))
connection = ssh.connect(node, username='root', key_filename='/root/.ssh/id_rsa', timeout=800, banner_timeout=400)
if connection is None:
break
except:
pass
for service_name in service:
logging.info("Checking status of Service: %s" % (service_name))
stdin, stdout, stderr = ssh.exec_command("systemctl status %s | grep '^ Active' | awk '{print $2}'" % (service_name))
service_status = stdout.readlines()[0]
logging.info("Status of service %s is %s \n" % (service_name, service_status.strip()))
if(service_status.strip() != "active" ):
logging.error("Expected service %s to be in Active state but is %s" % (service_name, service_status.strip()))
ssh.close()

44 changes: 44 additions & 0 deletions kraken/node_actions/openstack_node_scenarios.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,3 +131,47 @@ def node_reboot_scenario(self, instance_kill_count, node, timeout):
logging.error("node_reboot_scenario injection failed!")
sys.exit(1)

# Node scenario to start the node
def helper_node_start_scenario(self, instance_kill_count, node_ip, timeout):
for _ in range(instance_kill_count):
try:
logging.info("Starting helper_node_start_scenario injection")
openstack_node_name = self.openstackcloud.get_openstack_nodename(node_ip.strip())
logging.info("Starting the helper node %s" % (openstack_node_name))
self.openstackcloud.start_instances(openstack_node_name)
self.openstackcloud.wait_until_running(openstack_node_name)
logging.info("Helper node with IP: %s is in running state" % (node_ip))
logging.info("node_start_scenario has been successfully injected!")
except Exception as e:
logging.error("Failed to start node instance. Encountered following "
"exception: %s. Test Failed" % (e))
logging.error("helper_node_start_scenario injection failed!")
sys.exit(1)

# Node scenario to stop the node
def helper_node_stop_scenario(self, instance_kill_count, node_ip, timeout):
for _ in range(instance_kill_count):
try:
logging.info("Starting helper_node_stop_scenario injection")
openstack_node_name = self.openstackcloud.get_openstack_nodename(node_ip.strip())
logging.info("Stopping the helper node %s " % (openstack_node_name))
self.openstackcloud.stop_instances(openstack_node_name)
self.openstackcloud.wait_until_stopped(openstack_node_name)
logging.info("Helper node with IP: %s is in stopped state" % (node_ip))
except Exception as e:
logging.error("Failed to stop node instance. Encountered following exception: %s. "
"Test Failed" % (e))
logging.error("helper_node_stop_scenario injection failed!")
sys.exit(1)

def helper_node_service_status(self, node_ip, service, timeout):
try:
logging.info("Checking service status on the helper node")
nodeaction.check_service_status(node_ip.strip(), service, timeout)
logging.info("Service status checked on %s" % (node_ip))
logging.info("Check service status is successfuly injected!")
except Exception as e:
logging.error("Failed to check service status. Encountered following exception:"
" %s. Test Failed" % (e))
logging.error("helper_node_service_status injection failed!")
sys.exit(1)
8 changes: 8 additions & 0 deletions run_kraken.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ def inject_node_scenario(action, node_scenario, node_scenario_object):
node_name = node_scenario.get("node_name", "")
label_selector = node_scenario.get("label_selector", "")
timeout = node_scenario.get("timeout", 120)
service = node_scenario.get("service", "")
# Get the node to apply the scenario
node = nodeaction.get_node(node_name, label_selector)

Expand All @@ -71,6 +72,13 @@ def inject_node_scenario(action, node_scenario, node_scenario_object):
node_scenario_object.stop_kubelet_scenario(instance_kill_count, node, timeout)
elif action == "node_crash_scenario":
node_scenario_object.node_crash_scenario(instance_kill_count, node, timeout)
elif action == "stop_start_helper_node_scenario":
if not node_scenario['helper_node_ip']:
logging.error("Helper node IP address is not provided")
sys.exit(1)
node_scenario_object.helper_node_stop_start_scenario(instance_kill_count, node_scenario['helper_node_ip'], timeout)
node_scenario_object.helper_node_service_status(node_scenario['helper_node_ip'], service, timeout)



# Get cerberus status
Expand Down
6 changes: 6 additions & 0 deletions scenarios/node_scenarios_example.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,17 @@ node_scenarios:
- node_stop_start_scenario
- stop_start_kubelet_scenario
- node_crash_scenario
#- stop_start_helper_node_scenario
node_name: # node on which scenario has to be injected
label_selector: node-role.kubernetes.io/worker # when node_name is not specified, a node with matching label_selector is selected for node chaos scenario injection
instance_kill_count: 1 # number of times to inject each scenario under actions
timeout: 120 # duration to wait for completion of node scenario injection
cloud_type: aws # cloud type on which Kubernetes/OpenShift runs
helper_node_ip: # ip address of the helper node
service: # check status of the services on the helper node
- haproxy
- dhcpd
- named
- actions:
- node_reboot_scenario
node_name:
Expand Down

0 comments on commit ed73d0a

Please sign in to comment.