From c71be0b10872f4db6600f6526a5ad0ee1cb5214c Mon Sep 17 00:00:00 2001 From: prubenda Date: Thu, 8 Oct 2020 16:41:01 -0400 Subject: [PATCH] adding time scenario --- README.md | 3 +- config/config.yaml | 3 +- docs/time_skew.rd | 29 ++++ kraken/kubernetes/client.py | 7 +- kraken/time_actions/__init__.py | 0 kraken/time_actions/common_time_functions.py | 138 +++++++++++++++++++ run_kraken.py | 18 ++- scenarios/time_scenarios_example.yml | 7 + 8 files changed, 200 insertions(+), 5 deletions(-) create mode 100644 docs/time_skew.rd create mode 100644 kraken/time_actions/__init__.py create mode 100644 kraken/time_actions/common_time_functions.py create mode 100644 scenarios/time_scenarios_example.yml diff --git a/README.md b/README.md index 44ff2e77..489fa363 100644 --- a/README.md +++ b/README.md @@ -16,12 +16,13 @@ Instructions on how to setup the config and the options supported can be found a ### Kubernetes/OpenShift chaos scenarios supported -Kraken supports pod and node based scenarios. +Kraken supports pod, node and time/date based scenarios. - [Pod Scenarios](docs/pod_scenarios.md) - [Node Scenarios](docs/node_scenarios.md) +- [Time Scenarios](docs/time_scenarios.md) ### Kraken scenario pass/fail criteria and report It's important to make sure to check if the targeted component recovered from the chaos injection and also if the Kubernetes/OpenShift cluster is healthy as failures in one component can have an adverse impact on other components. Kraken does this by: diff --git a/config/config.yaml b/config/config.yaml index 1e1f58af..75a4c0e9 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -9,7 +9,8 @@ kraken: - scenarios/post_action_regex.py node_scenarios: # List of chaos node scenarios to load - scenarios/node_scenarios_example.yml - + time_scenarios: # List of chaos time scenarios to load + - scenarios/time_scenarios_example.yml cerberus: cerberus_enabled: False # Enable it when cerberus is previously installed cerberus_url: # When cerberus_enabled is set to True, provide the url where cerberus publishes go/no-go signal diff --git a/docs/time_skew.rd b/docs/time_skew.rd new file mode 100644 index 00000000..4f1344d3 --- /dev/null +++ b/docs/time_skew.rd @@ -0,0 +1,29 @@ +### Time/Date Skew Scenarios + +Using this type of scenario configuration, one is able to change the time and/or date of the system for pods or nodes + +Configuration Options: + +**action:** skew_time or skew_date + +**object_type:** pod or node + +**namespace:** namespace of the pods you want to skew, need to be set if setting a specific pod name + +**label_selector:** label on the nodes or pods you want to skew + +**object_name:** list of the names of pods or nodes you want to skew + +Refer to [time_scenarios_example](https://github.com/openshift-scale/kraken/blob/master/scenarios/time_scenarios_example.yml) config file. + +``` +time_scenarios: + - action: skew_time + object_type: pod + object_name: + - apiserver-868595fcbb-6qnsc + - apiserver-868595fcbb-mb9j5 + namespace: openshift-apiserver + - action: skew_date + object_type: node + label_selector: node-role.kubernetes.io/worker``` \ No newline at end of file diff --git a/kraken/kubernetes/client.py b/kraken/kubernetes/client.py index 7aece66f..25e22a13 100644 --- a/kraken/kubernetes/client.py +++ b/kraken/kubernetes/client.py @@ -61,9 +61,12 @@ def list_pods(namespace): return pods -def get_all_pods(): +def get_all_pods(label_selector=None): pods = [] - ret = cli.list_pod_for_all_namespaces(pretty=True) + if label_selector: + ret = cli.list_pod_for_all_namespaces(pretty=True, label_selector=label_selector) + else: + ret = cli.list_pod_for_all_namespaces(pretty=True) for pod in ret.items: pods.append([pod.metadata.name, pod.metadata.namespace]) return pods diff --git a/kraken/time_actions/__init__.py b/kraken/time_actions/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/kraken/time_actions/common_time_functions.py b/kraken/time_actions/common_time_functions.py new file mode 100644 index 00000000..67b0366b --- /dev/null +++ b/kraken/time_actions/common_time_functions.py @@ -0,0 +1,138 @@ +import datetime +import time +import logging +import kraken.invoke.command as runcommand +import kraken.kubernetes.client as kubecli +import re +import sys + + +def pod_exec(pod_name, command, namespace): + i = 0 + for i in range(5): + response = runcommand.invoke('kubectl exec %s -n %s -- %s' % (pod_name, namespace, command)) + if "unauthorized" in response.lower() or "authorization" in response.lower(): + continue + else: + break + return response + + +def node_debug(node_name, command): + + response = runcommand.invoke("oc debug node/" + node_name + ' -- chroot /host ' + command) + return response + + +def skew_time(scenario): + skew_command = "date -s " + if scenario['action'] == "skew_date": + skewed_date = "00-01-01" + skew_command += skewed_date + elif scenario['action'] == "skew_time": + skewed_time = "01:01:01" + skew_command += skewed_time + if "node" in scenario["object_type"]: + node_names = [] + if "object_name" in scenario.keys() and scenario['object_name']: + node_names = scenario['object_name'] + elif "label_selector" in scenario.keys() and scenario['label_selector']: + node_names = kubecli.list_nodes(scenario['label_selector']) + + for node in node_names: + node_debug(node, skew_command) + logging.info("Reset date/time on node " + str(node)) + return "node", node_names + + elif "pod" in scenario['object_type']: + pod_names = [] + if "object_name" in scenario.keys() and scenario['object_name']: + for name in scenario['object_name']: + if "namespace" not in scenario.keys(): + logging.error("Need to set namespace when using pod name") + sys.exit(1) + pod_names.append([name, scenario['namespace']]) + elif "label_selector" in scenario.keys() and scenario['label_selector']: + pod_names = kubecli.get_all_pods(scenario['label_selector']) + elif "namespace" in scenario.keys() and scenario['namespace']: + pod_names = kubecli.list_pods(scenario['namespace']) + counter = 0 + for pod_name in pod_names: + pod_names[counter] = [pod_name, scenario['namespace']] + counter += 1 + + for pod in pod_names: + if len(pod) > 1: + pod_exec(pod[0], skew_command, pod[1]) + else: + pod_exec(pod, skew_command, scenario['namespace']) + logging.info("Reset date/time on pod " + str(pod[0])) + return "pod", pod_names + + +# From kubectl/oc command get time output +def parse_string_date(obj_datetime): + try: + date_line = re.search(r'[a-zA-Z0-9_() .]*\w{3} \w{3} \d{2} \d{2}:\d{2}:\d{2} \w{3} ' + r'\d{4}\W*', obj_datetime) + return date_line.group().strip() + except Exception: + return "" + + +# Get date and time from string returned from OC +def string_to_date(obj_datetime): + obj_datetime = parse_string_date(obj_datetime) + try: + date_time_obj = datetime.datetime.strptime(obj_datetime, '%a %b %d %H:%M:%S %Z %Y') + return date_time_obj + except Exception: + return datetime.datetime(datetime.MINYEAR, 1, 1) + + +def check_date_time(object_type, names): + skew_command = "date" + not_reset = [] + max_retries = 30 + if object_type == "node": + for node_name in names: + first_date_time = datetime.datetime.utcnow() + node_datetime_string = node_debug(node_name, skew_command) + node_datetime = string_to_date(node_datetime_string) + counter = 0 + while not first_date_time < node_datetime < datetime.datetime.utcnow(): + time.sleep(5) + logging.info("Date/time on node %s still not reset, waiting 5 seconds and retrying" + % node_name) + node_datetime_string = node_debug(node_name, skew_command) + node_datetime = string_to_date(node_datetime_string) + counter += 1 + if counter > max_retries: + logging.error("Date and time in node %s didn't reset properly" + % node_name) + not_reset.append(node_name) + break + if counter < max_retries: + logging.info("Date in node " + str(node_name) + " reset properly") + elif object_type == "pod": + for pod_name in names: + first_date_time = datetime.datetime.utcnow() + counter = 0 + pod_datetime_string = pod_exec(pod_name[0], skew_command, pod_name[1]) + pod_datetime = string_to_date(pod_datetime_string) + while not first_date_time < pod_datetime < datetime.datetime.utcnow(): + time.sleep(5) + logging.info("Date/time on pod %s still not reset, waiting 5 seconds and retrying" + % pod_name[0]) + first_date_time = datetime.datetime.utcnow() + pod_datetime = pod_exec(pod_name[0], skew_command, pod_name[1]) + pod_datetime = string_to_date(pod_datetime) + counter += 1 + if counter > max_retries: + logging.error("Date and time in pod %s didn't reset properly" + % pod_name[0]) + not_reset.append(pod_name[0]) + break + if counter < max_retries: + logging.info("Date in pod " + str(pod_name[0]) + " reset properly") + return not_reset diff --git a/run_kraken.py b/run_kraken.py index 54c6a59d..2888f517 100644 --- a/run_kraken.py +++ b/run_kraken.py @@ -12,6 +12,7 @@ import kraken.invoke.command as runcommand import kraken.node_actions.common_node_functions as nodeaction from kraken.node_actions.aws_node_scenarios import aws_node_scenarios +import kraken.time_actions.common_time_functions as time_actions # Get the node scenarios object of specfied cloud type @@ -83,7 +84,6 @@ def publish_kraken_status(config, failed_post_scenarios): logging.info("Cerberus status is not healthy and post action scenarios " "are still failing") else: - if failed_post_scenarios: if config['kraken']['exit_on_failure']: logging.info("Cerberus status is healthy but post action scenarios " @@ -165,6 +165,7 @@ def main(cfg): kubeconfig_path = config["kraken"].get("kubeconfig_path", "") scenarios = config["kraken"].get("scenarios", []) node_scenarios = config["kraken"].get("node_scenarios", []) + time_scenarios = config['kraken'].get("time_scenarios", []) wait_duration = config["tunings"].get("wait_duration", 60) iterations = config["tunings"].get("iterations", 1) daemon_mode = config["tunings"].get("daemon_mode", False) @@ -247,6 +248,21 @@ def main(cfg): cerberus_integration(config) logging.info("") + # Inject time skew chaos scenarios specified in the config + if time_scenarios: + for time_scenario_config in time_scenarios: + with open(time_scenario_config, 'r') as f: + scenario_config = yaml.full_load(f) + for time_scenario in scenario_config['time_scenarios']: + object_type, object_names = time_actions.skew_time(time_scenario) + not_reset = time_actions.check_date_time(object_type, object_names) + if len(not_reset) > 0: + logging.info('Object times were not reset') + logging.info("Waiting for the specified duration: %s" + % wait_duration) + time.sleep(wait_duration) + publish_kraken_status(config, not_reset) + iteration += 1 logging.info("") if failed_post_scenarios: diff --git a/scenarios/time_scenarios_example.yml b/scenarios/time_scenarios_example.yml new file mode 100644 index 00000000..9b190b65 --- /dev/null +++ b/scenarios/time_scenarios_example.yml @@ -0,0 +1,7 @@ +time_scenarios: + - action: skew_time + object_type: pod + label_selector: app=multus + - action: skew_date + object_type: node + label_selector: node-role.kubernetes.io/worker