From 9254ec2a3839d846a5ef7b0e8f33687b0a3dfe78 Mon Sep 17 00:00:00 2001 From: prubenda Date: Wed, 7 Oct 2020 16:02:34 -0400 Subject: [PATCH] Adding litmus scenario options --- config/config.yaml | 4 ++ docs/litmus_scenarios.md | 18 ++++++ kraken/litmus/common_litmus.py | 103 +++++++++++++++++++++++++++++++++ run_kraken.py | 75 ++++++++++++++++++++++-- scenarios/node_hog_engine.yaml | 26 +++++++++ scenarios/node_hog_rbac.yaml | 37 ++++++++++++ 6 files changed, 259 insertions(+), 4 deletions(-) create mode 100644 docs/litmus_scenarios.md create mode 100644 kraken/litmus/common_litmus.py create mode 100644 scenarios/node_hog_engine.yaml create mode 100644 scenarios/node_hog_rbac.yaml diff --git a/config/config.yaml b/config/config.yaml index 92609841..7a868afe 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -1,6 +1,7 @@ kraken: kubeconfig_path: /root/.kube/config # Path to kubeconfig exit_on_failure: False # Exit when a post action scenario fails + litmus_version: v1.9.1 chaos_scenarios: # List of policies/chaos scenarios to load - pod_scenarios: # List of chaos pod scenarios to load - - scenarios/etcd.yml @@ -13,6 +14,9 @@ kraken: - - scenarios/openshift-kube-apiserver.yml - time_scenarios: # List of chaos time scenarios to load - scenarios/time_scenarios_example.yml + - litmus_scenarios: + - - https://hub.litmuschaos.io/api/chaos/1.9.1?file=charts/generic/node-cpu-hog/rbac.yaml + - scenarios/node_hog_engine.yaml cerberus: cerberus_enabled: False # Enable it when cerberus is previously installed diff --git a/docs/litmus_scenarios.md b/docs/litmus_scenarios.md new file mode 100644 index 00000000..5bbc5596 --- /dev/null +++ b/docs/litmus_scenarios.md @@ -0,0 +1,18 @@ +### Litmus Scenarios +Kraken consumes [Litmus](https://github.com/litmuschaos/litmus) under the hood for some infrastructure, pod, and node scenarios + +#### Litmus chaos scenarios +There are 3 custom resources that are created during each litmus scenario. Below is a description of the resources: +* ChaosEngine: A resource to link a Kubernetes application or Kubernetes node to a ChaosExperiment. ChaosEngine is watched by Litmus' Chaos-Operator which then invokes Chaos-Experiments +* ChaosExperiment: A resource to group the configuration parameters of a chaos experiment. ChaosExperiment CRs are created by the operator when experiments are invoked by ChaosEngine. +* ChaosResult : A resource to hold the results of a chaos-experiment. The Chaos-exporter reads the results and exports the metrics into a configured Prometheus server. + +Following are the start of scenarios for which a chaos scenario config exists today. Adding a new litmus based scenario is as simple as adding 3 new yaml files under scenarios directory and defining it in the config. + +Component | Description | Working +------------------------ | ---------------------------------------------------------------------------------------------------| ------------------------- | +Node CPU Hog | Chaos scenario that hogs up the CPU on a defined node for a specific amount of time | :heavy_check_mark: | + +**NOTE**: [More scenarios](https://hub.litmuschaos.io/) can be found on this page + + diff --git a/kraken/litmus/common_litmus.py b/kraken/litmus/common_litmus.py new file mode 100644 index 00000000..155d8ef6 --- /dev/null +++ b/kraken/litmus/common_litmus.py @@ -0,0 +1,103 @@ +import kraken.invoke.command as runcommand +import logging +import time +import sys + + +# Install litmus and wait until pod is running +def install_litmus(version): + runcommand.invoke("kubectl apply -f " + "https://litmuschaos.github.io/litmus/litmus-operator-%s.yaml" % version) + + runcommand.invoke("oc patch -n litmus deployment.apps/chaos-operator-ce --type=json --patch ' " + "[ { \"op\": \"add\", \"path\": \"/spec/template/spec/containers/0/env/-\", " + "\"value\": { \"name\": \"ANALYTICS\", \"value\": \"FALSE\" } } ]'") + + runcommand.invoke("oc wait deploy -n litmus chaos-operator-ce --for=condition=Available") + + +def deploy_all_experiments(version_string): + + if not version_string.startswith("v"): + logging.error("Incorrect version string for litmus, needs to start with 'v' " + "followed by a number") + sys.exit(1) + version = version_string[1:] + + runcommand.invoke("kubectl apply -f " + "https://hub.litmuschaos.io/api/chaos/%s?file=charts/generic/experiments.yaml" + % version) + + +def delete_experiments(): + runcommand.invoke("kubectl delete chaosengine --all") + + +# Check status of experiment +def check_experiment(engine_name, experiment_name, namespace): + chaos_engine = runcommand.invoke("kubectl get chaosengines/%s -n %s -o jsonpath=" + "'{.status.engineStatus}'" % (engine_name, namespace)) + engine_status = chaos_engine.strip() + max_tries = 30 + engine_counter = 0 + while engine_status.lower() != "running" and engine_status.lower() != "completed": + time.sleep(10) + logging.info("Waiting for engine to start running.") + chaos_engine = runcommand.invoke("kubectl get chaosengines/%s -n %s -o jsonpath=" + "'{.status.engineStatus}'" % (engine_name, namespace)) + engine_status = chaos_engine.strip() + if engine_counter >= max_tries: + logging.error("Chaos engine took longer than 5 minutes to be running or complete") + return False + engine_counter += 1 + # need to see if error in run + if "notfound" in engine_status.lower(): + logging.info("Chaos engine was not found") + return False + + if not chaos_engine: + return False + chaos_result = runcommand.invoke("kubectl get chaosresult %s" + "-%s -n %s -o " + "jsonpath='{.status.experimentstatus.verdict}'" + % (engine_name, experiment_name, namespace)) + result_counter = 0 + status = chaos_result.strip() + while status == "Awaited": + logging.info("Waiting for chaos result to finish, sleeping 10 seconds") + time.sleep(10) + chaos_result = runcommand.invoke("kubectl get chaosresult %s" + "-%s -n %s -o " + "jsonpath='{.status.experimentstatus.verdict}'" + % (engine_name, experiment_name, namespace)) + status = chaos_result.strip() + if result_counter >= max_tries: + logging.error("Chaos results took longer than 5 minutes to get a final result") + return False + result_counter += 1 + if "notfound" in status.lower(): + logging.info("Chaos result was not found") + return False + + if status == "Pass": + return True + else: + chaos_result = runcommand.invoke("kubectl get chaosresult %s" + "-%s -n %s -o jsonpath=" + "'{.status.experimentstatus.failStep}'" % + (engine_name, experiment_name, namespace)) + logging.info("Chaos result failed information: " + str(chaos_result)) + return False + + +# Delete all chaos engines in a given namespace +def delete_chaos(namespace): + runcommand.invoke("kubectl delete chaosengine --all -n " + str(namespace)) + runcommand.invoke("kubectl delete chaosexperiment --all -n " + str(namespace)) + runcommand.invoke("kubectl delete chaosresult --all -n " + str(namespace)) + + +# Uninstall litmus operator +def uninstall_litmus(version): + runcommand.invoke("kubectl delete -f " + "https://litmuschaos.github.io/litmus/litmus-operator-%s.yaml" % version) diff --git a/run_kraken.py b/run_kraken.py index aed6a2b0..76a8d414 100644 --- a/run_kraken.py +++ b/run_kraken.py @@ -10,6 +10,7 @@ import pyfiglet import kraken.kubernetes.client as kubecli import kraken.invoke.command as runcommand +import kraken.litmus.common_litmus as common_litmus import kraken.node_actions.common_node_functions as nodeaction from kraken.node_actions.aws_node_scenarios import aws_node_scenarios import kraken.time_actions.common_time_functions as time_actions @@ -160,10 +161,11 @@ def pod_scenarios(scenarios_list, config, failed_post_scenarios): pre_action_output = run_post_action(kubeconfig_path, pod_scenario[1]) else: pre_action_output = '' - scenario_logs = runcommand.invoke("powerfulseal autonomous --use-pod-delete-instead-of-ssh-kill" - " --policy-file %s --kubeconfig %s --no-cloud" - " --inventory-kubernetes --headless" - % (pod_scenario[0], kubeconfig_path)) + scenario_logs = runcommand.invoke("powerfulseal autonomous " + "--use-pod-delete-instead-of-ssh-kill" + " --policy-file %s --kubeconfig %s --no-cloud" + " --inventory-kubernetes --headless" + % (pod_scenario[0], kubeconfig_path)) # Display pod scenario logs/actions print(scenario_logs) @@ -210,6 +212,53 @@ def time_scenarios(scenarios_list, config): publish_kraken_status(config, not_reset) +def litmus_scenarios(scenarios_list, config, litmus_namespaces, litmus_uninstall): + # Loop to run the scenarios starts here + for l_scenario in scenarios_list: + try: + for item in l_scenario: + runcommand.invoke("kubectl apply -f %s" % item) + if "http" in item: + f = requests.get(item) + yaml_item = list(yaml.safe_load_all(f.content))[0] + else: + with open(item, "r") as f: + logging.info("opened yaml" + str(item)) + yaml_item = list(yaml.safe_load_all(f))[0] + + if yaml_item['kind'] == "ChaosEngine": + engine_name = yaml_item['metadata']['name'] + namespace = yaml_item['metadata']['namespace'] + litmus_namespaces.append(namespace) + experiment_names = yaml_item['spec']['experiments'] + for expr in experiment_names: + expr_name = expr['name'] + experiment_result = common_litmus.check_experiment(engine_name, + expr_name, + namespace) + if experiment_result: + logging.info("Scenario: %s has been successfully injected!" + % item) + else: + logging.info("Scenario: %s was not successfully injected!" + % item) + if litmus_uninstall: + for l_item in l_scenario: + logging.info('item ' + str(l_item)) + runcommand.invoke("kubectl delete -f %s" % l_item) + if litmus_uninstall: + for item in l_scenario: + logging.info('item ' + str(item)) + runcommand.invoke("kubectl delete -f %s" % item) + cerberus_integration(config) + logging.info("Waiting for the specified duration: %s" % wait_duration) + time.sleep(wait_duration) + except Exception as e: + logging.error("Failed to run litmus scenario: %s. Encountered " + "the following exception: %s" % (item, e)) + return litmus_namespaces + + # Main function def main(cfg): # Start kraken @@ -223,6 +272,8 @@ def main(cfg): global kubeconfig_path, wait_duration kubeconfig_path = config["kraken"].get("kubeconfig_path", "") chaos_scenarios = config["kraken"].get("chaos_scenarios", []) + litmus_version = config['kraken'].get("litmus_version", 'v1.9.1') + litmus_uninstall = config['kraken'].get("litmus_uninstall", False) wait_duration = config["tunings"].get("wait_duration", 60) iterations = config["tunings"].get("iterations", 1) daemon_mode = config["tunings"].get("daemon_mode", False) @@ -258,6 +309,8 @@ def main(cfg): iterations = int(iterations) failed_post_scenarios = [] + litmus_namespaces = [] + litmus_installed = False # Loop to run the chaos starts here while (int(iteration) < iterations): # Inject chaos scenarios specified in the config @@ -279,9 +332,23 @@ def main(cfg): # Inject time skew chaos scenarios specified in the config elif scenario_type == "time_scenarios": time_scenarios(scenarios_list, config) + elif scenario_type == "litmus_scenarios": + if not litmus_installed: + common_litmus.install_litmus(litmus_version) + common_litmus.deploy_all_experiments(litmus_version) + litmus_installed = True + litmus_namespaces = litmus_scenarios(scenarios_list, config, + litmus_namespaces, + litmus_uninstall) iteration += 1 logging.info("") + if litmus_uninstall and litmus_installed: + for namespace in litmus_namespaces: + common_litmus.delete_chaos(namespace) + common_litmus.delete_experiments() + common_litmus.uninstall_litmus(litmus_version) + if failed_post_scenarios: logging.error("Post scenarios are still failing at the end of all iterations") sys.exit(1) diff --git a/scenarios/node_hog_engine.yaml b/scenarios/node_hog_engine.yaml new file mode 100644 index 00000000..5b2ab2eb --- /dev/null +++ b/scenarios/node_hog_engine.yaml @@ -0,0 +1,26 @@ +apiVersion: litmuschaos.io/v1alpha1 +kind: ChaosEngine +metadata: + name: nginx-chaos + namespace: default +spec: + # It can be true/false + annotationCheck: 'false' + # It can be active/stop + engineState: 'active' + chaosServiceAccount: node-cpu-hog-sa + monitoring: false + # It can be delete/retain + jobCleanUpPolicy: 'delete' + experiments: + - name: node-cpu-hog + spec: + components: + env: + # set chaos duration (in sec) as desired + - name: TOTAL_CHAOS_DURATION + value: '60' + + # ENTER THE NAME OF THE APPLICATION NODE + - name: APP_NODE + value: 'ip-10-0-146-53.us-east-2.compute.internal' diff --git a/scenarios/node_hog_rbac.yaml b/scenarios/node_hog_rbac.yaml new file mode 100644 index 00000000..19b2bb17 --- /dev/null +++ b/scenarios/node_hog_rbac.yaml @@ -0,0 +1,37 @@ +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: node-cpu-hog-sa + namespace: default + labels: + name: node-cpu-hog-sa +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRole +metadata: + name: node-cpu-hog-sa + labels: + name: node-cpu-hog-sa +rules: +- apiGroups: ["","litmuschaos.io","batch","apps"] + resources: ["pods","jobs","events","chaosengines","pods/log","chaosexperiments","chaosresults"] + verbs: ["create","list","get","patch","update","delete"] +- apiGroups: [""] + resources: ["nodes"] + verbs: ["get","list"] +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRoleBinding +metadata: + name: node-cpu-hog-sa + labels: + name: node-cpu-hog-sa +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: node-cpu-hog-sa +subjects: +- kind: ServiceAccount + name: node-cpu-hog-sa + namespace: default