Adding litmus scenario options

krkn-chaos · Nov 12, 2020 · 9254ec2 · 9254ec2
1 parent 709badd
commit 9254ec2
Show file tree

Hide file tree

Showing 6 changed files with 259 additions and 4 deletions.
diff --git a/config/config.yaml b/config/config.yaml
@@ -1,6 +1,7 @@
 kraken:
     kubeconfig_path: /root/.kube/config                    # Path to kubeconfig
     exit_on_failure: False                                 # Exit when a post action scenario fails
+    litmus_version: v1.9.1
     chaos_scenarios:                                         # List of policies/chaos scenarios to load
         -   pod_scenarios:                                 # List of chaos pod scenarios to load
             - -    scenarios/etcd.yml
@@ -13,6 +14,9 @@ kraken:
             - -    scenarios/openshift-kube-apiserver.yml
         -   time_scenarios:                                # List of chaos time scenarios to load
             - scenarios/time_scenarios_example.yml
+        -   litmus_scenarios:
+            - - https://hub.litmuschaos.io/api/chaos/1.9.1?file=charts/generic/node-cpu-hog/rbac.yaml
+              - scenarios/node_hog_engine.yaml
 
 cerberus:
     cerberus_enabled: False                                # Enable it when cerberus is previously installed

diff --git a/docs/litmus_scenarios.md b/docs/litmus_scenarios.md
@@ -0,0 +1,18 @@
+### Litmus Scenarios
+Kraken consumes [Litmus](https://github.com/litmuschaos/litmus) under the hood for some infrastructure, pod, and node scenarios 
+
+#### Litmus chaos scenarios
+There are 3 custom resources that are created during each litmus scenario. Below is a description of the resources:
+* ChaosEngine: A resource to link a Kubernetes application or Kubernetes node to a ChaosExperiment. ChaosEngine is watched by Litmus' Chaos-Operator which then invokes Chaos-Experiments
+* ChaosExperiment: A resource to group the configuration parameters of a chaos experiment. ChaosExperiment CRs are created by the operator when experiments are invoked by ChaosEngine.
+* ChaosResult : A resource to hold the results of a chaos-experiment. The Chaos-exporter reads the results and exports the metrics into a configured Prometheus server.
+
+Following are the start of scenarios for which a chaos scenario config exists today. Adding a new litmus based scenario is as simple as adding 3 new yaml files under scenarios directory and defining it in the config.
+
+Component                | Description                                                                                        | Working
+------------------------ | ---------------------------------------------------------------------------------------------------| ------------------------- |
+Node CPU Hog             | Chaos scenario that hogs up the CPU on a defined node for a specific amount of time                | :heavy_check_mark:        |
+
+**NOTE**: [More scenarios](https://hub.litmuschaos.io/) can be found on this page 
+
+
diff --git a/kraken/litmus/common_litmus.py b/kraken/litmus/common_litmus.py
@@ -0,0 +1,103 @@
+import kraken.invoke.command as runcommand
+import logging
+import time
+import sys
+
+
+# Install litmus and wait until pod is running
+def install_litmus(version):
+    runcommand.invoke("kubectl apply -f "
+                      "https://litmuschaos.github.io/litmus/litmus-operator-%s.yaml" % version)
+
+    runcommand.invoke("oc patch -n litmus deployment.apps/chaos-operator-ce --type=json --patch ' "
+                      "[ { \"op\": \"add\", \"path\": \"/spec/template/spec/containers/0/env/-\", "
+                      "\"value\": { \"name\": \"ANALYTICS\", \"value\": \"FALSE\" } } ]'")
+
+    runcommand.invoke("oc wait deploy -n litmus chaos-operator-ce --for=condition=Available")
+
+
+def deploy_all_experiments(version_string):
+
+    if not version_string.startswith("v"):
+        logging.error("Incorrect version string for litmus, needs to start with 'v' "
+                      "followed by a number")
+        sys.exit(1)
+    version = version_string[1:]
+
+    runcommand.invoke("kubectl apply -f "
+                      "https://hub.litmuschaos.io/api/chaos/%s?file=charts/generic/experiments.yaml"
+                      % version)
+
+
+def delete_experiments():
+    runcommand.invoke("kubectl delete chaosengine --all")
+
+
+# Check status of experiment
+def check_experiment(engine_name, experiment_name, namespace):
+    chaos_engine = runcommand.invoke("kubectl get chaosengines/%s -n %s -o jsonpath="
+                                     "'{.status.engineStatus}'" % (engine_name, namespace))
+    engine_status = chaos_engine.strip()
+    max_tries = 30
+    engine_counter = 0
+    while engine_status.lower() != "running" and engine_status.lower() != "completed":
+        time.sleep(10)
+        logging.info("Waiting for engine to start running.")
+        chaos_engine = runcommand.invoke("kubectl get chaosengines/%s -n %s -o jsonpath="
+                                         "'{.status.engineStatus}'" % (engine_name, namespace))
+        engine_status = chaos_engine.strip()
+        if engine_counter >= max_tries:
+            logging.error("Chaos engine took longer than 5 minutes to be running or complete")
+            return False
+        engine_counter += 1
+        # need to see if error in run
+        if "notfound" in engine_status.lower():
+            logging.info("Chaos engine was not found")
+            return False
+
+    if not chaos_engine:
+        return False
+    chaos_result = runcommand.invoke("kubectl get chaosresult %s"
+                                     "-%s -n %s -o "
+                                     "jsonpath='{.status.experimentstatus.verdict}'"
+                                     % (engine_name, experiment_name, namespace))
+    result_counter = 0
+    status = chaos_result.strip()
+    while status == "Awaited":
+        logging.info("Waiting for chaos result to finish, sleeping 10 seconds")
+        time.sleep(10)
+        chaos_result = runcommand.invoke("kubectl get chaosresult %s"
+                                         "-%s -n %s -o "
+                                         "jsonpath='{.status.experimentstatus.verdict}'"
+                                         % (engine_name, experiment_name, namespace))
+        status = chaos_result.strip()
+        if result_counter >= max_tries:
+            logging.error("Chaos results took longer than 5 minutes to get a final result")
+            return False
+        result_counter += 1
+        if "notfound" in status.lower():
+            logging.info("Chaos result was not found")
+            return False
+
+    if status == "Pass":
+        return True
+    else:
+        chaos_result = runcommand.invoke("kubectl get chaosresult %s"
+                                         "-%s -n %s -o jsonpath="
+                                         "'{.status.experimentstatus.failStep}'" %
+                                         (engine_name, experiment_name, namespace))
+        logging.info("Chaos result failed information: " + str(chaos_result))
+        return False
+
+
+# Delete all chaos engines in a given namespace
+def delete_chaos(namespace):
+    runcommand.invoke("kubectl delete chaosengine --all -n " + str(namespace))
+    runcommand.invoke("kubectl delete chaosexperiment --all -n " + str(namespace))
+    runcommand.invoke("kubectl delete chaosresult --all -n " + str(namespace))
+
+
+# Uninstall litmus operator
+def uninstall_litmus(version):
+    runcommand.invoke("kubectl delete -f "
+                      "https://litmuschaos.github.io/litmus/litmus-operator-%s.yaml" % version)
diff --git a/run_kraken.py b/run_kraken.py
@@ -10,6 +10,7 @@
 import pyfiglet
 import kraken.kubernetes.client as kubecli
 import kraken.invoke.command as runcommand
+import kraken.litmus.common_litmus as common_litmus
 import kraken.node_actions.common_node_functions as nodeaction
 from kraken.node_actions.aws_node_scenarios import aws_node_scenarios
 import kraken.time_actions.common_time_functions as time_actions
@@ -160,10 +161,11 @@ def pod_scenarios(scenarios_list, config, failed_post_scenarios):
                 pre_action_output = run_post_action(kubeconfig_path, pod_scenario[1])
             else:
                 pre_action_output = ''
-            scenario_logs = runcommand.invoke("powerfulseal autonomous --use-pod-delete-instead-of-ssh-kill"
-                              " --policy-file %s --kubeconfig %s --no-cloud"
-                              " --inventory-kubernetes --headless"
-                              % (pod_scenario[0], kubeconfig_path))
+            scenario_logs = runcommand.invoke("powerfulseal autonomous "
+                                              "--use-pod-delete-instead-of-ssh-kill"
+                                              " --policy-file %s --kubeconfig %s --no-cloud"
+                                              " --inventory-kubernetes --headless"
+                                              % (pod_scenario[0], kubeconfig_path))
 
             # Display pod scenario logs/actions
             print(scenario_logs)
@@ -210,6 +212,53 @@ def time_scenarios(scenarios_list, config):
                 publish_kraken_status(config, not_reset)
 
 
+def litmus_scenarios(scenarios_list, config, litmus_namespaces, litmus_uninstall):
+    # Loop to run the scenarios starts here
+    for l_scenario in scenarios_list:
+        try:
+            for item in l_scenario:
+                runcommand.invoke("kubectl apply -f %s" % item)
+                if "http" in item:
+                    f = requests.get(item)
+                    yaml_item = list(yaml.safe_load_all(f.content))[0]
+                else:
+                    with open(item, "r") as f:
+                        logging.info("opened yaml" + str(item))
+                        yaml_item = list(yaml.safe_load_all(f))[0]
+
+                if yaml_item['kind'] == "ChaosEngine":
+                    engine_name = yaml_item['metadata']['name']
+                    namespace = yaml_item['metadata']['namespace']
+                    litmus_namespaces.append(namespace)
+                    experiment_names = yaml_item['spec']['experiments']
+                    for expr in experiment_names:
+                        expr_name = expr['name']
+                        experiment_result = common_litmus.check_experiment(engine_name,
+                                                                           expr_name,
+                                                                           namespace)
+                        if experiment_result:
+                            logging.info("Scenario: %s has been successfully injected!"
+                                         % item)
+                        else:
+                            logging.info("Scenario: %s was not successfully injected!"
+                                         % item)
+                            if litmus_uninstall:
+                                for l_item in l_scenario:
+                                    logging.info('item ' + str(l_item))
+                                    runcommand.invoke("kubectl delete -f %s" % l_item)
+            if litmus_uninstall:
+                for item in l_scenario:
+                    logging.info('item ' + str(item))
+                    runcommand.invoke("kubectl delete -f %s" % item)
+            cerberus_integration(config)
+            logging.info("Waiting for the specified duration: %s" % wait_duration)
+            time.sleep(wait_duration)
+        except Exception as e:
+            logging.error("Failed to run litmus scenario: %s. Encountered "
+                          "the following exception: %s" % (item, e))
+    return litmus_namespaces
+
+
 # Main function
 def main(cfg):
     # Start kraken
@@ -223,6 +272,8 @@ def main(cfg):
         global kubeconfig_path, wait_duration
         kubeconfig_path = config["kraken"].get("kubeconfig_path", "")
         chaos_scenarios = config["kraken"].get("chaos_scenarios", [])
+        litmus_version = config['kraken'].get("litmus_version", 'v1.9.1')
+        litmus_uninstall = config['kraken'].get("litmus_uninstall", False)
         wait_duration = config["tunings"].get("wait_duration", 60)
         iterations = config["tunings"].get("iterations", 1)
         daemon_mode = config["tunings"].get("daemon_mode", False)
@@ -258,6 +309,8 @@ def main(cfg):
             iterations = int(iterations)
 
         failed_post_scenarios = []
+        litmus_namespaces = []
+        litmus_installed = False
         # Loop to run the chaos starts here
         while (int(iteration) < iterations):
             # Inject chaos scenarios specified in the config
@@ -279,9 +332,23 @@ def main(cfg):
                         # Inject time skew chaos scenarios specified in the config
                         elif scenario_type == "time_scenarios":
                             time_scenarios(scenarios_list, config)
+                        elif scenario_type == "litmus_scenarios":
+                            if not litmus_installed:
+                                common_litmus.install_litmus(litmus_version)
+                                common_litmus.deploy_all_experiments(litmus_version)
+                                litmus_installed = True
+                            litmus_namespaces = litmus_scenarios(scenarios_list, config,
+                                                                 litmus_namespaces,
+                                                                 litmus_uninstall)
 
             iteration += 1
             logging.info("")
+        if litmus_uninstall and litmus_installed:
+            for namespace in litmus_namespaces:
+                common_litmus.delete_chaos(namespace)
+            common_litmus.delete_experiments()
+            common_litmus.uninstall_litmus(litmus_version)
+
         if failed_post_scenarios:
             logging.error("Post scenarios are still failing at the end of all iterations")
             sys.exit(1)

diff --git a/scenarios/node_hog_engine.yaml b/scenarios/node_hog_engine.yaml
@@ -0,0 +1,26 @@
+apiVersion: litmuschaos.io/v1alpha1
+kind: ChaosEngine
+metadata:
+  name: nginx-chaos
+  namespace: default
+spec:
+  # It can be true/false
+  annotationCheck: 'false'
+  # It can be active/stop
+  engineState: 'active'
+  chaosServiceAccount: node-cpu-hog-sa
+  monitoring: false
+  # It can be delete/retain
+  jobCleanUpPolicy: 'delete'
+  experiments:
+    - name: node-cpu-hog
+      spec:
+        components:
+          env:
+            # set chaos duration (in sec) as desired
+            - name: TOTAL_CHAOS_DURATION
+              value: '60'
+
+             # ENTER THE NAME OF THE APPLICATION NODE
+            - name: APP_NODE
+              value: 'ip-10-0-146-53.us-east-2.compute.internal'
diff --git a/scenarios/node_hog_rbac.yaml b/scenarios/node_hog_rbac.yaml
@@ -0,0 +1,37 @@
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: node-cpu-hog-sa
+  namespace: default
+  labels:
+    name: node-cpu-hog-sa
+---
+apiVersion: rbac.authorization.k8s.io/v1beta1
+kind: ClusterRole
+metadata:
+  name: node-cpu-hog-sa
+  labels:
+    name: node-cpu-hog-sa
+rules:
+- apiGroups: ["","litmuschaos.io","batch","apps"]
+  resources: ["pods","jobs","events","chaosengines","pods/log","chaosexperiments","chaosresults"]
+  verbs: ["create","list","get","patch","update","delete"]
+- apiGroups: [""]
+  resources: ["nodes"]
+  verbs: ["get","list"]
+---
+apiVersion: rbac.authorization.k8s.io/v1beta1
+kind: ClusterRoleBinding
+metadata:
+  name: node-cpu-hog-sa
+  labels:
+    name: node-cpu-hog-sa
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: node-cpu-hog-sa
+subjects:
+- kind: ServiceAccount
+  name: node-cpu-hog-sa
+  namespace: default