krkn-chaos · janosdebugs · Sep 2, 2022 · Sep 1, 2022 · Sep 1, 2022 · Sep 1, 2022
diff --git a/README.md b/README.md
@@ -106,7 +106,6 @@ In addition to checking the recovery and health of the cluster and components un
 ### Roadmap
 Following is a list of enhancements that we are planning to work on adding support in Kraken. Of course any help/contributions are greatly appreciated.
 - [Ability to visualize the metrics that are being captured by Kraken and stored in Elasticsearch](https://github.com/chaos-kubox/krkn/issues/124)
-- Ability to shape the ingress network similar to how Kraken supports [egress traffic shaping](https://github.com/chaos-kubox/krkn/blob/main/docs/network_chaos.md) today.
 - Continue to improve [Chaos Testing Guide](https://cloud-bulldozer.github.io/kraken/) in terms of adding best practices, test environment recommendations and scenarios to make sure the OpenShift platform, as well the applications running on top it, are resilient and performant under chaotic conditions.
 - Support for running Kraken on Kubernetes distribution - see https://github.com/chaos-kubox/krkn/issues/185, https://github.com/chaos-kubox/krkn/issues/186
 - Sweet logo for Kraken - see https://github.com/chaos-kubox/krkn/issues/195

diff --git a/config/config.yaml b/config/config.yaml
@@ -16,6 +16,7 @@ kraken:
             - scenarios/openshift/etcd.yml
             - scenarios/openshift/regex_openshift_pod_kill.yml
             - scenarios/openshift/vmware_node_scenarios.yml
+            - scenarios/openshift/network_chaos_ingress.yml
         -   node_scenarios:                                # List of chaos node scenarios to load
             -   scenarios/openshift/node_scenarios_example.yml
         -   plugin_scenarios:

diff --git a/docs/network_chaos.md b/docs/network_chaos.md
@@ -1,7 +1,7 @@
 ### Network chaos
 Scenario to introduce network latency, packet loss, and bandwidth restriction in the Node's host network interface. The purpose of this scenario is to observe faults caused by random variations in the network.
 
-##### Sample scenario config
+##### Sample scenario config for egress traffic shaping 
 ```
 network_chaos:                                    # Scenario to create an outage by simulating random variations in the network.
   duration: 300                                   # In seconds - duration network chaos will be applied.
@@ -17,6 +17,29 @@ network_chaos:                                    # Scenario to create an outage
     bandwidth: 100mbit
 ```
 
+##### Sample scenario config for ingress traffic shaping (using a plugin)
+'''
+- id: network_chaos
+  config: 
+    node_interface_name:                            # Dictionary with key as node name(s) and value as a list of its interfaces to test
+      ip-10-0-128-153.us-west-2.compute.internal:
+        - ens5
+        - genev_sys_6081
+    label_selector: node-role.kubernetes.io/master  # When node_interface_name is not specified, nodes with matching label_selector is selected for node chaos scenario injection
+    instance_count: 1                               # Number of nodes to perform action/select that match the label selector
+    kubeconfig_path: /root/.kube/config             # Path to kubernetes config file. If not specified, it defaults to ~/.kube/config
+    execution_type: parallel                        # Execute each of the ingress options as a single scenario(parallel) or as separate scenario(serial).
+    network_params:
+        latency: 50ms                    
+        loss: '0.02'                       
+        bandwidth: 100mbit
+    wait_duration: 120
+    test_duration: 60
+  '''
+
+  Note: For ingress traffic shaping, ensure that your node doesn't have any [IFB](https://wiki.linuxfoundation.org/networking/ifb) interfaces already present. The scenario relies on creating IFBs to do the shaping, and they are deleted at the end of the scenario.
+
+
 ##### Steps
  - Pick the nodes to introduce the network anomaly either from node_name or label_selector.
  - Verify interface list in one of the nodes or use the interface with a default route, as test interface, if no interface is specified by the user.

diff --git a/kraken/plugins/__init__.py b/kraken/plugins/__init__.py
@@ -8,6 +8,7 @@
 import kraken.plugins.vmware.vmware_plugin as vmware_plugin
 from kraken.plugins.pod_plugin import kill_pods, wait_for_pods
 from kraken.plugins.run_python_plugin import run_python_file
+from kraken.plugins.network.ingress_shaping import network_chaos
 
 
 @dataclasses.dataclass
@@ -177,6 +178,12 @@ def json_schema(self):
             [
                 "error"
             ]
+        ),
+        PluginStep(
+            network_chaos,
+            [
+                "error"
+            ]
         )
     ]
 )

diff --git a/kraken/plugins/network/cerberus.py b/kraken/plugins/network/cerberus.py
@@ -0,0 +1,141 @@
+import logging
+import requests
+import sys
+import json
+
+
+def get_status(config, start_time, end_time):
+    """
+    Function to get Cerberus status
+
+    Args:
+        config
+            - Kraken config dictionary
+
+        start_time
+            - The time when chaos is injected
+
+        end_time
+            - The time when chaos is removed
+
+    Returns:
+        Cerberus status
+    """
+
+    cerberus_status = True
+    check_application_routes = False
+    application_routes_status = True
+    if config["cerberus"]["cerberus_enabled"]:
+        cerberus_url = config["cerberus"]["cerberus_url"]
+        check_application_routes = config["cerberus"]["check_applicaton_routes"]
+        if not cerberus_url:
+            logging.error("url where Cerberus publishes True/False signal is not provided.")
+            sys.exit(1)
+        cerberus_status = requests.get(cerberus_url, timeout=60).content
+        cerberus_status = True if cerberus_status == b"True" else False
+
+        # Fail if the application routes monitored by cerberus experience downtime during the chaos
+        if check_application_routes:
+            application_routes_status, unavailable_routes = application_status(cerberus_url, start_time, end_time)
+            if not application_routes_status:
+                logging.error(
+                    "Application routes: %s monitored by cerberus encountered downtime during the run, failing"
+                    % unavailable_routes
+                )
+            else:
+                logging.info("Application routes being monitored didn't encounter any downtime during the run!")
+
+        if not cerberus_status:
+            logging.error(
+                "Received a no-go signal from Cerberus, looks like "
+                "the cluster is unhealthy. Please check the Cerberus "
+                "report for more details. Test failed."
+            )
+
+        if not application_routes_status or not cerberus_status:
+            sys.exit(1)
+        else:
+            logging.info("Received a go signal from Ceberus, the cluster is healthy. " "Test passed.")
+    return cerberus_status
+
+
+def publish_kraken_status(config, failed_post_scenarios, start_time, end_time):
+    """
+    Function to publish Kraken status to Cerberus
+
+    Args:
+        config
+            - Kraken config dictionary
+
+        failed_post_scenarios
+            - String containing the failed post scenarios
+
+        start_time
+            - The time when chaos is injected
+
+        end_time
+            - The time when chaos is removed
+    """
+
+    cerberus_status = get_status(config, start_time, end_time)
+    if not cerberus_status:
+        if failed_post_scenarios:
+            if config["kraken"]["exit_on_failure"]:
+                logging.info(
+                    "Cerberus status is not healthy and post action scenarios " "are still failing, exiting kraken run"
+                )
+                sys.exit(1)
+            else:
+                logging.info("Cerberus status is not healthy and post action scenarios " "are still failing")
+    else:
+        if failed_post_scenarios:
+            if config["kraken"]["exit_on_failure"]:
+                logging.info(
+                    "Cerberus status is healthy but post action scenarios " "are still failing, exiting kraken run"
+                )
+                sys.exit(1)
+            else:
+                logging.info("Cerberus status is healthy but post action scenarios " "are still failing")
+
+
+def application_status(cerberus_url, start_time, end_time):
+    """
+    Function to check application availability
+
+    Args:
+        cerberus_url
+            - url where Cerberus publishes True/False signal
+
+        start_time
+            - The time when chaos is injected
+
+        end_time
+            - The time when chaos is removed
+
+    Returns:
+        Application status and failed routes
+    """
+
+    if not cerberus_url:
+        logging.error("url where Cerberus publishes True/False signal is not provided.")
+        sys.exit(1)
+    else:
+        duration = (end_time - start_time) / 60
+        url = cerberus_url + "/" + "history" + "?" + "loopback=" + str(duration)
+        logging.info("Scraping the metrics for the test duration from cerberus url: %s" % url)
+        try:
+            failed_routes = []
+            status = True
+            metrics = requests.get(url, timeout=60).content
+            metrics_json = json.loads(metrics)
+            for entry in metrics_json["history"]["failures"]:
+                if entry["component"] == "route":
+                    name = entry["name"]
+                    failed_routes.append(name)
+                    status = False
+                else:
+                    continue
+        except Exception as e:
+            logging.error("Failed to scrape metrics from cerberus API at %s: %s" % (url, e))
+            sys.exit(1)
+    return status, set(failed_routes)