Skip to content

Commit

Permalink
Added cluster shut down scenario
Browse files Browse the repository at this point in the history
This commit adds a scenario to shut down all the nodes including
the masters and restarts them after a specified duration.
  • Loading branch information
yashashreesuresh committed Aug 28, 2020
1 parent 31f06b8 commit a7cb346
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 1 deletion.
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ kraken:
- scenarios/openshift-apiserver.yml
node_scenarios: # List of chaos node scenarios to load
- scenarios/node_scenarios_example.yml
cluster_shut_down_scenario:
- scenarios/cluster_shut_down_scenario.yml
tunings:
wait_duration: 60 # Duration to wait between each chaos scenario
Expand Down Expand Up @@ -102,6 +104,9 @@ node_scenarios:
cloud_type: aws
```

#### Kubernetes/OpenShift cluster shut down scenario
Scenario to shut down all the nodes including the masters and restart them after specified duration. Cluster shut down scenario can be injected by placing the shut_down config file under cluster_shut_down_scenario option in the kraken config. Refer to [cluster_shut_down_scenario](https://github.com/openshift-scale/kraken/blob/master/scenarios/cluster_shut_down_scenario.yml) config file.

#### Pod chaos scenarios
Following are the components of Kubernetes/OpenShift for which a basic chaos scenario config exists today. Adding a new pod based scenario is as simple as adding a new config under scenarios directory and defining it in the config.

Expand Down
2 changes: 2 additions & 0 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ kraken:
- scenarios/post_action_regex.py
node_scenarios: # List of chaos node scenarios to load
- scenarios/node_scenarios_example.yml
cluster_shut_down_scenario:
- scenarios/cluster_shut_down_scenario.yml

cerberus:
cerberus_enabled: False # Enable it when cerberus is previously installed
Expand Down
54 changes: 53 additions & 1 deletion run_kraken.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import kraken.kubernetes.client as kubecli
import kraken.invoke.command as runcommand
import kraken.node_actions.common_node_functions as nodeaction
from kraken.node_actions.aws_node_scenarios import aws_node_scenarios
from kraken.node_actions.aws_node_scenarios import AWS, aws_node_scenarios


# Get the node scenarios object of specfied cloud type
Expand Down Expand Up @@ -152,6 +152,47 @@ def post_actions(kubeconfig_path, scenario, failed_post_scenarios, pre_action_ou
return failed_post_scenarios


# Inject the cluster shut down scenario
def cluster_shut_down(shut_down_config, config):
runs = shut_down_config["runs"]
shut_down_duration = shut_down_config["shut_down_duration"]
cloud_type = shut_down_config["cloud_type"]
if cloud_type == "aws":
cloud_object = AWS()

nodes = set(kubecli.list_nodes())
node_id = {}
for node in nodes:
node_id[node] = cloud_object.get_instance_id(node)

for _ in range(runs):
logging.info("Starting cluster_shut_down scenario injection")
for node in nodes:
cloud_object.stop_instances(node_id[node])
logging.info("Waiting for 250s to shut down all the nodes")
time.sleep(250)
logging.info("Shutting down the cluster for the specified duration: %s"
% (shut_down_duration))
time.sleep(shut_down_duration)
logging.info("Restarting the nodes")
restarted_nodes = set()
stopped_nodes = nodes
while restarted_nodes != nodes:
for node in stopped_nodes:
try:
cloud_object.start_instances(node_id[node])
restarted_nodes.add(node)
except Exception:
time.sleep(10)
continue
stopped_nodes = nodes - restarted_nodes
logging.info("Waiting for 250s to allow cluster component initilization")
time.sleep(250)
logging.info("Successfully injected cluster_shut_down scenario!")
cerberus_integration(config)
logging.info("")


# Main function
def main(cfg):
# Start kraken
Expand All @@ -165,6 +206,7 @@ def main(cfg):
kubeconfig_path = config["kraken"].get("kubeconfig_path", "")
scenarios = config["kraken"].get("scenarios", [])
node_scenarios = config["kraken"].get("node_scenarios", [])
cluster_shut_down_scenario = config["kraken"].get("cluster_shut_down_scenario", [])
wait_duration = config["tunings"].get("wait_duration", 60)
iterations = config["tunings"].get("iterations", 1)
daemon_mode = config["tunings"].get("daemon_mode", False)
Expand Down Expand Up @@ -242,6 +284,16 @@ def main(cfg):
cerberus_integration(config)
logging.info("")

# Inject cluster shut down scenario specified in the config
if cluster_shut_down_scenario:
for shut_down_config in cluster_shut_down_scenario:
with open(shut_down_config, 'r') as f:
shut_down_config = yaml.full_load(f)
shut_down_config = shut_down_config["cluster_shut_down_scenario"]
cluster_shut_down(shut_down_config, config)
logging.info("Waiting for the specified duration: %s" % (wait_duration))
time.sleep(wait_duration)

iteration += 1
logging.info("")
if failed_post_scenarios:
Expand Down
4 changes: 4 additions & 0 deletions scenarios/cluster_shut_down_scenario.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
cluster_shut_down_scenario: # Scenario to stop all the nodes for specified duration and restart the nodes
runs: 1 # Number of times to execute the cluster_shut_down scenario
shut_down_duration: 120 # duration in seconds to shut down the cluster
cloud_type: aws # cloud type on which Kubernetes/OpenShift runs

0 comments on commit a7cb346

Please sign in to comment.