diff --git a/README.md b/README.md index c6cabb45..88cf4155 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,19 @@ See the [getting started doc](docs/getting_started.md) on support on how to get After installation, refer back to the below sections for supported scenarios and how to tweak the kraken config to load them on your cluster +### Setting up infrastructure dependencies +Kraken indexes the metrics specified in the profile into Elasticsearch in addition to leveraging Cerberus for understanding the health of the kubernetes/OpenShift cluster under test. More information on the features is documented below. The infrastruture pieces can be easily installed, uninstalled by running: + +``` +$ cd kraken +$ podman-compose up or $ docker-compose up # Spins up the containers specified in the docker-compose.yml file present in the run directory +$ podman-compose down or $ docker-compose up # Delete the containers installed +``` +This will manage the Cerberus and Elasticsearch containers on the host on which you are running Kraken. + +**NOTE**: Make sure to have enough resources ( memory and disk ) on the machine on top of which the containers are running as Elasticsearch is resource intensive. Cerberus monitors the system components by default, the [config](config/cerberus.yaml) can be tweaked to add applications namespaces, routes and other components to monitor as well. Also the command will keep running until killed as detached mode is not supported as of now. + + ### Config Instructions on how to setup the config and the options supported can be found at [Config](docs/config.md). @@ -31,10 +44,12 @@ Kraken supports pod, node, time/date and [litmus](https://github.com/litmuschaos - [Cluster Shut Down Scenarios](docs/cluster_shut_down_scenarios.md) + ### Kraken scenario pass/fail criteria and report It's important to make sure to check if the targeted component recovered from the chaos injection and also if the Kubernetes/OpenShift cluster is healthy as failures in one component can have an adverse impact on other components. Kraken does this by: - Having built in checks for pod and node based scenarios to ensure the expected number of replicas and nodes are up. It also supports running custom scripts with the checks. - Leveraging [Cerberus](https://github.com/openshift-scale/cerberus) to monitor the cluster under test and consuming the aggregated go/no-go signal to determine pass/fail. It is highly recommended to turn on the Cerberus health check feature avaliable in Kraken. Instructions on installing and setting up Cerberus can be found [here](https://github.com/openshift-scale/cerberus#installation). Once Cerberus is up and running, set cerberus_enabled to True and cerberus_url to the url where Cerberus publishes go/no-go signal in the Kraken config file. +- Leveraging [kube-burner](docs/alerts.md) alerting feature to fail the runs in case of critical alerts. ### Performance monitoring diff --git a/config/cerberus.yaml b/config/cerberus.yaml new file mode 100644 index 00000000..0c3a4b69 --- /dev/null +++ b/config/cerberus.yaml @@ -0,0 +1,54 @@ +cerberus: + distribution: openshift # Distribution can be kubernetes or openshift + kubeconfig_path: ~/.kube/config # Path to kubeconfig + port: 8080 # http server port where cerberus status is published + watch_nodes: True # Set to True for the cerberus to monitor the cluster nodes + watch_cluster_operators: True # Set to True for cerberus to monitor cluster operators + watch_url_routes: # Route url's you want to monitor, this is a double array with the url and optional authorization parameter + watch_master_schedulable: # When enabled checks for the schedulable master nodes with given label. + enabled: True + label: node-role.kubernetes.io/master + watch_namespaces: # List of namespaces to be monitored + - openshift-etcd + - openshift-apiserver + - openshift-kube-apiserver + - openshift-monitoring + - openshift-kube-controller-manager + - openshift-machine-api + - openshift-kube-scheduler + - openshift-ingress + - openshift-sdn # When enabled, it will check for the cluster sdn and monitor that namespace + cerberus_publish_status: True # When enabled, cerberus starts a light weight http server and publishes the status + inspect_components: False # Enable it only when OpenShift client is supported to run + # When enabled, cerberus collects logs, events and metrics of failed components + + prometheus_url: # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. + prometheus_bearer_token: # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus. + # This enables Cerberus to query prometheus and alert on observing high Kube API Server latencies. + + slack_integration: False # When enabled, cerberus reports the failed iterations in the slack channel + # The following env vars needs to be set: SLACK_API_TOKEN ( Bot User OAuth Access Token ) and SLACK_CHANNEL ( channel to send notifications in case of failures ) + # When slack_integration is enabled, a watcher can be assigned for each day. The watcher of the day is tagged while reporting failures in the slack channel. Values are slack member ID's. + watcher_slack_ID: # (NOTE: Defining the watcher id's is optional and when the watcher slack id's are not defined, the slack_team_alias tag is used if it is set else no tag is used while reporting failures in the slack channel.) + Monday: + Tuesday: + Wednesday: + Thursday: + Friday: + Saturday: + Sunday: + slack_team_alias: # The slack team alias to be tagged while reporting failures in the slack channel when no watcher is assigned + + custom_checks: # Relative paths of files conataining additional user defined checks + +tunings: + timeout: 60 # Number of seconds before requests fail + iterations: 5 # Iterations to loop before stopping the watch, it will be replaced with infinity when the daemon mode is enabled + sleep_time: 5 # Sleep duration between each iteration + kube_api_request_chunk_size: 250 # Large requests will be broken into the specified chunk size to reduce the load on API server and improve responsiveness. + daemon_mode: True # Iterations are set to infinity which means that the cerberus will monitor the resources forever + cores_usage_percentage: 0.5 # Set the fraction of cores to be used for multiprocessing + +database: + database_path: /tmp/cerberus.db # Path where cerberus database needs to be stored + reuse_database: False # When enabled, the database is reused to store the failures diff --git a/config/config_performance.yaml b/config/config_performance.yaml new file mode 100644 index 00000000..bd2bca31 --- /dev/null +++ b/config/config_performance.yaml @@ -0,0 +1,45 @@ +kraken: + distribution: openshift # Distribution can be kubernetes or openshift + kubeconfig_path: /root/.kube/config # Path to kubeconfig + exit_on_failure: False # Exit when a post action scenario fails + litmus_version: v1.10.0 # Litmus version to install + litmus_uninstall: False # If you want to uninstall litmus if failure + chaos_scenarios: # List of policies/chaos scenarios to load + - pod_scenarios: # List of chaos pod scenarios to load + - - scenarios/etcd.yml + - - scenarios/regex_openshift_pod_kill.yml + - scenarios/post_action_regex.py + - node_scenarios: # List of chaos node scenarios to load + - scenarios/node_scenarios_example.yml + - pod_scenarios: + - - scenarios/openshift-apiserver.yml + - - scenarios/openshift-kube-apiserver.yml + - time_scenarios: # List of chaos time scenarios to load + - scenarios/time_scenarios_example.yml + - litmus_scenarios: # List of litmus scenarios to load + - - https://hub.litmuschaos.io/api/chaos/1.10.0?file=charts/generic/node-cpu-hog/rbac.yaml + - scenarios/node_hog_engine.yaml + - cluster_shut_down_scenarios: + - - scenarios/cluster_shut_down_scenario.yml + - scenarios/post_action_shut_down.py +cerberus: + cerberus_enabled: True # Enable it when cerberus is previously installed + cerberus_url: http://0.0.0.0:8080 # When cerberus_enabled is set to True, provide the url where cerberus publishes go/no-go signal + +performance_monitoring: + deploy_dashboards: True # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift + repo: "https://github.com/cloud-bulldozer/performance-dashboards.git" + kube_burner_binary_url: "https://github.com/cloud-bulldozer/kube-burner/releases/download/v0.9.1/kube-burner-0.9.1-Linux-x86_64.tar.gz" + capture_metrics: True + config_path: config/kube_burner.yaml # Define the Elasticsearch url and index name in this config + metrics_profile_path: config/metrics-aggregated.yaml + prometheus_url: # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. + prometheus_bearer_token: # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus. + uuid: # uuid for the run is generated by default if not set + enable_alerts: True # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error + alert_profile: config/alerts # Path to alert profile with the prometheus queries + +tunings: + wait_duration: 60 # Duration to wait between each chaos scenario + iterations: 1 # Number of times to execute the scenarios + daemon_mode: False # Iterations are set to infinity which means that the kraken will cause chaos forever diff --git a/config/kube_burner.yaml b/config/kube_burner.yaml index 6a02ef2d..dbea38d9 100644 --- a/config/kube_burner.yaml +++ b/config/kube_burner.yaml @@ -5,11 +5,11 @@ global: metricsDirectory: collected-metrics measurements: - name: podLatency - esIndex: kube-burner + esIndex: kraken indexerConfig: enabled: true - esServers: [https://elastic.example.com:9200] + esServers: [http://0.0.0.0:9200] # Please change this to the respective Elasticsearch in use if you haven't run the podman-compose command to setup the infrastructure containers insecureSkipVerify: true defaultIndex: kraken type: elastic diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 00000000..66c928b2 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,35 @@ +version: "3" +services: + elastic: + image: docker.elastic.co/elasticsearch/elasticsearch:7.13.2 + deploy: + replicas: 1 + restart_policy: + condition: on-failure + ports: + - "9200:9200" + - "9300:9300" + environment: + discovery.type: single-node + kibana: + image: docker.elastic.co/kibana/kibana:7.13.2 + deploy: + replicas: 1 + restart_policy: + condition: on-failure + ports: + - "5601:5601" + environment: + ELASTICSEARCH_HOSTS: "http://0.0.0.0:9200" + cerberus: + image: quay.io/openshift-scale/cerberus:latest + privileged: true + deploy: + replicas: 1 + restart_policy: + condition: on-failure + ports: + - "8080:8080" + volumes: + - ./config/cerberus.yaml:/root/cerberus/config/cerberus.yaml:Z # Modify the config in case of the need to monitor additional components + - /root/.kube/config:/root/.kube/config:Z diff --git a/docs/config.md b/docs/config.md index 0cff093c..3bceabf2 100644 --- a/docs/config.md +++ b/docs/config.md @@ -31,3 +31,5 @@ tunings: iterations: 1 # Number of times to execute the scenarios daemon_mode: False # Iterations are set to infinity which means that the kraken will cause chaos forever ``` + +**NOTE**: [config](https://github.com/cloud-bulldozer/kraken/tree/master/config/config_performance.yaml) can be used if leveraging the automated way to install the infrastruture pieces. diff --git a/requirements.txt b/requirements.txt index a97d8f55..502ca1e9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,3 +16,5 @@ paramiko setuptools openshift-client python-ipmi +podman-compose +docker-compose