From 5755e2869e28e59b3b15c1d9fcb1bf8b6fc9dbf9 Mon Sep 17 00:00:00 2001 From: Karthik Satchitanand Date: Wed, 6 Nov 2019 14:23:41 +0530 Subject: [PATCH 01/21] (chore)cep: add cep template (#917) Signed-off-by: ksatchit --- ceps/0001-cep-template.md | 173 ++++++++++++++++++++++++++++++++++++++ ceps/README.md | 50 +++++++++++ 2 files changed, 223 insertions(+) create mode 100644 ceps/0001-cep-template.md create mode 100644 ceps/README.md diff --git a/ceps/0001-cep-template.md b/ceps/0001-cep-template.md new file mode 100644 index 00000000000..6b10588fb1e --- /dev/null +++ b/ceps/0001-cep-template.md @@ -0,0 +1,173 @@ +--- +cep-number: 0 +title: My CEP +authors: +- "@ksatchit" +owners: + +- TBD + +- "@ksatchit" +editor: TBD +creation-date: yyyy-mm-dd +last-updated: yyyy-mm-dd +status: provisional/implementable/implemented/deferred/rejected/withdrawn/replaced +see-also: + +- CEP-1 + +- CEP-2 + +replaces: + +- CEP-3 + +superseded-by: + +- CEP-100 +--- + +# Title + +This is the title of the Chaos Enhancement Proposal (CEP). +Keep it simple and descriptive. +A good title can help communicate what the CEP is and should be considered as part of any review. + +The title should be lowercased and spaces/punctuation should be replaced with `-`. + +To get started with this template: +1. **Make a copy of this template.** + Name it `YYYYMMDD-my-title.md`. +1. **Fill out the "overview" sections.** + This includes the Summary and Motivation sections. +1. **Create a PR.** + Name it `[CEP NUMBER] Title`, e.g. `[CEP 20191014] Initial work on Chaos Operator`. + Assign it to owner(s) that are working on these features. +1. **Merge early.** + Avoid getting hung up on specific details and instead aim to get the goal of the CEP merged quickly. + The best way to do this is to just start with the "Overview" sections and fill out details incrementally in follow on PRs. + View anything marked as a `provisional` as a working document and subject to change. + Aim for single topic PRs to keep discussions focused. + If you disagree with what is already in a document, open a new PR with suggested changes. + +The canonical place for the latest set of instructions (and the likely source of this file) is [here](/ceps/0001-cep-template.md). + +The `Metadata` section above is intended to support the creation of tooling around the CEP process. + +## Table of Contents + +A table of contents is helpful for quickly jumping to sections of a CEP and for highlighting any additional information provided beyond the standard CEP template. +[Tools for generating](https://github.com/ekalinin/github-markdown-toc) a table of contents from markdown are available. + +- [Table of Contents](#table-of-contents) + +- [Summary](#summary) + +- [Motivation](#motivation) + + - [Goals](#goals) + - [Non-Goals](#non-goals) + +- [Proposal](#proposal) + + - [User Stories](#user-stories-optional) + - [Story 1](#story-1) + - [Story 2](#story-2) + - [Implementation Details/Notes/Constraints](#implementation-detailsnotesconstraints) + - [Risks and Mitigations](#risks-and-mitigations) + +- [Graduation Criteria](#graduation-criteria) + +- [Implementation History](#implementation-history) + +- [Drawbacks](#drawbacks) + +- [Alternatives](#alternatives) + +- [Infrastructure Needed [optional]](#infrastructure-needed) + +## Summary + +The `Summary` section is incredibly important for producing high quality user focused documentation such as release notes +or a development road map.It should be possible to collect this information before implementation begins in order to avoid +requiring implementors to split their attention between writing release notes and implementing the feature itself. +CEP editors should help to ensure that the tone and content of the `Summary` section is useful for a wide audience. + +A good summary is probably at least a paragraph in length. + +## Motivation + +This section is for explicitly listing the motivation, goals and non-goals of this CEP. +Describe why the change is important and the benefits to users. +The motivation section can optionally provide links to [experience reports](https://github.com/golang/go/wiki/ExperienceReports) to demonstrate the interest in a CEP +within the wider Litmus community. + +### Goals + +List the specific goals of the CEP. +How will we know that this has succeeded? + +### Non-Goals + +What is out of scope for his CEP? +Listing non-goals helps to focus discussion and make progress. + +## Proposal + +This is where we get down to the nitty gritty of what the proposal actually is. + +### User Stories (optional) + +Detail the things that people will be able to do if this CEP is implemented. +Include as much detail as possible so that people can understand the "how" of the system. +The goal here is to make this feel real for users without getting bogged down. + +#### Story 1 + +#### Story 2 + +### Implementation Details/Notes/Constraints (optional) + +What are the caveats to the implementation? +What are some important details that didn't come across above. +Go in to as much detail as necessary here. +This might be a good place to talk about core concepts and how they releate. + +### Risks and Mitigations + +What are the risks of this proposal and how do we mitigate. +Think broadly. +For example, consider both security and how this will impact the larger kubernetes ecosystem. + +## Graduation Criteria + +How will we know that this has succeeded? +Gathering user feedback is crucial for building high quality experiences and owners have the important responsibility +of setting milestones for stability and completeness. + +## Implementation History + +Major milestones in the life cycle of a CEP should be tracked in `Implementation History. +Major milestones might include the following. + +- the `Summary` and `Motivation` sections being merged signaling owner acceptance +- the `Proposal` section being merged signaling agreement on a proposed design +- the date implementation started +- the first Litmus release where an initial version of the CEP was available +- the version of Litmus where the CEP graduated to general availability +- when the CEP was retired or superseded + +## Drawbacks (optional) + +Why should this CEP _not_ be implemented. + +## Alternatives (optional) + +Similar to the `Drawbacks` section the `Alternatives` section is used to highlight and record other possible approaches +to delivering the value proposed by a CEP. + +## Infrastructure Needed (optional) + +Use this section if you need things from the project/owner. +Examples include a new subproject, repos requested, github details. +Listing these here allows a owner to get the process for these resources started right away. diff --git a/ceps/README.md b/ceps/README.md new file mode 100644 index 00000000000..1fed4d37ceb --- /dev/null +++ b/ceps/README.md @@ -0,0 +1,50 @@ +# Chaos Enhancement Proposals (CEPs) + +A Chaos Enhancement Proposal (CEP) is a way to propose, communicate and coordinate on new efforts for the LitmusChaos project. +You can read the full details of the project in [CEP-1](0001-chaos-enhancement-proposal-process.md). + +This process is still in _alpha_ state and is mandatory for all major feature beginning release 0.9. + +## Quick start for the CEP process + +- Socialize an idea with the Litmus contributors.Make sure that others think the work is worth taking up and will help review the CEP and any code changes required. +- Follow the process outlined in the [CEP template](YYYYMMDD-cep-template.md) + +## FAQs + +### Do I have to use the CEP process + +No... but we hope that you will. +Over time having a rich set of CEPs in one place will make it easier for people to track what is going in the community +and find a structured historic record. + +CEPs are required when the changes are wide-ranging & are feature-level items. +These changes are usually coordinated through Litmus maintainers. + +### Why would I want to use the CEP process + +Our aim with CEPs is to clearly communicate new efforts to the Litmus Chaos contributor community. +As such, we want to build a well curated set of clear proposals in a common format with useful metadata. + +We are inspired by KEPs, i.e., [Kubernetes Enhancement Proposals](https://github.com/kubernetes/enhancements/tree/master/keps) + +### Do I put my CEP in the root CEP directory or a SIG subdirectory + +If the CEP is mainly restricted to one SIG's purview then it should be in a CEP directory for that SIG. +If the CEP is widely impacting much of Litmus, it should be put at the root of this directory. + +### What will it take for CEPs to "graduate" out of "beta" + +Things we'd like to see happen to consider CEPs well on their way. + +- A set of CEPs that show healthy process around describing an effort and recording decisions in a reasonable amount of time. +- CEPs exposed on a searchable and indexable web site. +- Presubmit checks for CEPs around metadata format and markdown validity. + +Even so, the process can evolve. As we find new techniques we can improve our processes. + +### My FAQ isn't answered here + +The CEP process is still evolving! +If something is missing or not answered here feel free to reach out to [LitmusChaos Community](https://kubernetes.slack.com/messages/CNXNB0ZTN). +If you want to propose a change to the CEP process you can open a PR on [CEP-1](0001-cep-template.md) with your proposal. From a6417f7b7f0db26d19d55bf9dd03760ad1e6d901 Mon Sep 17 00:00:00 2001 From: Shubham Chaudhary Date: Thu, 7 Nov 2019 13:07:41 +0530 Subject: [PATCH 02/21] (feat): Add openebs-pool-pod-failure experiment (#907) * (feat): Add openebs-pool-pod-failure experiment Signed-off-by: shubhamchaudhary --- chaoslib/litmus/kill_random_pod.yml | 31 ++-- chaoslib/litmus/pod_failure_by_litmus.yml | 14 +- .../container_kill/container_kill_k8s_job.yml | 2 +- .../pod_delete/pod_delete_ansible_logic.yml | 3 +- .../openebs_pool_kill_ansible_logic.yml | 7 +- .../openebs_pool_kill_k8s_job.yml | 15 +- .../openebs-pool-pod-failure/README.md | 121 ++++++++++++++ ...stor_delete_and_verify_pool_deployment.yml | 61 +++++++ .../cstor_pool_delete.yml | 6 + .../cstor_pool_health_check.yml | 16 ++ .../cstor_verify_pool_provisioning.yml | 28 ++++ .../data_persistence.j2 | 5 + ...openebs_pool_pod_failure_ansible_logic.yml | 157 ++++++++++++++++++ .../openebs_pool_pod_failure_k8s_job.yml | 71 ++++++++ .../apps/openebs/fetch_cvr_count_from_pv.yml | 20 +++ .../openebs/fetch_replica_count_from_sc.yml | 28 ++++ 16 files changed, 557 insertions(+), 28 deletions(-) create mode 100644 experiments/openebs/openebs-pool-pod-failure/README.md create mode 100644 experiments/openebs/openebs-pool-pod-failure/cstor_delete_and_verify_pool_deployment.yml create mode 100644 experiments/openebs/openebs-pool-pod-failure/cstor_pool_delete.yml create mode 100644 experiments/openebs/openebs-pool-pod-failure/cstor_pool_health_check.yml create mode 100644 experiments/openebs/openebs-pool-pod-failure/cstor_verify_pool_provisioning.yml create mode 100644 experiments/openebs/openebs-pool-pod-failure/data_persistence.j2 create mode 100644 experiments/openebs/openebs-pool-pod-failure/openebs_pool_pod_failure_ansible_logic.yml create mode 100644 experiments/openebs/openebs-pool-pod-failure/openebs_pool_pod_failure_k8s_job.yml create mode 100644 utils/apps/openebs/fetch_cvr_count_from_pv.yml create mode 100644 utils/apps/openebs/fetch_replica_count_from_sc.yml diff --git a/chaoslib/litmus/kill_random_pod.yml b/chaoslib/litmus/kill_random_pod.yml index 56cf73977e2..4c17a423da9 100644 --- a/chaoslib/litmus/kill_random_pod.yml +++ b/chaoslib/litmus/kill_random_pod.yml @@ -1,21 +1,24 @@ -- name: Get a list of all pods from given namespace - k8s_facts: - kind: Pod - namespace: "{{ a_ns }}" - label_selectors: - - "{{a_label}}" - register: pod_list +- block: + - name: Get a list of all pods from given namespace + k8s_facts: + kind: Pod + namespace: "{{ app_ns }}" + label_selectors: + - "{{ app_label }}" + register: pod_list -- name: Select a random pod to kill - set_fact: - a_pod_to_kill: "{{ pod_list.resources | random | json_query('metadata.name') }}" + - name: Select a random pod to kill + set_fact: + app_pod_name: "{{ pod_list.resources | random | json_query('metadata.name') }}" + + when: app_pod_name is undefined - debug: - msg: "Killing pod {{a_pod_to_kill}}" + msg: "Killing pod {{ app_pod_name }}" - name: Force Kill application pod shell: | - kubectl delete pod -n {{ a_ns }} --force --grace-period=0 --wait=false {{a_pod_to_kill}} + kubectl delete pod -n {{ app_ns }} --force --grace-period=0 --wait=false {{ app_pod_name }} args: executable: /bin/bash register: result @@ -23,7 +26,7 @@ - name: Kill application pod shell: | - kubectl delete pod -n {{ a_ns }} --grace-period=0 --wait=false {{a_pod_to_kill}} + kubectl delete pod -n {{ app_ns }} --grace-period=0 --wait=false {{ app_pod_name }} args: executable: /bin/bash register: result @@ -31,4 +34,4 @@ - name: Wait for the interval timer pause: - seconds: "{{c_interval}}" \ No newline at end of file + seconds: "{{ c_interval }}" \ No newline at end of file diff --git a/chaoslib/litmus/pod_failure_by_litmus.yml b/chaoslib/litmus/pod_failure_by_litmus.yml index 95715966d3a..17e6296d597 100644 --- a/chaoslib/litmus/pod_failure_by_litmus.yml +++ b/chaoslib/litmus/pod_failure_by_litmus.yml @@ -1,13 +1,19 @@ - name: Derive chaos iterations set_fact: - chaos_iterations: "{{ (c_duration|int / c_interval|int)|int }}" + c_iterations: "{{ (c_duration|int / c_interval|int)|int }}" + when: c_iterations is undefined + +- name: Derive chaos interval + set_fact: + c_interval: "{{ (c_duration|int / c_iterations|int)|int }}" + when: c_interval is undefined - name: Set min chaos count to 1 if interval > duration set_fact: - chaos_iterations: 1 - when: "chaos_iterations == '0'" + c_iterations: 1 + when: "c_iterations == '0'" - name: Kill random pod include: kill_random_pod.yml - with_sequence: start=1 end={{ chaos_iterations }} + with_sequence: start=1 end={{ c_iterations }} \ No newline at end of file diff --git a/experiments/generic/container_kill/container_kill_k8s_job.yml b/experiments/generic/container_kill/container_kill_k8s_job.yml index 5507fa32bb7..e47ee6fb2d6 100644 --- a/experiments/generic/container_kill/container_kill_k8s_job.yml +++ b/experiments/generic/container_kill/container_kill_k8s_job.yml @@ -41,7 +41,7 @@ spec: # provide application labels - name: APP_LABEL - value: ''>>>>>>> master + value: '' # provide target container - name: TARGET_CONTAINER diff --git a/experiments/generic/pod_delete/pod_delete_ansible_logic.yml b/experiments/generic/pod_delete/pod_delete_ansible_logic.yml index 0ac273739bc..f320bd2eaf0 100644 --- a/experiments/generic/pod_delete/pod_delete_ansible_logic.yml +++ b/experiments/generic/pod_delete/pod_delete_ansible_logic.yml @@ -48,7 +48,8 @@ - include_tasks: "{{ c_util }}" vars: - c_svc_acc: "{{ lookup('env','CHAOS_SERVICE_ACCOUNT') }}" + app_ns: "{{ a_ns }}" + app_label: "{{ a_label }}" ## POST-CHAOS APPLICATION LIVENESS CHECK diff --git a/experiments/openebs/openebs-pool-container-failure/openebs_pool_kill_ansible_logic.yml b/experiments/openebs/openebs-pool-container-failure/openebs_pool_kill_ansible_logic.yml index 2cf5450ce05..32ce62c69ea 100644 --- a/experiments/openebs/openebs-pool-container-failure/openebs_pool_kill_ansible_logic.yml +++ b/experiments/openebs/openebs-pool-container-failure/openebs_pool_kill_ansible_logic.yml @@ -11,6 +11,7 @@ chaos_duration: 600 chaos_iterations: "{{ lookup('env','CHAOS_ITERATIONS') }}" data_persistence: "{{ lookup('env','DATA_PERSISTENCE') }}" + lib_image: "{{ lookup('env','LIB_IMAGE') }}" liveness_label: "{{ lookup('env','LIVENESS_APP_LABEL') }}" liveness_namespace: "{{ lookup('env','LIVENESS_APP_NAMESPACE') }}" openebs_ns: "{{ lookup('env','OPENEBS_NS') }}" @@ -92,7 +93,7 @@ vars: status: 'LOAD' ns: "{{ a_ns }}" - app_pod_name: "{{ app_pod_name.stdout }}" + pod_name: "{{ app_pod_name.stdout }}" when: data_persistence != '' ## STORAGE FAULT INJECTION @@ -113,7 +114,7 @@ vars: status: 'VERIFY' ns: "{{ a_ns }}" - app_pod_name: "{{ app_pod_name.stdout }}" + pod_name: "{{ app_pod_name.stdout }}" when: data_persistence != '' - name: Get application pod name @@ -129,7 +130,7 @@ vars: status: 'DELETE' ns: "{{ a_ns }}" - app_pod_name: "{{ new_app_pod.stdout }}" + pod_name: "{{ new_app_pod.stdout }}" when: data_persistence != '' # Check application liveness post chaos diff --git a/experiments/openebs/openebs-pool-container-failure/openebs_pool_kill_k8s_job.yml b/experiments/openebs/openebs-pool-container-failure/openebs_pool_kill_k8s_job.yml index 2ed4546fe87..e0706cd5cf5 100644 --- a/experiments/openebs/openebs-pool-container-failure/openebs_pool_kill_k8s_job.yml +++ b/experiments/openebs/openebs-pool-container-failure/openebs_pool_kill_k8s_job.yml @@ -2,7 +2,7 @@ apiVersion: v1 kind: ConfigMap metadata: - name: data-persistence-configmap + name: pool-container-kill data: parameters.yml: | @@ -24,11 +24,12 @@ spec: image: litmuschaos/ansible-runner:ci imagePullPolicy: Always env: - - name: OPENEBS_NS - value: openebs - - name: ANSIBLE_STDOUT_CALLBACK value: 'default' + + #provide openebs namespace + - name: OPENEBS_NS + value: 'openebs' # provide application namespace - name: APP_NAMESPACE @@ -42,6 +43,10 @@ spec: - name: APP_PVC value: '' + # provide lib image + - name: LIB_IMAGE + value: 'gaiaadm/pumba:0.4.8' + - name: LIVENESS_APP_LABEL value: '' @@ -66,4 +71,4 @@ spec: volumes: - name: parameters configMap: - name: data-persistence-configmap + name: pool-container-kill diff --git a/experiments/openebs/openebs-pool-pod-failure/README.md b/experiments/openebs/openebs-pool-pod-failure/README.md new file mode 100644 index 00000000000..5df32a2a921 --- /dev/null +++ b/experiments/openebs/openebs-pool-pod-failure/README.md @@ -0,0 +1,121 @@ +## Experiment Metadata + + + + + + + + + + + + + + +
Type Description Storage K8s Platform
Chaos Kill the pool pod and check if gets scheduled again OPENEBS Any
+ +## Entry-Criteria + +- Application services are accessible & pods are healthy +- Application writes are successful + +## Exit-Criteria + +- Application services are accessible & pods are healthy +- Data written prior to chaos is successfully retrieved/read +- Database consistency is maintained as per db integrity check utils +- Storage target pods are healthy + +## Notes + +- Typically used as a disruptive test, to cause loss of access to storage pool by killing it. +- The pool pod should start again and it should be healthy. + +## Associated Utils + +- [cstor_pool_delete.yml](/experiments/openebs/openebs-pool-container-failure/cstor_pool_delete.yml) +- [cstor_pool_health_check.yml](/experiments/openebs/openebs-pool-container-failure/cstor_pool_health_check.yml) +- [cstor_verify_pool_provisioning.yml](/experiments/openebs/openebs-pool-container-failure/cstor_verify_pool_provisioning.yml) +- [cstor_delete_and_verify_pool_deployment.yml](/experiments/openebs/openebs-pool-container-failure/cstor_delete_and_verify_pool_deployment.yml) + +### Procedure + +This scenario validates the behaviour of application and OpenEBS persistent volumes in the amidst of chaos induced on storage pool. The litmus experiment fails the specified pool and thereby losing the access to volumes being created on it. + +After injecting the chaos into the component specified via environmental variable, litmus experiment observes the behaviour of corresponding OpenEBS PV and the application which consumes the volume. + +Based on the value of env DATA_PERSISTENCE, the corresponding data consistency util will be executed. At present only busybox and percona-mysql are supported. Along with specifying env in the litmus experiment, user needs to pass name for configmap and the data consistency specific parameters required via configmap in the format as follows: + + parameters.yml: | + blocksize: 4k + blockcount: 1024 + testfile: difiletest + +It is recommended to pass test-name for configmap and mount the corresponding configmap as volume in the litmus pod. The above snippet holds the parameters required for validation data consistency in busybox application. + +For percona-mysql, the following parameters are to be injected into configmap. + + parameters.yml: | + dbuser: root + dbpassword: k8sDem0 + dbname: tdb + +The configmap data will be utilised by litmus experiments as its variables while executing the scenario. Based on the data provided, litmus checks if the data is consistent after recovering from induced chaos. + +## Litmusbook Environment Variables + +### Application + + + + + + + + + + + + + + + + + +
Parameter + Description
APP_NAMESPACE Namespace in which application pods are deployed
APP_LABEL Unique Labels in `key=value` format of application deployment
APP_PVC Name of persistent volume claim used for app's volume mounts
+ +### Chaos + + + + + + + + + + +
Parameter Description
CHAOS_ITERATIONS The number of chaos iterations
+ +### Health Checks + + + + + + + + + + + + + + + + + +
Parameter + Description
LIVENESS_APP_NAMESPACE Namespace in which external liveness pods are deployed, if any
LIVENESS_APP_LABEL Unique Labels in `key=value` format for external liveness pod, if any
DATA_PERSISTENCE Data accessibility & integrity verification post recovery. To check against busybox set value: "busybox" and for percona, set value: "mysql"
\ No newline at end of file diff --git a/experiments/openebs/openebs-pool-pod-failure/cstor_delete_and_verify_pool_deployment.yml b/experiments/openebs/openebs-pool-pod-failure/cstor_delete_and_verify_pool_deployment.yml new file mode 100644 index 00000000000..f1c79df5c0c --- /dev/null +++ b/experiments/openebs/openebs-pool-pod-failure/cstor_delete_and_verify_pool_deployment.yml @@ -0,0 +1,61 @@ +- name: Randomly select the pool deployment from cvr + shell: > + kubectl get cvr -n {{ openebs_ns }} + -l openebs.io/persistent-volume={{ pv.stdout }} --no-headers + -o=jsonpath='{range .items[*]}{.metadata.labels.cstorpool\.openebs\.io\/name}{"\n"}{end}' | + shuf -n1 | awk '{print $1}' + args: + executable: /bin/bash + register: pool_deployment + +- name: Get the resourceVersion of pool deployment + shell: > + kubectl get deployment {{ pool_deployment.stdout }} + -n {{ openebs_ns }} -o=jsonpath='{.metadata.resourceVersion}' + args: + executable: /bin/bash + register: pool_deployment_revisionSource_before + +- name: Get the pod of pool deployment + shell: > + kubectl get pods -n {{ openebs_ns }} | + grep {{ pool_deployment.stdout }} | grep -w "Running" | awk '{print $1}' + args: + executable: /bin/bash + register: cstor_pool_pod + +# including chaoslib kill-random-pod +- name: Delete the cstor pool pod for reschedule + include_tasks: /chaoslib/litmus/kill_random_pod.yml + vars: + app_ns: "{{ openebs_ns }}" + app_pod_name: "{{ cstor_pool_pod.stdout }}" + +- name: Check for pool pod in running state + shell: > + kubectl get pods -n {{ openebs_ns }} | + grep {{ pool_deployment.stdout }} | grep -v {{ cstor_pool_pod.stdout }} | + grep -w "Running" | wc -l + args: + executable: /bin/bash + register: cstor_pool_pod_cnt + until: "cstor_pool_pod_cnt.stdout == \"1\"" + delay: 30 + retries: 10 + +- name: Get resourceVersion after pod delete + shell: > + kubectl get deployment {{ pool_deployment.stdout }} + -n {{ openebs_ns }} -o=jsonpath='{.metadata.resourceVersion}' + args: + executable: /bin/bash + register: pool_deployment_revisionSource_after + +- name: Compare resourceVersions + debug: + msg: + - "Verified pool pods were restarted by fault injection" + - "Before: {{ pool_deployment_revisionSource_before.stdout }}" + - "After: {{ pool_deployment_revisionSource_after.stdout }}" + failed_when: "pool_deployment_revisionSource_before.stdout|int == pool_deployment_revisionSource_after.stdout|int" + diff --git a/experiments/openebs/openebs-pool-pod-failure/cstor_pool_delete.yml b/experiments/openebs/openebs-pool-pod-failure/cstor_pool_delete.yml new file mode 100644 index 00000000000..2941195bcb9 --- /dev/null +++ b/experiments/openebs/openebs-pool-pod-failure/cstor_pool_delete.yml @@ -0,0 +1,6 @@ +- include: cstor_verify_pool_provisioning.yml + +- include: cstor_delete_and_verify_pool_deployment.yml + loop: "{{ range(0, c_iterations|int, 1)|list }}" + +- include: cstor_pool_health_check.yml diff --git a/experiments/openebs/openebs-pool-pod-failure/cstor_pool_health_check.yml b/experiments/openebs/openebs-pool-pod-failure/cstor_pool_health_check.yml new file mode 100644 index 00000000000..c3885ca8598 --- /dev/null +++ b/experiments/openebs/openebs-pool-pod-failure/cstor_pool_health_check.yml @@ -0,0 +1,16 @@ +- name: Wait (soak) for I/O on pools + wait_for: + timeout: "{{ post_chaos_soak_time }}" + +- name: Fetch the pool pod name from cvr + include_tasks: "/utils/apps/openebs/fetch_podname_from_cvr.yaml" + +- name: Verify logs of pool pods for error strings + shell: > + kubectl logs {{ item }} -n {{ openebs_ns }} + -c cstor-pool | egrep '{{ error_messages }}' + args: + executable: /bin/bash + register: result + with_items: "{{ pool_pod_named_list }}" + failed_when: result.rc == 0 diff --git a/experiments/openebs/openebs-pool-pod-failure/cstor_verify_pool_provisioning.yml b/experiments/openebs/openebs-pool-pod-failure/cstor_verify_pool_provisioning.yml new file mode 100644 index 00000000000..4a953408659 --- /dev/null +++ b/experiments/openebs/openebs-pool-pod-failure/cstor_verify_pool_provisioning.yml @@ -0,0 +1,28 @@ + +- name: Fetch the replica count from storage class + include_tasks: "/utils/apps/openebs/fetch_replica_count_from_sc.yml" + +- name: Fetch the CVR count from pv + include_tasks: "/utils/apps/openebs/fetch_cvr_count_from_pv.yml" + +- name: Compare ReplicaCount and cvr_count to verify provisioning + debug: + msg: + - "replicacnt: {{ replicacnt }}" + - "cvr_count: {{ cvr_count| int }}" + failed_when: "replicacnt|int != cvr_count|int" + +- name: Get CVR status list from pv + shell: > + kubectl get cvr -n {{ openebs_ns }} + -l openebs.io/persistent-volume={{ pv.stdout }} --no-headers + -o jsonpath='{range .items[*]}{.status.phase}{"\n"}{end}' + args: + executable: /bin/bash + register: cvr_status_phase + +- name: Check status of cvr + command: echo "{{ item }}" + failed_when: "item != \"Offline\" and item != \"Degraded\" and item != \"Rebuilding\" and item != \"Healthy\"" + with_items: + - "{{ cvr_status_phase.stdout_lines }}" diff --git a/experiments/openebs/openebs-pool-pod-failure/data_persistence.j2 b/experiments/openebs/openebs-pool-pod-failure/data_persistence.j2 new file mode 100644 index 00000000000..405497dde21 --- /dev/null +++ b/experiments/openebs/openebs-pool-pod-failure/data_persistence.j2 @@ -0,0 +1,5 @@ +{% if data_persistence is defined and data_persistence == 'mysql' %} + consistencyutil: /utils/apps/mysql/mysql_data_persistence.yml + {% elif data_persistence is defined and data_persistence == 'busybox' %} + consistencyutil: /utils/apps/busybox/busybox_data_persistence.yml +{% endif %} \ No newline at end of file diff --git a/experiments/openebs/openebs-pool-pod-failure/openebs_pool_pod_failure_ansible_logic.yml b/experiments/openebs/openebs-pool-pod-failure/openebs_pool_pod_failure_ansible_logic.yml new file mode 100644 index 00000000000..8fe0d926e2a --- /dev/null +++ b/experiments/openebs/openebs-pool-pod-failure/openebs_pool_pod_failure_ansible_logic.yml @@ -0,0 +1,157 @@ +--- +- hosts: localhost + connection: local + + vars: + a_label: "{{ lookup('env','APP_LABEL') }}" + a_ns: "{{ lookup('env','APP_NAMESPACE') }}" + a_pvc: "{{ lookup('env','APP_PVC') }}" + c_duration: 600 + c_interval: 5 + c_engine: "{{ lookup('env','CHAOSENGINE') }}" + c_experiment: openebs-pool-pod-failure + c_force: "{{ lookup('env','FORCE') }}" + c_iterations: "{{ lookup('env','CHAOS_ITERATIONS') }}" + data_persistence: "{{ lookup('env','DATA_PERSISTENCE') }}" + liveness_label: "{{ lookup('env','LIVENESS_APP_LABEL') }}" + liveness_namespace: "{{ lookup('env','LIVENESS_APP_NAMESPACE') }}" + openebs_ns: openebs + pool_debug_msg: 'uncorrectable I/O failure|suspended|ERROR ZFS event' + + vars_files: + - /mnt/parameters.yml + - /experiments/openebs/openebs_components.yml + + tasks: + - block: + + ## PRE-CHAOS APPLICATION LIVENESS CHECK + + - include_tasks: /utils/common/application_liveness_check.yml + when: liveness_label != '' + + - name: Identify the data consistency util to be invoked + template: + src: data_persistence.j2 + dest: data_persistence.yml + + - include_vars: + file: data_persistence.yml + + - name: Record the chaos util path + set_fact: + chaos_util_path: "/experiments/openebs/openebs-pool-pod-failure/cstor_pool_delete.yml" + + - name: Record the data consistency util path + set_fact: + data_consistency_util_path: "{{ consistencyutil }}" + when: data_persistence != '' + + ## GENERATE EXP RESULT NAME + - block: + + - name: Construct chaos result name (experiment_name) + set_fact: + c_result: "{{ c_engine }}-{{ c_experiment }}" + + when: c_engine != '' + + ## RECORD START-OF-TEST IN LITMUS RESULT CR + + - include_tasks: /utils/runtime/create_testname.yml + + - include_tasks: /utils/runtime/update_chaos_result_resource.yml + vars: + status: 'SOT' + namespace: "{{ a_ns }}" + + ## DISPLAY APP INFORMATION + + - name: Display the app information passed via the test job + debug: + msg: + - "The application info is as follows:" + - "Namespace : {{ a_ns }}" + - "Label : {{ a_label }}" + - "PVC : {{ a_pvc }}" + + ## PRE-CHAOS APPLICATION LIVENESS CHECK + - name: Get application pod name + shell: > + kubectl get pods -n {{ a_ns }} -l {{ a_label }} --no-headers + -o=custom-columns=NAME:".metadata.name" + args: + executable: /bin/bash + register: app_pod_name + + - name: Verify that the AUT (Application Under Test) is running + include_tasks: "/utils/common/status_app_pod.yml" + vars: + application_name: "{{ app_pod_name.stdout }}" + delay: 5 + retries: 60 + + - name: Create some test data + include: "{{ data_consistency_util_path }}" + vars: + status: 'LOAD' + ns: "{{ a_ns }}" + pod_name: "{{ app_pod_name.stdout }}" + when: data_persistence != '' + + ## STORAGE FAULT INJECTION + + - include: "{{ chaos_util_path }}" + error_messages: "{{ pool_debug_msg }}" + post_chaos_soak_time : "{{ c_duration }}" + + ## POST-CHAOS APPLICATION LIVENESS CHECK + + - name: Verify AUT liveness post fault-injection + include_tasks: "/utils/common/status_app_pod.yml" + vars: + application_name: "{{ app_pod_name.stdout }}" + delay: 5 + retries: 60 + + - name: Verify application data persistence + include: "{{ data_consistency_util_path }}" + vars: + status: 'VERIFY' + ns: "{{ a_ns }}" + pod_name: "{{ app_pod_name.stdout }}" + when: data_persistence != '' + + - name: Get application pod name + shell: > + kubectl get pods -n {{ a_ns }} -l {{ a_label }} --no-headers + -o=custom-columns=NAME:".metadata.name" + args: + executable: /bin/bash + register: new_app_pod + + - name: Verify successful database delete + include: "{{ data_consistency_util_path }}" + vars: + status: 'DELETE' + ns: "{{ a_ns }}" + pod_name: "{{ new_app_pod.stdout }}" + when: data_persistence != '' + + # Check application liveness post chaos + - include_tasks: /utils/common/application_liveness_check.yml + when: liveness_label != '' + + - set_fact: + flag: "Pass" + + rescue: + - set_fact: + flag: "Fail" + + always: + ## RECORD END-OF-TEST IN LITMUS RESULT CR + - include_tasks: /utils/runtime/update_chaos_result_resource.yml + vars: + status: 'EOT' + namespace: "{{ a_ns }}" diff --git a/experiments/openebs/openebs-pool-pod-failure/openebs_pool_pod_failure_k8s_job.yml b/experiments/openebs/openebs-pool-pod-failure/openebs_pool_pod_failure_k8s_job.yml new file mode 100644 index 00000000000..60edeb84b61 --- /dev/null +++ b/experiments/openebs/openebs-pool-pod-failure/openebs_pool_pod_failure_k8s_job.yml @@ -0,0 +1,71 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: pool-pod-delete +data: + parameters.yml: | + +--- +apiVersion: batch/v1 +kind: Job +metadata: + generateName: openebs-pool-pod-failure- +spec: + template: + metadata: + labels: + name: openebs-pool-pod-failure + spec: + serviceAccountName: %CHAOS_SERVICE_ACCOUNT% + restartPolicy: Never + + containers: + - name: ansibletest + image: litmuschaos/ansible-runner:ci + imagePullPolicy: Always + env: + - name: ANSIBLE_STDOUT_CALLBACK + value: 'default' + + #provide openebs namespace + - name: OPENEBS_NS + value: 'openebs' + + #provide application namespace + - name: APP_NAMESPACE + value: '' + + #provide application labels + - name: APP_LABEL + value: '' + + #provide application pvc + - name: APP_PVC + value: '' + + - name: FORCE + value: 'true' + + - name: LIVENESS_APP_LABEL + value: '' + + - name: LIVENESS_APP_NAMESPACE + value: '' + + - name: DATA_PERSISTENCE + value: '' + + - name: CHAOS_ITERATIONS + value: '2' + + command: ["/bin/bash"] + args: ["-c", "ansible-playbook ./experiments/openebs/openebs-pool-pod-failure/openebs_pool_pod_failure_ansible_logic.yml -i /etc/ansible/hosts -vv; exit 0"] + + volumeMounts: + - name: parameters + mountPath: /mnt/ + volumes: + - name: parameters + configMap: + name: pool-pod-delete diff --git a/utils/apps/openebs/fetch_cvr_count_from_pv.yml b/utils/apps/openebs/fetch_cvr_count_from_pv.yml new file mode 100644 index 00000000000..30b3f49bff7 --- /dev/null +++ b/utils/apps/openebs/fetch_cvr_count_from_pv.yml @@ -0,0 +1,20 @@ +--- +- name: Derive PV from application PVC + shell: > + kubectl get pvc {{ a_pvc }} + -o custom-columns=:spec.volumeName -n {{ a_ns }} + --no-headers + args: + executable: /bin/bash + register: pv + +- name: Get CVR count from pv + shell: > + kubectl get cvr -n {{ openebs_ns }} + -l openebs.io/persistent-volume={{ pv.stdout }} --no-headers | wc -l + args: + executable: /bin/bash + register: cvr_count + +- set_fact: + cvr_count: "{{ cvr_count.stdout }}" diff --git a/utils/apps/openebs/fetch_replica_count_from_sc.yml b/utils/apps/openebs/fetch_replica_count_from_sc.yml new file mode 100644 index 00000000000..c6a1c607c07 --- /dev/null +++ b/utils/apps/openebs/fetch_replica_count_from_sc.yml @@ -0,0 +1,28 @@ +--- +- name: Derive SC from application PVC + shell: > + kubectl get pvc {{ a_pvc }} + -o custom-columns=:spec.storageClassName -n {{ a_ns }} + --no-headers + args: + executable: /bin/bash + register: sc + +- name: Derive ReplicaCount from SC + shell: > + kubectl get sc {{ sc.stdout }} -n {{ openebs_ns }} --no-headers + -o jsonpath="{.metadata.annotations.cas\\.openebs\\.io\/config}" + | grep -A1 "ReplicaCount" | grep -i value | awk '{print $2}' | tr -d '"' + args: + executable: /bin/bash + register: replicacount + +- name: Set default value for replicacount if it is empty + set_fact: + replicacnt: "3" + when: "replicacount.stdout == \"\"" + +- name: Set default value for replicacount if it is non-empty + set_fact: + replicacnt: "{{ replicacount.stdout }}" + when: "replicacount.stdout != \"\"" From 29ee5cb23931f72f6051e08335bc920f0608291a Mon Sep 17 00:00:00 2001 From: Shubham Chaudhary Date: Thu, 7 Nov 2019 13:36:13 +0530 Subject: [PATCH 03/21] (feat): Add openebs-target-network-loss experiment (#913) * (feat): Add openebs-target-network-loss experiment Signed-off-by: shubhamchaudhary --- .../pumba/network_chaos/induce_latency.yml | 2 +- .../network_chaos/induce_packet_loss.yml | 2 +- .../pumba/network_chaos/network_chaos.yml | 34 ++-- .../openebs_pool_kill_k8s_job.yml | 4 +- .../openebs-target-network-loss/README.md | 128 +++++++++++++++ .../openebs-target-network-loss/chaosutil.j2 | 7 + .../cstor_target_network_delay.yaml | 19 +++ .../data_persistence.j2 | 5 + .../jiva_controller_network_delay.yaml | 70 ++++++++ ...nebs_target_network_loss_ansible_logic.yml | 153 ++++++++++++++++++ .../openebs_target_network_loss_k8s_job.yml | 78 +++++++++ .../test_prerequisites.yml | 39 +++++ experiments/openebs/openebs_components.yml | 7 + .../apps/openebs/fetch_sc_and_provisioner.yml | 24 +++ 14 files changed, 555 insertions(+), 17 deletions(-) create mode 100644 experiments/openebs/openebs-target-network-loss/README.md create mode 100644 experiments/openebs/openebs-target-network-loss/chaosutil.j2 create mode 100644 experiments/openebs/openebs-target-network-loss/cstor_target_network_delay.yaml create mode 100644 experiments/openebs/openebs-target-network-loss/data_persistence.j2 create mode 100644 experiments/openebs/openebs-target-network-loss/jiva_controller_network_delay.yaml create mode 100644 experiments/openebs/openebs-target-network-loss/openebs_target_network_loss_ansible_logic.yml create mode 100644 experiments/openebs/openebs-target-network-loss/openebs_target_network_loss_k8s_job.yml create mode 100644 experiments/openebs/openebs-target-network-loss/test_prerequisites.yml create mode 100644 utils/apps/openebs/fetch_sc_and_provisioner.yml diff --git a/chaoslib/pumba/network_chaos/induce_latency.yml b/chaoslib/pumba/network_chaos/induce_latency.yml index 8a86a3f2dfd..5498c837eb0 100644 --- a/chaoslib/pumba/network_chaos/induce_latency.yml +++ b/chaoslib/pumba/network_chaos/induce_latency.yml @@ -2,6 +2,6 @@ shell: > kubectl exec {{ pumba_pod.stdout }} -n {{ a_ns }} -- pumba netem --interface {{ n_interface }} --duration {{ c_duration }}ms delay - --time {{ n_latency }} re2:k8s_{{ c_container }}_{{ app_pod.stdout }} + --time {{ n_latency }} re2:k8s_{{ c_container }}_{{ app_pod }} args: executable: /bin/bash diff --git a/chaoslib/pumba/network_chaos/induce_packet_loss.yml b/chaoslib/pumba/network_chaos/induce_packet_loss.yml index 5fa8c979c26..5367ace8ec3 100644 --- a/chaoslib/pumba/network_chaos/induce_packet_loss.yml +++ b/chaoslib/pumba/network_chaos/induce_packet_loss.yml @@ -2,6 +2,6 @@ shell: > kubectl exec {{ pumba_pod.stdout }} -n {{ a_ns }} -- pumba netem --interface {{ n_interface }} --duration {{ c_duration }}ms - loss --percent {{ n_packet_loss }} re2:k8s_{{ c_container }}_{{ app_pod.stdout }} + loss --percent {{ n_packet_loss }} re2:k8s_{{ c_container }}_{{ app_pod }} args: executable: /bin/bash diff --git a/chaoslib/pumba/network_chaos/network_chaos.yml b/chaoslib/pumba/network_chaos/network_chaos.yml index f4f4831322f..f8762025435 100644 --- a/chaoslib/pumba/network_chaos/network_chaos.yml +++ b/chaoslib/pumba/network_chaos/network_chaos.yml @@ -27,18 +27,28 @@ retries: 60 ignore_errors: true - - name: Select the app pod - shell: > - kubectl get pod -l {{ a_label }} -n {{ a_ns }} - -o=custom-columns=:metadata.name --no-headers - | shuf | head -1 - args: - executable: /bin/bash - register: app_pod + - block: + - name: Select the app pod + shell: > + kubectl get pod -l {{ a_label }} -n {{ a_ns }} + -o=custom-columns=:metadata.name --no-headers + | shuf | head -1 + args: + executable: /bin/bash + register: app_pod_name + + - name: Record app pod name + set_fact: + app_pod: "{{ app_pod_name.stdout }}" + when: "app_pod is undefined" + + # here app_ns is the namespace of pod on which we are performing network loss/delay + # in genric experiments app_ns is same as a_ns + # in openebs experiments app_ns is the namespace where openebs is installed i.e, openebs - name: Identify the application node shell: > - kubectl get pod {{ app_pod.stdout }} -n {{ a_ns }} + kubectl get pod {{ app_pod }} -n {{ app_ns }} --no-headers -o custom-columns=:spec.nodeName args: executable: /bin/bash @@ -60,7 +70,7 @@ - name: Tear down pumba infrastructure shell: > - kubectl delete -f /chaoslib/pumba/pumba_kube.yaml -n {{ a_ns }} + kubectl delete -f /chaoslib/pumba/pumba_kube.yml -n {{ a_ns }} args: executable: /bin/bash @@ -94,6 +104,4 @@ delay: 20 retries: 15 - when: "pumb_deploy_result.rc == 0" - - + when: "pumb_deploy_result.rc == 0" \ No newline at end of file diff --git a/experiments/openebs/openebs-pool-container-failure/openebs_pool_kill_k8s_job.yml b/experiments/openebs/openebs-pool-container-failure/openebs_pool_kill_k8s_job.yml index e0706cd5cf5..1df5a1690eb 100644 --- a/experiments/openebs/openebs-pool-container-failure/openebs_pool_kill_k8s_job.yml +++ b/experiments/openebs/openebs-pool-container-failure/openebs_pool_kill_k8s_job.yml @@ -43,9 +43,9 @@ spec: - name: APP_PVC value: '' - # provide lib image + # provide lib image - name: LIB_IMAGE - value: 'gaiaadm/pumba:0.4.8' + value: 'gaiaadm/pumba:0.4.8' - name: LIVENESS_APP_LABEL value: '' diff --git a/experiments/openebs/openebs-target-network-loss/README.md b/experiments/openebs/openebs-target-network-loss/README.md new file mode 100644 index 00000000000..bccde806998 --- /dev/null +++ b/experiments/openebs/openebs-target-network-loss/README.md @@ -0,0 +1,128 @@ +## Experiment Metadata + + + + + + + + + + + + + + + + +
Type Description Storage Application K8s Platform
Chaos Inject n/w delay on storage target/controller OPENEBS Percona MySQL Any
+ +## Entry-Criteria + +- Application services are accessible & pods are healthy +- Application writes are successful + +## Exit-Criteria + +- Application services are accessible & pods should not be in running state +- Storage target pods are healthy + +## Notes + +- Typically used as a disruptive test, to cause loss of access to storage by injecting prolonged network delay +- Tests Recovery workflows for the PV & data integrity post recovery + +## Associated Utils + +- [cstor_target_network_delay.yaml](/experiments/openebs/openebs-target-network-delay/cstor_target_network_delay.yaml) +- [jiva_controller_network_delay.yaml](/experiments/openebs/openebs-target-network-delay/jiva_controller_network_delay.yaml) +- [fetch_sc_and_provisioner.yml](/utils/apps/openebs/fetch_sc_and_provisioner.yml) + +## Litmus experiment Environment Variables + +### Application + + + + + + + + + + + + + + + + + +
Parameter + Description
APP_NAMESPACE Namespace in which application pods are deployed
APP_LABEL Unique Labels in `key=value` format of application deployment
APP_PVC Name of persistent volume claim used for app's volume mounts
+ +### Chaos + + + + + + + + + + + + + + +
Parameter Description
NETWORK_DELAY Egress delay (in msec) on the target pod
CHAOS_DURATION Period (in sec)for which induced delay is maintained
+ +### Health Checks + + + + + + + + + + + + + + + + + +
Parameter + Description
LIVENESS_APP_NAMESPACE Namespace in which external liveness pods are deployed, if any
LIVENESS_APP_LABEL Unique Labels in `key=value` format for external liveness pod, if any
DATA_PERSISTENCE Data accessibility & integrity verification post recovery (enabled, disabled)
+ +### Procedure + +This scenario validates the behaviour of application and OpenEBS persistent volumes in the amidst of chaos induced on OpenEBS data plane and control plane components. + +After injecting the chaos into the component specified via environmental variable, litmus experiment observes the behaviour of corresponding OpenEBS PV and the application which consumes the volume. + +Based on the value of env DATA_PERSISTENCE, the corresponding data consistency util will be executed. At present only busybox and percona-mysql are supported. Along with specifying env in the litmus experiment, user needs to pass name for configmap and the data consistency specific parameters required via configmap in the format as follows: + +```yml + parameters.yml: | + blocksize: 4k + blockcount: 1024 + testfile: difiletest +``` + +It is recommended to pass test-name for configmap and mount the corresponding configmap as volume in the litmus pod. The above snippet holds the parameters required for validation data consistency in busybox application. + +For percona-mysql, the following parameters are to be injected into configmap. + +```yml + parameters.yml: | + dbuser: root + dbpassword: k8sDem0 + dbname: tdb +``` + +The configmap data will be utilised by litmus experiments as its variables while executing the scenario. Based on the data provided, litmus checks if the data is consistent after recovering from induced chaos. diff --git a/experiments/openebs/openebs-target-network-loss/chaosutil.j2 b/experiments/openebs/openebs-target-network-loss/chaosutil.j2 new file mode 100644 index 00000000000..8ca7686917f --- /dev/null +++ b/experiments/openebs/openebs-target-network-loss/chaosutil.j2 @@ -0,0 +1,7 @@ +{% if stg_prov is defined and stg_prov == 'openebs.io/provisioner-iscsi' %} + {% if stg_engine is defined and stg_engine == 'cstor' %} + chaosutil: /experiments/openebs/openebs-target-network-loss/cstor_target_network_delay.yaml + {% else %} + chaosutil: /experiments/openebs/openebs-target-network-loss/jiva_controller_network_delay.yaml + {% endif %} +{% endif %} diff --git a/experiments/openebs/openebs-target-network-loss/cstor_target_network_delay.yaml b/experiments/openebs/openebs-target-network-loss/cstor_target_network_delay.yaml new file mode 100644 index 00000000000..13979d1d2ca --- /dev/null +++ b/experiments/openebs/openebs-target-network-loss/cstor_target_network_delay.yaml @@ -0,0 +1,19 @@ +--- +- name: Pick a cStor target pod belonging to the PV + shell: > + kubectl get pods -l {{ cstor_target_pod_label }} + -n {{ openebs_ns }} -o jsonpath='{.items[?(@.metadata.labels.openebs\.io/persistent-volume=="{{ pv_name }}")].metadata.name}' + args: + executable: /bin/bash + register: cstor_target_pod + +# including pumba lib -> network_chaos +- name: Inject egress delay of {{ network_delay }}ms on cstor target for {{ chaos_duration }}ms + include_tasks: /chaoslib/pumba/network_chaos/network_chaos.yml + vars: + n_interface: "eth0" + n_packet_loss: "{{ packet_loss_perc }}" + c_container: "cstor-istgt" + app_pod: "{{ cstor_target_pod.stdout }}" + app_ns: "{{ openebs_ns }}" + \ No newline at end of file diff --git a/experiments/openebs/openebs-target-network-loss/data_persistence.j2 b/experiments/openebs/openebs-target-network-loss/data_persistence.j2 new file mode 100644 index 00000000000..8b0e7e500cc --- /dev/null +++ b/experiments/openebs/openebs-target-network-loss/data_persistence.j2 @@ -0,0 +1,5 @@ +{% if data_persistence is defined and data_persistence == 'mysql' %} + consistencyutil: /utils/apps/mysql/mysql_data_persistence.yml + {% elif data_persistence is defined and data_persistence == 'busybox' %} + consistencyutil: /utils/apps/busybox/busybox_data_persistence.yml +{% endif %} diff --git a/experiments/openebs/openebs-target-network-loss/jiva_controller_network_delay.yaml b/experiments/openebs/openebs-target-network-loss/jiva_controller_network_delay.yaml new file mode 100644 index 00000000000..733a5facf3a --- /dev/null +++ b/experiments/openebs/openebs-target-network-loss/jiva_controller_network_delay.yaml @@ -0,0 +1,70 @@ +--- +- name: Identify the jiva controller pod belonging to the PV + shell: > + kubectl get pods -l {{ jiva_controller_pod_label }} + -n {{ a_ns }} -o jsonpath='{.items[?(@.metadata.labels.openebs\.io/persistent-volume=="{{ pv_name }}")].metadata.name}' + args: + executable: /bin/bash + register: jiva_controller_pod + +- name: Record the jiva controller pod and container name + set_fact: + jiva_controller_container_name: "{{ pv.stdout }}-{{ jiva_controller_pod_suffix }}-{{ jiva_controller_container_suffix }}" + jiva_controller_pod_name: "{{ jiva_controller_pod.stdout }}" + +- name: Get controller svc + shell: > + kubectl get svc -l {{ jiva_controller_svc_label }} + -n {{ a_ns }} -o=jsonpath='{.items[0].spec.clusterIP}' + args: + executable: /bin/bash + register: controller_svc + failed_when: controller_svc.stdout == "" + +- name: Install jq package inside a controller container + shell: > + kubectl exec -it {{ jiva_controller_pod.stdout }} -n {{ a_ns }} -c {{ jiva_controller_container_name }} + -- bash -c "apt-get update && apt-get install -y jq && apt-get install -y iproute2" + args: + executable: /bin/bash + +- name: Getting the ReplicaCount before injecting delay + shell: > + kubectl exec -it {{ jiva_controller_pod.stdout }} -n {{ a_ns }} + -c {{ jiva_controller_container_name }} curl http://"{{controller_svc.stdout}}":9501/v1/volumes | jq -r '.data[].replicaCount' + args: + executable: /bin/bash + register: rcount_before + +# including pumba lib -> network_chaos +- name: Inject egress delay of {{ n_delay }}ms on jiva controller for {{ c_duration }}ms + include_tasks: /chaoslib/pumba/network_chaos/network_chaos.yml + vars: + n_interface: "eth0" + n_packet_loss: "{{ packet_loss_perc }}" + c_container: "{{ jiva_controller_container_name }}" + app_pod: "{{ jiva_controller_pod_name }}" + app_ns: "{{ a_ns }}" + +- name: Verifying the Replica getting disconnected + shell: > + kubectl exec -it {{ jiva_controller_pod.stdout }} -n {{ a_ns }} + -c {{ jiva_controller_container_name }} curl http://"{{controller_svc.stdout}}":9501/v1/volumes | jq -r '.data[].replicaCount' + args: + executable: /bin/bash + register: resp + until: resp.stdout != rcount_before.stdout + retries: 10 + delay: 15 + +- name: Verifying the replicas post network recovery + shell: > + kubectl exec -it {{ jiva_controller_pod.stdout }} -n {{ a_ns }} + -c {{ jiva_controller_container_name }} curl http://"{{controller_svc.stdout}}":9501/v1/volumes | jq -r '.data[].replicaCount' + args: + executable: /bin/bash + register: replica + until: replica.stdout == rcount_before.stdout + retries: 10 + delay: 15 + \ No newline at end of file diff --git a/experiments/openebs/openebs-target-network-loss/openebs_target_network_loss_ansible_logic.yml b/experiments/openebs/openebs-target-network-loss/openebs_target_network_loss_ansible_logic.yml new file mode 100644 index 00000000000..3edb78bfe3b --- /dev/null +++ b/experiments/openebs/openebs-target-network-loss/openebs_target_network_loss_ansible_logic.yml @@ -0,0 +1,153 @@ +--- +- hosts: localhost + connection: local + + vars: + a_label: "{{ lookup('env','APP_LABEL') }}" + a_ns: "{{ lookup('env','APP_NAMESPACE') }}" + a_pvc: "{{ lookup('env','APP_PVC') }}" + c_duration: "{{ lookup('env','CHAOS_DURATION') }}" + c_experiment: "openebs-target-network-loss" + c_force: "{{ lookup('env','FORCE') }}" + c_interval: "5" + data_persistence: "{{ lookup('env','DATA_PERSISTENCE') }}" + lib_image: "{{ lookup('env','LIB_IMAGE') }}" + liveness_label: "{{ lookup('env','LIVENESS_APP_LABEL') }}" + liveness_namespace: "{{ lookup('env','LIVENESS_APP_NAMESPACE') }}" + packet_loss_perc: "{{ lookup('env','NETWORK_PACKET_LOSS_PERCENTAGE') }}" + openebs_ns: "{{ lookup('env','OPENEBS_NAMESPACE') }}" + + vars_files: + - /mnt/parameters.yml + - /experiments/openebs/openebs_components.yml + + tasks: + - block: + + - include_tasks: /utils/common/application_liveness_check.yml + when: liveness_label != '' + + ## DERIVE THE APP STORAGE CLASS AND CHAOS UTIL TO USE + + - include: test_prerequisites.yml + + - include_vars: + file: data_persistence.yml + + - include_vars: + file: chaosutil.yml + + - name: Record the chaos util path + set_fact: + chaos_util_path: "{{ chaosutil }}" + + - name: Record the data consistency util path + set_fact: + data_consistency_util_path: "{{ consistencyutil }}" + when: data_persistence != '' + + ## RECORD START-OF-TEST IN CHAOS RESULT CR + + - include_tasks: /utils/runtime/create_testname.yml + + ## GENERATE EXP RESULT NAME + - block: + + - name: Construct chaos result name (experiment_name) + set_fact: + c_experiment: "{{ lookup('env','CHAOSENGINE') }}-{{ c_experiment }}" + + when: lookup('env','CHAOSENGINE') + + - include_tasks: /utils/runtime/update_chaos_result_resource.yml + vars: + status: 'SOT' + namespace: "{{ a_ns }}" + + ## DISPLAY APP INFORMATION + + - name: Display the app information passed via the test job + debug: + msg: + - "The application info is as follows:" + - "Namespace : {{ a_ns }}" + - "Label : {{ a_label }}" + - "PVC : {{ a_pvc }}" + - "StorageClass : {{ sc }}" + + ## PRE-CHAOS APPLICATION LIVENESS CHECK + - name: Verify that the AUT (Application Under Test) is running + include_tasks: "/utils/common/status_app_pod.yml" + vars: + delay: 5 + retries: 60 + + - name: Get application pod name + shell: > + kubectl get pods -n {{ a_ns }} -l {{ a_label }} --no-headers + -o=custom-columns=NAME:".metadata.name" + args: + executable: /bin/bash + register: app_pod + + - name: Create some test data + include: "{{ data_consistency_util_path }}" + vars: + status: 'LOAD' + ns: "{{ a_ns }}" + pod_name: "{{ app_pod.stdout }}" + when: data_persistence != '' + + ## STORAGE FAULT INJECTION + + - include: "{{ chaos_util_path }}" + + ## POST-CHAOS APPLICATION LIVENESS CHECK + + # including chaoslib kill-random-pod + - name: Kill the application pod + include_tasks: /chaoslib/litmus/kill_random_pod.yml + vars: + app_ns: "{{ a_ns }}" + app_pod_name: "{{ app_pod.stdout }}" + + - name: Verify if the application pod is deleted + shell: > + kubectl get pods -n {{ a_ns }} + args: + executable: /bin/bash + register: podstatus + until: '"{{ app_pod.stdout }}" not in podstatus.stdout' + retries: 2 + delay: 150 + + - name: Obtain the newly created pod name for application + shell: > + kubectl get pods -n {{ a_ns }} -l {{ a_label }} -o jsonpath='{.items[].metadata.name}' + args: + executable: /bin/bash + register: newpod_name + + - name: Checking application pod is not in running state + shell: kubectl get pods -n {{ a_ns }} -o jsonpath='{.items[?(@.metadata.name=="{{ newpod_name.stdout }}")].status.containerStatuses[*].state.waiting.reason}' + register: result + until: "((result.stdout.split()|unique)|length) == 1 and 'Running' not in result.stdout" + delay: 2 + retries: 150 + + - set_fact: + flag: "Pass" + + rescue: + - set_fact: + flag: "Fail" + + always: + + ## RECORD END-OF-TEST IN CHAOS RESULT CR + + - include_tasks: /utils/runtime/update_chaos_result_resource.yml + vars: + status: 'EOT' + namespace: "{{ a_ns }}" + \ No newline at end of file diff --git a/experiments/openebs/openebs-target-network-loss/openebs_target_network_loss_k8s_job.yml b/experiments/openebs/openebs-target-network-loss/openebs_target_network_loss_k8s_job.yml new file mode 100644 index 00000000000..c691bf37765 --- /dev/null +++ b/experiments/openebs/openebs-target-network-loss/openebs_target_network_loss_k8s_job.yml @@ -0,0 +1,78 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: target-network-loss +data: + parameters.yml: | + +--- +apiVersion: batch/v1 +kind: Job +metadata: + generateName: openebs-target-network-loss- +spec: + template: + metadata: + labels: + name: openebs-target-network-loss + spec: + serviceAccountName: %CHAOS_SERVICE_ACCOUNT% + restartPolicy: Never + containers: + - name: ansibletest + image: litmuschaos/ansible-runner:ci + imagePullPolicy: Always + env: + - name: ANSIBLE_STDOUT_CALLBACK + value: 'default' + + # provide openebs namespace + - name: OPENEBS_NAMESPACE + value: 'openebs' + + # provide application namespace + - name: APP_NAMESPACE + value: '' + + # provide application label + - name: APP_LABEL + value: '' + + - name: FORCE + value: 'true' + + # provide application pvc + - name: APP_PVC + value: '' + + # provide lib image + - name: LIB_IMAGE + value: 'gaiaadm/pumba:0.4.8' + + - name: NETWORK_PACKET_LOSS_PERCENTAGE + value: '100' # in percentage + + - name: CHAOS_DURATION + value: '240000' # in milliseconds + + - name: LIVENESS_APP_LABEL + value: '' + + - name: LIVENESS_APP_NAMESPACE + value: '' + + - name: DATA_PERSISTENCE + value: '' + + command: ["/bin/bash"] + args: ["-c", "ansible-playbook ./experiments/openebs/openebs-target-network-loss/openebs_target_network_loss_ansible_logic.yml -i /etc/ansible/hosts -vv; exit 0"] + + volumeMounts: + - name: parameters + mountPath: /mnt/ + volumes: + - name: parameters + configMap: + name: target-network-loss + \ No newline at end of file diff --git a/experiments/openebs/openebs-target-network-loss/test_prerequisites.yml b/experiments/openebs/openebs-target-network-loss/test_prerequisites.yml new file mode 100644 index 00000000000..589ebafdd41 --- /dev/null +++ b/experiments/openebs/openebs-target-network-loss/test_prerequisites.yml @@ -0,0 +1,39 @@ +--- +- name: Fetch sc and provisioner + include_tasks: /utils/apps/openebs/fetch_sc_and_provisioner.yml + +- block: + - name: Derive PV name from PVC to query storage engine type (openebs) + shell: > + kubectl get pvc {{ a_pvc }} -n {{ a_ns }} + --no-headers -o custom-columns=:spec.volumeName + args: + executable: /bin/bash + register: pv + + - name: Record pv name + set_fact: + pv_name: "{{ pv.stdout }}" + + - name: Check for presence & value of cas type annotation + shell: > + kubectl get pv {{ pv_name }} --no-headers + -o jsonpath="{.metadata.annotations.openebs\\.io/cas-type}" + args: + executable: /bin/bash + register: openebs_stg_engine + + - name: Record the storage engine name + set_fact: + stg_engine: "{{ openebs_stg_engine.stdout }}" + when: stg_prov == "openebs.io/provisioner-iscsi" + +- name: Identify the chaos util to be invoked + template: + src: chaosutil.j2 + dest: chaosutil.yml + +- name: Identify the data consistency util to be invoked + template: + src: data_persistence.j2 + dest: data_persistence.yml \ No newline at end of file diff --git a/experiments/openebs/openebs_components.yml b/experiments/openebs/openebs_components.yml index 143aca30ac2..ea42453e20f 100644 --- a/experiments/openebs/openebs_components.yml +++ b/experiments/openebs/openebs_components.yml @@ -1 +1,8 @@ pool_label: "app=cstor-pool" +cstor_target_pod_suffix: "target" +cstor_target_pod_label: "openebs.io/target=cstor-target" +jiva_controller_pod_suffix: "ctrl" +jiva_controller_container_suffix: "con" +jiva_controller_pod_label: "openebs.io/controller=jiva-controller" +jiva_controller_svc_label: "openebs.io/controller-service=jiva-controller-svc" + diff --git a/utils/apps/openebs/fetch_sc_and_provisioner.yml b/utils/apps/openebs/fetch_sc_and_provisioner.yml new file mode 100644 index 00000000000..10338429b0b --- /dev/null +++ b/utils/apps/openebs/fetch_sc_and_provisioner.yml @@ -0,0 +1,24 @@ +--- +- name: Identify the storage class used by the PVC + shell: > + kubectl get pvc {{ a_pvc }} -n {{ a_ns }} + --no-headers -o custom-columns=:spec.storageClassName + args: + executable: /bin/bash + register: storage_class + +- name: Identify the storage provisioner used by the SC + shell: > + kubectl get sc {{ storage_class.stdout }} + --no-headers -o custom-columns=:provisioner + args: + executable: /bin/bash + register: provisioner + +- name: Record the storage class name + set_fact: + sc: "{{ storage_class.stdout }}" + +- name: Record the storage provisioner name + set_fact: + stg_prov: "{{ provisioner.stdout }}" From 87d27ba23efc0e9b5646ccb9e488eea961f7d56f Mon Sep 17 00:00:00 2001 From: Rahul M Chheda <53308066+rahulchheda@users.noreply.github.com> Date: Thu, 7 Nov 2019 14:10:31 +0530 Subject: [PATCH 04/21] Diskfill FIX (#918) * Added DiskFill Experiment Signed-off-by: Rahul M Chheda --- chaoslib/litmus/disk_fill/disk_fill_by_litmus.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/chaoslib/litmus/disk_fill/disk_fill_by_litmus.yml b/chaoslib/litmus/disk_fill/disk_fill_by_litmus.yml index 200f3adab3f..890641e2fd3 100644 --- a/chaoslib/litmus/disk_fill/disk_fill_by_litmus.yml +++ b/chaoslib/litmus/disk_fill/disk_fill_by_litmus.yml @@ -35,11 +35,11 @@ - name: Fetch the value of Request ephemeral-storage Unit in KB set_fact: - req_storage_KB: "{{ req_value_storage.stdout }}" + req_storage_KB: "{{ lookup('vars', req_value_storage.stdout) }}" - name: Fetch the value of Limit ephemeral-storage Unit in KB set_fact: - limit_storage_KB: "{{ limit_value_storage.stdout }}" + limit_storage_KB: "{{ lookup('vars', limit_value_storage.stdout) }}" - include_tasks: /chaoslib/litmus/disk_fill/convert_fill_percentage.yml From 02bb045d170e2b574fa24f61f2e269d9a0277222 Mon Sep 17 00:00:00 2001 From: Shubham Chaudhary Date: Thu, 7 Nov 2019 14:31:31 +0530 Subject: [PATCH 05/21] (feat): Add openebs-target-network-delay experiment (#910) * (feat): Add openebs-target-network-failure experiment Signed-off-by: shubhamchaudhary --- .../openebs_pool_kill_k8s_job.yml | 5 +- .../openebs-target-network-delay/README.md | 128 ++++++++++++++++ .../openebs-target-network-delay/chaosutil.j2 | 7 + .../cstor_target_network_delay.yaml | 18 +++ .../data_persistence.j2 | 5 + .../jiva_controller_network_delay.yaml | 69 +++++++++ ...ebs_target_network_delay_ansible_logic.yml | 137 ++++++++++++++++++ .../openebs_target_network_delay_k8s_job.yml | 76 ++++++++++ .../test_prerequisites.yml | 39 +++++ .../openebs_target_network_loss_k8s_job.yml | 3 +- experiments/openebs/openebs_components.yml | 1 - .../apps/openebs/fetch_sc_and_provisioner.yml | 2 +- 12 files changed, 485 insertions(+), 5 deletions(-) create mode 100644 experiments/openebs/openebs-target-network-delay/README.md create mode 100644 experiments/openebs/openebs-target-network-delay/chaosutil.j2 create mode 100644 experiments/openebs/openebs-target-network-delay/cstor_target_network_delay.yaml create mode 100644 experiments/openebs/openebs-target-network-delay/data_persistence.j2 create mode 100644 experiments/openebs/openebs-target-network-delay/jiva_controller_network_delay.yaml create mode 100644 experiments/openebs/openebs-target-network-delay/openebs_target_network_delay_ansible_logic.yml create mode 100644 experiments/openebs/openebs-target-network-delay/openebs_target_network_delay_k8s_job.yml create mode 100644 experiments/openebs/openebs-target-network-delay/test_prerequisites.yml diff --git a/experiments/openebs/openebs-pool-container-failure/openebs_pool_kill_k8s_job.yml b/experiments/openebs/openebs-pool-container-failure/openebs_pool_kill_k8s_job.yml index 1df5a1690eb..62a2b6eeb1a 100644 --- a/experiments/openebs/openebs-pool-container-failure/openebs_pool_kill_k8s_job.yml +++ b/experiments/openebs/openebs-pool-container-failure/openebs_pool_kill_k8s_job.yml @@ -43,8 +43,9 @@ spec: - name: APP_PVC value: '' - # provide lib image - - name: LIB_IMAGE + # only pumba supported + # For pumba image use : gaiaadm/pumba:0.4.8 + - name: LIB_IMAGE value: 'gaiaadm/pumba:0.4.8' - name: LIVENESS_APP_LABEL diff --git a/experiments/openebs/openebs-target-network-delay/README.md b/experiments/openebs/openebs-target-network-delay/README.md new file mode 100644 index 00000000000..b359ad3de42 --- /dev/null +++ b/experiments/openebs/openebs-target-network-delay/README.md @@ -0,0 +1,128 @@ +## Experiment Metadata + + + + + + + + + + + + + + +
Type Description Storage K8s Platform
Chaos Inject delay in storage target and verify the application availability OPENEBS Any
+ +## Entry-Criteria + +- Application services are accessible & pods are healthy +- Application writes are successful + +## Exit-Criteria + +- Application services are accessible & pods are healthy +- Data written prior to chaos is successfully retrieved/read +- Database consistency is maintained as per db integrity check utils +- Storage target pods are healthy + +## Notes + +- Typically used as a disruptive test, to cause loss of access to storage target by injecting network delay using pumba. +- The application pod should be healthy once it gets recovered. + +## Associated Utils + +- [cstor_target_network_delay.yaml](/experiments/openebs/openebs-target-network-delay/cstor_target_network_delay.yaml) +- [jiva_controller_network_delay.yaml](/experiments/openebs/openebs-target-network-delay/jiva_controller_network_delay.yaml) +- [fetch_sc_and_provisioner.yml](/utils/apps/openebs/fetch_sc_and_provisioner.yml) + +## Litmusbook Environment Variables + +### Application + + + + + + + + + + + + + + + + + +
Parameter + Description
APP_NAMESPACE Namespace in which application pods are deployed
APP_LABEL Unique Labels in `key=value` format of application deployment
APP_PVC Name of persistent volume claim used for app's volume mounts
+ +### Chaos + + + + + + + + + + + + + + +
Parameter Description
NETWORK_DELAY The time interval in milliseconds
CHAOS_DURATION The time interval for chaos insertion
+ +### Health Checks + + + + + + + + + + + + + + + + + +
Parameter + Description
LIVENESS_APP_NAMESPACE Namespace in which external liveness pods are deployed, if any
LIVENESS_APP_LABEL Unique Labels in `key=value` format for external liveness pod, if any
DATA_PERSISTENCY Data accessibility & integrity verification post recovery (enabled, disabled)
+ +### Procedure +​ +This scenario validates the behaviour of application and OpenEBS persistent volumes in the amidst of chaos induced on OpenEBS data plane and control plane components. +​ +After injecting the chaos into the component specified via environmental variable, litmus experiment observes the behaviour of corresponding OpenEBS PV and the application which consumes the volume. + +Based on the value of env DATA_PERSISTENCE, the corresponding data consistency util will be executed. At present only busybox and percona-mysql are supported. Along with specifying env in the litmus experiment, user needs to pass name for configmap and the data consistency specific parameters required via configmap in the format as follows: + +```yml + parameters.yml: | + blocksize: 4k + blockcount: 1024 + testfile: difiletest +``` + +It is recommended to pass test-name for configmap and mount the corresponding configmap as volume in the litmus pod. The above snippet holds the parameters required for validation data consistency in busybox application. + +For percona-mysql, the following parameters are to be injected into configmap. + +```yml + parameters.yml: | + dbuser: root + dbpassword: k8sDem0 + dbname: tdb +``` + +The configmap data will be utilised by litmus experiments as its variables while executing the scenario. Based on the data provided, litmus checks if the data is consistent after recovering from induced chaos. \ No newline at end of file diff --git a/experiments/openebs/openebs-target-network-delay/chaosutil.j2 b/experiments/openebs/openebs-target-network-delay/chaosutil.j2 new file mode 100644 index 00000000000..5637a89b8b6 --- /dev/null +++ b/experiments/openebs/openebs-target-network-delay/chaosutil.j2 @@ -0,0 +1,7 @@ +{% if stg_prov is defined and stg_prov == 'openebs.io/provisioner-iscsi' %} + {% if stg_engine is defined and stg_engine == 'cstor' %} + chaosutil: /experiments/openebs/openebs-target-network-delay/cstor_target_network_delay.yaml + {% else %} + chaosutil: /experiments/openebs/openebs-target-network-delay/jiva_controller_network_delay.yaml + {% endif %} +{% endif %} diff --git a/experiments/openebs/openebs-target-network-delay/cstor_target_network_delay.yaml b/experiments/openebs/openebs-target-network-delay/cstor_target_network_delay.yaml new file mode 100644 index 00000000000..098d3285a85 --- /dev/null +++ b/experiments/openebs/openebs-target-network-delay/cstor_target_network_delay.yaml @@ -0,0 +1,18 @@ +--- +- name: Pick a cStor target pod belonging to the PV + shell: > + kubectl get pods -l {{ cstor_target_pod_label }} + -n {{ openebs_ns }} -o jsonpath='{.items[?(@.metadata.labels.openebs\.io/persistent-volume=="{{ pv_name }}")].metadata.name}' + args: + executable: /bin/bash + register: cstor_target_pod + +# including pumba lib -> network_chaos +- name: Inject egress delay of {{ network_delay }}ms on cstor target for {{ chaos_duration }}ms + include_tasks: /chaoslib/pumba/network_chaos/network_chaos.yml + vars: + n_interface: "eth0" + n_latency: "{{ n_delay }}" + c_container: "cstor-istgt" + app_pod: "{{ cstor_target_pod.stdout }}" + app_ns: "{{ openebs_ns }}" diff --git a/experiments/openebs/openebs-target-network-delay/data_persistence.j2 b/experiments/openebs/openebs-target-network-delay/data_persistence.j2 new file mode 100644 index 00000000000..8b0e7e500cc --- /dev/null +++ b/experiments/openebs/openebs-target-network-delay/data_persistence.j2 @@ -0,0 +1,5 @@ +{% if data_persistence is defined and data_persistence == 'mysql' %} + consistencyutil: /utils/apps/mysql/mysql_data_persistence.yml + {% elif data_persistence is defined and data_persistence == 'busybox' %} + consistencyutil: /utils/apps/busybox/busybox_data_persistence.yml +{% endif %} diff --git a/experiments/openebs/openebs-target-network-delay/jiva_controller_network_delay.yaml b/experiments/openebs/openebs-target-network-delay/jiva_controller_network_delay.yaml new file mode 100644 index 00000000000..c203acb0773 --- /dev/null +++ b/experiments/openebs/openebs-target-network-delay/jiva_controller_network_delay.yaml @@ -0,0 +1,69 @@ +--- +- name: Identify the jiva controller pod belonging to the PV + shell: > + kubectl get pods -l {{ jiva_controller_pod_label }} + -n {{ a_ns }} -o jsonpath='{.items[?(@.metadata.labels.openebs\.io/persistent-volume=="{{ pv_name }}")].metadata.name}' + args: + executable: /bin/bash + register: jiva_controller_pod + +- name: Record the jiva controller pod and container name + set_fact: + jiva_controller_container_name: "{{ pv.stdout }}-{{ jiva_controller_pod_suffix }}-{{ jiva_controller_container_suffix }}" + jiva_controller_pod_name: "{{ jiva_controller_pod.stdout }}" + +- name: Get controller svc + shell: > + kubectl get svc -l {{ jiva_controller_svc_label }} + -n {{ a_ns }} -o=jsonpath='{.items[0].spec.clusterIP}' + args: + executable: /bin/bash + register: controller_svc + failed_when: controller_svc.stdout == "" + +- name: Install jq package inside a controller container + shell: > + kubectl exec -it {{ jiva_controller_pod.stdout }} -n {{ a_ns }} -c {{ jiva_controller_container_name }} + -- bash -c "apt-get update && apt-get install -y jq && apt-get install -y iproute2" + args: + executable: /bin/bash + +- name: Getting the ReplicaCount before injecting delay + shell: > + kubectl exec -it {{ jiva_controller_pod.stdout }} -n {{ a_ns }} + -c {{ jiva_controller_container_name }} curl http://"{{controller_svc.stdout}}":9501/v1/volumes | jq -r '.data[].replicaCount' + args: + executable: /bin/bash + register: rcount_before + +# including pumba lib -> network_chaos +- name: Inject egress delay of {{ n_delay }}ms on jiva controller for {{ c_duration }}ms + include_tasks: /chaoslib/pumba/network_chaos/network_chaos.yml + vars: + n_interface: "eth0" + n_latency: "{{ n_delay }}" + c_container: "{{ jiva_controller_container_name }}" + app_pod: "{{ jiva_controller_pod_name }}" + app_ns: "{{ a_ns }}" + +- name: Verifying the Replica getting disconnected + shell: > + kubectl exec -it {{ jiva_controller_pod.stdout }} -n {{ a_ns }} + -c {{ jiva_controller_container_name }} curl http://"{{controller_svc.stdout}}":9501/v1/volumes | jq -r '.data[].replicaCount' + args: + executable: /bin/bash + register: resp + until: resp.stdout != rcount_before.stdout + retries: 10 + delay: 15 + +- name: Verifying the replicas post network recovery + shell: > + kubectl exec -it {{ jiva_controller_pod.stdout }} -n {{ a_ns }} + -c {{ jiva_controller_container_name }} curl http://"{{controller_svc.stdout}}":9501/v1/volumes | jq -r '.data[].replicaCount' + args: + executable: /bin/bash + register: replica + until: replica.stdout == rcount_before.stdout + retries: 10 + delay: 15 diff --git a/experiments/openebs/openebs-target-network-delay/openebs_target_network_delay_ansible_logic.yml b/experiments/openebs/openebs-target-network-delay/openebs_target_network_delay_ansible_logic.yml new file mode 100644 index 00000000000..98007b43c6c --- /dev/null +++ b/experiments/openebs/openebs-target-network-delay/openebs_target_network_delay_ansible_logic.yml @@ -0,0 +1,137 @@ +--- +- hosts: localhost + connection: local + + vars: + a_label: "{{ lookup('env','APP_LABEL') }}" + a_ns: "{{ lookup('env','APP_NAMESPACE') }}" + a_pvc: "{{ lookup('env','APP_PVC') }}" + c_duration: "{{ lookup('env','CHAOS_DURATION') }}" + c_experiment: openebs-target-network-delay + lib_image: "{{ lookup('env','LIB_IMAGE') }}" + data_persistence: "{{ lookup('env','DATA_PERSISTENCE') }}" + liveness_label: "{{ lookup('env','LIVENESS_APP_LABEL') }}" + liveness_namespace: "{{ lookup('env','LIVENESS_APP_NAMESPACE') }}" + n_delay: "{{ lookup('env','NETWORK_DELAY') }}" + openebs_ns: "{{ lookup('env','OPENEBS_NAMESPACE') }}" + + vars_files: + - /mnt/parameters.yml + - /experiments/openebs/openebs_components.yml + + tasks: + - block: + + ## PRE-CHAOS APPLICATION LIVENESS CHECK + - include_tasks: /utils/common/application_liveness_check.yml + when: liveness_label != '' + + # Create test name append with run_id + - include_tasks: /utils/runtime/create_testname.yml + + - include: test_prerequisites.yml + + - include_vars: + file: data_persistence.yml + + - include_vars: + file: chaosutil.yml + + - name: Record the chaos util path + set_fact: + chaos_util_path: "{{ chaosutil }}" + + - name: Record the data consistency util path + set_fact: + data_consistency_util_path: "{{ consistencyutil }}" + when: data_persistence != '' + + ## GENERATE EXP RESULT NAME + - block: + + - name: Construct chaos result name (experiment_name) + set_fact: + c_experiment: "{{ lookup('env','CHAOSENGINE') }}-{{ c_experiment }}" + + when: lookup('env','CHAOSENGINE') + + ## RECORD START-OF-TEST IN CHAOS RESULT CR + + - include_tasks: /utils/runtime/update_chaos_result_resource.yml + vars: + status: 'SOT' + namespace: "{{ a_ns }}" + + ## DISPLAY APP INFORMATION + + - name: Display the app information passed via the test job + debug: + msg: + - "The application info is as follows:" + - "Namespace : {{ a_ns }}" + - "Label : {{ a_label }}" + - "PVC : {{ a_pvc }}" + - "StorageClass : {{ sc }}" + + ## PRE-CHAOS APPLICATION LIVENESS CHECK + + - name: Verify that the AUT (Application Under Test) is running + include_tasks: "/utils/common/status_app_pod.yml" + vars: + delay: 5 + retries: 60 + + - name: Get application pod name + shell: > + kubectl get pods -n {{ a_ns }} -l {{ a_label }} --no-headers + -o=custom-columns=NAME:".metadata.name" + args: + executable: /bin/bash + register: app_pod_name + + - name: Create some test data + include: "{{ data_consistency_util_path }}" + vars: + status: 'LOAD' + ns: "{{ a_ns }}" + pod_name: "{{ app_pod_name.stdout }}" + when: data_persistence != '' + + ## STORAGE FAULT INJECTION + + - include: "{{ chaos_util_path }}" + + ## POST-CHAOS APPLICATION LIVENESS CHECK + + - name: Verify AUT liveness post fault-injection + include_tasks: "/utils/common/status_app_pod.yml" + vars: + delay: 5 + retries: 60 + + - include_tasks: /utils/common/application_liveness_check.yml + when: liveness_label != '' + + - name: Verify application data persistence + include: "{{ data_consistency_util_path }}" + vars: + status: 'VERIFY' + ns: "{{ a_ns }}" + pod_name: "{{ app_pod_name.stdout }}" + when: data_persistence != '' + + - set_fact: + flag: "Pass" + + rescue: + - set_fact: + flag: "Fail" + + always: + + ## RECORD END-OF-TEST IN CHAOS RESULT CR + + - include_tasks: /utils/runtime/update_chaos_result_resource.yml + vars: + status: 'EOT' + namespace: "{{ a_ns }}" diff --git a/experiments/openebs/openebs-target-network-delay/openebs_target_network_delay_k8s_job.yml b/experiments/openebs/openebs-target-network-delay/openebs_target_network_delay_k8s_job.yml new file mode 100644 index 00000000000..8fb18b05266 --- /dev/null +++ b/experiments/openebs/openebs-target-network-delay/openebs_target_network_delay_k8s_job.yml @@ -0,0 +1,76 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: target-network-delay +data: + parameters.yml: | + +--- +apiVersion: batch/v1 +kind: Job +metadata: + generateName: openebs-target-network-delay- +spec: + template: + metadata: + labels: + name: openebs-target-network-delay + spec: + serviceAccountName: %CHAOS_SERVICE_ACCOUNT% + restartPolicy: Never + containers: + - name: ansibletest + image: litmuschaos/ansible-runner:ci + imagePullPolicy: Always + env: + - name: ANSIBLE_STDOUT_CALLBACK + value: 'default' + + # provide openebs namespace + - name: OPENEBS_NAMESPACE + value: 'openebs' + + # provide application namespace + - name: APP_NAMESPACE + value: '' + + # provide application label + - name: APP_LABEL + value: '' + + # provide application pvc + - name: APP_PVC + value: '' + + # only pumba supported + # For pumba image use : gaiaadm/pumba:0.4.8 + - name: LIB_IMAGE + value: 'gaiaadm/pumba:0.4.8' + + - name: NETWORK_DELAY + value: '60000' # in milliseconds + + - name: CHAOS_DURATION + value: '60000' # in milliseconds + + - name: LIVENESS_APP_LABEL + value: '' + + - name: LIVENESS_APP_NAMESPACE + value: '' + + - name: DATA_PERSISTENCE + value: '' + + command: ["/bin/bash"] + args: ["-c", "ansible-playbook ./experiments/openebs/openebs-target-network-delay/openebs_target_network_delay_ansible_logic.yml -i /etc/ansible/hosts -vv; exit 0"] + + volumeMounts: + - name: parameters + mountPath: /mnt/ + volumes: + - name: parameters + configMap: + name: target-network-delay + \ No newline at end of file diff --git a/experiments/openebs/openebs-target-network-delay/test_prerequisites.yml b/experiments/openebs/openebs-target-network-delay/test_prerequisites.yml new file mode 100644 index 00000000000..589ebafdd41 --- /dev/null +++ b/experiments/openebs/openebs-target-network-delay/test_prerequisites.yml @@ -0,0 +1,39 @@ +--- +- name: Fetch sc and provisioner + include_tasks: /utils/apps/openebs/fetch_sc_and_provisioner.yml + +- block: + - name: Derive PV name from PVC to query storage engine type (openebs) + shell: > + kubectl get pvc {{ a_pvc }} -n {{ a_ns }} + --no-headers -o custom-columns=:spec.volumeName + args: + executable: /bin/bash + register: pv + + - name: Record pv name + set_fact: + pv_name: "{{ pv.stdout }}" + + - name: Check for presence & value of cas type annotation + shell: > + kubectl get pv {{ pv_name }} --no-headers + -o jsonpath="{.metadata.annotations.openebs\\.io/cas-type}" + args: + executable: /bin/bash + register: openebs_stg_engine + + - name: Record the storage engine name + set_fact: + stg_engine: "{{ openebs_stg_engine.stdout }}" + when: stg_prov == "openebs.io/provisioner-iscsi" + +- name: Identify the chaos util to be invoked + template: + src: chaosutil.j2 + dest: chaosutil.yml + +- name: Identify the data consistency util to be invoked + template: + src: data_persistence.j2 + dest: data_persistence.yml \ No newline at end of file diff --git a/experiments/openebs/openebs-target-network-loss/openebs_target_network_loss_k8s_job.yml b/experiments/openebs/openebs-target-network-loss/openebs_target_network_loss_k8s_job.yml index c691bf37765..72509e6e25c 100644 --- a/experiments/openebs/openebs-target-network-loss/openebs_target_network_loss_k8s_job.yml +++ b/experiments/openebs/openebs-target-network-loss/openebs_target_network_loss_k8s_job.yml @@ -46,7 +46,8 @@ spec: - name: APP_PVC value: '' - # provide lib image + # only pumba supported + # For pumba image use : gaiaadm/pumba:0.4.8 - name: LIB_IMAGE value: 'gaiaadm/pumba:0.4.8' diff --git a/experiments/openebs/openebs_components.yml b/experiments/openebs/openebs_components.yml index ea42453e20f..68b67a189ce 100644 --- a/experiments/openebs/openebs_components.yml +++ b/experiments/openebs/openebs_components.yml @@ -5,4 +5,3 @@ jiva_controller_pod_suffix: "ctrl" jiva_controller_container_suffix: "con" jiva_controller_pod_label: "openebs.io/controller=jiva-controller" jiva_controller_svc_label: "openebs.io/controller-service=jiva-controller-svc" - diff --git a/utils/apps/openebs/fetch_sc_and_provisioner.yml b/utils/apps/openebs/fetch_sc_and_provisioner.yml index 10338429b0b..111d8d82186 100644 --- a/utils/apps/openebs/fetch_sc_and_provisioner.yml +++ b/utils/apps/openebs/fetch_sc_and_provisioner.yml @@ -21,4 +21,4 @@ - name: Record the storage provisioner name set_fact: - stg_prov: "{{ provisioner.stdout }}" + stg_prov: "{{ provisioner.stdout }}" From 1ffa82add0c8d1f127e7880b4a2d562fde925617 Mon Sep 17 00:00:00 2001 From: Shubham Chaudhary Date: Thu, 7 Nov 2019 16:05:49 +0530 Subject: [PATCH 06/21] (feat): Add openebs target failure experiment (#908) * (feat): Add openebs-pool-pod-failure experiment Signed-off-by: shubhamchaudhary --- .../containerd_chaos/containerd.j2 | 28 +++ .../containerd_chaos/crictl-chaos.yml | 157 +++++++++++++++++ ..._pool_container_failure_ansible_logic.yml} | 0 ...penebs_pool_container_failure_k8s_job.yml} | 2 +- .../openebs/openebs-target-failure/README.md | 113 ++++++++++++ .../openebs-target-failure/chaosutil.j2 | 18 ++ .../cstor_target_container_kill.yml | 67 +++++++ .../cstor_target_failure.yaml | 39 +++++ .../data_persistence.j2 | 5 + .../jiva_controller_container_kill.yml | 71 ++++++++ .../jiva_controller_pod_failure.yaml | 86 +++++++++ .../openebs_target_failure_ansible_logic.yml | 164 ++++++++++++++++++ .../openebs_target_failure_k8s_job.yml | 98 +++++++++++ .../test_prerequisites.yml | 36 ++++ utils/apps/openebs/fetch_cstor_target_pod.yml | 18 ++ .../openebs/fetch_jiva_controller_pod.yml | 18 ++ utils/apps/openebs/target_affinity_check.yml | 61 +++++++ 17 files changed, 980 insertions(+), 1 deletion(-) create mode 100644 chaoslib/litmus/container_kill/containerd_chaos/containerd.j2 create mode 100644 chaoslib/litmus/container_kill/containerd_chaos/crictl-chaos.yml rename experiments/openebs/openebs-pool-container-failure/{openebs_pool_kill_ansible_logic.yml => openebs_pool_container_failure_ansible_logic.yml} (100%) rename experiments/openebs/openebs-pool-container-failure/{openebs_pool_kill_k8s_job.yml => openebs_pool_container_failure_k8s_job.yml} (97%) create mode 100644 experiments/openebs/openebs-target-failure/README.md create mode 100644 experiments/openebs/openebs-target-failure/chaosutil.j2 create mode 100644 experiments/openebs/openebs-target-failure/cstor_target_container_kill.yml create mode 100644 experiments/openebs/openebs-target-failure/cstor_target_failure.yaml create mode 100644 experiments/openebs/openebs-target-failure/data_persistence.j2 create mode 100644 experiments/openebs/openebs-target-failure/jiva_controller_container_kill.yml create mode 100644 experiments/openebs/openebs-target-failure/jiva_controller_pod_failure.yaml create mode 100644 experiments/openebs/openebs-target-failure/openebs_target_failure_ansible_logic.yml create mode 100644 experiments/openebs/openebs-target-failure/openebs_target_failure_k8s_job.yml create mode 100644 experiments/openebs/openebs-target-failure/test_prerequisites.yml create mode 100644 utils/apps/openebs/fetch_cstor_target_pod.yml create mode 100644 utils/apps/openebs/fetch_jiva_controller_pod.yml create mode 100644 utils/apps/openebs/target_affinity_check.yml diff --git a/chaoslib/litmus/container_kill/containerd_chaos/containerd.j2 b/chaoslib/litmus/container_kill/containerd_chaos/containerd.j2 new file mode 100644 index 00000000000..66dea5eb74a --- /dev/null +++ b/chaoslib/litmus/container_kill/containerd_chaos/containerd.j2 @@ -0,0 +1,28 @@ +apiVersion: extensions/apps/v1 +kind: DaemonSet +metadata: + name: containerd-chaos +spec: + template: + metadata: + labels: + app: crictl + name: containerd-chaos + spec: + containers: + - image: {{ containerd_image }} + imagePullPolicy: Always + name: containerd-chaos + command: ['sh', '-c', 'echo Hello! && sleep 1800'] + volumeMounts: + - name: cri-socket + mountPath: /run/containerd/containerd.sock + - name: cri-config + mountPath: /etc/crictl.yaml + volumes: + - hostPath: + path: /run/containerd/containerd.sock + name: cri-socket + - hostPath: + path: /etc/crictl.yaml + name: cri-config diff --git a/chaoslib/litmus/container_kill/containerd_chaos/crictl-chaos.yml b/chaoslib/litmus/container_kill/containerd_chaos/crictl-chaos.yml new file mode 100644 index 00000000000..47ff963dbe7 --- /dev/null +++ b/chaoslib/litmus/container_kill/containerd_chaos/crictl-chaos.yml @@ -0,0 +1,157 @@ +--- +- name: Patch the chaoslib image + template: + src: /chaoslib/litmus/container_kill/containerd_chaos/containerd.j2 + dest: /chaoslib/litmus/container_kill/containerd_chaos/containerd-chaos-ds.yml + vars: + containerd_image: "{{ lib_image }}" + +- block: + + - name: Setup containerd chaos infrastructure. + shell: > + kubectl apply -f /chaoslib/litmus/container_kill/containerd_chaos/containerd-chaos-ds.yml + -n {{ namespace }} + args: + executable: /bin/bash + register: result + + - name: Confirm that the containerd-chaos ds is running on all nodes. + shell: > + kubectl get pod -l app=crictl + --no-headers -o custom-columns=:status.phase + -n {{ namespace }} | sort | uniq + args: + executable: /bin/bash + register: result + until: "result.stdout == 'Running'" + delay: 3 + retries: 60 + ignore_errors: true + + - block: + - name: Select the app pod + shell: > + kubectl get pod -l {{ label }} -n {{ namespace }} + -o=custom-columns=NAME:".metadata.name" --no-headers + | shuf | head -1 + args: + executable: /bin/bash + register: pod_name + + - name: Record application pod name + set_fact: + app_pod: "{{ pod_name.stdout }}" + when: app_pod is undefined + + - name: Identify the node where application is running + shell: > + kubectl get pod {{ app_pod }} -n {{ namespace }} + --no-headers -o custom-columns=:spec.nodeName + args: + executable: /bin/bash + register: result + failed_when: result is failed + + - name: Record the application node name + set_fact: + app_node: "{{ result.stdout }}" + + - name: Record the containerd-chaos pod on app node + shell: > + kubectl get pod -l app=crictl -o wide + -n {{ namespace }} | grep {{ app_node }} + | awk '{print $1}' + args: + executable: /bin/bash + register: chaos_pod + failed_when: chaos_pod is failed + + - block: + + - name: Record the application container + shell: > + kubectl get pods -l {{ label }} -n {{ namespace }} -o jsonpath='{.items[0].spec.containers[0].name}' + args: + executable: /bin/bash + register: container + + - name: Record the app_container + set_fact: + app_container: "{{ container.stdout }}" + + when: app_container is undefined + + - name: Obtain the pod ID through Pod name + shell: > + kubectl exec {{ chaos_pod.stdout}} -n {{ namespace }} -- + crictl pods | grep "{{ app_pod }}" | awk '{print $1}' + args: + executable: /bin/bash + register: pod_id + failed_when: pod_id is failed + + - name: Obtain the container ID using pod name and container name + shell: > + kubectl exec {{ chaos_pod.stdout}} -n {{ namespace }} -- + crictl ps | grep {{ pod_id.stdout }} | grep {{ app_container }} | awk '{print $1}' + args: + executable: /bin/bash + register: container_id + failed_when: container_id is failed + + - name: Kill the container + shell: > + kubectl exec {{ chaos_pod.stdout}} -n {{ namespace }} -- + crictl stop "{{ container_id.stdout }}" + args: + executable: /bin/bash + register: result + failed_when: result is failed + + - name: Obtain the container ID using pod name and container name + shell: > + kubectl exec {{ chaos_pod.stdout}} -n {{ namespace }} -- + crictl ps | grep {{ pod_id.stdout }} | grep {{ app_container }} | awk '{print $1}' + args: + executable: /bin/bash + register: new_container_id + until: "new_container_id.stdout != ''" + delay: 5 + retries: 20 + + - name: Check if the new container is running. + shell: > + kubectl exec {{ chaos_pod.stdout}} -n {{ namespace }} -- + crictl ps | grep {{ new_container_id.stdout }} + args: + executable: /bin/bash + register: status + until: "'Running' in status.stdout" + delay: 3 + retries: 30 + + when: action == "killapp" + +- block: + + - name: Delete the crictl-chaos daemonset + shell: > + kubectl delete -f /chaoslib/litmus/container_kill/containerd_chaos/containerd-chaos-ds.yml + -n {{ namespace }} + args: + executable: /bin/bash + register: result + + - name: Confirm that the containerd-chaos pod is deleted successfully + shell: > + kubectl get pod -l app=crictl + --no-headers -n {{ namespace }} + args: + executable: /bin/bash + register: result + until: "result.stdout == ''" + delay: 3 + retries: 50 + + when: action == "delete-containerd" diff --git a/experiments/openebs/openebs-pool-container-failure/openebs_pool_kill_ansible_logic.yml b/experiments/openebs/openebs-pool-container-failure/openebs_pool_container_failure_ansible_logic.yml similarity index 100% rename from experiments/openebs/openebs-pool-container-failure/openebs_pool_kill_ansible_logic.yml rename to experiments/openebs/openebs-pool-container-failure/openebs_pool_container_failure_ansible_logic.yml diff --git a/experiments/openebs/openebs-pool-container-failure/openebs_pool_kill_k8s_job.yml b/experiments/openebs/openebs-pool-container-failure/openebs_pool_container_failure_k8s_job.yml similarity index 97% rename from experiments/openebs/openebs-pool-container-failure/openebs_pool_kill_k8s_job.yml rename to experiments/openebs/openebs-pool-container-failure/openebs_pool_container_failure_k8s_job.yml index 62a2b6eeb1a..be296b86e1f 100644 --- a/experiments/openebs/openebs-pool-container-failure/openebs_pool_kill_k8s_job.yml +++ b/experiments/openebs/openebs-pool-container-failure/openebs_pool_container_failure_k8s_job.yml @@ -72,4 +72,4 @@ spec: volumes: - name: parameters configMap: - name: pool-container-kill + name: pool-container-kill diff --git a/experiments/openebs/openebs-target-failure/README.md b/experiments/openebs/openebs-target-failure/README.md new file mode 100644 index 00000000000..636c7ab1074 --- /dev/null +++ b/experiments/openebs/openebs-target-failure/README.md @@ -0,0 +1,113 @@ +## Experiment Metadata + + + + + + + + + + + + + + +
Type Description Storage K8s Platform
Chaos Kill the cstor target/Jiva controller container and check if gets created again OPENEBS Any
+ +## Entry-Criteria + +- Application services are accessible & pods are healthy +- Application writes are successful + +## Exit-Criteria + +- Application services are accessible & pods are healthy +- Data written prior to chaos is successfully retrieved/read +- Database consistency is maintained as per db integrity check utils +- Storage target pods are healthy + +### Notes + +- Typically used as a disruptive test, to cause loss of access to storage target by killing the containers. +- The container should be created again and it should be healthy. + +## Associated Utils +- [cstor_target_container_kill.yml](/experiments/openebs/openebs-target-failure/cstor_target_container_kill.yml) +- [cstor_target_failure.yaml](/experiments/openebs/openebs-target-failure/cstor_target_failure.yaml) +- [jiva_controller_container_kill.yml](/experiments/openebs/openebs-target-failure/jiva_controller_container_kill.yml) +- [jiva_controller_pod_failure.yaml](/experiments/openebs/openebs-target-failure/jiva_controller_pod_failure.yaml) +- [fetch_cstor_target_pod.yml](/utils/apps/openebs/fetch_cstor_target_pod.yml) +- [fetch_jiva_controller_pod.yml](/utils/apps/openebs/fetch_jiva_controller_pod.yml) +- [fetch_sc_and_provisioner.yml](/utils/apps/openebs/fetch_sc_and_provisioner.yml) +- [target_affinity_check.yml](/utils/apps/openebs/target_affinity_check.yml) + +## Litmus experiment Environment Variables + +### Application + + + + + + + + + + + + + + + + + + + + + +
Parameter + Description
APP_NAMESPACE Namespace in which application pods are deployed
APP_LABEL Unique Labels in `key=value` format of application deployment
APP_PVC Name of persistent volume claim used for app's volume mounts
DATA_PERSISTENCE Specify the application name against which data consistency has to be ensured. Example: busybox
+ +### Chaos + + + + + + + + + + +
CHAOS_TYPE The type of chaos to be induced.
TARGET_CONTAINER The container against which chaos has to be induced.
+ +### Procedure + +This scenario validates the behaviour of application and OpenEBS persistent volumes in the amidst of chaos induced on OpenEBS data plane and control plane components. + +After injecting the chaos into the component specified via environmental variable, litmus experiment observes the behaviour of corresponding OpenEBS PV and the application which consumes the volume. + +Based on the value of env `DATA_PERSISTENCE`, the corresponding data consistency util will be executed. At present only busybox and percona-mysql are supported. Along with specifying env in the litmus experiment, user needs to pass name for configmap and the data consistency specific parameters required via configmap in the format as follows: + +```yml + parameters.yml: | + blocksize: 4k + blockcount: 1024 + testfile: difiletest +``` + +It is recommended to pass test-name for configmap and mount the corresponding configmap as volume in the litmus pod. The above snippet holds the parameters required for validation data consistency in busybox application. + +For percona-mysql, the following parameters are to be injected into configmap. + +```yml + parameters.yml: | + dbuser: root + dbpassword: k8sDemo + dbname: tbd +``` + +The configmap data will be utilised by litmus experiments as its variables while executing the scenario. + +Based on the data provided, litmus checks if the data is consistent after recovering from induced chaos. diff --git a/experiments/openebs/openebs-target-failure/chaosutil.j2 b/experiments/openebs/openebs-target-failure/chaosutil.j2 new file mode 100644 index 00000000000..959aadb1736 --- /dev/null +++ b/experiments/openebs/openebs-target-failure/chaosutil.j2 @@ -0,0 +1,18 @@ +{% if stg_prov is defined and stg_prov == 'openebs.io/provisioner-iscsi' %} + {% if stg_engine is defined and stg_engine == 'cstor' %} + {% if chaos_type is defined and chaos_type == 'target-kill' or chaos_type == 'target-zrepl-kill' %} + chaosutil: /experiments/openebs/openebs-target-failure/cstor_target_container_kill.yml + {% else %} + chaosutil: /experiments/openebs/openebs-target-failure/cstor_target_failure.yaml + {% endif %} + {% endif %} +{% endif %} +{% if stg_prov is defined and stg_prov == 'openebs.io/provisioner-iscsi' %} + {% if stg_engine is defined and stg_engine == 'jiva' %} + {% if chaos_type is defined and chaos_type == 'jiva-ctrl-kill' %} + chaosutil: /experiments/openebs/openebs-target-failure/jiva_controller_container_kill.yml + {% else %} + chaosutil: /experiments/openebs/openebs-target-failure/jiva_controller_pod_failure.yaml + {% endif %} + {% endif %} +{% endif %} diff --git a/experiments/openebs/openebs-target-failure/cstor_target_container_kill.yml b/experiments/openebs/openebs-target-failure/cstor_target_container_kill.yml new file mode 100644 index 00000000000..2bae19b7bc6 --- /dev/null +++ b/experiments/openebs/openebs-target-failure/cstor_target_container_kill.yml @@ -0,0 +1,67 @@ +--- +- name: Pick the cstor target pod + include_tasks: /utils/apps/openebs/fetch_cstor_target_pod.yml + +- name: Get the restartCount of cstor-istgt container + shell: > + kubectl get pod {{ cstor_target_pod.stdout }} -n {{ openebs_ns }} + -o=jsonpath='{.status.containerStatuses[?(@.name==''"{{ target_container }}"'')].restartCount}' + args: + executable: /bin/bash + register: restartCount_before + +# including pumba chaoslib -> pod-failure-by-sigkill +- include_tasks: /chaoslib/pumba/pod_failure_by_sigkill.yaml + vars: + action: "killapp" + namespace: "{{ openebs_ns }}" + app_pod: "{{ cstor_target_pod.stdout }}" + app_container: "{{ target_container }}" + when: cri == 'docker' + +- include_tasks: /chaoslib/litmus/container_kill/containerd_chaos/crictl-chaos.yml + vars: + action: "killapp" + namespace: "{{ openebs_ns }}" + app_pod: "{{ cstor_target_pod.stdout }}" + app_container: "{{ target_container }}" + when: cri == 'containerd' + +- name: Check for target pod in running state + shell: > + kubectl get pod {{ cstor_target_pod.stdout }} -n {{ openebs_ns }} | + grep -w "Running" | wc -l + args: + executable: /bin/bash + register: cstor_target_pod_cnt + until: "cstor_target_pod_cnt.stdout == \"1\"" + delay: 30 + retries: 10 + +- name: Get the runningStatus of target pod + shell: > + kubectl get pod {{ cstor_target_pod.stdout }} -n {{ openebs_ns }} + -o=jsonpath='{range .status.containerStatuses[*]}{.state}{"\n"}{end}' | + grep -w running | wc -l + args: + executable: /bin/bash + register: runningStatusCount + until: "runningStatusCount.stdout == \"3\"" + delay: 30 + retries: 10 + +- name: Get the restartCount of cstor-istgt container + shell: > + kubectl get pod {{ cstor_target_pod.stdout }} -n {{ openebs_ns }} + -o=jsonpath='{.status.containerStatuses[?(@.name==''"{{target_container}}"'')].restartCount}' + args: + executable: /bin/bash + register: restartCount_after + +- name: Compare restartCounts + debug: + msg: + - "Verified pool pods were restarted by fault injection" + - "Before: {{ restartCount_before.stdout }}" + - "After: {{ restartCount_after.stdout }}" + failed_when: "{{ restartCount_after.stdout|int }} != {{ restartCount_before.stdout|int + 1 }}" diff --git a/experiments/openebs/openebs-target-failure/cstor_target_failure.yaml b/experiments/openebs/openebs-target-failure/cstor_target_failure.yaml new file mode 100644 index 00000000000..b9a7934fe3a --- /dev/null +++ b/experiments/openebs/openebs-target-failure/cstor_target_failure.yaml @@ -0,0 +1,39 @@ +--- +- name: Pick the cstor target pod + include_tasks: /utils/apps/openebs/fetch_cstor_target_pod.yml + +- name: Record the cstor target deployment of the PV + set_fact: + cstor_target_deploy: "{{ pv.stdout }}-{{ cstor_target_pod_suffix }}" + +- name: Get the resourceVersion of the target deploy before fault injection + shell: > + kubectl get deployment {{ cstor_target_deploy }} + -n {{ openebs_ns }} -o=jsonpath='{.metadata.resourceVersion}' + args: + executable: /bin/bash + register: rv_bef + +# including litmus chaoslib -> kill-random-pod +- name: Kill the cstor target pod + include_tasks: /chaoslib/litmus/kill_random_pod.yml + vars: + app_ns: "{{ openebs_ns }}" + app_pod_name: "{{ cstor_target_pod.stdout }}" + +- name: Wait for 10s post fault injection + wait_for: + timeout: 10 + +- name: Get the resourceVersion of the target deploy after fault injection + shell: > + kubectl get deployment {{ cstor_target_deploy }} + -n {{ openebs_ns }} -o=jsonpath='{.metadata.resourceVersion}' + args: + executable: /bin/bash + register: rv_aft + +- name: Compare resourceVersions of target deployment + debug: + msg: "Verified target pods were restarted by fault injection" + failed_when: "rv_bef.stdout | int == rv_aft.stdout | int" diff --git a/experiments/openebs/openebs-target-failure/data_persistence.j2 b/experiments/openebs/openebs-target-failure/data_persistence.j2 new file mode 100644 index 00000000000..8b0e7e500cc --- /dev/null +++ b/experiments/openebs/openebs-target-failure/data_persistence.j2 @@ -0,0 +1,5 @@ +{% if data_persistence is defined and data_persistence == 'mysql' %} + consistencyutil: /utils/apps/mysql/mysql_data_persistence.yml + {% elif data_persistence is defined and data_persistence == 'busybox' %} + consistencyutil: /utils/apps/busybox/busybox_data_persistence.yml +{% endif %} diff --git a/experiments/openebs/openebs-target-failure/jiva_controller_container_kill.yml b/experiments/openebs/openebs-target-failure/jiva_controller_container_kill.yml new file mode 100644 index 00000000000..b6e11f6aefa --- /dev/null +++ b/experiments/openebs/openebs-target-failure/jiva_controller_container_kill.yml @@ -0,0 +1,71 @@ +--- +- name: Pick the jiva controller pod + include_tasks: /utils/apps/openebs/fetch_jiva_controller_pod.yml + +- name: Record jiva controller container name + set_fact: + ctrl_container: "{{ pv.stdout }}-{{ jiva_controller_pod_suffix }}-{{ jiva_controller_container_suffix }}" + +- name: Get the restartCount of ctrl-con container + shell: > + kubectl get pods {{ jiva_controller_pod.stdout }} -n {{ a_ns }} + -o=jsonpath='{.status.containerStatuses[?(@.name==''"{{ctrl_container}}"'')].restartCount}' + args: + executable: /bin/bash + register: restartCount_before + +# including pumba chaoslib - pod-failure-by-sigkill +- include_tasks: /chaoslib/pumba/pod_failure_by_sigkill.yaml + vars: + action: "killapp" + app_pod: "{{ jiva_controller_pod.stdout }}" + namespace: "{{ a_ns }}" + app_container: "{{ ctrl_container }}" + when: cri == 'docker' + +- include_tasks: /chaoslib/litmus/container_kill/containerd_chaos/crictl-chaos.yml + vars: + action: "killapp" + app_pod: "{{ jiva_controller_pod.stdout }}" + namespace: "{{ a_ns }}" + app_container: "{{ ctrl_container }}" + when: cri == 'containerd' + +- name: Check if the controller pod is running + shell: > + kubectl get pod {{ jiva_controller_pod.stdout }} -n {{ a_ns }} --no-headers + -o custom-columns=:.status.phase + args: + executable: /bin/bash + register: result + until: "result.stdout == 'Running'" + delay: 5 + retries: 45 + +- name: Check for controller container status + shell: > + kubectl get pod {{ jiva_controller_pod.stdout }} -n {{ a_ns }} + -o=jsonpath='{range .status.containerStatuses[*]}{.state}{"\n"}{end}' | + grep -w running | wc -l + args: + executable: /bin/bash + register: runningStatusCount + until: "runningStatusCount.stdout == \"2\"" + delay: 30 + retries: 10 + +- name: Get the restartCount of ctrl-con container + shell: > + kubectl get pods {{ jiva_controller_pod.stdout }} -n {{ a_ns }} + -o=jsonpath='{.status.containerStatuses[?(@.name==''"{{ctrl_container}}"'')].restartCount}' + args: + executable: /bin/bash + register: restartCount_after + +- name: Compare restartCounts + debug: + msg: + - "Verified containers restartcounts after fault injection" + - "Before: {{ restartCount_before.stdout }}" + - "After: {{ restartCount_after.stdout }}" + failed_when: "{{ restartCount_after.stdout|int }} != {{ restartCount_before.stdout|int + 1 }}" diff --git a/experiments/openebs/openebs-target-failure/jiva_controller_pod_failure.yaml b/experiments/openebs/openebs-target-failure/jiva_controller_pod_failure.yaml new file mode 100644 index 00000000000..c47dfb9b976 --- /dev/null +++ b/experiments/openebs/openebs-target-failure/jiva_controller_pod_failure.yaml @@ -0,0 +1,86 @@ +--- +- name: Pick the jiva controller pod + include_tasks: /utils/apps/openebs/fetch_jiva_controller_pod.yml + +- name: Record the jiva controller deployment and container name + set_fact: + jiva_controller_deploy: "{{ pv.stdout }}-{{ jiva_controller_pod_suffix }}" + jiva_controller_name: "{{ pv.stdout }}-{{ jiva_controller_pod_suffix }}-{{ jiva_controller_container_suffix }}" + +- name: Get the resourceVersion of the target deploy before fault injection + shell: > + kubectl get deploy {{ jiva_controller_deploy }} -n {{ a_ns }} + -o=custom-columns=NAME:".metadata.resourceVersion" --no-headers + args: + executable: /bin/bash + register: rv_bef + +- name: Get controller svc + shell: > + kubectl get svc -l {{ jiva_controller_svc_label }} + -n {{ a_ns }} -o=jsonpath='{.items[0].spec.clusterIP}' + args: + executable: /bin/bash + register: controller_svc + failed_when: controller_svc.stdout == "" + +- name: Install jq package inside a controller container + shell: > + kubectl exec -it {{ jiva_controller_pod.stdout }} -n {{ a_ns }} -c {{ jiva_controller_name }} + -- bash -c "apt-get update && apt-get install -y jq" + args: + executable: /bin/bash + +- name: Getting the Replicastatus before killing controller + shell: > + kubectl exec -it {{ jiva_controller_pod.stdout }} -n {{ a_ns }} + -c {{ jiva_controller_name }} curl http://"{{controller_svc.stdout}}":9501/v1/replicas | jq -r '.data[].mode' + args: + executable: /bin/bash + register: rstatus_before + +# including litmus chaoslib -> kill-random-pod +- name: Kill the jiva controller pod + include_tasks: /chaoslib/litmus/kill_random_pod.yml + vars: + app_ns: "{{ a_ns }}" + app_pod_name: "{{ jiva_controller_pod.stdout }}" + +- name: Get jiva controller pod belonging to the PV + shell: > + kubectl get pods --no-headers -l {{ jiva_controller_pod_label }} -n {{ a_ns }} + -o jsonpath="{.items[?(@.metadata.labels.openebs\\.io/persistent-volume==\"{{pv.stdout}}\")].metadata.name}" + args: + executable: /bin/bash + register: jctrl_pod_after + +- name: Install jq package inside a controller container + shell: > + kubectl exec -it {{ jctrl_pod_after.stdout }} -n {{ a_ns }} -c {{ jiva_controller_name }} + -- bash -c "apt-get update && apt-get install -y jq" + args: + executable: /bin/bash + +- name: Getting the Replicastatus after killing the controller + shell: > + kubectl exec -it {{ jctrl_pod_after.stdout }} -n {{ a_ns }} + -c {{ jiva_controller_name }} curl http://"{{controller_svc.stdout}}":9501/v1/replicas | jq -r '.data[].mode' + args: + executable: /bin/bash + register: rstatus_after + until: "rstatus_after.stdout_lines == rstatus_before.stdout_lines and 'RW' in rstatus_after.stdout" + retries: 30 + delay: 10 + +- name: Get the resourceVersion of the target deploy after fault injection + shell: > + kubectl get deploy {{ jiva_controller_deploy }} -n {{ a_ns }} + -o=custom-columns=NAME:".metadata.resourceVersion" --no-headers + args: + executable: /bin/bash + register: rv_aft + +- name: Compare resourceVersions of target deployment + debug: + msg: "Verified target pods were restarted by fault injection" + failed_when: "rv_bef.stdout | int == rv_aft.stdout | int" diff --git a/experiments/openebs/openebs-target-failure/openebs_target_failure_ansible_logic.yml b/experiments/openebs/openebs-target-failure/openebs_target_failure_ansible_logic.yml new file mode 100644 index 00000000000..4f79a2003b2 --- /dev/null +++ b/experiments/openebs/openebs-target-failure/openebs_target_failure_ansible_logic.yml @@ -0,0 +1,164 @@ +--- +- hosts: localhost + connection: local + + vars: + a_label: "{{ lookup('env','APP_LABEL') }}" + a_ns: "{{ lookup('env','APP_NAMESPACE') }}" + a_pvc: "{{ lookup('env','APP_PVC') }}" + c_experiment: openebs-target-failure + c_force: "{{ lookup('env','FORCE') }}" + c_interval: 5 + chaos_duration: 120 + chaos_type: "{{ lookup('env','CHAOS_TYPE') }}" + cri: "{{ lookup('env','CONTAINER_RUNTIME') }}" + data_persistence: "{{ lookup('env','DATA_PERSISTENCE') }}" + deploy_type: "{{ lookup('env','DEPLOY_TYPE') }}" + lib_image: "{{ lookup('env','LIB_IMAGE') }}" + liveness_label: "{{ lookup('env','LIVENESS_APP_LABEL') }}" + liveness_namespace: "{{ lookup('env','LIVENESS_APP_NAMESPACE') }}" + openebs_ns: "{{ lookup('env','OPENEBS_NAMESPACE') }}" + target_container: "{{ lookup('env','TARGET_CONTAINER') }}" + + vars_files: + - /mnt/parameters.yml + - /experiments/openebs/openebs_components.yml + + tasks: + - block: + + ## PRE-CHAOS APPLICATION LIVENESS CHECK + - include_tasks: /utils/common/application_liveness_check.yml + when: liveness_label != '' + + - include: test_prerequisites.yml + + - include_vars: + file: data_persistence.yml + + - include_vars: + file: chaosutil.yml + + - name: Record the chaos util path + set_fact: + chaos_util_path: "{{ chaosutil }}" + + - name: Record the data consistency util path + set_fact: + data_consistency_util_path: "{{ consistencyutil }}" + when: data_persistence != '' + + - include_tasks: /utils/runtime/create_testname.yml + + ## GENERATE EXP RESULT NAME + - block: + + - name: Construct chaos result name (experiment_name) + set_fact: + c_experiment: "{{ lookup('env','CHAOSENGINE') }}-{{ c_experiment }}" + + when: lookup('env','CHAOSENGINE') + + ## RECORD START-OF-TEST IN CHAOS RESULT CR + - include_tasks: /utils/runtime/update_chaos_result_resource.yml + vars: + status: 'SOT' + namespace: "{{ a_ns }}" + + ## DISPLAY APP INFORMATION + + - name: Display the app information passed via the test job + debug: + msg: + - "The application info is as follows:" + - "Namespace : {{ a_ns }}" + - "Target Namespace : {{ openebs_ns }}" + - "Label : {{ a_label }}" + - "PVC : {{ a_pvc }}" + - "StorageClass : {{ sc }}" + + ## PRE-CHAOS APPLICATION LIVENESS CHECK + + - name: Verify that the AUT (Application Under Test) is running + include_tasks: "/utils/common/status_app_pod.yml" + vars: + application_name: "{{ app_pod_name.stdout }}" + delay: 5 + retries: 60 + + - name: Get application pod name + shell: > + kubectl get pods -n {{ a_ns }} -l {{ a_label }} --no-headers + -o=custom-columns=NAME:".metadata.name" + args: + executable: /bin/bash + register: app_pod_name + + - name: Create some test data + include: "{{ data_consistency_util_path }}" + vars: + status: 'LOAD' + ns: "{{ a_ns }}" + pod_name: "{{ app_pod_name.stdout }}" + when: data_persistence != '' + + ## STORAGE FAULT INJECTION + + - include: "{{ chaos_util_path }}" + + ## POST-CHAOS APPLICATION LIVENESS CHECK + + - name: Wait (soak) for I/O on pools + wait_for: + timeout: "{{ chaos_duration }}" + + - name: Verify AUT liveness post fault-injection + include_tasks: "/utils/common/status_app_pod.yml" + vars: + application_name: "{{ app_pod_name.stdout }}" + delay: 5 + retries: 60 + + ## POST-CHAOS APPLICATION LIVENESS CHECK + - include_tasks: /utils/common/application_liveness_check.yml + when: liveness_label != '' + + - name: Get application pod name + shell: > + kubectl get pods -n {{ a_ns }} -l {{ a_label }} --no-headers + -o=custom-columns=NAME:".metadata.name" + args: + executable: /bin/bash + register: rescheduled_app_pod + + - name: Verify application data persistence + include: "{{ data_consistency_util_path }}" + vars: + status: 'VERIFY' + ns: "{{ a_ns }}" + pod_name: "{{ rescheduled_app_pod.stdout }}" + when: data_persistence != '' + + ## Check application-target pod affinity + - include_tasks: /utils/apps/openebs/target_affinity_check.yml + when: deploy_type == 'deployment' + + ## Check statefulset application-target pod affinity + - include_tasks: /utils/apps/openebs/sts_target_affinity_check.yml + when: deploy_type == 'statefulset' + + - set_fact: + flag: "Pass" + + rescue: + - set_fact: + flag: "Fail" + + always: + + ## RECORD END-OF-TEST IN CHAOS RESULT CR + + - include_tasks: /utils/runtime/update_chaos_result_resource.yml + vars: + status: 'EOT' + namespace: "{{ a_ns }}" diff --git a/experiments/openebs/openebs-target-failure/openebs_target_failure_k8s_job.yml b/experiments/openebs/openebs-target-failure/openebs_target_failure_k8s_job.yml new file mode 100644 index 00000000000..7e406af85c5 --- /dev/null +++ b/experiments/openebs/openebs-target-failure/openebs_target_failure_k8s_job.yml @@ -0,0 +1,98 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: target-failure +data: + parameters.yml: | + +--- +apiVersion: batch/v1 +kind: Job +metadata: + generateName: openebs-target-failure- +spec: + template: + metadata: + labels: + name: openebs-target-failure + spec: + serviceAccountName: %CHAOS_SERVICE_ACCOUNT% + restartPolicy: Never + containers: + - name: ansibletest + image: litmuschaos/ansible-runner:ci + imagePullPolicy: Always + env: + - name: ANSIBLE_STDOUT_CALLBACK + value: 'default' + + # provide application namespace + - name: APP_NAMESPACE + value: '' + + # provide openebs namespace + - name: OPENEBS_NAMESPACE + value: 'openebs' + + # provide application label + - name: APP_LABEL + value: '' + + # provide application pvc + - name: APP_PVC + value: '' + + # it can be true or false, depending upon scenario - allowed force deletion or not + - name: FORCE + value: 'true' + + - name: LIVENESS_APP_LABEL + value: '' + + # LIB_IMAGE can be - gaiaadm/pumba:0.4.8, gprasath/crictl:ci + # For pumba image use : gaiaadm/pumba:0.4.8 + # For containerd image use : gprasath/crictl:ci + - name: LIB_IMAGE + value: 'gaiaadm/pumba:0.4.8' + + - name: LIVENESS_APP_NAMESPACE + value: '' + + - name: DATA_PERSISTENCE + value: '' + + # Specify the container runtime used , to pick the relevant chaos util + - name: CONTAINER_RUNTIME + value: 'docker' + + # CHAOS_TYPE values : target-zrepl-kill , target-kill , target-delete , jiva-ctrl-kill + # For cstor-volume-istgt container kill Use : target-kill + # For Volume-mgmt-kill container Use : target-zrepl-kill + # For cstor-target-failure Use : target-delete + # For Jiva-Controller-container-kill Use : jiva-ctrl-kill + + - name: CHAOS_TYPE + value: 'jiva-ctrl-kill' + + # TARGET_CONTAINER values: cstor-volume-mgmt , cstor-istgt + # For cstor-volume-istgt container kill use : cstor-istgt + # For volume-mgmt-kill container use : cstor-volume-mgmt + + - name: TARGET_CONTAINER + value: 'cstor-volume-mgmt' + + # DEPLOY_TYPE values: deployment, statefulset + - name: DEPLOY_TYPE + value: 'deployment' + + command: ["/bin/bash"] + args: ["-c", "ansible-playbook ./experiments/openebs/openebs-target-failure/openebs_target_failure_ansible_logic.yml -i /etc/ansible/hosts -vv; exit 0"] + + volumeMounts: + - name: parameters + mountPath: /mnt/ + volumes: + - name: parameters + configMap: + name: target-failure diff --git a/experiments/openebs/openebs-target-failure/test_prerequisites.yml b/experiments/openebs/openebs-target-failure/test_prerequisites.yml new file mode 100644 index 00000000000..16dbe18619b --- /dev/null +++ b/experiments/openebs/openebs-target-failure/test_prerequisites.yml @@ -0,0 +1,36 @@ +--- +- name: Fetch sc and provisioner + include_tasks: /utils/apps/openebs/fetch_sc_and_provisioner.yml + +- block: + - name: Derive PV name from PVC to query storage engine type (openebs) + shell: > + kubectl get pvc {{ a_pvc }} -n {{ a_ns }} + --no-headers -o custom-columns=:spec.volumeName + args: + executable: /bin/bash + register: pv + + - name: Check for presence & value of cas type annotation + shell: > + kubectl get pv {{ pv.stdout }} --no-headers + -o jsonpath="{.metadata.annotations.openebs\\.io/cas-type}" + args: + executable: /bin/bash + register: openebs_stg_engine + + - name: Record the storage engine name + set_fact: + stg_engine: "{{ openebs_stg_engine.stdout }}" + when: stg_prov == "openebs.io/provisioner-iscsi" + +- name: Identify the chaos util to be invoked + template: + src: chaosutil.j2 + dest: chaosutil.yml + +- name: Identify the data consistency util to be invoked + template: + src: data_persistence.j2 + dest: data_persistence.yml + diff --git a/utils/apps/openebs/fetch_cstor_target_pod.yml b/utils/apps/openebs/fetch_cstor_target_pod.yml new file mode 100644 index 00000000000..537c068a8bf --- /dev/null +++ b/utils/apps/openebs/fetch_cstor_target_pod.yml @@ -0,0 +1,18 @@ +--- +- name: Derive PV from application PVC + shell: > + kubectl get pvc {{ a_pvc }} + -o custom-columns=:spec.volumeName -n {{ a_ns }} + --no-headers + args: + executable: /bin/bash + register: pv + +- name: Pick a cStor target pod belonging to the PV + shell: > + kubectl get pods -l {{ cstor_target_pod_label }} + -n {{ openebs_ns }} --no-headers | grep {{ pv.stdout }} + | shuf -n1 | awk '{print $1}' + args: + executable: /bin/bash + register: cstor_target_pod diff --git a/utils/apps/openebs/fetch_jiva_controller_pod.yml b/utils/apps/openebs/fetch_jiva_controller_pod.yml new file mode 100644 index 00000000000..b97bdffd279 --- /dev/null +++ b/utils/apps/openebs/fetch_jiva_controller_pod.yml @@ -0,0 +1,18 @@ +--- +- name: Derive PV from application PVC + shell: > + kubectl get pvc {{ a_pvc }} + -o custom-columns=:spec.volumeName -n {{ a_ns }} + --no-headers + args: + executable: /bin/bash + register: pv + +- name: Get jiva controller pod belonging to the PV + shell: > + kubectl get pods --no-headers -l {{ jiva_controller_pod_label }} -n {{ a_ns }} + -o jsonpath="{.items[?(@.metadata.labels.openebs\\.io/persistent-volume==\"{{pv.stdout}}\")].metadata.name}" + args: + executable: /bin/bash + register: jiva_controller_pod + \ No newline at end of file diff --git a/utils/apps/openebs/target_affinity_check.yml b/utils/apps/openebs/target_affinity_check.yml new file mode 100644 index 00000000000..b99d0547aff --- /dev/null +++ b/utils/apps/openebs/target_affinity_check.yml @@ -0,0 +1,61 @@ +- name: Obtain node where app pod resides + k8s_facts: + kind: Pod + label_selectors: + - "{{ a_label }}" + namespace: "{{ a_ns }}" + register: app_node + failed_when: app_node.resources | length < 1 + +- debug: + msg: "{{ app_node | json_query('resources[*].spec.nodeName')}}" + + +- name: Derive PV from application PVC + k8s_facts: + kind: PersistentVolumeClaim + name: "{{ a_pvc }}" + namespace: "{{ a_ns }}" + register: pv + failed_when: pv.resources | length < 1 + +- debug: + msg: "{{ pv | json_query('resources[*].spec.volumeName')}}" + +- name: Derive storage engine from PV + k8s_facts: + kind: PersistentVolume + name: "{{ pv | json_query('resources[0].spec.volumeName')}}" + register: stg_engine + +- debug: + msg: "{{ item.metadata.annotations['openebs.io/cas-type'] }}" + with_items: "{{ stg_engine.resources }}" + +- set_fact: + target_ns: "{{ a_ns }}" + target_label: "openebs.io/controller=jiva-controller" + when: stg_engine.resources.0.metadata.annotations['openebs.io/cas-type'] == 'jiva' + + +## TODO: Account for the case where cstor target can reside in app_ns +## For future: Leave a bool var called {{ target_in_app_ns }} as undefined + +- set_fact: + target_ns: "{{ openebs_ns }}" + target_label: "openebs.io/target=cstor-target" + when: stg_engine.resources.0.metadata.annotations['openebs.io/cas-type'] == 'cstor' and target_in_app_ns is undefined + +- name: Obtain the node where PV target pod resides + k8s_facts: + kind: Pod + namespace: "{{ target_ns }}" + label_selectors: + - "{{ target_label }}" + - "openebs.io/persistent-volume={{ pv.resources.0.spec.volumeName }}" + register: target_node + +- name: Verify whether the app & target pod co-exist on same node + debug: + msg: "App and Target affinity is maintained" + failed_when: target_node.resources.0.spec.nodeName != app_node.resources.0.spec.nodeName From fc8a1bd1a64420dd0d9987399c7da422f54e5cab Mon Sep 17 00:00:00 2001 From: Rahul M Chheda <53308066+rahulchheda@users.noreply.github.com> Date: Thu, 7 Nov 2019 18:30:33 +0530 Subject: [PATCH 07/21] Diskfill Fix, to remove the pod, if it's Eviction Status == true (#920) * Added DiskFill Experiment Signed-off-by: Rahul M Chheda --- .../litmus/disk_fill/disk_fill_by_litmus.yml | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/chaoslib/litmus/disk_fill/disk_fill_by_litmus.yml b/chaoslib/litmus/disk_fill/disk_fill_by_litmus.yml index 890641e2fd3..ac8121cac19 100644 --- a/chaoslib/litmus/disk_fill/disk_fill_by_litmus.yml +++ b/chaoslib/litmus/disk_fill/disk_fill_by_litmus.yml @@ -105,11 +105,6 @@ - include_tasks: /chaoslib/litmus/disk_fill/file_creation.yml -- include_tasks: /utils/common/status_app_pod.yml - vars: - delay: 1 - retries: 60 - - name: Waiting for Chaos Duration wait_for: timeout: "{{ c_duration }}" @@ -121,10 +116,22 @@ executable: /bin/bash register: pod_status +- name: Check if the pod is Evicted + shell: > + kubectl get pod {{ pod_name.stdout }} -n {{ a_ns }} -o jsonpath='{.status.reason}' + args: + executable: /bin/bash + register: eviction_status + +- name: Delete Pod if evicted + shell: > + kubectl delete pod {{ pod_name.stdout }} -n {{ a_ns }} + when: eviction_status.stdout == "Evicted" + - name: If Pod is not evicted / running shell: > kubectl exec -it {{ disk_fill_pod.stdout }} -n {{ a_ns }} -- sh -c "rm -rf /diskfill/{{ containerID.stdout }}/diskfill" - when: pod_status.stdout != "Evicted" + when: pod_status.stdout == "Running" - name: Delete DaemonSet disk-fill shell: > From ecd9ea595aeabf204317bccc9c9eb97239b41e46 Mon Sep 17 00:00:00 2001 From: Shubham Chaudhary Date: Fri, 8 Nov 2019 11:58:23 +0530 Subject: [PATCH 08/21] (fix) modify the experiments to remove duplicate envs (#921) Signed-off-by: shubhamchaudhary --- chaoslib/litmus/kill_random_pod.yml | 15 +++++++++++---- .../container_kill/container_kill_k8s_job.yml | 12 ------------ .../pod_network_latency_ansible_logic.yml | 2 ++ .../pod_network_latency_k8s_job.yml | 8 -------- .../pod_network_loss_ansible_logic.yml | 2 ++ .../pod_network_loss/pod_network_loss_k8s_job.yml | 12 ------------ 6 files changed, 15 insertions(+), 36 deletions(-) diff --git a/chaoslib/litmus/kill_random_pod.yml b/chaoslib/litmus/kill_random_pod.yml index 4c17a423da9..fae73e4b3b2 100644 --- a/chaoslib/litmus/kill_random_pod.yml +++ b/chaoslib/litmus/kill_random_pod.yml @@ -9,16 +9,23 @@ - name: Select a random pod to kill set_fact: - app_pod_name: "{{ pod_list.resources | random | json_query('metadata.name') }}" + app_pod: "{{ pod_list.resources | random | json_query('metadata.name') }}" when: app_pod_name is undefined +- block: + - name: Record app pod + set_fact: + app_pod: "{{ app_pod_name }}" + + when: app_pod_name is defined + - debug: - msg: "Killing pod {{ app_pod_name }}" + msg: "Killing pod {{ app_pod }}" - name: Force Kill application pod shell: | - kubectl delete pod -n {{ app_ns }} --force --grace-period=0 --wait=false {{ app_pod_name }} + kubectl delete pod -n {{ app_ns }} --force --grace-period=0 --wait=false {{ app_pod }} args: executable: /bin/bash register: result @@ -26,7 +33,7 @@ - name: Kill application pod shell: | - kubectl delete pod -n {{ app_ns }} --grace-period=0 --wait=false {{ app_pod_name }} + kubectl delete pod -n {{ app_ns }} --grace-period=0 --wait=false {{ app_pod }} args: executable: /bin/bash register: result diff --git a/experiments/generic/container_kill/container_kill_k8s_job.yml b/experiments/generic/container_kill/container_kill_k8s_job.yml index e47ee6fb2d6..3d22aeede01 100644 --- a/experiments/generic/container_kill/container_kill_k8s_job.yml +++ b/experiments/generic/container_kill/container_kill_k8s_job.yml @@ -35,18 +35,6 @@ spec: - name: TARGET_CONTAINER value: '' - # provide application namespace - - name: APP_NAMESPACE - value: '' - - # provide application labels - - name: APP_LABEL - value: '' - - # provide target container - - name: TARGET_CONTAINER - value: '' - # provide chaosengine name - name: CHAOSENGINE value: '' diff --git a/experiments/generic/pod_network_latency/pod_network_latency_ansible_logic.yml b/experiments/generic/pod_network_latency/pod_network_latency_ansible_logic.yml index 07736e4b8ba..dd92f88ac55 100644 --- a/experiments/generic/pod_network_latency/pod_network_latency_ansible_logic.yml +++ b/experiments/generic/pod_network_latency/pod_network_latency_ansible_logic.yml @@ -45,6 +45,8 @@ ## FAULT INJECTION - include_tasks: "{{ c_util }}" + vars: + app_ns: "{{ a_ns }}" ## POST-CHAOS APPLICATION LIVENESS CHECK - name: Verify AUT liveness post fault-injection diff --git a/experiments/generic/pod_network_latency/pod_network_latency_k8s_job.yml b/experiments/generic/pod_network_latency/pod_network_latency_k8s_job.yml index 7bb009965d1..3ae8866bca9 100644 --- a/experiments/generic/pod_network_latency/pod_network_latency_k8s_job.yml +++ b/experiments/generic/pod_network_latency/pod_network_latency_k8s_job.yml @@ -33,14 +33,6 @@ spec: - name: APP_KIND value: '' - # provide application labels - - name: APP_LABEL - value: '' - - # provide application kind - - name: APP_KIND - value: '' - - name: TARGET_CONTAINER value: '' diff --git a/experiments/generic/pod_network_loss/pod_network_loss_ansible_logic.yml b/experiments/generic/pod_network_loss/pod_network_loss_ansible_logic.yml index 8b7e1934bb9..85d3947a7b1 100644 --- a/experiments/generic/pod_network_loss/pod_network_loss_ansible_logic.yml +++ b/experiments/generic/pod_network_loss/pod_network_loss_ansible_logic.yml @@ -45,6 +45,8 @@ ## FAULT INJECTION - include_tasks: "{{ c_util }}" + vars: + app_ns: "{{ a_ns }}" ## POST-CHAOS APPLICATION LIVENESS CHECK - name: Verify AUT liveness post fault-injection diff --git a/experiments/generic/pod_network_loss/pod_network_loss_k8s_job.yml b/experiments/generic/pod_network_loss/pod_network_loss_k8s_job.yml index f41903ad9ee..9930760b63d 100644 --- a/experiments/generic/pod_network_loss/pod_network_loss_k8s_job.yml +++ b/experiments/generic/pod_network_loss/pod_network_loss_k8s_job.yml @@ -33,18 +33,6 @@ spec: - name: APP_KIND value: '' - # provide application namespace - - name: APP_NAMESPACE - value: '' - - # provide application labels - - name: APP_LABEL - value: '' - - # provide application kind - - name: APP_KIND - value: '' - # provide target conatiner - name: TARGET_CONTAINER value: '' From 509583e366257c084ef023781b291d62a9155a24 Mon Sep 17 00:00:00 2001 From: Raj Babu Das Date: Fri, 8 Nov 2019 15:16:17 +0530 Subject: [PATCH 09/21] (feat): Chaos Experiment: Added disk loss chaos experiment logic and job for AWS (#909) * Added logic, k8s job and gcloud utils for disk loss Signed-off-by: Raj --- build/ansible-runner/Dockerfile | 6 ++--- chaoslib/litmus/platform/aws/disk_loss.yml | 21 ++++++++++++++++++ chaoslib/litmus/platform/gke/disk_loss.yml | 2 +- experiments/generic/disk_loss/README.md | 15 +++++++++---- experiments/generic/disk_loss/chaosutil.j2 | 4 +++- .../disk_loss/disk_loss_ansible_logic.yml | 22 ++++++++++++++++--- .../generic/disk_loss/disk_loss_k8s_job.yml | 4 ++++ .../generic/disk_loss/disk_status_check.j2 | 19 +++++++++++++++- utils/cloud/aws/aws_configure.yml | 14 ++++++++++++ utils/cloud/aws/status_disk.yml | 15 +++++++++++++ 10 files changed, 109 insertions(+), 13 deletions(-) create mode 100644 chaoslib/litmus/platform/aws/disk_loss.yml create mode 100644 utils/cloud/aws/aws_configure.yml create mode 100644 utils/cloud/aws/status_disk.yml diff --git a/build/ansible-runner/Dockerfile b/build/ansible-runner/Dockerfile index 6411eb40be3..830a00394b4 100644 --- a/build/ansible-runner/Dockerfile +++ b/build/ansible-runner/Dockerfile @@ -8,8 +8,9 @@ RUN apt-get clean && \ apt-get install -y --no-install-recommends python-minimal python-pip netcat iproute2 jq sshpass \ curl openssh-client python-setuptools && rm -rf /var/lib/apt/lists/* +RUN pip install --upgrade pip #Installing ansible and dependencies for k8s module -RUN pip install ansible==2.7.3 openshift jmespath +RUN pip install ansible==2.7.3 openshift jmespath boto boto3 RUN touch /mnt/parameters.yml /mnt/cloud_config.yml @@ -21,8 +22,7 @@ RUN gcloud --version #Installing Kubectl ENV KUBE_LATEST_VERSION="v1.12.0" RUN curl -L https://storage.googleapis.com/kubernetes-release/release/${KUBE_LATEST_VERSION}/bin/linux/amd64/kubectl -o /usr/local/bin/kubectl && \ - chmod +x /usr/local/bin/kubectl && \ - curl -o /usr/local/bin/aws-iam-authenticator https://amazon-eks.s3-us-west-2.amazonaws.com/1.10.3/2018-07-26/bin/linux/amd64/aws-iam-authenticator && \chmod +x /usr/local/bin/aws-iam-authenticator + chmod +x /usr/local/bin/kubectl #Adding hosts entries and making ansible folders RUN mkdir /etc/ansible/ /ansible && \ diff --git a/chaoslib/litmus/platform/aws/disk_loss.yml b/chaoslib/litmus/platform/aws/disk_loss.yml new file mode 100644 index 00000000000..de0b30a50b2 --- /dev/null +++ b/chaoslib/litmus/platform/aws/disk_loss.yml @@ -0,0 +1,21 @@ +- name: Detaching the disk + ec2_vol: + id: "{{ disk_name }}" + instance: None + region: "{{ zone_name }}" + +- name: chaos injection for {{ c_duration }}s + wait_for: + timeout: "{{ c_duration }}" + +- name: Verify that the disk is connected to node (post) + include_tasks: "/utils/cloud/aws/status_disk.yml" + +- block: + - name: If disk is not attached, it will attach manually + ec2_vol: + instance: "{{ node_name }}" + id: "{{ disk_name }}" + device_name: "{{ device_name }}" + region: "{{ zone_name }}" + when: "inuse == false" diff --git a/chaoslib/litmus/platform/gke/disk_loss.yml b/chaoslib/litmus/platform/gke/disk_loss.yml index 3eef6e8a6c4..c97ad31a34b 100644 --- a/chaoslib/litmus/platform/gke/disk_loss.yml +++ b/chaoslib/litmus/platform/gke/disk_loss.yml @@ -1,5 +1,5 @@ - name: Detaching the disk - shell: gcloud compute instances detach-disk {{ node_name }} --device-name {{ disk_name }} --zone {{ zone_name }} + shell: gcloud compute instances detach-disk {{ node_name }} --disk {{ disk_name }} --zone {{ zone_name }} - name: chaos injection for {{ c_duration }}s wait_for: diff --git a/experiments/generic/disk_loss/README.md b/experiments/generic/disk_loss/README.md index 9f1ef357d0f..16f85e2961b 100644 --- a/experiments/generic/disk_loss/README.md +++ b/experiments/generic/disk_loss/README.md @@ -76,35 +76,42 @@ Cloud Platform name Mandatory + CLOUD_NAMESPACE This is a chaos namespace which will create all infra chaos resources in that namespace Mandatory - + PROJECT_ID GCP project ID Mandatory - + NODE_NAME Node name of the cluster Mandatory + DISK_NAME Disk Name of the node, it must be an external disk. Mandatory + + DEVICE_NAME + Enter the device name which you wanted to mount only for AWS. + Mandatory - + ZONE_NAME Zone Name of the node Mandatory - + CHAOSENGINE ChaosEngine CR name associated with the experiment instance Mandatory + CHAOS_SERVICE_ACCOUNT Service account used by the litmus Mandatory diff --git a/experiments/generic/disk_loss/chaosutil.j2 b/experiments/generic/disk_loss/chaosutil.j2 index 0fb487d966f..ce1781bf7d3 100644 --- a/experiments/generic/disk_loss/chaosutil.j2 +++ b/experiments/generic/disk_loss/chaosutil.j2 @@ -1,6 +1,8 @@ # All code here is not indented because j2 is space sensitive # checks if cloud_platform is set or not -{% if cloud_platform is defined and cloud_platform == 'GCP' or cloud_platform == 'AWS' %} +{% if cloud_platform is defined and cloud_platform == 'GCP' %} c_util: /chaoslib/litmus/platform/gke/disk_loss.yml +{% elif cloud_platform is defined and cloud_platform == 'AWS' %} +c_util: /chaoslib/litmus/platform/aws/disk_loss.yml {% endif %} diff --git a/experiments/generic/disk_loss/disk_loss_ansible_logic.yml b/experiments/generic/disk_loss/disk_loss_ansible_logic.yml index deec1e164cb..a544e579479 100644 --- a/experiments/generic/disk_loss/disk_loss_ansible_logic.yml +++ b/experiments/generic/disk_loss/disk_loss_ansible_logic.yml @@ -10,6 +10,7 @@ cloud_platform: "{{ lookup('env','CLOUD_PLATFORM') }}" c_ns: "{{ lookup('env','CHAOS_NAMESPACE') }}" disk_name: "{{ lookup('env','DISK_NAME') }}" + device_name: "{{ lookup('env', 'DEVICE_NAME') }}" node_name: "{{ lookup('env','NODE_NAME') }}" project_id: "{{ lookup('env','PROJECT_ID') }}" zone_name: "{{ lookup('env','ZONE_NAME') }}" @@ -47,14 +48,24 @@ - name: Gcloud authentication include_tasks: "/utils/cloud/gcp/gcloud_configure.yml" when: "cloud_platform == 'GCP'" + + # AWS authentication + - name: AWS authentication + include_tasks: "/utils/cloud/aws/aws_configure.yml" + when: "cloud_platform == 'AWS'" ## PRE-CHAOS DISK LIVENESS CHECK - name: Verify that the disk is connected to node (pre) include_tasks: "/utils/cloud/gcp/status_disk.yml" + when: "cloud_platform == 'GCP'" + + - name: Verify that the disk is connected to node (pre) + include_tasks: "/utils/cloud/aws/status_disk.yml" + when: "cloud_platform == 'AWS'" # Checking disk is attached to node - debug: - msg: echo "disk attached" + msg: "disk attached" when: "inuse == true" ## INJECTING CHAOS @@ -72,10 +83,15 @@ ## POST-CHAOS DISK LIVENESS CHECK - name: Verify that the disk is connected to node (post) include_tasks: "/utils/cloud/gcp/status_disk.yml" - + when: "cloud_platform == 'GCP'" + + - name: Verify that the disk is connected to node (post) + include_tasks: "/utils/cloud/aws/status_disk.yml" + when: "cloud_platform == 'AWS'" + # Checking disk is attached to node - debug: - msg: echo "disk attached" + msg: "disk attached" when: "inuse == true" - set_fact: diff --git a/experiments/generic/disk_loss/disk_loss_k8s_job.yml b/experiments/generic/disk_loss/disk_loss_k8s_job.yml index 68d544ba5a2..fa6c6bafef1 100644 --- a/experiments/generic/disk_loss/disk_loss_k8s_job.yml +++ b/experiments/generic/disk_loss/disk_loss_k8s_job.yml @@ -57,6 +57,10 @@ spec: # Enter the disk name - name: DISK_NAME value: '' + + # Enter the device name + - name: DEVICE_NAME + value: '' # Enter the zone name - name: ZONE_NAME diff --git a/experiments/generic/disk_loss/disk_status_check.j2 b/experiments/generic/disk_loss/disk_status_check.j2 index 1f197caa8fd..5aa4c82f69c 100644 --- a/experiments/generic/disk_loss/disk_status_check.j2 +++ b/experiments/generic/disk_loss/disk_status_check.j2 @@ -1,6 +1,8 @@ # All code here is not indented because j2 is space sensitive # Initially, it "inuse" set to false {% set disk = namespace(inuse=false) %} +# For GCP +{% if cloud_platform is defined and cloud_platform == 'GCP' %} {% set expect_user = 'https://www.googleapis.com/compute/v1/projects/' + project_id + '/zones/' + zone_name + '/instances/' + node_name %} # loop through all the disk users and checks if current_user is equal to expect_user {% for current_user in disk_users.stdout_lines %} @@ -9,8 +11,23 @@ {% set disk.inuse = true %} {% endif %} {% endfor %} + +# For AWS +{% elif cloud_platform is defined and cloud_platform == 'AWS' %} +{% set expect_user = node_name %} +# loop through all the disk users and checks if current_user is equal to expect_user +{% for current_user in disk_users.volumes %} +{% if current_user.attachment_set.instance_id == expect_user and current_user.attachment_set.status == "attached" %} +# If the condition is true, then set "inuse" to true +{% set disk.inuse = true %} +{% endif %} +{% endfor %} + +# This will append inuse: true/false {% if disk.inuse == true %} inuse: true {% else %} inuse: false -{% endif %} \ No newline at end of file +{% endif %} +{% endif %} + diff --git a/utils/cloud/aws/aws_configure.yml b/utils/cloud/aws/aws_configure.yml new file mode 100644 index 00000000000..14c155d0a61 --- /dev/null +++ b/utils/cloud/aws/aws_configure.yml @@ -0,0 +1,14 @@ +- name: Creates directory for aws configuration + file: + path: /root/.aws + state: directory + +- name: Creating credential file in aws directory + file: + path: /root/.aws/credentials + state: touch + +- name: Copying aws credentials from cloud_config + copy: + src: /mnt/cloud_config.yml + dest: /root/.aws/credentials \ No newline at end of file diff --git a/utils/cloud/aws/status_disk.yml b/utils/cloud/aws/status_disk.yml new file mode 100644 index 00000000000..84a007558d1 --- /dev/null +++ b/utils/cloud/aws/status_disk.yml @@ -0,0 +1,15 @@ +# ec2_vol_facts is deprecated, once python2 is upgraded to python3 in ansible runner +# we can change ec2_vol_facts to ec2_vol_info. +- name: Getting disk users + ec2_vol_facts: + filters: + volume-id: "{{ disk_name }}" + register: disk_users + +- name: Disk status check + template: + src: disk_status_check.j2 + dest: disk_status_check.yml + +- include_vars: + file: disk_status_check.yml \ No newline at end of file From 1f0994cb5763d877b04f395c13b4d72f079576a7 Mon Sep 17 00:00:00 2001 From: Shubham Chaudhary Date: Mon, 11 Nov 2019 18:25:34 +0530 Subject: [PATCH 10/21] (feat): add openebs target pod/container failure experiment (#924) Signed-off-by: shubhamchaudhary --- ...openebs_pool_container_failure_k8s_job.yml | 4 +- .../openebs_pool_pod_failure_k8s_job.yml | 4 +- .../README.md | 8 +- .../chaosutil.j2 | 7 + .../cstor_target_container_kill.yml | 0 .../data_persistence.j2 | 0 .../jiva_controller_container_kill.yml | 0 ...arget_container_failure_ansible_logic.yml} | 5 +- ...nebs_target_container_failure_k8s_job.yml} | 27 ++- .../test_prerequisites.yml | 0 .../openebs-target-failure/chaosutil.j2 | 18 -- .../openebs_target_network_delay_k8s_job.yml | 4 +- .../openebs_target_network_loss_k8s_job.yml | 4 +- .../openebs-target-pod-failure/README.md | 98 +++++++++++ .../openebs-target-pod-failure/chaosutil.j2 | 7 + .../cstor_target_failure.yaml | 0 .../data_persistence.j2 | 5 + .../jiva_controller_pod_failure.yaml | 0 ...enebs_target_pod_failure_ansible_logic.yml | 160 ++++++++++++++++++ .../openebs_target_pod_failure_k8s_job.yml | 78 +++++++++ .../test_prerequisites.yml | 36 ++++ 21 files changed, 415 insertions(+), 50 deletions(-) rename experiments/openebs/{openebs-target-failure => openebs-target-container-failure}/README.md (87%) create mode 100644 experiments/openebs/openebs-target-container-failure/chaosutil.j2 rename experiments/openebs/{openebs-target-failure => openebs-target-container-failure}/cstor_target_container_kill.yml (100%) rename experiments/openebs/{openebs-target-failure => openebs-target-container-failure}/data_persistence.j2 (100%) rename experiments/openebs/{openebs-target-failure => openebs-target-container-failure}/jiva_controller_container_kill.yml (100%) rename experiments/openebs/{openebs-target-failure/openebs_target_failure_ansible_logic.yml => openebs-target-container-failure/openebs_target_container_failure_ansible_logic.yml} (97%) rename experiments/openebs/{openebs-target-failure/openebs_target_failure_k8s_job.yml => openebs-target-container-failure/openebs_target_container_failure_k8s_job.yml} (76%) rename experiments/openebs/{openebs-target-failure => openebs-target-container-failure}/test_prerequisites.yml (100%) delete mode 100644 experiments/openebs/openebs-target-failure/chaosutil.j2 create mode 100644 experiments/openebs/openebs-target-pod-failure/README.md create mode 100644 experiments/openebs/openebs-target-pod-failure/chaosutil.j2 rename experiments/openebs/{openebs-target-failure => openebs-target-pod-failure}/cstor_target_failure.yaml (100%) create mode 100644 experiments/openebs/openebs-target-pod-failure/data_persistence.j2 rename experiments/openebs/{openebs-target-failure => openebs-target-pod-failure}/jiva_controller_pod_failure.yaml (100%) create mode 100644 experiments/openebs/openebs-target-pod-failure/openebs_target_pod_failure_ansible_logic.yml create mode 100644 experiments/openebs/openebs-target-pod-failure/openebs_target_pod_failure_k8s_job.yml create mode 100644 experiments/openebs/openebs-target-pod-failure/test_prerequisites.yml diff --git a/experiments/openebs/openebs-pool-container-failure/openebs_pool_container_failure_k8s_job.yml b/experiments/openebs/openebs-pool-container-failure/openebs_pool_container_failure_k8s_job.yml index be296b86e1f..c34ce43df9b 100644 --- a/experiments/openebs/openebs-pool-container-failure/openebs_pool_container_failure_k8s_job.yml +++ b/experiments/openebs/openebs-pool-container-failure/openebs_pool_container_failure_k8s_job.yml @@ -2,7 +2,7 @@ apiVersion: v1 kind: ConfigMap metadata: - name: pool-container-kill + name: openebs-pool-container-failure data: parameters.yml: | @@ -72,4 +72,4 @@ spec: volumes: - name: parameters configMap: - name: pool-container-kill + name: openebs-pool-container-failure diff --git a/experiments/openebs/openebs-pool-pod-failure/openebs_pool_pod_failure_k8s_job.yml b/experiments/openebs/openebs-pool-pod-failure/openebs_pool_pod_failure_k8s_job.yml index 60edeb84b61..ceea91fb857 100644 --- a/experiments/openebs/openebs-pool-pod-failure/openebs_pool_pod_failure_k8s_job.yml +++ b/experiments/openebs/openebs-pool-pod-failure/openebs_pool_pod_failure_k8s_job.yml @@ -2,7 +2,7 @@ apiVersion: v1 kind: ConfigMap metadata: - name: pool-pod-delete + name: openebs-pool-pod-failure data: parameters.yml: | @@ -68,4 +68,4 @@ spec: volumes: - name: parameters configMap: - name: pool-pod-delete + name: openebs-pool-pod-failure diff --git a/experiments/openebs/openebs-target-failure/README.md b/experiments/openebs/openebs-target-container-failure/README.md similarity index 87% rename from experiments/openebs/openebs-target-failure/README.md rename to experiments/openebs/openebs-target-container-failure/README.md index 636c7ab1074..7ea0b556461 100644 --- a/experiments/openebs/openebs-target-failure/README.md +++ b/experiments/openebs/openebs-target-container-failure/README.md @@ -33,12 +33,8 @@ - The container should be created again and it should be healthy. ## Associated Utils -- [cstor_target_container_kill.yml](/experiments/openebs/openebs-target-failure/cstor_target_container_kill.yml) -- [cstor_target_failure.yaml](/experiments/openebs/openebs-target-failure/cstor_target_failure.yaml) -- [jiva_controller_container_kill.yml](/experiments/openebs/openebs-target-failure/jiva_controller_container_kill.yml) -- [jiva_controller_pod_failure.yaml](/experiments/openebs/openebs-target-failure/jiva_controller_pod_failure.yaml) -- [fetch_cstor_target_pod.yml](/utils/apps/openebs/fetch_cstor_target_pod.yml) -- [fetch_jiva_controller_pod.yml](/utils/apps/openebs/fetch_jiva_controller_pod.yml) +- [cstor_target_container_kill.yml](/experiments/openebs/openebs-target-container-failure/cstor_target_container_kill.yml) +- [jiva_controller_container_kill.yml](/experiments/openebs/openebs-target-container-failure/jiva_controller_container_kill.yml) - [fetch_sc_and_provisioner.yml](/utils/apps/openebs/fetch_sc_and_provisioner.yml) - [target_affinity_check.yml](/utils/apps/openebs/target_affinity_check.yml) diff --git a/experiments/openebs/openebs-target-container-failure/chaosutil.j2 b/experiments/openebs/openebs-target-container-failure/chaosutil.j2 new file mode 100644 index 00000000000..dd0c274cc55 --- /dev/null +++ b/experiments/openebs/openebs-target-container-failure/chaosutil.j2 @@ -0,0 +1,7 @@ +{% if stg_prov is defined and stg_prov == 'openebs.io/provisioner-iscsi' %} + {% if stg_engine is defined and stg_engine == 'cstor' %} + chaosutil: /experiments/openebs/openebs-target-container-failure/cstor_target_container_kill.yml + {% else %} + chaosutil: /experiments/openebs/openebs-target-container-failure/jiva_controller_container_kill.yml + {% endif %} +{% endif %} diff --git a/experiments/openebs/openebs-target-failure/cstor_target_container_kill.yml b/experiments/openebs/openebs-target-container-failure/cstor_target_container_kill.yml similarity index 100% rename from experiments/openebs/openebs-target-failure/cstor_target_container_kill.yml rename to experiments/openebs/openebs-target-container-failure/cstor_target_container_kill.yml diff --git a/experiments/openebs/openebs-target-failure/data_persistence.j2 b/experiments/openebs/openebs-target-container-failure/data_persistence.j2 similarity index 100% rename from experiments/openebs/openebs-target-failure/data_persistence.j2 rename to experiments/openebs/openebs-target-container-failure/data_persistence.j2 diff --git a/experiments/openebs/openebs-target-failure/jiva_controller_container_kill.yml b/experiments/openebs/openebs-target-container-failure/jiva_controller_container_kill.yml similarity index 100% rename from experiments/openebs/openebs-target-failure/jiva_controller_container_kill.yml rename to experiments/openebs/openebs-target-container-failure/jiva_controller_container_kill.yml diff --git a/experiments/openebs/openebs-target-failure/openebs_target_failure_ansible_logic.yml b/experiments/openebs/openebs-target-container-failure/openebs_target_container_failure_ansible_logic.yml similarity index 97% rename from experiments/openebs/openebs-target-failure/openebs_target_failure_ansible_logic.yml rename to experiments/openebs/openebs-target-container-failure/openebs_target_container_failure_ansible_logic.yml index 4f79a2003b2..867e4dc02b4 100644 --- a/experiments/openebs/openebs-target-failure/openebs_target_failure_ansible_logic.yml +++ b/experiments/openebs/openebs-target-container-failure/openebs_target_container_failure_ansible_logic.yml @@ -8,9 +8,8 @@ a_pvc: "{{ lookup('env','APP_PVC') }}" c_experiment: openebs-target-failure c_force: "{{ lookup('env','FORCE') }}" - c_interval: 5 - chaos_duration: 120 - chaos_type: "{{ lookup('env','CHAOS_TYPE') }}" + c_interval: "{{ lookup('env','CHAOS_INTERVAL') }}" + chaos_duration: "{{ lookup('env','CHAOS_DURATION') }}" cri: "{{ lookup('env','CONTAINER_RUNTIME') }}" data_persistence: "{{ lookup('env','DATA_PERSISTENCE') }}" deploy_type: "{{ lookup('env','DEPLOY_TYPE') }}" diff --git a/experiments/openebs/openebs-target-failure/openebs_target_failure_k8s_job.yml b/experiments/openebs/openebs-target-container-failure/openebs_target_container_failure_k8s_job.yml similarity index 76% rename from experiments/openebs/openebs-target-failure/openebs_target_failure_k8s_job.yml rename to experiments/openebs/openebs-target-container-failure/openebs_target_container_failure_k8s_job.yml index 7e406af85c5..978b5257fb9 100644 --- a/experiments/openebs/openebs-target-failure/openebs_target_failure_k8s_job.yml +++ b/experiments/openebs/openebs-target-container-failure/openebs_target_container_failure_k8s_job.yml @@ -2,7 +2,7 @@ apiVersion: v1 kind: ConfigMap metadata: - name: target-failure + name: openebs-target-container-failure data: parameters.yml: | @@ -10,12 +10,12 @@ data: apiVersion: batch/v1 kind: Job metadata: - generateName: openebs-target-failure- + generateName: openebs-target-container-failure- spec: template: metadata: labels: - name: openebs-target-failure + name: openebs-target-container-failure spec: serviceAccountName: %CHAOS_SERVICE_ACCOUNT% restartPolicy: Never @@ -61,20 +61,17 @@ spec: - name: DATA_PERSISTENCE value: '' + + - name: CHAOS_INTERVAL + value: '5' + + - name: CHAOS_DURATION + value: '120' # Specify the container runtime used , to pick the relevant chaos util - name: CONTAINER_RUNTIME value: 'docker' - - # CHAOS_TYPE values : target-zrepl-kill , target-kill , target-delete , jiva-ctrl-kill - # For cstor-volume-istgt container kill Use : target-kill - # For Volume-mgmt-kill container Use : target-zrepl-kill - # For cstor-target-failure Use : target-delete - # For Jiva-Controller-container-kill Use : jiva-ctrl-kill - - - name: CHAOS_TYPE - value: 'jiva-ctrl-kill' - + # TARGET_CONTAINER values: cstor-volume-mgmt , cstor-istgt # For cstor-volume-istgt container kill use : cstor-istgt # For volume-mgmt-kill container use : cstor-volume-mgmt @@ -87,7 +84,7 @@ spec: value: 'deployment' command: ["/bin/bash"] - args: ["-c", "ansible-playbook ./experiments/openebs/openebs-target-failure/openebs_target_failure_ansible_logic.yml -i /etc/ansible/hosts -vv; exit 0"] + args: ["-c", "ansible-playbook ./experiments/openebs/openebs-target-container-failure/openebs_target_container_failure_ansible_logic.yml -i /etc/ansible/hosts -vv; exit 0"] volumeMounts: - name: parameters @@ -95,4 +92,4 @@ spec: volumes: - name: parameters configMap: - name: target-failure + name: openebs-target-container-failure diff --git a/experiments/openebs/openebs-target-failure/test_prerequisites.yml b/experiments/openebs/openebs-target-container-failure/test_prerequisites.yml similarity index 100% rename from experiments/openebs/openebs-target-failure/test_prerequisites.yml rename to experiments/openebs/openebs-target-container-failure/test_prerequisites.yml diff --git a/experiments/openebs/openebs-target-failure/chaosutil.j2 b/experiments/openebs/openebs-target-failure/chaosutil.j2 deleted file mode 100644 index 959aadb1736..00000000000 --- a/experiments/openebs/openebs-target-failure/chaosutil.j2 +++ /dev/null @@ -1,18 +0,0 @@ -{% if stg_prov is defined and stg_prov == 'openebs.io/provisioner-iscsi' %} - {% if stg_engine is defined and stg_engine == 'cstor' %} - {% if chaos_type is defined and chaos_type == 'target-kill' or chaos_type == 'target-zrepl-kill' %} - chaosutil: /experiments/openebs/openebs-target-failure/cstor_target_container_kill.yml - {% else %} - chaosutil: /experiments/openebs/openebs-target-failure/cstor_target_failure.yaml - {% endif %} - {% endif %} -{% endif %} -{% if stg_prov is defined and stg_prov == 'openebs.io/provisioner-iscsi' %} - {% if stg_engine is defined and stg_engine == 'jiva' %} - {% if chaos_type is defined and chaos_type == 'jiva-ctrl-kill' %} - chaosutil: /experiments/openebs/openebs-target-failure/jiva_controller_container_kill.yml - {% else %} - chaosutil: /experiments/openebs/openebs-target-failure/jiva_controller_pod_failure.yaml - {% endif %} - {% endif %} -{% endif %} diff --git a/experiments/openebs/openebs-target-network-delay/openebs_target_network_delay_k8s_job.yml b/experiments/openebs/openebs-target-network-delay/openebs_target_network_delay_k8s_job.yml index 8fb18b05266..eabd32bcad9 100644 --- a/experiments/openebs/openebs-target-network-delay/openebs_target_network_delay_k8s_job.yml +++ b/experiments/openebs/openebs-target-network-delay/openebs_target_network_delay_k8s_job.yml @@ -2,7 +2,7 @@ apiVersion: v1 kind: ConfigMap metadata: - name: target-network-delay + name: openebs-target-network-delay data: parameters.yml: | @@ -72,5 +72,5 @@ spec: volumes: - name: parameters configMap: - name: target-network-delay + name: openebs-target-network-delay \ No newline at end of file diff --git a/experiments/openebs/openebs-target-network-loss/openebs_target_network_loss_k8s_job.yml b/experiments/openebs/openebs-target-network-loss/openebs_target_network_loss_k8s_job.yml index 72509e6e25c..bd8fd7f4cd8 100644 --- a/experiments/openebs/openebs-target-network-loss/openebs_target_network_loss_k8s_job.yml +++ b/experiments/openebs/openebs-target-network-loss/openebs_target_network_loss_k8s_job.yml @@ -2,7 +2,7 @@ apiVersion: v1 kind: ConfigMap metadata: - name: target-network-loss + name: openebs-target-network-loss data: parameters.yml: | @@ -75,5 +75,5 @@ spec: volumes: - name: parameters configMap: - name: target-network-loss + name: openebs-target-network-loss \ No newline at end of file diff --git a/experiments/openebs/openebs-target-pod-failure/README.md b/experiments/openebs/openebs-target-pod-failure/README.md new file mode 100644 index 00000000000..318116c8113 --- /dev/null +++ b/experiments/openebs/openebs-target-pod-failure/README.md @@ -0,0 +1,98 @@ +## Experiment Metadata + + + + + + + + + + + + + + +
Type Description Storage K8s Platform
Chaos Kill the cstor/jiva target/controller pod and check if gets created again OPENEBS Any
+ +## Entry-Criteria + +- Application services are accessible & pods are healthy +- Application writes are successful + +## Exit-Criteria + +- Application services are accessible & pods are healthy +- Data written prior to chaos is successfully retrieved/read +- Database consistency is maintained as per db integrity check utils +- Storage target pods are healthy + +### Notes + +- Typically used as a disruptive test, to cause loss of access to storage target by killing the containers. +- The container should be created again and it should be healthy. + +## Associated Utils +- [cstor_target_failure.yaml](/experiments/openebs/openebs-target-pod-failure/cstor_target_failure.yaml) +- [jiva_controller_pod_failure.yaml](/experiments/openebs/openebs-target-pod-failure/jiva_controller_pod_failure.yaml) +- [fetch_cstor_target_pod.yml](/utils/apps/openebs/fetch_cstor_target_pod.yml) +- [fetch_jiva_controller_pod.yml](/utils/apps/openebs/fetch_jiva_controller_pod.yml) +- [fetch_sc_and_provisioner.yml](/utils/apps/openebs/fetch_sc_and_provisioner.yml) +- [target_affinity_check.yml](/utils/apps/openebs/target_affinity_check.yml) + +## Litmus experiment Environment Variables + +### Application + + + + + + + + + + + + + + + + + + + + + +
Parameter + Description
APP_NAMESPACE Namespace in which application pods are deployed
APP_LABEL Unique Labels in `key=value` format of application deployment
APP_PVC Name of persistent volume claim used for app's volume mounts
DATA_PERSISTENCE Specify the application name against which data consistency has to be ensured. Example: busybox
+ +### Procedure + +This scenario validates the behaviour of application and OpenEBS persistent volumes in the amidst of chaos induced on OpenEBS data plane and control plane components. + +After injecting the chaos into the component specified via environmental variable, litmus experiment observes the behaviour of corresponding OpenEBS PV and the application which consumes the volume. + +Based on the value of env `DATA_PERSISTENCE`, the corresponding data consistency util will be executed. At present only busybox and percona-mysql are supported. Along with specifying env in the litmus experiment, user needs to pass name for configmap and the data consistency specific parameters required via configmap in the format as follows: + +```yml + parameters.yml: | + blocksize: 4k + blockcount: 1024 + testfile: difiletest +``` + +It is recommended to pass test-name for configmap and mount the corresponding configmap as volume in the litmus pod. The above snippet holds the parameters required for validation data consistency in busybox application. + +For percona-mysql, the following parameters are to be injected into configmap. + +```yml + parameters.yml: | + dbuser: root + dbpassword: k8sDemo + dbname: tbd +``` + +The configmap data will be utilised by litmus experiments as its variables while executing the scenario. + +Based on the data provided, litmus checks if the data is consistent after recovering from induced chaos. diff --git a/experiments/openebs/openebs-target-pod-failure/chaosutil.j2 b/experiments/openebs/openebs-target-pod-failure/chaosutil.j2 new file mode 100644 index 00000000000..6b179e7cd90 --- /dev/null +++ b/experiments/openebs/openebs-target-pod-failure/chaosutil.j2 @@ -0,0 +1,7 @@ +{% if stg_prov is defined and stg_prov == 'openebs.io/provisioner-iscsi' %} + {% if stg_engine is defined and stg_engine == 'cstor' %} + chaosutil: /experiments/openebs/openebs-target-pod-failure/cstor_target_failure.yaml + {% else %} + chaosutil: /experiments/openebs/openebs-target-pod-failure/jiva_controller_pod_failure.yaml + {% endif %} +{% endif %} diff --git a/experiments/openebs/openebs-target-failure/cstor_target_failure.yaml b/experiments/openebs/openebs-target-pod-failure/cstor_target_failure.yaml similarity index 100% rename from experiments/openebs/openebs-target-failure/cstor_target_failure.yaml rename to experiments/openebs/openebs-target-pod-failure/cstor_target_failure.yaml diff --git a/experiments/openebs/openebs-target-pod-failure/data_persistence.j2 b/experiments/openebs/openebs-target-pod-failure/data_persistence.j2 new file mode 100644 index 00000000000..8b0e7e500cc --- /dev/null +++ b/experiments/openebs/openebs-target-pod-failure/data_persistence.j2 @@ -0,0 +1,5 @@ +{% if data_persistence is defined and data_persistence == 'mysql' %} + consistencyutil: /utils/apps/mysql/mysql_data_persistence.yml + {% elif data_persistence is defined and data_persistence == 'busybox' %} + consistencyutil: /utils/apps/busybox/busybox_data_persistence.yml +{% endif %} diff --git a/experiments/openebs/openebs-target-failure/jiva_controller_pod_failure.yaml b/experiments/openebs/openebs-target-pod-failure/jiva_controller_pod_failure.yaml similarity index 100% rename from experiments/openebs/openebs-target-failure/jiva_controller_pod_failure.yaml rename to experiments/openebs/openebs-target-pod-failure/jiva_controller_pod_failure.yaml diff --git a/experiments/openebs/openebs-target-pod-failure/openebs_target_pod_failure_ansible_logic.yml b/experiments/openebs/openebs-target-pod-failure/openebs_target_pod_failure_ansible_logic.yml new file mode 100644 index 00000000000..a225f807d9f --- /dev/null +++ b/experiments/openebs/openebs-target-pod-failure/openebs_target_pod_failure_ansible_logic.yml @@ -0,0 +1,160 @@ +--- +- hosts: localhost + connection: local + + vars: + a_label: "{{ lookup('env','APP_LABEL') }}" + a_ns: "{{ lookup('env','APP_NAMESPACE') }}" + a_pvc: "{{ lookup('env','APP_PVC') }}" + c_experiment: openebs-target-failure + c_force: "{{ lookup('env','FORCE') }}" + c_interval: "{{ lookup('env','CHAOS_INTERVAL') }}" + chaos_duration: "{{ lookup('env','CHAOS_DURATION') }}" + data_persistence: "{{ lookup('env','DATA_PERSISTENCE') }}" + deploy_type: "{{ lookup('env','DEPLOY_TYPE') }}" + liveness_label: "{{ lookup('env','LIVENESS_APP_LABEL') }}" + liveness_namespace: "{{ lookup('env','LIVENESS_APP_NAMESPACE') }}" + openebs_ns: "{{ lookup('env','OPENEBS_NAMESPACE') }}" + + vars_files: + - /mnt/parameters.yml + - /experiments/openebs/openebs_components.yml + + tasks: + - block: + + ## PRE-CHAOS APPLICATION LIVENESS CHECK + - include_tasks: /utils/common/application_liveness_check.yml + when: liveness_label != '' + + - include: test_prerequisites.yml + + - include_vars: + file: data_persistence.yml + + - include_vars: + file: chaosutil.yml + + - name: Record the chaos util path + set_fact: + chaos_util_path: "{{ chaosutil }}" + + - name: Record the data consistency util path + set_fact: + data_consistency_util_path: "{{ consistencyutil }}" + when: data_persistence != '' + + - include_tasks: /utils/runtime/create_testname.yml + + ## GENERATE EXP RESULT NAME + - block: + + - name: Construct chaos result name (experiment_name) + set_fact: + c_experiment: "{{ lookup('env','CHAOSENGINE') }}-{{ c_experiment }}" + + when: lookup('env','CHAOSENGINE') + + ## RECORD START-OF-TEST IN CHAOS RESULT CR + - include_tasks: /utils/runtime/update_chaos_result_resource.yml + vars: + status: 'SOT' + namespace: "{{ a_ns }}" + + ## DISPLAY APP INFORMATION + + - name: Display the app information passed via the test job + debug: + msg: + - "The application info is as follows:" + - "Namespace : {{ a_ns }}" + - "Target Namespace : {{ openebs_ns }}" + - "Label : {{ a_label }}" + - "PVC : {{ a_pvc }}" + - "StorageClass : {{ sc }}" + + ## PRE-CHAOS APPLICATION LIVENESS CHECK + + - name: Verify that the AUT (Application Under Test) is running + include_tasks: "/utils/common/status_app_pod.yml" + vars: + application_name: "{{ app_pod_name.stdout }}" + delay: 5 + retries: 60 + + - name: Get application pod name + shell: > + kubectl get pods -n {{ a_ns }} -l {{ a_label }} --no-headers + -o=custom-columns=NAME:".metadata.name" + args: + executable: /bin/bash + register: app_pod_name + + - name: Create some test data + include: "{{ data_consistency_util_path }}" + vars: + status: 'LOAD' + ns: "{{ a_ns }}" + pod_name: "{{ app_pod_name.stdout }}" + when: data_persistence != '' + + ## STORAGE FAULT INJECTION + + - include: "{{ chaos_util_path }}" + + ## POST-CHAOS APPLICATION LIVENESS CHECK + + - name: Wait (soak) for I/O on pools + wait_for: + timeout: "{{ chaos_duration }}" + + - name: Verify AUT liveness post fault-injection + include_tasks: "/utils/common/status_app_pod.yml" + vars: + application_name: "{{ app_pod_name.stdout }}" + delay: 5 + retries: 60 + + ## POST-CHAOS APPLICATION LIVENESS CHECK + - include_tasks: /utils/common/application_liveness_check.yml + when: liveness_label != '' + + - name: Get application pod name + shell: > + kubectl get pods -n {{ a_ns }} -l {{ a_label }} --no-headers + -o=custom-columns=NAME:".metadata.name" + args: + executable: /bin/bash + register: rescheduled_app_pod + + - name: Verify application data persistence + include: "{{ data_consistency_util_path }}" + vars: + status: 'VERIFY' + ns: "{{ a_ns }}" + pod_name: "{{ rescheduled_app_pod.stdout }}" + when: data_persistence != '' + + ## Check application-target pod affinity + - include_tasks: /utils/apps/openebs/target_affinity_check.yml + when: deploy_type == 'deployment' + + ## Check statefulset application-target pod affinity + - include_tasks: /utils/apps/openebs/sts_target_affinity_check.yml + when: deploy_type == 'statefulset' + + - set_fact: + flag: "Pass" + + rescue: + - set_fact: + flag: "Fail" + + always: + + ## RECORD END-OF-TEST IN CHAOS RESULT CR + + - include_tasks: /utils/runtime/update_chaos_result_resource.yml + vars: + status: 'EOT' + namespace: "{{ a_ns }}" diff --git a/experiments/openebs/openebs-target-pod-failure/openebs_target_pod_failure_k8s_job.yml b/experiments/openebs/openebs-target-pod-failure/openebs_target_pod_failure_k8s_job.yml new file mode 100644 index 00000000000..f7dbc1627d0 --- /dev/null +++ b/experiments/openebs/openebs-target-pod-failure/openebs_target_pod_failure_k8s_job.yml @@ -0,0 +1,78 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: openebs-target-pod-failure +data: + parameters.yml: | + +--- +apiVersion: batch/v1 +kind: Job +metadata: + generateName: openebs-target-pod-failure- +spec: + template: + metadata: + labels: + name: openebs-target-pod-failure + spec: + serviceAccountName: %CHAOS_SERVICE_ACCOUNT% + restartPolicy: Never + containers: + - name: ansibletest + image: litmuschaos/ansible-runner:ci + imagePullPolicy: Always + env: + - name: ANSIBLE_STDOUT_CALLBACK + value: 'default' + + # provide application namespace + - name: APP_NAMESPACE + value: '' + + # provide openebs namespace + - name: OPENEBS_NAMESPACE + value: 'openebs' + + # provide application label + - name: APP_LABEL + value: '' + + # provide application pvc + - name: APP_PVC + value: '' + + # it can be true or false, depending upon scenario - allowed force deletion or not + - name: FORCE + value: 'true' + + - name: LIVENESS_APP_LABEL + value: '' + + - name: LIVENESS_APP_NAMESPACE + value: '' + + - name: DATA_PERSISTENCE + value: '' + + - name: CHAOS_INTERVAL + value: '5' + + - name: CHAOS_DURATION + value: '120' + + # DEPLOY_TYPE values: deployment, statefulset + - name: DEPLOY_TYPE + value: 'deployment' + + command: ["/bin/bash"] + args: ["-c", "ansible-playbook ./experiments/openebs/openebs-target-pod-failure/openebs_target_pod_failure_ansible_logic.yml -i /etc/ansible/hosts -vv; exit 0"] + + volumeMounts: + - name: parameters + mountPath: /mnt/ + volumes: + - name: parameters + configMap: + name: openebs-target-pod-failure diff --git a/experiments/openebs/openebs-target-pod-failure/test_prerequisites.yml b/experiments/openebs/openebs-target-pod-failure/test_prerequisites.yml new file mode 100644 index 00000000000..16dbe18619b --- /dev/null +++ b/experiments/openebs/openebs-target-pod-failure/test_prerequisites.yml @@ -0,0 +1,36 @@ +--- +- name: Fetch sc and provisioner + include_tasks: /utils/apps/openebs/fetch_sc_and_provisioner.yml + +- block: + - name: Derive PV name from PVC to query storage engine type (openebs) + shell: > + kubectl get pvc {{ a_pvc }} -n {{ a_ns }} + --no-headers -o custom-columns=:spec.volumeName + args: + executable: /bin/bash + register: pv + + - name: Check for presence & value of cas type annotation + shell: > + kubectl get pv {{ pv.stdout }} --no-headers + -o jsonpath="{.metadata.annotations.openebs\\.io/cas-type}" + args: + executable: /bin/bash + register: openebs_stg_engine + + - name: Record the storage engine name + set_fact: + stg_engine: "{{ openebs_stg_engine.stdout }}" + when: stg_prov == "openebs.io/provisioner-iscsi" + +- name: Identify the chaos util to be invoked + template: + src: chaosutil.j2 + dest: chaosutil.yml + +- name: Identify the data consistency util to be invoked + template: + src: data_persistence.j2 + dest: data_persistence.yml + From da6e531d731d76017280f1c7839e2a0e7d060cc3 Mon Sep 17 00:00:00 2001 From: Raj Babu Das Date: Tue, 12 Nov 2019 19:53:31 +0530 Subject: [PATCH 11/21] fixing disk loss j2 (#927) * fixing disk loss j2 Signed-off-by: Raj --- experiments/generic/disk_loss/disk_status_check.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/experiments/generic/disk_loss/disk_status_check.j2 b/experiments/generic/disk_loss/disk_status_check.j2 index 5aa4c82f69c..8a5e2044252 100644 --- a/experiments/generic/disk_loss/disk_status_check.j2 +++ b/experiments/generic/disk_loss/disk_status_check.j2 @@ -22,6 +22,7 @@ {% set disk.inuse = true %} {% endif %} {% endfor %} +{% endif %} # This will append inuse: true/false {% if disk.inuse == true %} @@ -29,5 +30,4 @@ inuse: true {% else %} inuse: false {% endif %} -{% endif %} From dfd6731d493ca9a7cf854d44a5b03155f62ccdb8 Mon Sep 17 00:00:00 2001 From: Karthik Satchitanand Date: Tue, 12 Nov 2019 20:51:01 +0530 Subject: [PATCH 12/21] (feat)kafka: include kafka broker kill experiment (#926) Signed-off-by: ksatchit --- chaoslib/litmus/kill_random_pod.yml | 6 +- .../kafka-broker-pod-failure/chaosutil.j2 | 7 ++ ...kafka-broker-pod-failure-ansible-logic.yml | 112 ++++++++++++++++++ ...oker-pod-failure-ansible-prerequisites.yml | 31 +++++ .../kafka-broker-pod-failure-k8s-job.yml | 89 ++++++++++++++ .../apps/kafka/display_kafka_broker_info.yml | 7 ++ utils/apps/kafka/kafka_cluster_health.yml | 16 +++ ...fka_launch_stream_derive_leader_broker.yml | 3 + utils/apps/kafka/kafka_liveness.j2 | 54 +++++++++ utils/apps/kafka/kafka_liveness_cleanup.yml | 16 +++ utils/apps/kafka/kafka_liveness_stream.yml | 60 ++++++++++ utils/apps/kafka/kafka_select_broker.yml | 11 ++ 12 files changed, 409 insertions(+), 3 deletions(-) create mode 100644 experiments/kafka/kafka-broker-pod-failure/chaosutil.j2 create mode 100644 experiments/kafka/kafka-broker-pod-failure/kafka-broker-pod-failure-ansible-logic.yml create mode 100644 experiments/kafka/kafka-broker-pod-failure/kafka-broker-pod-failure-ansible-prerequisites.yml create mode 100644 experiments/kafka/kafka-broker-pod-failure/kafka-broker-pod-failure-k8s-job.yml create mode 100644 utils/apps/kafka/display_kafka_broker_info.yml create mode 100644 utils/apps/kafka/kafka_cluster_health.yml create mode 100644 utils/apps/kafka/kafka_launch_stream_derive_leader_broker.yml create mode 100644 utils/apps/kafka/kafka_liveness.j2 create mode 100644 utils/apps/kafka/kafka_liveness_cleanup.yml create mode 100644 utils/apps/kafka/kafka_liveness_stream.yml create mode 100644 utils/apps/kafka/kafka_select_broker.yml diff --git a/chaoslib/litmus/kill_random_pod.yml b/chaoslib/litmus/kill_random_pod.yml index fae73e4b3b2..2c561a3bf0a 100644 --- a/chaoslib/litmus/kill_random_pod.yml +++ b/chaoslib/litmus/kill_random_pod.yml @@ -11,14 +11,14 @@ set_fact: app_pod: "{{ pod_list.resources | random | json_query('metadata.name') }}" - when: app_pod_name is undefined + when: app_pod_name is undefined or app_pod_name == '' - block: - name: Record app pod set_fact: app_pod: "{{ app_pod_name }}" - when: app_pod_name is defined + when: app_pod_name is defined and app_pod_name != '' - debug: msg: "Killing pod {{ app_pod }}" @@ -41,4 +41,4 @@ - name: Wait for the interval timer pause: - seconds: "{{ c_interval }}" \ No newline at end of file + seconds: "{{ c_interval }}" diff --git a/experiments/kafka/kafka-broker-pod-failure/chaosutil.j2 b/experiments/kafka/kafka-broker-pod-failure/chaosutil.j2 new file mode 100644 index 00000000000..cbf537e7f41 --- /dev/null +++ b/experiments/kafka/kafka-broker-pod-failure/chaosutil.j2 @@ -0,0 +1,7 @@ +{% if c_lib is defined and c_lib == 'chaoskube' %} + c_util: /chaoslib/chaoskube/pod_failure_by_chaoskube.yml +{% elif c_lib is defined and c_lib == 'powerfulseal' %} + c_util: /chaoslib/powerfulseal/pod_failure_by_powerfulseal.yml +{% else %} + c_util: /chaoslib/litmus/pod_failure_by_litmus.yml +{% endif %} diff --git a/experiments/kafka/kafka-broker-pod-failure/kafka-broker-pod-failure-ansible-logic.yml b/experiments/kafka/kafka-broker-pod-failure/kafka-broker-pod-failure-ansible-logic.yml new file mode 100644 index 00000000000..e07f308d255 --- /dev/null +++ b/experiments/kafka/kafka-broker-pod-failure/kafka-broker-pod-failure-ansible-logic.yml @@ -0,0 +1,112 @@ +--- +- hosts: localhost + connection: local + + vars: + c_experiment: "kafka-broker-pod-failure" + c_duration: "{{ lookup('env','TOTAL_CHAOS_DURATION') }}" + c_interval: "{{ lookup('env','CHAOS_INTERVAL') }}" + c_force: "{{ lookup('env','FORCE') }}" + c_lib: "{{ lookup('env','LIB') }}" + kafka_ns: "{{ lookup('env','KAFKA_NAMESPACE') }}" + kafka_label: "{{ lookup('env','KAFKA_LABEL') }}" + kafka_kind: "{{ lookup('env','KAFKA_KIND') }}" + kafka_broker: "{{ lookup('env','KAFKA_BROKER') }}" + kafka_stream: "{{ lookup('env','KAFKA_LIVENESS_STREAM') }}" + kafka_service: "{{ lookup('env','KAFKA_SERVICE') }}" + kafka_port: "{{ lookup('env','KAFKA_PORT') }}" + kafka_replication_factor: "{{ lookup('env','KAFKA_REPLICATION_FACTOR') }}" + zk_ns: "{{ lookup('env','ZOOKEEPER_NAMESPACE') }}" + zk_label: "{{ lookup('env','ZOOKEEPER_LABEL') }}" + zk_service: "{{ lookup('env','ZOOKEEPER_SERVICE') }}" + zk_port: "{{ lookup('env','ZOOKEEPER_PORT') }}" + + tasks: + - block: + + - include: kafka-broker-pod-failure-ansible-prerequisites.yml + + - include_vars: + file: chaosutil.yml + + ## GENERATE EXP RESULT NAME + - block: + + - name: Construct chaos result name (experiment_name) + set_fact: + c_experiment: "{{ lookup('env','CHAOSENGINE') }}-{{ c_experiment }}" + + when: lookup('env','CHAOSENGINE') + + ## RECORD START-OF-EXPERIMENT IN LITMUSCHAOS RESULT CR + - include_tasks: /utils/runtime/update_chaos_result_resource.yml + vars: + status: 'SOT' + namespace: "{{ kafka_ns }}" + + ## PRE-CHAOS APPLICATION LIVENESS CHECK + + - name: Verify that the Kafka cluster is healthy + include_tasks: "/utils/apps/kafka/kafka_cluster_health.yml" + vars: + delay: 1 + retries: 60 + + ## SETUP KAFKA CHAOS INFRA AND DERIVE BROKERS UNDER TEST + + - include_tasks: "{{ kafka_broker_util }}" + + ## FAULT INJECTION + + - include_tasks: "{{ c_util }}" + vars: + app_ns: "{{ kafka_ns }}" + app_label: "{{ kafka_label }}" + + # derived from the 'kafka_broker_util' task + app_pod_name: "{{ kafka_broker }}" + + ## POST-CHAOS APPLICATION LIVENESS CHECK + + - name: Verify that the Kafka cluster is healthy + include_tasks: "/utils/apps/kafka/kafka_cluster_health.yml" + vars: + delay: 1 + retries: 60 + + ## CHECK FOR KAFKA LIVENESS & CLEANUP + + - block: + + - name: Verify that the Kafka liveness pod (pub-sub) is uninterrupted + include_tasks: "/utils/common/status_app_pod.yml" + vars: + a_ns: "{{ kafka_ns }}" + a_label: "name=kafka-liveness" + delay: 1 + retries: 60 + + - include_tasks: "/utils/apps/kafka/kafka_liveness_cleanup.yml" + + when: kafka_stream is defined and kafka_stream != '' + + - set_fact: + flag: "pass" + + + rescue: + - set_fact: + flag: "fail" + + - name: Cleanup kafka liveness pods if present + include_tasks: "/utils/apps/kafka/kafka_liveness_cleanup.yml" + ignore_errors: true + + always: + + ## RECORD END-OF-TEST IN LITMUSCHAOS RESULT CR + - include_tasks: /utils/runtime/update_chaos_result_resource.yml + vars: + status: 'EOT' + namespace: "{{ kafka_ns }}" + diff --git a/experiments/kafka/kafka-broker-pod-failure/kafka-broker-pod-failure-ansible-prerequisites.yml b/experiments/kafka/kafka-broker-pod-failure/kafka-broker-pod-failure-ansible-prerequisites.yml new file mode 100644 index 00000000000..10293ba0a95 --- /dev/null +++ b/experiments/kafka/kafka-broker-pod-failure/kafka-broker-pod-failure-ansible-prerequisites.yml @@ -0,0 +1,31 @@ +- name: Identify the chaos util to be invoked + template: + src: chaosutil.j2 + dest: chaosutil.yml + +- block: + + - set_fact: + kafka_broker_util: "/utils/apps/kafka/kafka_liveness_stream.yml" + when: kafka_stream is defined and kafka_stream != '' + + - set_fact: + kafka_broker_util: "/utils/apps/kafka/display_kafka_broker_info.yml" + when: kafka_stream is not defined or kafka_stream == '' + + when: kafka_broker is defined and kafka_broker != '' + +- block: + + - set_fact: + kafka_broker_util: "/utils/apps/kafka/kafka_launch_stream_derive_leader_broker.yml" + when: kafka_stream is defined and kafka_stream != '' + + - set_fact: + kafka_broker_util: "/utils/apps/kafka/kafka_select_broker.yml" + when: kafka_stream is not defined or kafka_stream == '' + + when: kafka_broker is not defined or kafka_broker == '' + + + diff --git a/experiments/kafka/kafka-broker-pod-failure/kafka-broker-pod-failure-k8s-job.yml b/experiments/kafka/kafka-broker-pod-failure/kafka-broker-pod-failure-k8s-job.yml new file mode 100644 index 00000000000..85c7311fc03 --- /dev/null +++ b/experiments/kafka/kafka-broker-pod-failure/kafka-broker-pod-failure-k8s-job.yml @@ -0,0 +1,89 @@ +--- +apiVersion: batch/v1 +kind: Job +metadata: + generateName: kafka-broker-pod-failure- +spec: + template: + metadata: + labels: + experiment: kafka-broker-pod-failure + spec: + # Placeholder that is updated by the executor for automated runs + # Provide appropriate SA (with desired permissions) if executed manually + serviceAccountName: nginx + restartPolicy: Never + containers: + - name: ansibletest + image: ksatchit/ansible-runner:ci + imagePullPolicy: Always + env: + - name: ANSIBLE_STDOUT_CALLBACK + value: 'default' + + # provide application namespace + - name: KAFKA_NAMESPACE + value: 'default' + + # provide application labels + - name: KAFKA_LABEL + value: 'app=cp-kafka' + + # provide application kind + - name: KAFKA_KIND + value: 'statefulset' + + - name: KAFKA_BROKER + value: '' + + - name: KAFKA_LIVENESS_STREAM + value: 'enabled' + + - name: KAFKA_REPLICATION_FACTOR + value: '3' + + - name: KAFKA_SERVICE + value: 'kafka-demo-cp-kafka-headless' + + - name: KAFKA_PORT + value: '9092' + + - name: ZOOKEEPER_NAMESPACE + value: 'default' + + # provide application labels + - name: ZOOKEEPER_LABEL + value: 'app=cp-zookeeper' + + + - name: ZOOKEEPER_SERVICE + value: 'kafka-demo-cp-zookeeper-headless' + + - name: ZOOKEEPER_PORT + value: '2181' + + - name: TOTAL_CHAOS_DURATION + value: '15' + + - name: CHAOS_INTERVAL + value: '5' + + - name: FORCE + value: 'true' + + ## env var that describes the library used to execute the chaos + ## default: litmus. Supported values: litmus, powerfulseal, chaoskube + - name: LIB + value: '' + + - name: CHAOSENGINE + value: '' + + - name: CHAOS_SERVICE_ACCOUNT + valueFrom: + fieldRef: + fieldPath: spec.serviceAccountName + + command: ["/bin/bash"] + args: ["-c", "ansible-playbook ./experiments/kafka/kafka-broker-pod-failure/kafka-broker-pod-failure-ansible-logic.yml -vv -i /etc/ansible/hosts; exit 0"] + diff --git a/utils/apps/kafka/display_kafka_broker_info.yml b/utils/apps/kafka/display_kafka_broker_info.yml new file mode 100644 index 00000000000..94b89fe438d --- /dev/null +++ b/utils/apps/kafka/display_kafka_broker_info.yml @@ -0,0 +1,7 @@ +- debug: + msg: "{{ kafka_broker }}" + when: kafka_broker != '' + +- debug: + msg: "kafka broker will be selected randomly across the cluster" + when: kafka_broker == '' diff --git a/utils/apps/kafka/kafka_cluster_health.yml b/utils/apps/kafka/kafka_cluster_health.yml new file mode 100644 index 00000000000..9d6e3b7de28 --- /dev/null +++ b/utils/apps/kafka/kafka_cluster_health.yml @@ -0,0 +1,16 @@ +--- +- name: Verify that all kafka pods are running + include_tasks: "/utils/common/status_app_pod.yml" + vars: + delay: 1 + retries: 60 + a_ns: "{{ kafka_ns }}" + a_label: "{{ kafka_label }}" + +- name: Verify that all zookeeper pods are running + include_tasks: "/utils/common/status_app_pod.yml" + vars: + delay: 1 + retries: 60 + a_ns: "{{ zk_ns }}" + a_label: "{{ zk_label }}" diff --git a/utils/apps/kafka/kafka_launch_stream_derive_leader_broker.yml b/utils/apps/kafka/kafka_launch_stream_derive_leader_broker.yml new file mode 100644 index 00000000000..3df02d783e3 --- /dev/null +++ b/utils/apps/kafka/kafka_launch_stream_derive_leader_broker.yml @@ -0,0 +1,3 @@ +- include_tasks: "/utils/apps/kafka/kafka_liveness_stream.yml" +- include_tasks: "/utils/apps/kafka/kafka_select_broker.yml" +- include_tasks: "/utils/apps/kafka/display_kafka_broker_info.yml" diff --git a/utils/apps/kafka/kafka_liveness.j2 b/utils/apps/kafka/kafka_liveness.j2 new file mode 100644 index 00000000000..866f375f283 --- /dev/null +++ b/utils/apps/kafka/kafka_liveness.j2 @@ -0,0 +1,54 @@ +--- +apiVersion: v1 +kind: Pod +metadata: + name: kafka-liveness + labels: + name: kafka-liveness +spec: + initContainers: + - name: kafka-topic-creator + image: litmuschaos/kafka-client:ci + imagePullPolicy: Always + env: + - name: TOPIC_NAME + value: {{ kafka_topic }} + - name: ZOOKEEPER_SERVICE + value: {{ zk_service }} + - name: ZOOKEEPER_PORT + value: "{{ zk_port }}" + - name: REPLICATION_FACTOR + value: "{{ kafka_replication_factor }}" + command: + - sh + - -c + - "./topic.sh" + containers: + - name: kafka-producer + image: litmuschaos/kafka-client:ci + imagePullPolicy: Always + env: + - name: TOPIC_NAME + value: {{ kafka_topic }} + - name: KAFKA_SERVICE + value: {{ kafka_service }} + - name: KAFKA_PORT + value: "{{ kafka_port }}" + command: + - sh + - -c + - "./producer.sh" + - name: kafka-consumer + image: litmuschaos/kafka-client:ci + imagePullPolicy: Always + env: + - name: TOPIC_NAME + value: {{ kafka_topic }} + - name: KAFKA_SERVICE + value: {{ kafka_service }} + - name: KAFKA_PORT + value: "{{ kafka_port }}" + command: + - sh + - -c + - "./consumer.sh" diff --git a/utils/apps/kafka/kafka_liveness_cleanup.yml b/utils/apps/kafka/kafka_liveness_cleanup.yml new file mode 100644 index 00000000000..cc35015fc65 --- /dev/null +++ b/utils/apps/kafka/kafka_liveness_cleanup.yml @@ -0,0 +1,16 @@ +- name: Remove the Kafka liveness pod + shell: + kubectl delete -f kafka_liveness.yml -n {{ kafka_ns }} + args: + executable: /bin/bash + register: result + +- name: Confirm that the Kafka liveness pod is deleted successfully + shell: + kubectl get pod -l name=kafka-liveness --no-headers -n {{ kafka_ns }} + args: + executable: /bin/bash + register: result + until: "'Running' not in result.stdout" + delay: 1 + retries: 120 diff --git a/utils/apps/kafka/kafka_liveness_stream.yml b/utils/apps/kafka/kafka_liveness_stream.yml new file mode 100644 index 00000000000..61f0bdb26a6 --- /dev/null +++ b/utils/apps/kafka/kafka_liveness_stream.yml @@ -0,0 +1,60 @@ +- name: Generate a random strint as suffix to topic name + shell: echo $(mktemp) | cut -d '.' -f 2 + args: + executable: /bin/bash + register: uniqstr + +- name: Set the kafka topic name to a variable + set_fact: + kafka_topic: "topic-{{ uniqstr.stdout }}" + +- name: Generate the kafka liveness spec from template + template: + src: /utils/apps/kafka/kafka_liveness.j2 + dest: kafka_liveness.yml + +- name: Apply the pub-sub kafka liveness applicaton + shell: + kubectl apply -f kafka_liveness.yml -n {{ kafka_ns }} + args: + executable: /bin/bash + register: result + failed_when: "result.rc != 0" + +- name: Confirm that the kafka liveness pod is running + shell: + kubectl get pod -l name=kafka-liveness --no-headers -n {{ kafka_ns }} + args: + executable: /bin/bash + register: result + until: "'Running' in result.stdout" + delay: 1 + retries: 120 + +- name: Fetch the kafka-liveness pod name + shell: + kubectl get pods -n {{ kafka_ns }} -l name=kafka-liveness -o jsonpath='{.items[0].metadata.name}' + register: kafka_liveness_pod + +- name: Obtain the leader broker ordinality for the topic (partition) created by kafka-liveness + shell: > + kubectl exec {{ kafka_liveness_pod.stdout }} -n {{ kafka_ns }} -c kafka-consumer + -- kafka-topics --topic {{ kafka_topic }} --describe --zookeeper {{ zk_service }}:{{ zk_port }} + | grep -o 'Leader: [^[:space:]]*' | awk '{print $2}' + args: + executable: /bin/bash + register: ordinality + failed_when: "ordinality.rc != 0" + +- name: Determine the leader broker pod name + shell: + kubectl get pods -l app=cp-kafka --no-headers -o custom-columns=:metadata.name | grep '^.*-{{ ordinality.stdout }}$' + args: + executable: /bin/bash + register: leader_broker + failed_when: "result.rc != 0" + +- name: Set the kafka broker to be subjected to chaos + set_fact: + liveness_topic_leader: "{{ leader_broker.stdout }}" + diff --git a/utils/apps/kafka/kafka_select_broker.yml b/utils/apps/kafka/kafka_select_broker.yml new file mode 100644 index 00000000000..507fb72154e --- /dev/null +++ b/utils/apps/kafka/kafka_select_broker.yml @@ -0,0 +1,11 @@ +- name: select leader broker as per the liveness topic (partition) + set_fact: + kafka_broker: "{{ liveness_topic_leader }}" + when: kafka_stream is defined and kafka_stream != '' + +- name: allow random pod selection by chaosutil + set_fact: + kafka_broker: '' + when: kafka_stream is undefined or kafka_stream == '' + + From 34b4e1cd3d4603f48b99309a1266e49a6edb4c15 Mon Sep 17 00:00:00 2001 From: Karthik Satchitanand Date: Wed, 13 Nov 2019 01:09:51 +0530 Subject: [PATCH 13/21] (feat)kafka: include kafka broker disk failure experiment (#928) Signed-off-by: ksatchit --- .../disk_loss/disk_loss_ansible_logic.yml | 14 +- .../kafka-broker-disk-failure/chaosutil.j2 | 6 + .../disk_status_check.j2 | 33 ++++ ...afka-broker-disk-failure-ansible-logic.yml | 157 ++++++++++++++++++ ...ker-disk-failure-ansible-prerequisites.yml | 7 + .../kafka-broker-disk-failure-k8s-job.yml | 93 +++++++++++ .../kafka-broker-pod-failure-k8s-job.yml | 57 +++---- utils/common/status_app_pod.yml | 2 +- 8 files changed, 334 insertions(+), 35 deletions(-) create mode 100644 experiments/kafka/kafka-broker-disk-failure/chaosutil.j2 create mode 100644 experiments/kafka/kafka-broker-disk-failure/disk_status_check.j2 create mode 100644 experiments/kafka/kafka-broker-disk-failure/kafka-broker-disk-failure-ansible-logic.yml create mode 100644 experiments/kafka/kafka-broker-disk-failure/kafka-broker-disk-failure-ansible-prerequisites.yml create mode 100644 experiments/kafka/kafka-broker-disk-failure/kafka-broker-disk-failure-k8s-job.yml diff --git a/experiments/generic/disk_loss/disk_loss_ansible_logic.yml b/experiments/generic/disk_loss/disk_loss_ansible_logic.yml index a544e579479..dc2d6e1b2f7 100644 --- a/experiments/generic/disk_loss/disk_loss_ansible_logic.yml +++ b/experiments/generic/disk_loss/disk_loss_ansible_logic.yml @@ -65,9 +65,13 @@ # Checking disk is attached to node - debug: - msg: "disk attached" + msg: "specified disk is attached to node" when: "inuse == true" + - fail: + msg: "specified disk not attached to node" + when: "inuse == false" + ## INJECTING CHAOS - name: Injecting the chaos include_tasks: "{{ c_util }}" @@ -91,9 +95,13 @@ # Checking disk is attached to node - debug: - msg: "disk attached" + msg: "specified disk is attached to node" when: "inuse == true" + - fail: + msg: "specified disk not re-attached to node" + when: "inuse == false" + - set_fact: flag: "Pass" @@ -108,4 +116,4 @@ - include_tasks: "/utils/runtime/update_chaos_result_resource.yml" vars: status: 'EOT' - namespace: "{{ c_ns }}" \ No newline at end of file + namespace: "{{ c_ns }}" diff --git a/experiments/kafka/kafka-broker-disk-failure/chaosutil.j2 b/experiments/kafka/kafka-broker-disk-failure/chaosutil.j2 new file mode 100644 index 00000000000..3cf087ab8de --- /dev/null +++ b/experiments/kafka/kafka-broker-disk-failure/chaosutil.j2 @@ -0,0 +1,6 @@ +# checks if cloud_platform is set or not +{% if cloud_platform is defined and cloud_platform == 'GCP' %} + c_util: /chaoslib/litmus/platform/gke/disk_loss.yml +{% elif cloud_platform is defined and cloud_platform == 'AWS' %} + c_util: /chaoslib/litmus/platform/aws/disk_loss.yml +{% endif %} diff --git a/experiments/kafka/kafka-broker-disk-failure/disk_status_check.j2 b/experiments/kafka/kafka-broker-disk-failure/disk_status_check.j2 new file mode 100644 index 00000000000..8a5e2044252 --- /dev/null +++ b/experiments/kafka/kafka-broker-disk-failure/disk_status_check.j2 @@ -0,0 +1,33 @@ +# All code here is not indented because j2 is space sensitive +# Initially, it "inuse" set to false +{% set disk = namespace(inuse=false) %} +# For GCP +{% if cloud_platform is defined and cloud_platform == 'GCP' %} +{% set expect_user = 'https://www.googleapis.com/compute/v1/projects/' + project_id + '/zones/' + zone_name + '/instances/' + node_name %} +# loop through all the disk users and checks if current_user is equal to expect_user +{% for current_user in disk_users.stdout_lines %} +{% if current_user == expect_user %} +# If the condition is true, then set "inuse" to true +{% set disk.inuse = true %} +{% endif %} +{% endfor %} + +# For AWS +{% elif cloud_platform is defined and cloud_platform == 'AWS' %} +{% set expect_user = node_name %} +# loop through all the disk users and checks if current_user is equal to expect_user +{% for current_user in disk_users.volumes %} +{% if current_user.attachment_set.instance_id == expect_user and current_user.attachment_set.status == "attached" %} +# If the condition is true, then set "inuse" to true +{% set disk.inuse = true %} +{% endif %} +{% endfor %} +{% endif %} + +# This will append inuse: true/false +{% if disk.inuse == true %} +inuse: true +{% else %} +inuse: false +{% endif %} + diff --git a/experiments/kafka/kafka-broker-disk-failure/kafka-broker-disk-failure-ansible-logic.yml b/experiments/kafka/kafka-broker-disk-failure/kafka-broker-disk-failure-ansible-logic.yml new file mode 100644 index 00000000000..b5060a3c0f0 --- /dev/null +++ b/experiments/kafka/kafka-broker-disk-failure/kafka-broker-disk-failure-ansible-logic.yml @@ -0,0 +1,157 @@ +--- +- hosts: localhost + connection: local + + vars: + c_experiment: "kafka-broker-disk-failure" + c_duration: "{{ lookup('env','TOTAL_CHAOS_DURATION') }}" + cloud_platform: "{{ lookup('env','CLOUD_PLATFORM') }}" + disk_name: "{{ lookup('env','DISK_NAME') }}" + project_id: "{{ lookup('env','PROJECT_ID') }}" + zone_name: "{{ lookup('env','ZONE_NAME') }}" + kafka_ns: "{{ lookup('env','KAFKA_NAMESPACE') }}" + kafka_label: "{{ lookup('env','KAFKA_LABEL') }}" + kafka_kind: "{{ lookup('env','KAFKA_KIND') }}" + kafka_broker: "{{ lookup('env','KAFKA_BROKER') }}" + kafka_stream: "{{ lookup('env','KAFKA_LIVENESS_STREAM') }}" + kafka_service: "{{ lookup('env','KAFKA_SERVICE') }}" + kafka_port: "{{ lookup('env','KAFKA_PORT') }}" + kafka_replication_factor: "{{ lookup('env','KAFKA_REPLICATION_FACTOR') }}" + zk_ns: "{{ lookup('env','ZOOKEEPER_NAMESPACE') }}" + zk_label: "{{ lookup('env','ZOOKEEPER_LABEL') }}" + zk_service: "{{ lookup('env','ZOOKEEPER_SERVICE') }}" + zk_port: "{{ lookup('env','ZOOKEEPER_PORT') }}" + + tasks: + - block: + + - include: kafka-broker-disk-failure-ansible-prerequisites.yml + + - include_vars: + file: chaosutil.yml + + ## GENERATE EXP RESULT NAME + - block: + + - name: Construct chaos result name (experiment_name) + set_fact: + c_experiment: "{{ lookup('env','CHAOSENGINE') }}-{{ c_experiment }}" + + when: lookup('env','CHAOSENGINE') + + ## RECORD START-OF-EXPERIMENT IN LITMUSCHAOS RESULT CR + - include_tasks: /utils/runtime/update_chaos_result_resource.yml + vars: + status: 'SOT' + namespace: "{{ kafka_ns }}" + + - name: Verify mandatory Kafka broker and disk information + debug: + msg: "kafka-broker-pod: {{ kafka_broker }}; kafka-broker-disk: {{ disk_name }}" + failed_when: (kafka_broker is not defined or not kafka_broker) or (disk_name is not defined or not disk_name) + + ## PERFORM GCLOUD PLATFORM CONFIGURATION STEPS + + - name: Perform gcloud authentication + include_tasks: "/utils/cloud/gcp/gcloud_configure.yml" + when: "cloud_platform == 'GCP'" + + ## PRE-CHAOS APPLICATION LIVENESS CHECK + + - name: Verify that the Kafka cluster is healthy + include_tasks: "/utils/apps/kafka/kafka_cluster_health.yml" + vars: + delay: 1 + retries: 60 + + - name: Derive the kafka-broker node name + shell: + kubectl get pod {{ kafka_broker }} -n {{ kafka_ns }} --no-headers -o custom-columns=:spec.nodeName + args: + executable: /bin/bash + register: node + + - set_fact: + node_name: "{{ node.stdout }}" + + - name: Verify that the specified disk is connected to node + include_tasks: "/utils/cloud/gcp/status_disk.yml" + when: "cloud_platform == 'GCP'" + + - debug: + msg: "specified disk is attached to node" + when: "inuse == true" + + - fail: + msg: "specified disk not attached to node" + when: "inuse == false" + + ## SETUP KAFKA CHAOS INFRA (LIVENESS CLIENT) + + - include_tasks: "/utils/apps/kafka/kafka_liveness_stream.yml" + when: kafka_stream is defined and kafka_stream != '' + + ## FAULT INJECTION + + - include_tasks: "{{ c_util }}" + + ## POST-CHAOS APPLICATION LIVENESS CHECK + + ## NOTE: This is disabled at present as the recovery post re-attach (in case of mounted disks) + ## is still manual + + #- name: Verify that the Kafka cluster is healthy + # include_tasks: "/utils/apps/kafka/kafka_cluster_health.yml" + # vars: + # delay: 1 + # retries: 60 + + ## CHECK FOR KAFKA LIVENESS & CLEANUP + + - block: + + - name: Verify that the Kafka liveness pod (pub-sub) is uninterrupted + include_tasks: "/utils/common/status_app_pod.yml" + vars: + a_ns: "{{ kafka_ns }}" + a_label: "name=kafka-liveness" + delay: 1 + retries: 60 + + - include_tasks: "/utils/apps/kafka/kafka_liveness_cleanup.yml" + + when: kafka_stream is defined and kafka_stream != '' + + ## POST-CHAOS DISK LIVENESS CHECK + + - name: Verify that the disk is connected to node (post) + include_tasks: "/utils/cloud/gcp/status_disk.yml" + when: "cloud_platform == 'GCP'" + + - debug: + msg: "specified disk is attached to node" + when: "inuse == true" + + - fail: + msg: "specified disk not re-attached to kafka-broker node" + when: "inuse == false" + + - set_fact: + flag: "pass" + + rescue: + - set_fact: + flag: "fail" + + - name: Cleanup kafka liveness pods if present + include_tasks: "/utils/apps/kafka/kafka_liveness_cleanup.yml" + ignore_errors: true + + always: + + ## RECORD END-OF-TEST IN LITMUSCHAOS RESULT CR + - include_tasks: /utils/runtime/update_chaos_result_resource.yml + vars: + status: 'EOT' + namespace: "{{ kafka_ns }}" + diff --git a/experiments/kafka/kafka-broker-disk-failure/kafka-broker-disk-failure-ansible-prerequisites.yml b/experiments/kafka/kafka-broker-disk-failure/kafka-broker-disk-failure-ansible-prerequisites.yml new file mode 100644 index 00000000000..1d7e14250f9 --- /dev/null +++ b/experiments/kafka/kafka-broker-disk-failure/kafka-broker-disk-failure-ansible-prerequisites.yml @@ -0,0 +1,7 @@ +- name: Identify the chaos util to be invoked + template: + src: chaosutil.j2 + dest: chaosutil.yml + + + diff --git a/experiments/kafka/kafka-broker-disk-failure/kafka-broker-disk-failure-k8s-job.yml b/experiments/kafka/kafka-broker-disk-failure/kafka-broker-disk-failure-k8s-job.yml new file mode 100644 index 00000000000..261ae947889 --- /dev/null +++ b/experiments/kafka/kafka-broker-disk-failure/kafka-broker-disk-failure-k8s-job.yml @@ -0,0 +1,93 @@ +--- +apiVersion: batch/v1 +kind: Job +metadata: + generateName: kafka-broker-disk-failure- +spec: + template: + metadata: + labels: + experiment: kafka-broker-disk-failure + spec: + # Placeholder that is updated by the executor for automated runs + # Provide appropriate SA (with desired permissions) if executed manually + serviceAccountName: %CHAOS_SERVICE_ACCOUNT% + restartPolicy: Never + containers: + - name: ansibletest + image: litmuschaos/ansible-runner:ci + imagePullPolicy: Always + env: + - name: ANSIBLE_STDOUT_CALLBACK + value: 'default' + + # provide application kind + - name: KAFKA_KIND + value: 'statefulset' + + - name: KAFKA_LIVENESS_STREAM + value: 'enabled' + + - name: TOTAL_CHAOS_DURATION + value: '30' + + - name: CLOUD_PLATFORM + value: 'GCP' + + - name: PROJECT_ID + value: '' + + - name: DISK_NAME + value: '' + + - name: ZONE_NAME + value: '' + + - name: KAFKA_NAMESPACE + value: '' + + - name: KAFKA_LABEL + value: '' + + - name: KAFKA_BROKER + value: '' + + - name: KAFKA_REPLICATION_FACTOR + value: '' + + - name: KAFKA_SERVICE + value: '' + + - name: KAFKA_PORT + value: '' + + - name: ZOOKEEPER_NAMESPACE + value: '' + + - name: ZOOKEEPER_LABEL + value: '' + + - name: ZOOKEEPER_SERVICE + value: '' + + - name: ZOOKEEPER_PORT + value: '' + + - name: CHAOSENGINE + value: '' + + - name: CHAOS_SERVICE_ACCOUNT + valueFrom: + fieldRef: + fieldPath: spec.serviceAccountName + + command: ["/bin/bash"] + args: ["-c", "ansible-playbook ./experiments/kafka/kafka-broker-disk-failure/kafka-broker-disk-failure-ansible-logic.yml -vv -i /etc/ansible/hosts; exit 0"] + volumeMounts: + - name: parameters + mountPath: /mnt/ + volumes: + - name: parameters + secret: + secretName: %SECRET_NAME% + diff --git a/experiments/kafka/kafka-broker-pod-failure/kafka-broker-pod-failure-k8s-job.yml b/experiments/kafka/kafka-broker-pod-failure/kafka-broker-pod-failure-k8s-job.yml index 85c7311fc03..bc95f8d9620 100644 --- a/experiments/kafka/kafka-broker-pod-failure/kafka-broker-pod-failure-k8s-job.yml +++ b/experiments/kafka/kafka-broker-pod-failure/kafka-broker-pod-failure-k8s-job.yml @@ -11,65 +11,60 @@ spec: spec: # Placeholder that is updated by the executor for automated runs # Provide appropriate SA (with desired permissions) if executed manually - serviceAccountName: nginx + serviceAccountName: %CHAOS_SERVICE_ACCOUNT% restartPolicy: Never containers: - name: ansibletest - image: ksatchit/ansible-runner:ci + image: litmuschaos/ansible-runner:ci imagePullPolicy: Always env: - name: ANSIBLE_STDOUT_CALLBACK value: 'default' - # provide application namespace + - name: KAFKA_KIND + value: 'statefulset' + + - name: KAFKA_LIVENESS_STREAM + value: 'enabled' + + - name: TOTAL_CHAOS_DURATION + value: '15' + + - name: CHAOS_INTERVAL + value: '5' + + - name: FORCE + value: 'true' + - name: KAFKA_NAMESPACE - value: 'default' + value: '' - # provide application labels - name: KAFKA_LABEL - value: 'app=cp-kafka' - - # provide application kind - - name: KAFKA_KIND - value: 'statefulset' + value: '' - name: KAFKA_BROKER value: '' - - name: KAFKA_LIVENESS_STREAM - value: 'enabled' - - name: KAFKA_REPLICATION_FACTOR - value: '3' + value: '' - name: KAFKA_SERVICE - value: 'kafka-demo-cp-kafka-headless' + value: '' - name: KAFKA_PORT - value: '9092' + value: '' - name: ZOOKEEPER_NAMESPACE - value: 'default' + value: '' - # provide application labels - name: ZOOKEEPER_LABEL - value: 'app=cp-zookeeper' - + value: '' - name: ZOOKEEPER_SERVICE - value: 'kafka-demo-cp-zookeeper-headless' + value: '' - name: ZOOKEEPER_PORT - value: '2181' - - - name: TOTAL_CHAOS_DURATION - value: '15' - - - name: CHAOS_INTERVAL - value: '5' - - - name: FORCE - value: 'true' + value: '' ## env var that describes the library used to execute the chaos ## default: litmus. Supported values: litmus, powerfulseal, chaoskube diff --git a/utils/common/status_app_pod.yml b/utils/common/status_app_pod.yml index e855ddee90e..5a9183a6a85 100644 --- a/utils/common/status_app_pod.yml +++ b/utils/common/status_app_pod.yml @@ -1,5 +1,5 @@ --- -- name: Checking {{ application_name }} pod is in running state +- name: Checking whether application pods are in running state shell: kubectl get pods -n {{ a_ns }} -l {{ a_label }} -o custom-columns=:.status.phase --no-headers register: result until: "((result.stdout.split()|unique)|length) == 1 and 'Running' in result.stdout" From ccb5d25fd81a3d3e450cb459612cddebaac57b86 Mon Sep 17 00:00:00 2001 From: Shubham Chaudhary Date: Wed, 13 Nov 2019 15:44:40 +0530 Subject: [PATCH 14/21] (feat): add readme for kafka broker pod failure experiment (#929) * (feat): add readme for kafka broker pod failure experiment Signed-off-by: shubhamchaudhary --- .../kafka/kafka-broker-pod-failure/README.md | 55 +++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 experiments/kafka/kafka-broker-pod-failure/README.md diff --git a/experiments/kafka/kafka-broker-pod-failure/README.md b/experiments/kafka/kafka-broker-pod-failure/README.md new file mode 100644 index 00000000000..61dbf98dc36 --- /dev/null +++ b/experiments/kafka/kafka-broker-pod-failure/README.md @@ -0,0 +1,55 @@ +### Sample ChaosEngine manifest to execute kafka broker kill experiment + +- To override experiment defaults, add the ENV variables in `spec.components` of the experiment. + + ```yml + apiVersion: litmuschaos.io/v1alpha1 + kind: ChaosEngine + metadata: + name: kafka-chaos + namespace: default + spec: + appinfo: + appns: default + applabel: 'app=cp-kafka' + appkind: statefulset + chaosServiceAccount: kafka-sa + monitoring: false + experiments: + - name: kafka-broker-pod-failure + spec: + components: + # choose based on available kafka broker replicas + - name: KAFKA_REPLICATION_FACTOR + value: '3' + + # get via "kubectl get pods --show-labels -n " + - name: KAFKA_LABEL + value: 'app=cp-kafka' + + - name: KAFKA_NAMESPACE + value: 'default' + + # get via "kubectl get svc -n " + - name: KAFKA_SERVICE + value: 'kafka-cp-kafka-headless' + + # get via "kubectl get svc -n + - name: KAFKA_PORT + value: '9092' + + - name: ZOOKEEPER_NAMESPACE + value: 'default' + + # get via "kubectl get pods --show-labels -n " + - name: ZOOKEEPER_LABEL + value: 'app=cp-zookeeper' + + # get via "kubectl get svc -n + - name: ZOOKEEPER_SERVICE + value: 'kafka-cp-zookeeper-headless' + + # get via "kubectl get svc -n + - name: ZOOKEEPER_PORT + value: '2181' + ``` \ No newline at end of file From 7f36c00fa31f2663b18aa1c59cfdd9e78b7a92ca Mon Sep 17 00:00:00 2001 From: Shubham Chaudhary Date: Thu, 14 Nov 2019 15:18:43 +0530 Subject: [PATCH 15/21] (feat): add abality to mount secret in executor (#931) Signed-off-by: shubhamchaudhary --- executor/README.md | 4 + executor/executor.yml | 18 +- executor/experiment_configmap.yml | 163 +++++++++++++----- .../generic/disk_loss/disk_loss_k8s_job.yml | 2 +- utils/cloud/gcp/gcloud_configure.yml | 2 +- 5 files changed, 144 insertions(+), 45 deletions(-) diff --git a/executor/README.md b/executor/README.md index 7ddf841d206..7054c1741d9 100644 --- a/executor/README.md +++ b/executor/README.md @@ -132,3 +132,7 @@ - The name of file which contains data for configmap in experimentCR should be parameters.yml - The configmap is mounted in this default directory: /mnt/ + +- Executor is currently unable to parse more than one secret. + +- The secret is mounted in this default directory: /tmp/ diff --git a/executor/executor.yml b/executor/executor.yml index 924cbf8f1d3..4d5207840cc 100644 --- a/executor/executor.yml +++ b/executor/executor.yml @@ -65,13 +65,25 @@ executable: /bin/bash register: c_job_args -- name: Fetching data for the configmap +- name: Check Availability of configmaps shell: > kubectl get chaosexperiment -n {{ c_app_ns }} -o jsonpath='{.items[?(@.metadata.name=="{{ c_experiment_name }}")].spec.definition.configmaps[0].data.parameters\.yml}' args: executable: /bin/bash register: configMap_available +- name: Check Availability of secrets + shell: > + kubectl get chaosexperiment -n {{ c_app_ns }} -o jsonpath='{.items[?(@.metadata.name=="{{ c_experiment_name }}")].spec.definition.secrets[0].name}' + args: + executable: /bin/bash + register: secret_available + +- name: Record availability of configmaps and secrets + set_fact: + configMap_available: "{{ configMap_available.stdout }}" + secret_available: "{{ secret_available.stdout }}" + - include: experiment_env_getter.yml with_sequence: start=0 count="{{c_env_length.stdout | int}}" @@ -138,10 +150,10 @@ --labels={{c_job_labels.stdout}} {{c_env_list}} --command -- /bin/bash {{c_job_args.stdout}} args: executable: /bin/bash - when: configMap_available.stdout == '' + when: configMap_available == '' and secret_available == '' - include: experiment_configmap.yml - when: configMap_available.stdout != '' + when: configMap_available != '' or secret_available != '' - name: Monitoring the litmus chaos job for completion shell: > diff --git a/executor/experiment_configmap.yml b/executor/experiment_configmap.yml index e972bce203b..c1315e6d374 100644 --- a/executor/experiment_configmap.yml +++ b/executor/experiment_configmap.yml @@ -1,23 +1,44 @@ -- name: Fetching data for the configmap - shell: > - kubectl get chaosexperiment -n {{ c_app_ns }} -o jsonpath='{.items[?(@.metadata.name=="{{ c_experiment_name }}")].spec.definition.configmaps[0].data.parameters\.yml}' > parameters.yml - args: - executable: /bin/bash +- block: + - name: Fetching data for the configmap + shell: > + kubectl get chaosexperiment -n {{ c_app_ns }} -o jsonpath='{.items[?(@.metadata.name=="{{ c_experiment_name }}")].spec.definition.configmaps[0].data.parameters\.yml}' > parameters.yml + args: + executable: /bin/bash -- name: Fetching name of configmap - shell: > - kubectl get chaosexperiment -n {{ c_app_ns }} -o jsonpath='{.items[?(@.metadata.name=="{{ c_experiment_name }}")].spec.definition.configmaps[0].name}' - args: - executable: /bin/bash - register: c_map_name + - name: Fetching name of configmap + shell: > + kubectl get chaosexperiment -n {{ c_app_ns }} -o jsonpath='{.items[?(@.metadata.name=="{{ c_experiment_name }}")].spec.definition.configmaps[0].name}' + args: + executable: /bin/bash + register: c_map_name -- name: Creating configmap - shell: - kubectl create configmap {{c_map_name.stdout}} --from-file=parameters.yml -n {{c_app_ns}} - args: - executable: /bin/bash + - name: Creating configmap + shell: + kubectl create configmap {{c_map_name.stdout}} --from-file=parameters.yml -n {{c_app_ns}} + args: + executable: /bin/bash + + when: configMap_available != '' + +- block: + + - name: Fetching name of secret + shell: > + kubectl get chaosexperiment -n {{ c_app_ns }} -o jsonpath='{.items[?(@.metadata.name=="{{ c_experiment_name }}")].spec.definition.secrets[0].name}' + args: + executable: /bin/bash + register: c_secret_name + + - name: Fetching mount path for secret + shell: > + kubectl get chaosexperiment -n {{ c_app_ns }} -o jsonpath='{.items[?(@.metadata.name=="{{ c_experiment_name }}")].spec.definition.secrets[0].mountPath}' + args: + executable: /bin/bash + register: c_mount_path + + when: secret_available != '' -- name: Run the chaos experiment job +- name: Get the job yaml shell: kubectl run {{ c_experiment_name }}-{{random_string.stdout}} --restart=OnFailure --image={{c_image.stdout}} --namespace={{c_app_ns}} --serviceaccount={{ c_svc_acc }} --image-pull-policy=Always @@ -25,32 +46,94 @@ args: executable: /bin/bash -- name: Include the volumeMounts in jobYml - lineinfile: - dest: cjob.yml - insertafter: "resources: {}" - state: present - line: ' {{item}}' - with_items: - - " mountPath: /mnt/" - - "- name: parameters" - - "volumeMounts:" - -- name: Include the volumes in jobYml - lineinfile: - dest: cjob.yml - insertafter: "serviceAccountName" - state: present - line: ' {{item}}' - with_items: - - " name: {{c_map_name.stdout}}" - - " configMap:" - - "- name: parameters" - - "volumes:" +- block: + + - name: Include the volumeMounts in jobYml - configmap only + lineinfile: + dest: cjob.yml + insertafter: "resources: {}" + state: present + line: ' {{item}}' + with_items: + - " mountPath: /mnt/" + - "- name: parameters" + - "volumeMounts:" + + - name: Include the volumes in jobYml - configmap only + lineinfile: + dest: cjob.yml + insertafter: "serviceAccountName" + state: present + line: ' {{item}}' + with_items: + - " name: {{c_map_name.stdout}}" + - " configMap:" + - "- name: parameters" + - "volumes:" + + when: configMap_available != '' and secret_available == '' + +- block: + + - name: Include the volumeMounts in jobYml - secret only + lineinfile: + dest: cjob.yml + insertafter: "resources: {}" + state: present + line: ' {{item}}' + with_items: + - " mountPath: {{ c_mount_path.stdout }}" + - "- name: cloud-config" + - "volumeMounts:" + + - name: Include the volumes in jobYml + lineinfile: + dest: cjob.yml + insertafter: "serviceAccountName" + state: present + line: ' {{item}}' + with_items: + - " secretName: {{ c_secret_name.stdout }}" + - " secret:" + - "- name: cloud-config" + - "volumes:" + + when: configMap_available == '' and secret_available != '' + +- block: + + - name: Include the volumeMounts in jobYml - secret and configmap both + lineinfile: + dest: cjob.yml + insertafter: "resources: {}" + state: present + line: ' {{item}}' + with_items: + - " mountPath: {{ c_mount_path.stdout }}" + - "- name: cloud-config" + - " mountPath: /mnt/" + - "- name: parameters" + - "volumeMounts:" + + - name: Include the volumes in jobYml + lineinfile: + dest: cjob.yml + insertafter: "serviceAccountName" + state: present + line: ' {{item}}' + with_items: + - " secretName: {{ c_secret_name.stdout }}" + - " secret:" + - "- name: cloud-config" + - " name: {{c_map_name.stdout}}" + - " configMap:" + - "- name: parameters" + - "volumes:" + + when: configMap_available != '' and secret_available != '' - name: create job shell: kubectl create -f cjob.yml -n {{c_app_ns}} args: executable: /bin/bash - diff --git a/experiments/generic/disk_loss/disk_loss_k8s_job.yml b/experiments/generic/disk_loss/disk_loss_k8s_job.yml index fa6c6bafef1..8a5b25993c0 100644 --- a/experiments/generic/disk_loss/disk_loss_k8s_job.yml +++ b/experiments/generic/disk_loss/disk_loss_k8s_job.yml @@ -74,7 +74,7 @@ spec: args: ['-c', 'ansible-playbook ./experiments/generic/disk_loss/disk_loss_ansible_logic.yml -i /etc/ansible/hosts -vv; exit 0'] volumeMounts: - name: parameters - mountPath: /mnt/ + mountPath: /tmp/ volumes: - name: parameters # Enter the secret name of the service account, you want to mount diff --git a/utils/cloud/gcp/gcloud_configure.yml b/utils/cloud/gcp/gcloud_configure.yml index 042a098aa60..afd8678c7bd 100644 --- a/utils/cloud/gcp/gcloud_configure.yml +++ b/utils/cloud/gcp/gcloud_configure.yml @@ -1,5 +1,5 @@ - name: authenticate gcloud service account - shell: gcloud auth activate-service-account --key-file=/mnt/cloud_config.yml + shell: gcloud auth activate-service-account --key-file=/tmp/cloud_config.yml - name: Gcloud project setting shell: gcloud config set project {{ project_id }} From ebd4a58297c53d8f23187f714cf0097ae1b67a12 Mon Sep 17 00:00:00 2001 From: Karthik Satchitanand Date: Thu, 14 Nov 2019 20:02:10 +0530 Subject: [PATCH 16/21] (refactor)kafka: introduce kafka_instance_name env to support liveness checks on kudo kafka (#938) Signed-off-by: ksatchit --- .../kafka-broker-pod-failure-ansible-logic.yml | 1 + .../kafka-broker-pod-failure-k8s-job.yml | 3 +++ utils/apps/kafka/kafka_liveness.j2 | 2 ++ 3 files changed, 6 insertions(+) diff --git a/experiments/kafka/kafka-broker-pod-failure/kafka-broker-pod-failure-ansible-logic.yml b/experiments/kafka/kafka-broker-pod-failure/kafka-broker-pod-failure-ansible-logic.yml index e07f308d255..f33b4b4bf7f 100644 --- a/experiments/kafka/kafka-broker-pod-failure/kafka-broker-pod-failure-ansible-logic.yml +++ b/experiments/kafka/kafka-broker-pod-failure/kafka-broker-pod-failure-ansible-logic.yml @@ -8,6 +8,7 @@ c_interval: "{{ lookup('env','CHAOS_INTERVAL') }}" c_force: "{{ lookup('env','FORCE') }}" c_lib: "{{ lookup('env','LIB') }}" + kafka_instance: "{{ lookup('env','KAFKA_INSTANCE_NAME') }}" kafka_ns: "{{ lookup('env','KAFKA_NAMESPACE') }}" kafka_label: "{{ lookup('env','KAFKA_LABEL') }}" kafka_kind: "{{ lookup('env','KAFKA_KIND') }}" diff --git a/experiments/kafka/kafka-broker-pod-failure/kafka-broker-pod-failure-k8s-job.yml b/experiments/kafka/kafka-broker-pod-failure/kafka-broker-pod-failure-k8s-job.yml index bc95f8d9620..d739d78d57f 100644 --- a/experiments/kafka/kafka-broker-pod-failure/kafka-broker-pod-failure-k8s-job.yml +++ b/experiments/kafka/kafka-broker-pod-failure/kafka-broker-pod-failure-k8s-job.yml @@ -36,6 +36,9 @@ spec: - name: FORCE value: 'true' + - name: KAFKA_INSTANCE_NAME + value: '' + - name: KAFKA_NAMESPACE value: '' diff --git a/utils/apps/kafka/kafka_liveness.j2 b/utils/apps/kafka/kafka_liveness.j2 index 866f375f283..18261e0462c 100644 --- a/utils/apps/kafka/kafka_liveness.j2 +++ b/utils/apps/kafka/kafka_liveness.j2 @@ -13,6 +13,8 @@ spec: env: - name: TOPIC_NAME value: {{ kafka_topic }} + - name: KAFKA_INSTANCE_NAME + value: {{ kafka_instance }} - name: ZOOKEEPER_SERVICE value: {{ zk_service }} - name: ZOOKEEPER_PORT From d639ca67fbdbf4ce2a3d688e24a1f8ab67be1e58 Mon Sep 17 00:00:00 2001 From: Karthik Satchitanand Date: Thu, 14 Nov 2019 21:33:52 +0530 Subject: [PATCH 17/21] (refactor)kafka-liveness: use correct zookeeper uri for kudo kafka cluster (#939) Signed-off-by: ksatchit --- utils/apps/kafka/kafka_liveness_stream.yml | 35 ++++++++++++++++------ 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/utils/apps/kafka/kafka_liveness_stream.yml b/utils/apps/kafka/kafka_liveness_stream.yml index 61f0bdb26a6..1f9ca850714 100644 --- a/utils/apps/kafka/kafka_liveness_stream.yml +++ b/utils/apps/kafka/kafka_liveness_stream.yml @@ -36,15 +36,32 @@ kubectl get pods -n {{ kafka_ns }} -l name=kafka-liveness -o jsonpath='{.items[0].metadata.name}' register: kafka_liveness_pod -- name: Obtain the leader broker ordinality for the topic (partition) created by kafka-liveness - shell: > - kubectl exec {{ kafka_liveness_pod.stdout }} -n {{ kafka_ns }} -c kafka-consumer - -- kafka-topics --topic {{ kafka_topic }} --describe --zookeeper {{ zk_service }}:{{ zk_port }} - | grep -o 'Leader: [^[:space:]]*' | awk '{print $2}' - args: - executable: /bin/bash - register: ordinality - failed_when: "ordinality.rc != 0" +- block: + + - name: Obtain the leader broker ordinality for the topic (partition) created by kafka-liveness + shell: > + kubectl exec {{ kafka_liveness_pod.stdout }} -n {{ kafka_ns }} -c kafka-consumer + -- kafka-topics --topic {{ kafka_topic }} --describe --zookeeper {{ zk_service }}:{{ zk_port }} + | grep -o 'Leader: [^[:space:]]*' | awk '{print $2}' + args: + executable: /bin/bash + register: ordinality + failed_when: "ordinality.rc != 0" + when: kafka_instance is not defined or kafka_instance == '' + +- block: + + - name: Obtain the leader broker ordinality for the topic (partition) created by kafka-liveness + shell: > + kubectl exec {{ kafka_liveness_pod.stdout }} -n {{ kafka_ns }} -c kafka-consumer + -- kafka-topics --topic {{ kafka_topic }} --describe --zookeeper {{ zk_service }}:{{ zk_port }}/{{ kafka_instance }} + | grep -o 'Leader: [^[:space:]]*' | awk '{print $2}' + args: + executable: /bin/bash + register: ordinality + failed_when: "ordinality.rc != 0" + when: kafka_instance is defined and kafka_instance != '' + - name: Determine the leader broker pod name shell: From b79fb4a33e7718c9a10cef60994c5a989d3e8a26 Mon Sep 17 00:00:00 2001 From: Karthik Satchitanand Date: Thu, 14 Nov 2019 22:29:33 +0530 Subject: [PATCH 18/21] (fix)kafka: remove label hardcoding in liveness util (#940) Signed-off-by: ksatchit --- utils/apps/kafka/kafka_liveness_stream.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/apps/kafka/kafka_liveness_stream.yml b/utils/apps/kafka/kafka_liveness_stream.yml index 1f9ca850714..b912aa674fc 100644 --- a/utils/apps/kafka/kafka_liveness_stream.yml +++ b/utils/apps/kafka/kafka_liveness_stream.yml @@ -65,7 +65,7 @@ - name: Determine the leader broker pod name shell: - kubectl get pods -l app=cp-kafka --no-headers -o custom-columns=:metadata.name | grep '^.*-{{ ordinality.stdout }}$' + kubectl get pods -l {{ kafka_label }} --no-headers -o custom-columns=:metadata.name | grep '^.*-{{ ordinality.stdout }}$' args: executable: /bin/bash register: leader_broker From 857d04a861781c980081ad64a297da50f16b56a2 Mon Sep 17 00:00:00 2001 From: Karthik Satchitanand Date: Thu, 14 Nov 2019 23:59:13 +0530 Subject: [PATCH 19/21] (fix)set kafka ordinality variable based on ansible fact (#941) Signed-off-by: ksatchit --- utils/apps/kafka/kafka_liveness_stream.yml | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/utils/apps/kafka/kafka_liveness_stream.yml b/utils/apps/kafka/kafka_liveness_stream.yml index b912aa674fc..d0fc24536e4 100644 --- a/utils/apps/kafka/kafka_liveness_stream.yml +++ b/utils/apps/kafka/kafka_liveness_stream.yml @@ -45,8 +45,12 @@ | grep -o 'Leader: [^[:space:]]*' | awk '{print $2}' args: executable: /bin/bash - register: ordinality + register: ordinality_non_instance failed_when: "ordinality.rc != 0" + + - set_fact: + ordinality: "{{ ordinality_non_instance.stdout }}" + when: kafka_instance is not defined or kafka_instance == '' - block: @@ -58,8 +62,12 @@ | grep -o 'Leader: [^[:space:]]*' | awk '{print $2}' args: executable: /bin/bash - register: ordinality + register: ordinality_instance failed_when: "ordinality.rc != 0" + + - set_fact: + ordinality: "{{ ordinality_instance.stdout }}" + when: kafka_instance is defined and kafka_instance != '' From 688171dcd4692508f6e10972faff29a9c334d965 Mon Sep 17 00:00:00 2001 From: Karthik Satchitanand Date: Fri, 15 Nov 2019 00:24:03 +0530 Subject: [PATCH 20/21] (fix)kafka liveness util typo (#942) Signed-off-by: ksatchit --- utils/apps/kafka/kafka_liveness_stream.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/utils/apps/kafka/kafka_liveness_stream.yml b/utils/apps/kafka/kafka_liveness_stream.yml index d0fc24536e4..7a5898b1180 100644 --- a/utils/apps/kafka/kafka_liveness_stream.yml +++ b/utils/apps/kafka/kafka_liveness_stream.yml @@ -46,7 +46,7 @@ args: executable: /bin/bash register: ordinality_non_instance - failed_when: "ordinality.rc != 0" + failed_when: "ordinality_non_instance.rc != 0" - set_fact: ordinality: "{{ ordinality_non_instance.stdout }}" @@ -63,7 +63,7 @@ args: executable: /bin/bash register: ordinality_instance - failed_when: "ordinality.rc != 0" + failed_when: "ordinality_instance.rc != 0" - set_fact: ordinality: "{{ ordinality_instance.stdout }}" @@ -77,7 +77,7 @@ args: executable: /bin/bash register: leader_broker - failed_when: "result.rc != 0" + failed_when: "leader_broker.rc != 0" - name: Set the kafka broker to be subjected to chaos set_fact: From 405a77d6ee338cc36378cad3c6c041d98938bd9a Mon Sep 17 00:00:00 2001 From: Karthik Satchitanand Date: Fri, 15 Nov 2019 00:46:10 +0530 Subject: [PATCH 21/21] (fix) typo in usage of fact name (#943) Signed-off-by: ksatchit --- utils/apps/kafka/kafka_liveness_stream.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/apps/kafka/kafka_liveness_stream.yml b/utils/apps/kafka/kafka_liveness_stream.yml index 7a5898b1180..50e1faf666a 100644 --- a/utils/apps/kafka/kafka_liveness_stream.yml +++ b/utils/apps/kafka/kafka_liveness_stream.yml @@ -73,7 +73,7 @@ - name: Determine the leader broker pod name shell: - kubectl get pods -l {{ kafka_label }} --no-headers -o custom-columns=:metadata.name | grep '^.*-{{ ordinality.stdout }}$' + kubectl get pods -l {{ kafka_label }} --no-headers -o custom-columns=:metadata.name | grep '^.*-{{ ordinality }}$' args: executable: /bin/bash register: leader_broker