Skip to content

Commit

Permalink
chore(feat): Add kubelet service kill experiment in generic experimen…
Browse files Browse the repository at this point in the history
…t list (#1542)

* chore(feat): Add kubelet service kill experiment in generic experiment list

Signed-off-by: Udit Gaurav <uditgaurav@gmail.com>
  • Loading branch information
uditgaurav committed Jun 14, 2020
1 parent 39df140 commit 57f88a5
Show file tree
Hide file tree
Showing 7 changed files with 426 additions and 0 deletions.
42 changes: 42 additions & 0 deletions chaoslib/litmus/kubelet_service_kill/kubelet-service-kill.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
---
apiVersion: v1
kind: Pod
metadata:
name: service-kill-{{ run_id }}
labels:
app: service-kill
name: service-kill-{{ run_id }}
{% if chaos_uid is defined and chaos_uid != '' %}
chaosUID: {{ chaos_uid }}
{% endif %}
spec:
nodeName: {{ node_name }}
restartPolicy: Never
containers:
- name: service-kill
image: ubuntu:16.04
command: ["/bin/bash"]
args: ["-c", "sleep 10 && systemctl stop kubelet && sleep {{ c_duration }} && systemctl start kubelet"]
resources:
requests:
cpu: 10m
memory: 5M
limits:
cpu: 100m
memory: 20M
volumeMounts:
- name: bus
mountPath: /var/run
- name: root
mountPath: /node
securityContext:
privileged: true
tty: true
volumes:
- name: bus
hostPath:
path: /var/run
- name: root
hostPath:
path: /
type: ""
145 changes: 145 additions & 0 deletions chaoslib/litmus/kubelet_service_kill/kubelet_service_kill.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
---
- block:

- block:

- name: "[Prepare]: Select the application pod name"
shell: >
kubectl get pod -l {{ a_label }} -n {{ a_ns }}
-o=custom-columns=:metadata.name --no-headers
| shuf | head -1
args:
executable: /bin/bash
register: app_pod_name

- name: "[Prepare]: Recording the application pod name"
set_fact:
app_pod: "{{ app_pod_name.stdout }}"

when: "app_pod is not defined or app_pod == ''"

- name: "[Prepare]: Identify the application node name"
shell: >
kubectl get pod {{ app_pod }} -n {{ a_ns }}
--no-headers -o custom-columns=:spec.nodeName
args:
executable: /bin/bash
register: app_node

- name: "[Prepare]: Record the application node name"
set_fact:
app_node: "{{ app_node.stdout }}"

- block:
- name: "[Prepare:] Generate a run id if not passed from the engine/experiment"
shell: echo $(mktemp) | cut -d '.' -f 2 | cut -c -6
register: rand_string

- set_fact:
run_id: "{{ rand_string.stdout | lower }}"
when: "run_id is not defined or run_id == ''"

- name: "[Event]: Generating an Event for ChaosInjection"
include_tasks: /utils/common/generate-kubernetes-chaos-events.yml
vars:
stage: "ChaosInject"
exp_pod_name: "{{ chaos_pod_name }}"
engine_ns: "{{ c_ns }}"
message: "Injecting {{ c_experiment }} chaos on {{ app_node }} node"
when: "c_engine is defined and c_engine != ''"

- name: "[Prepare]: Patch the run_id to kubelet service kill helper pod template"
template:
src: /chaoslib/litmus/kubelet_service_kill/kubelet-service-kill.j2
dest: /tmp/kubelet-service-kill.yml
vars:
node_name: "{{ app_node }}"

# Setting pod_running_status to nil
- set_fact:
pod_running_status: ""

# Kubelet service kill pod creation is attempted for a total of 3 times, if it is not immediately schedulable due to transient node conditions
# If the kubelet-servie-kill pod is not schedulable across these 3 tries, the experiment is failed with message indicating improper cluster state.
- name: "[Prepare]: Including the util to create the chaos pod"
include_tasks: /utils/common/create_chaos_pod.yml
vars:
pod_ns: "{{ c_ns }}"
c_path: "/tmp/kubelet-service-kill.yml"
pod_label: "name=service-kill-{{ run_id }}"
with_sequence: start=1 end=3

# Failing the execution, If kubelet-service-kill pod won't come to running state after three retries.
- fail:
msg: "kubelet_service_kill lib failed, Unable to create as kubelet_service_kill pod couldn't be scheduled on the {{ node_name }} node"
when: "pod_running_status is not defined or pod_running_status != 'Running'"

- name: "[Status]: Waiting for node to get in NotReady state"
shell: |
kubectl get nodes {{ app_node }} --no-headers | awk '{print$2}'
args:
executable: /bin/bash
register: node_state
until: node_state.stdout == 'NotReady'
delay: 2
retries: 90

- name: "[Wait]: Wait for the chaos duration of {{ c_duration }}s"
wait_for:
timeout: "{{ c_duration }}"

- name: "[Status]: Checking the node status after chaos"
shell: |
kubectl get nodes {{ app_node }} --no-headers | awk '{print$2}'
args:
executable: /bin/bash
register: node_state
until: node_state.stdout == 'Ready'
delay: 2
retries: 90

- name: "[CleanUP]: Tear down service kill infra"
shell: >
kubectl delete -f /tmp/kubelet-service-kill.yml -n {{ c_ns }}
args:
executable: /bin/bash
register: result

- name: "[Status]: Confirm that the svc chaos helper pod is teminated successfully"
shell: >
kubectl get pod -l name=service-kill-{{ run_id }} --no-headers -o custom-columns=:status.phase -n {{ a_ns }} | sort | uniq
args:
executable: /bin/bash
register: result_status
until: result_status.stdout ==''
delay: 2
retries: 90

rescue:

- block:

- name: "[CleanUP]: Tear down service kill infra"
shell: >
kubectl delete -f /tmp/kubelet-service-kill.yml -n {{ c_ns }}
args:
executable: /bin/bash
register: result
when: "chaos_pod_result.rc == 0"

- name: "[Status]: Confirm that the svc chaos helper pod is teminated successfully"
shell: >
kubectl get pod -l name=service-kill-{{ run_id }} --no-headers -o custom-columns=:status.phase -n {{ a_ns }} | sort | uniq
args:
executable: /bin/bash
register: result_status
until: result_status.stdout ==''
delay: 2
retries: 90

when: "(pod_running_status is defined and pod_running_status == 'Running') and chaos_pod_result is defined"

- fail:
msg: "kubelet_service_kill lib failed"
when: true

14 changes: 14 additions & 0 deletions experiments/generic/kubelet_service_kill/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
## Experiment Metadata

<table>
<tr>
<th> Name </th>
<th> Description </th>
<th> Documentation Link </th>
</tr>
<tr>
<td> Kubelet Service Kill </td>
<td> This experiment causes kubelet service kill gracefully for a certain chaos duration. The experiment aims to verify resiliency of applications whose replicas may be evicted or becomes unreachable on account on nodes turning unschedulable (Not Ready) due to kubelet service kill. </td>
<td> <a href=""> Added soon </a> </td>
</tr>
</table>
3 changes: 3 additions & 0 deletions experiments/generic/kubelet_service_kill/chaosutil.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{% if c_lib is defined and c_lib == 'litmus' %}
c_util: "/chaoslib/litmus/kubelet_service_kill/kubelet_service_kill.yml"
{% endif %}
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
---
- hosts: localhost
connection: local

vars:
c_experiment: "kubelet-service-kill"
c_duration: "{{ lookup('env','TOTAL_CHAOS_DURATION') }}"
ramp_time: "{{ lookup('env','RAMP_TIME') }}"
a_ns: "{{ lookup('env','APP_NAMESPACE') }}"
a_label: "{{ lookup('env','APP_LABEL') }}"
a_kind: "{{ lookup('env','APP_KIND') }}"
lib_image: "{{ lookup('env','LIB_IMAGE') }}"
auxiliary_appinfo: "{{ lookup('env','AUXILIARY_APPINFO') }}"
chaos_uid: "{{ lookup('env','CHAOS_UID') }}"
c_engine: "{{ lookup('env','CHAOSENGINE') }}"
chaos_pod_name: "{{ lookup('env','POD_NAME') }}"
c_ns: "{{ lookup('env','CHAOS_NAMESPACE') }}"
c_lib: "{{ lookup('env','LIB') }}"

tasks:

- block:

## DETERMINE THE CHAOSLIB TASKFILES TO BE USED
- include: kubelet_service_kill_ansible_prerequisites.yml

- name: "[PreReq]: Including the chaos util for the {{ c_experiment }} experiment"
include_vars:
file: /tmp/chaosutil.yml

## GENERATE EXPERIMENT RESULT NAME
- name: "[PreReq]: Constructing the chaos result name"
set_fact:
c_result: "{{ c_engine }}-{{ c_experiment }}"
when: "c_engine is defined and c_engine != ''"

## RECORD START-OF-EXPERIMENT IN LITMUSCHAOS RESULT CR
- name: "[PreReq]: Updating the chaos result of {{ c_experiment }} experiment (SOT)"
include_tasks: /utils/runtime/update_chaos_result_resource.yml
vars:
status: 'SOT'
namespace: "{{ c_ns }}"

## DISPLAY APP INFORMATION
- name: "[Info]: Display the application information passed via the test job"
debug:
msg:
- "The application info is as follows:"
- "Namespace : {{ a_ns }}"
- "Label : {{ a_label }}"
- "Ramp Time : {{ ramp_time }}"

## PRE-CHAOS APPLICATION STATUS CHECK
- name: "[Status]: Verify that the AUT (Application Under Test) is running (pre-chaos)"
include_tasks: "/utils/common/status_app_pod.yml"
vars:
app_ns: "{{ a_ns }}"
app_label: "{{ a_label }}"
delay: 2
retries: 90

# Auxiliary application health check status
- block:

- name: Record auxiliary appinfo
set_fact:
auxiliary_appinfo_list: "{{ auxiliary_appinfo.split(',') }}"

- name: "[Status]: Verify that the Auxiliary Applications are running (pre-chaos)"
include: /utils/common/status_app_pod.yml
vars:
app_ns: "{{ item.split(':')[0] }}"
app_label: "{{ item.split(':')[1] }}"
delay: 2
retries: 90
with_items:
- "{{ auxiliary_appinfo_list }}"

when: auxiliary_appinfo is defined and auxiliary_appinfo != ''

## RECORD EVENT FOR PRE-CHAOS CHECK
- name: "[Event]: Generating an Event for PreChaosCheck"
include_tasks: /utils/common/generate-kubernetes-chaos-events.yml
vars:
stage: "PreChaosCheck"
exp_pod_name: "{{ chaos_pod_name }}"
engine_ns: "{{ c_ns }}"
message: "AUT is Running successfully"
when: "c_engine is defined and c_engine != ''"

## READY TO START SERVICE CHAOS
- name: "[Prepare]: Including the kubelet service kill lib"
include_tasks: "{{ c_util }}"

## POST-CHAOS APPLICATION STATUS CHECK

- name: "[Status]: Verify that the AUT (Application Under Test) is running (post-chaos)"
include_tasks: "/utils/common/status_app_pod.yml"
vars:
app_ns: "{{ a_ns }}"
app_label: "{{ a_label }}"
delay: 2
retries: 90

# Auxiliary application health check status
- block:

- name: "[Status]: Verify that the Auxiliary Applications are running (pre-chaos)"
include: /utils/common/status_app_pod.yml
vars:
app_ns: "{{ item.split(':')[0] }}"
app_label: "{{ item.split(':')[1] }}"
delay: 2
retries: 90
with_items:
- "{{ auxiliary_appinfo_list }}"

when: auxiliary_appinfo is defined and auxiliary_appinfo != ''

## RECORD EVENT FOR POST-CHAOS CHECK
- name: "[Event]: Generating an Event for PostChaosCheck"
include_tasks: /utils/common/generate-kubernetes-chaos-events.yml
vars:
stage: "PostChaosCheck"
exp_pod_name: "{{ chaos_pod_name }}"
engine_ns: "{{ c_ns }}"
message: "AUT is Running successfully"
when: "c_engine is defined and c_engine != ''"

- set_fact:
flag: "Pass"

- name: "[Result]: Getting the final result of {{ c_experiment }} experiment"
debug:
msg: "{{ c_experiment }} experiment has been {{ flag }}ed"

rescue:

- set_fact:
flag: "Fail"

- name: "[Result]: Getting the final result of {{ c_experiment }} experiment"
debug:
msg: "{{ c_experiment }} experiment has been {{ flag }}ed"

always:

## Getting failure step from experiment-pod
- include_tasks: /utils/runtime/getting_failure_step.yml

## RECORD END-OF-TEST IN LITMUSCHAOS RESULT CR
- name: "[The End]: Updating the chaos result of {{ c_experiment }} experiment (EOT)"
include_tasks: /utils/runtime/update_chaos_result_resource.yml
vars:
status: 'EOT'
namespace: "{{ c_ns }}"
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
- name: "[PreReq] Identify the chaos util for {{ c_experiment }} experiment"
template:
src: chaosutil.j2
dest: /tmp/chaosutil.yml
Loading

0 comments on commit 57f88a5

Please sign in to comment.