litmuschaos · ksatchit · Aug 15, 2020 · Jul 23, 2020 · Aug 3, 2020 · Aug 7, 2020
@@ -26,5 +26,7 @@ go build -o build/_output/node-cpu-hog ./experiments/generic/node-cpu-hog
 go build -o build/_output/container-kill ./experiments/generic/container-kill
 # Buiding go binaries for disk_fill experiment
 go build -o build/_output/disk-fill ./experiments/generic/disk-fill
+# Buiding go binaries for pod-autoscaler experiment
+go build -o build/_output/pod-autoscaler ./experiments/generic/pod-autoscaler
 # Buiding go binaries for container_kill helper
 go build -o build/_output/container-killer ./chaoslib/litmus/container_kill/helper
@@ -0,0 +1,267 @@
+package pod_autoscaler
+
+import (
+	"os"
+	"os/signal"
+	"strconv"
+	"syscall"
+	"time"
+
+	clients "github.com/litmuschaos/litmus-go/pkg/clients"
+	"github.com/litmuschaos/litmus-go/pkg/events"
+	experimentTypes "github.com/litmuschaos/litmus-go/pkg/generic/pod-autoscaler/types"
+	"github.com/litmuschaos/litmus-go/pkg/log"
+	"github.com/litmuschaos/litmus-go/pkg/result"
+	"github.com/litmuschaos/litmus-go/pkg/types"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	retries "k8s.io/client-go/util/retry"
+
+	"github.com/pkg/errors"
+)
+
+var err error
+
+//PreparePodAutoscaler contains the prepration steps before chaos injection
+func PreparePodAutoscaler(experimentsDetails *experimentTypes.ExperimentDetails, clients clients.ClientSets, resultDetails *types.ResultDetails, eventsDetails *types.EventDetails, chaosDetails *types.ChaosDetails) error {
+
+	appName, replicaCount, err := GetApplicationDetails(experimentsDetails, clients)
+	if err != nil {
+		return errors.Errorf("Unable to get the relicaCount of the application, err: %v", err)
+	}
+
+	//Waiting for the ramp time before chaos injection
+	if experimentsDetails.RampTime != 0 {
+		log.Infof("[Ramp]: Waiting for the %vs ramp time before injecting chaos", strconv.Itoa(experimentsDetails.RampTime))
+		waitForRampTime(experimentsDetails)
+	}
+	if err != nil {
+		return errors.Errorf("Unable to get the serviceAccountName, err: %v", err)
+	}
+
+	err = PodAutoscalerChaos(experimentsDetails, clients, replicaCount, appName, resultDetails, eventsDetails, chaosDetails)
+
+	if err != nil {
+		return errors.Errorf("Unable to perform autoscaling, due to %v", err)
+	}
+
+	err = AutoscalerReovery(experimentsDetails, clients, replicaCount, appName)
+	if err != nil {
+		return errors.Errorf("Unable to perform autoscaling, due to %v", err)
+	}
+
+	//Waiting for the ramp time after chaos injection
+	if experimentsDetails.RampTime != 0 {
+		log.Infof("[Ramp]: Waiting for the %vs ramp time after injecting chaos", strconv.Itoa(experimentsDetails.RampTime))
+		waitForRampTime(experimentsDetails)
+	}
+	return nil
+}
+
+//waitForRampTime waits for the given ramp time duration (in seconds)
+func waitForRampTime(experimentsDetails *experimentTypes.ExperimentDetails) {
+	time.Sleep(time.Duration(experimentsDetails.RampTime) * time.Second)
+}
+
+//GetApplicationDetails is used to get the application name, replicas of the application
+func GetApplicationDetails(experimentsDetails *experimentTypes.ExperimentDetails, clients clients.ClientSets) (string, int, error) {
+
+	var appReplica int
+	var appName string
+	applicationClient := clients.KubeClient.AppsV1().Deployments(experimentsDetails.ChaosNamespace)
+	// Get Deployment replica count
+	applicationList, err := applicationClient.List(metav1.ListOptions{})
+	if err != nil {
+		return "", 0, errors.Errorf("Unable to get application, err: %v", err)
+	}
+	for _, app := range applicationList.Items {
+		appReplica = int(*app.Spec.Replicas)
+		appName = app.Name
+
+	}
+	return appName, appReplica, nil
+
+}
+
+//PodAutoscalerChaos scales up the application pod replicas
+func PodAutoscalerChaos(experimentsDetails *experimentTypes.ExperimentDetails, clients clients.ClientSets, replicaCount int, appName string, resultDetails *types.ResultDetails, eventsDetails *types.EventDetails, chaosDetails *types.ChaosDetails) error {
+
+	applicationClient := clients.KubeClient.AppsV1().Deployments(experimentsDetails.ChaosNamespace)
+
+	replicas := int32(experimentsDetails.Replicas)
+	// Scale Application
+	retryErr := retries.RetryOnConflict(retries.DefaultRetry, func() error {
+		// Retrieve the latest version of Deployment before attempting update
+		// RetryOnConflict uses exponential backoff to avoid exhausting the apiserver
+		result, err := applicationClient.Get(appName, metav1.GetOptions{})
+		if err != nil {
+			return errors.Errorf("Failed to get latest version of Application Deployment: %v", err)
+		}
+
+		result.Spec.Replicas = int32Ptr(replicas) // modify replica count
+		_, updateErr := applicationClient.Update(result)
+		return updateErr
+	})
+	if retryErr != nil {
+		return errors.Errorf("Unable to scale the application, due to: %v", retryErr)
+	}
+	log.Info("Application Started Scaling")
+
+	err = ApplicationPodStatusCheck(experimentsDetails, appName, clients, replicaCount, resultDetails, eventsDetails, chaosDetails)
+	if err != nil {
+		return errors.Errorf("Status Check failed, err: %v", err)
+	}
+
+	return nil
+}
+
+// ApplicationPodStatusCheck checks the status of the application pod
+func ApplicationPodStatusCheck(experimentsDetails *experimentTypes.ExperimentDetails, appName string, clients clients.ClientSets, replicaCount int, resultDetails *types.ResultDetails, eventsDetails *types.EventDetails, chaosDetails *types.ChaosDetails) error {
+
+	//ChaosStartTimeStamp contains the start timestamp, when the chaos injection begin
+	ChaosStartTimeStamp := time.Now().Unix()
+	var endTime <-chan time.Time
+	timeDelay := time.Duration(experimentsDetails.ChaosDuration) * time.Second
+	failFlag := false
+	applicationClient := clients.KubeClient.AppsV1().Deployments(experimentsDetails.AppNS)
+	applicationDeploy, err := applicationClient.Get(appName, metav1.GetOptions{})
+	if err != nil {
+		return errors.Errorf("Unable to get the application, err: %v", err)
+	}
+	for count := 0; count < int(experimentsDetails.ChaosDuration/2); count++ {
+
+		if int(applicationDeploy.Status.AvailableReplicas) != experimentsDetails.Replicas {
+
+			log.Infof("Application Pod Avaliable Count is: %s", strconv.Itoa(int(applicationDeploy.Status.AvailableReplicas)))
+			applicationDeploy, err = applicationClient.Get(appName, metav1.GetOptions{})
+			if err != nil {
+				return errors.Errorf("Unable to get the application, err: %v", err)
+			}
+
+			time.Sleep(2 * time.Second)
+			//ChaosCurrentTimeStamp contains the current timestamp
+			ChaosCurrentTimeStamp := time.Now().Unix()
+
+			//ChaosDiffTimeStamp contains the difference of current timestamp and start timestamp
+			//It will helpful to track the total chaos duration
+			chaosDiffTimeStamp := ChaosCurrentTimeStamp - ChaosStartTimeStamp
+			if int(chaosDiffTimeStamp) >= experimentsDetails.ChaosDuration {
+				failFlag = true
+				break
+			}
+
+			// signChan channel is used to transmit signal notifications.
+			signChan := make(chan os.Signal, 1)
+			// Catch and relay certain signal(s) to signChan channel.
+			signal.Notify(signChan, os.Interrupt, syscall.SIGTERM, syscall.SIGKILL)
+		loop:
+			for {
+				endTime = time.After(timeDelay)
+
+				select {
+				case <-signChan:
+					err = AutoscalerReovery(experimentsDetails, clients, replicaCount, appName)
+					if err != nil {
+						return errors.Errorf("Unable to perform autoscaling, due to %v", err)
+					}
+					// updating the chaosresult after stopped
+					failStep := "Pod autoscaler chaos injection stopped!"
+					types.SetResultAfterCompletion(resultDetails, "Fail", "Stopped", failStep)
+					result.ChaosResult(chaosDetails, clients, resultDetails, "EOT")
+
+					// generating summary event in chaosengine
+					msg := experimentsDetails.ExperimentName + " experiment has been aborted"
+					types.SetEngineEventAttributes(eventsDetails, types.Summary, msg, "Warning", chaosDetails)
+					events.GenerateEvents(eventsDetails, clients, chaosDetails, "ChaosEngine")
+
+					// generating summary event in chaosresult
+					types.SetResultEventAttributes(eventsDetails, types.Summary, msg, "Warning", resultDetails)
+					events.GenerateEvents(eventsDetails, clients, chaosDetails, "ChaosResult")
+
+					os.Exit(1)
+				case <-endTime:
+					log.Infof("[Chaos]: Time is up for experiment: %v", experimentsDetails.ExperimentName)
+					endTime = nil
+					break loop
+
+				}
+
+			}
+
+			err = AutoscalerReovery(experimentsDetails, clients, replicaCount, appName)
+			if err != nil {
+				return errors.Errorf("Unable to perform autoscaling, due to %v", err)
+			}
+		} else {
+			break
+		}
+	}
+	if failFlag == true {
+		err = AutoscalerReovery(experimentsDetails, clients, replicaCount, appName)
+		if err != nil {
+			return errors.Errorf("Unable to perform autoscaling, due to %v", err)
+		}
+		return errors.Errorf("Application pod fails to come in running state after Chaos Duration of %d sec", experimentsDetails.ChaosDuration)
+	}
+	// Keeping a wait time of 10s after all pod comes in running state
+	// This is optional and used just for viewing the pod status
+	time.Sleep(10 * time.Second)
+
+	return nil
+}
+
+//AutoscalerReovery scale back to initial number of replica
+func AutoscalerReovery(experimentsDetails *experimentTypes.ExperimentDetails, clients clients.ClientSets, replicaCount int, appName string) error {
+
+	applicationClient := clients.KubeClient.AppsV1().Deployments(experimentsDetails.ChaosNamespace)
+
+	// Scale back to initial number of replicas
+	retryErr := retries.RetryOnConflict(retries.DefaultRetry, func() error {
+		// Retrieve the latest version of Deployment before attempting update
+		// RetryOnConflict uses exponential backoff to avoid exhausting the apiserver
+		result, err := applicationClient.Get(appName, metav1.GetOptions{})
+		if err != nil {
+			return errors.Errorf("Failed to get latest version of Application Deployment: %v", err)
+		}
+
+		result.Spec.Replicas = int32Ptr(int32(replicaCount)) // modify replica count
+		_, updateErr := applicationClient.Update(result)
+		return updateErr
+	})
+	if retryErr != nil {
+		return errors.Errorf("Unable to scale the, due to: %v", retryErr)
+	}
+	log.Info("[Info]: Application Started Scaling back")
+
+	applicationDeploy, err := clients.KubeClient.AppsV1().Deployments(experimentsDetails.AppNS).Get(appName, metav1.GetOptions{})
+	if err != nil {
+		return errors.Errorf("Unable to get the application, err: %v", err)
+	}
+	failFlag := false
+	// Check for 30 retries with 2secs of delay
+	for count := 0; count < 30; count++ {
+
+		if int(applicationDeploy.Status.AvailableReplicas) != replicaCount {
+
+			applicationDeploy, err = applicationClient.Get(appName, metav1.GetOptions{})
+			if err != nil {
+				return errors.Errorf("Unable to get the application, err: %v", err)
+			}
+			time.Sleep(2 * time.Second)
+			if count == 30 {
+				failFlag = true
+				break
+			}
+
+		} else {
+			break
+		}
+	}
+	if failFlag == true {
+		return errors.Errorf("Application fails to roll back")
+	}
+	log.Info("[RollBack]: Application Pod roll back to initial number of replicas")
+
+	return nil
+}
+
+func int32Ptr(i int32) *int32 { return &i }
@@ -0,0 +1,69 @@
+## Experiment CR for the pod-autoscaler experiment
+
+```
+apiVersion: litmuschaos.io/v1alpha1
+description:
+  message: |
+    Scale the deployment replicas to check the autoscaling capability
+kind: ChaosExperiment
+metadata:
+  name: pod-autoscaler
+  version: 0.1.0
+spec:
+  definition:
+    scope: Namespaced
+    permissions:
+      - apiGroups:
+          - ""
+          - "apps"
+          - "batch"
+          - "litmuschaos.io"
+        resources:
+          - "deployments"
+          - "jobs"
+          - "pods"
+          - "pods/log"
+          - "events"
+          - "configmaps"
+          - "chaosengines"
+          - "chaosexperiments"
+          - "chaosresults"
+        verbs:
+          - "create"
+          - "list"
+          - "get"
+          - "patch"
+          - "update"
+          - "delete"
+      - apiGroups:
+          - ""
+        resources: 
+          - "nodes"
+        verbs:
+          - "get"
+          - "list"
+    image: "litmuschaos/go-runner:ci"
+    imagePullPolicy: Always
+    args:
+    - -c
+    - ./experiments/pod-autoscaler
+    command:
+    - /bin/bash
+    env:
+
+    - name: TOTAL_CHAOS_DURATION
+      value: '60'
+
+    - name: REPLICA_COUNT
+      value: '5'
+
+    # Period to wait before and after injection of chaos in sec
+    - name: RAMP_TIME
+      value: ''
+
+    - name: LIB
+      value: 'litmus'    
+    labels:
+      name: pod-autoscaler
+
+```