chore(exp): Add pod autoscaler experiment used to check the scalabili…

…ty of the application pod (#65) * chore(exp): Add pod autoscaler experiment used to check the salability of the application pod Signed-off-by: Udit Gaurav <uditgaurav@gmail.com> * Adding abort in the experiment Signed-off-by: Udit Gaurav <udit.gaurav@mayadata.io>
litmuschaos · Aug 15, 2020 · 9b866d7 · 9b866d7
1 parent 55b0a6f
commit 9b866d7
Show file tree

Hide file tree

Showing 17 changed files with 1,160 additions and 5 deletions.
diff --git a/build/generate_go_binary b/build/generate_go_binary
@@ -26,5 +26,7 @@ go build -o build/_output/node-cpu-hog ./experiments/generic/node-cpu-hog
 go build -o build/_output/container-kill ./experiments/generic/container-kill
 # Buiding go binaries for disk_fill experiment
 go build -o build/_output/disk-fill ./experiments/generic/disk-fill
+# Buiding go binaries for pod-autoscaler experiment
+go build -o build/_output/pod-autoscaler ./experiments/generic/pod-autoscaler
 # Buiding go binaries for container_kill helper
 go build -o build/_output/container-killer ./chaoslib/litmus/container_kill/helper
diff --git a/chaoslib/litmus/pod_autoscaler/pod-autoscaler.go b/chaoslib/litmus/pod_autoscaler/pod-autoscaler.go
@@ -0,0 +1,215 @@
+package pod_autoscaler
+
+import (
+	"strconv"
+	"time"
+
+	clients "github.com/litmuschaos/litmus-go/pkg/clients"
+	experimentTypes "github.com/litmuschaos/litmus-go/pkg/generic/pod-autoscaler/types"
+	"github.com/litmuschaos/litmus-go/pkg/log"
+	"github.com/litmuschaos/litmus-go/pkg/types"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	retries "k8s.io/client-go/util/retry"
+
+	"github.com/pkg/errors"
+)
+
+var err error
+
+//PreparePodAutoscaler contains the prepration steps before chaos injection
+func PreparePodAutoscaler(experimentsDetails *experimentTypes.ExperimentDetails, clients clients.ClientSets, resultDetails *types.ResultDetails, eventsDetails *types.EventDetails, chaosDetails *types.ChaosDetails) error {
+
+	appName, replicaCount, err := GetApplicationDetails(experimentsDetails, clients)
+	if err != nil {
+		return errors.Errorf("Unable to get the relicaCount of the application, err: %v", err)
+	}
+
+	//Waiting for the ramp time before chaos injection
+	if experimentsDetails.RampTime != 0 {
+		log.Infof("[Ramp]: Waiting for the %vs ramp time before injecting chaos", strconv.Itoa(experimentsDetails.RampTime))
+		waitForRampTime(experimentsDetails)
+	}
+
+	err = PodAutoscalerChaos(experimentsDetails, clients, replicaCount, appName, resultDetails, eventsDetails, chaosDetails)
+
+	if err != nil {
+		return errors.Errorf("Unable to perform autoscaling, due to %v", err)
+	}
+
+	err = AutoscalerRecovery(experimentsDetails, clients, replicaCount, appName)
+	if err != nil {
+		return errors.Errorf("Unable to perform autoscaling, due to %v", err)
+	}
+
+	//Waiting for the ramp time after chaos injection
+	if experimentsDetails.RampTime != 0 {
+		log.Infof("[Ramp]: Waiting for the %vs ramp time after injecting chaos", strconv.Itoa(experimentsDetails.RampTime))
+		waitForRampTime(experimentsDetails)
+	}
+	return nil
+}
+
+//waitForRampTime waits for the given ramp time duration (in seconds)
+func waitForRampTime(experimentsDetails *experimentTypes.ExperimentDetails) {
+	time.Sleep(time.Duration(experimentsDetails.RampTime) * time.Second)
+}
+
+//GetApplicationDetails is used to get the application name, replicas of the application
+func GetApplicationDetails(experimentsDetails *experimentTypes.ExperimentDetails, clients clients.ClientSets) (string, int, error) {
+
+	var appReplica int
+	var appName string
+	// Get Deployment replica count
+	applicationList, err := clients.KubeClient.AppsV1().Deployments(experimentsDetails.AppNS).List(metav1.ListOptions{LabelSelector: experimentsDetails.AppLabel})
+	if err != nil || len(applicationList.Items) == 0 {
+		return "", 0, errors.Errorf("Unable to get application, err: %v", err)
+	}
+	for _, app := range applicationList.Items {
+		appReplica = int(*app.Spec.Replicas)
+		appName = app.Name
+
+	}
+	return appName, appReplica, nil
+
+}
+
+//PodAutoscalerChaos scales up the application pod replicas
+func PodAutoscalerChaos(experimentsDetails *experimentTypes.ExperimentDetails, clients clients.ClientSets, replicaCount int, appName string, resultDetails *types.ResultDetails, eventsDetails *types.EventDetails, chaosDetails *types.ChaosDetails) error {
+
+	applicationClient := clients.KubeClient.AppsV1().Deployments(experimentsDetails.AppNS)
+
+	replicas := int32(experimentsDetails.Replicas)
+	// Scale Application
+	retryErr := retries.RetryOnConflict(retries.DefaultRetry, func() error {
+		// Retrieve the latest version of Deployment before attempting update
+		// RetryOnConflict uses exponential backoff to avoid exhausting the apiserver
+		appUnderTest, err := applicationClient.Get(appName, metav1.GetOptions{})
+		if err != nil {
+			return errors.Errorf("Failed to get latest version of Application Deployment: %v", err)
+		}
+
+		appUnderTest.Spec.Replicas = int32Ptr(replicas) // modify replica count
+		_, updateErr := applicationClient.Update(appUnderTest)
+		return updateErr
+	})
+	if retryErr != nil {
+		return errors.Errorf("Unable to scale the application, due to: %v", retryErr)
+	}
+	log.Info("Application Started Scaling")
+
+	err = ApplicationPodStatusCheck(experimentsDetails, appName, clients, replicaCount, resultDetails, eventsDetails, chaosDetails)
+	if err != nil {
+		return errors.Errorf("Status Check failed, err: %v", err)
+	}
+
+	return nil
+}
+
+// ApplicationPodStatusCheck checks the status of the application pod
+func ApplicationPodStatusCheck(experimentsDetails *experimentTypes.ExperimentDetails, appName string, clients clients.ClientSets, replicaCount int, resultDetails *types.ResultDetails, eventsDetails *types.EventDetails, chaosDetails *types.ChaosDetails) error {
+
+	//ChaosStartTimeStamp contains the start timestamp, when the chaos injection begin
+	ChaosStartTimeStamp := time.Now().Unix()
+	failFlag := false
+	applicationClient := clients.KubeClient.AppsV1().Deployments(experimentsDetails.AppNS)
+	applicationDeploy, err := applicationClient.Get(appName, metav1.GetOptions{})
+	if err != nil {
+		return errors.Errorf("Unable to get the application, err: %v", err)
+	}
+	for count := 0; count < int(experimentsDetails.ChaosDuration/2); count++ {
+
+		if int(applicationDeploy.Status.AvailableReplicas) != experimentsDetails.Replicas {
+
+			log.Infof("Application Pod Avaliable Count is: %s", strconv.Itoa(int(applicationDeploy.Status.AvailableReplicas)))
+			applicationDeploy, err = applicationClient.Get(appName, metav1.GetOptions{})
+			if err != nil {
+				return errors.Errorf("Unable to get the application, err: %v", err)
+			}
+
+			time.Sleep(2 * time.Second)
+			//ChaosCurrentTimeStamp contains the current timestamp
+			ChaosCurrentTimeStamp := time.Now().Unix()
+
+			//ChaosDiffTimeStamp contains the difference of current timestamp and start timestamp
+			//It will helpful to track the total chaos duration
+			chaosDiffTimeStamp := ChaosCurrentTimeStamp - ChaosStartTimeStamp
+			if int(chaosDiffTimeStamp) >= experimentsDetails.ChaosDuration {
+				failFlag = true
+				break
+			}
+
+		} else {
+			break
+		}
+	}
+	if failFlag == true {
+		err = AutoscalerRecovery(experimentsDetails, clients, replicaCount, appName)
+		if err != nil {
+			return errors.Errorf("Unable to perform autoscaling, due to %v", err)
+		}
+		return errors.Errorf("Application pod fails to come in running state after Chaos Duration of %d sec", experimentsDetails.ChaosDuration)
+	}
+	// Keeping a wait time of 10s after all pod comes in running state
+	// This is optional and used just for viewing the pod status
+	time.Sleep(10 * time.Second)
+
+	return nil
+}
+
+//AutoscalerRecovery scale back to initial number of replica
+func AutoscalerRecovery(experimentsDetails *experimentTypes.ExperimentDetails, clients clients.ClientSets, replicaCount int, appName string) error {
+
+	applicationClient := clients.KubeClient.AppsV1().Deployments(experimentsDetails.ChaosNamespace)
+
+	// Scale back to initial number of replicas
+	retryErr := retries.RetryOnConflict(retries.DefaultRetry, func() error {
+		// Retrieve the latest version of Deployment before attempting update
+		// RetryOnConflict uses exponential backoff to avoid exhausting the apiserver
+		appUnderTest, err := applicationClient.Get(appName, metav1.GetOptions{})
+		if err != nil {
+			return errors.Errorf("Failed to get latest version of Application Deployment: %v", err)
+		}
+
+		appUnderTest.Spec.Replicas = int32Ptr(int32(replicaCount)) // modify replica count
+		_, updateErr := applicationClient.Update(appUnderTest)
+		return updateErr
+	})
+	if retryErr != nil {
+		return errors.Errorf("Unable to scale the, due to: %v", retryErr)
+	}
+	log.Info("[Info]: Application pod started rolling back")
+
+	applicationDeploy, err := clients.KubeClient.AppsV1().Deployments(experimentsDetails.AppNS).Get(appName, metav1.GetOptions{})
+	if err != nil {
+		return errors.Errorf("Unable to get the application, err: %v", err)
+	}
+
+	failFlag := false
+	// Check for 30 retries with 2secs of delay
+	for count := 0; count < 30; count++ {
+
+		if int(applicationDeploy.Status.AvailableReplicas) != replicaCount {
+
+			applicationDeploy, err = applicationClient.Get(appName, metav1.GetOptions{})
+			if err != nil {
+				return errors.Errorf("Unable to get the application, err: %v", err)
+			}
+			time.Sleep(2 * time.Second)
+			if count == 30 {
+				failFlag = true
+				break
+			}
+
+		} else {
+			break
+		}
+	}
+	if failFlag == true {
+		return errors.Errorf("Application fails to roll back")
+	}
+	log.Info("[RollBack]: Application Pod roll back to initial number of replicas")
+
+	return nil
+}
+
+func int32Ptr(i int32) *int32 { return &i }
diff --git a/chaoslib/litmus/pod_cpu_hog/pod-cpu-hog.go b/chaoslib/litmus/pod_cpu_hog/pod-cpu-hog.go
@@ -91,7 +91,7 @@ func ExperimentCPU(experimentsDetails *experimentTypes.ExperimentDetails, client
 						}
 						// updating the chaosresult after stopped
 						failStep := "CPU hog Chaos injection stopped!"
-						types.SetResultAfterCompletion(resultDetails, "Fail", "Stopped", failStep)
+						types.SetResultAfterCompletion(resultDetails, "Stopped", "Stopped", failStep)
 						result.ChaosResult(chaosDetails, clients, resultDetails, "EOT")
 
 						// generating summary event in chaosengine

diff --git a/chaoslib/litmus/pod_memory_hog/pod-memory-hog.go b/chaoslib/litmus/pod_memory_hog/pod-memory-hog.go
@@ -96,7 +96,7 @@ func ExperimentMemory(experimentsDetails *experimentTypes.ExperimentDetails, cli
 					}
 					// updating the chaosresult after stopped
 					failStep := "Memory hog Chaos injection stopped!"
-					types.SetResultAfterCompletion(resultDetails, "Fail", "Stopped", failStep)
+					types.SetResultAfterCompletion(resultDetails, "Stopped", "Stopped", failStep)
 					result.ChaosResult(chaosDetails, clients, resultDetails, "EOT")
 
 					// generating summary event in chaosengine

diff --git a/experiments/generic/pod-autoscaler/README.md b/experiments/generic/pod-autoscaler/README.md
@@ -0,0 +1,14 @@
+## Experiment Metadata
+
+<table>
+<tr>
+<th> Name </th>
+<th> Description </th>
+<th> Documentation Link </th>
+</tr>
+<tr>
+ <td> Pod Autoscaler</td>
+ <td> Scale the deployment replicas to check the autoscaling capability. </td>
+ <td>  <a href="https://docs.litmuschaos.io/docs/pod-autoscaler/"> Here </a> </td>
+ </tr>
+ </table>
diff --git a/experiments/generic/pod-autoscaler/pod-autoscaler.go b/experiments/generic/pod-autoscaler/pod-autoscaler.go
@@ -0,0 +1,134 @@
+package main
+
+import (
+	"github.com/litmuschaos/litmus-go/chaoslib/litmus/pod_autoscaler"
+	clients "github.com/litmuschaos/litmus-go/pkg/clients"
+	"github.com/litmuschaos/litmus-go/pkg/events"
+	experimentEnv "github.com/litmuschaos/litmus-go/pkg/generic/pod-autoscaler/environment"
+	experimentTypes "github.com/litmuschaos/litmus-go/pkg/generic/pod-autoscaler/types"
+	"github.com/litmuschaos/litmus-go/pkg/log"
+	"github.com/litmuschaos/litmus-go/pkg/result"
+	"github.com/litmuschaos/litmus-go/pkg/status"
+	"github.com/litmuschaos/litmus-go/pkg/types"
+	"github.com/sirupsen/logrus"
+)
+
+func init() {
+	// Log as JSON instead of the default ASCII formatter.
+	logrus.SetFormatter(&logrus.TextFormatter{
+		FullTimestamp:          true,
+		DisableSorting:         true,
+		DisableLevelTruncation: true,
+	})
+}
+
+func main() {
+
+	var err error
+	experimentsDetails := experimentTypes.ExperimentDetails{}
+	resultDetails := types.ResultDetails{}
+	eventsDetails := types.EventDetails{}
+	clients := clients.ClientSets{}
+	chaosDetails := types.ChaosDetails{}
+
+	//Getting kubeConfig and Generate ClientSets
+	if err := clients.GenerateClientSetFromKubeConfig(); err != nil {
+		log.Fatalf("Unable to Get the kubeconfig due to %v", err)
+	}
+
+	//Fetching all the ENV passed from the runner pod
+	log.Infof("[PreReq]: Getting the ENV for the %v experiment", experimentsDetails.ExperimentName)
+	experimentEnv.GetENV(&experimentsDetails, "pod-autoscaler")
+
+	// Intialise the chaos attributes
+	experimentEnv.InitialiseChaosVariables(&chaosDetails, &experimentsDetails)
+
+	// Intialise Chaos Result Parameters
+	types.SetResultAttributes(&resultDetails, chaosDetails)
+
+	//Updating the chaos result in the beginning of experiment
+	log.Infof("[PreReq]: Updating the chaos result of %v experiment (SOT)", experimentsDetails.ExperimentName)
+	err = result.ChaosResult(&chaosDetails, clients, &resultDetails, "SOT")
+	if err != nil {
+		log.Errorf("Unable to Create the Chaos Result due to %v", err)
+		failStep := "Updating the chaos result of pod-delete experiment (SOT)"
+		result.RecordAfterFailure(&chaosDetails, &resultDetails, failStep, clients, &eventsDetails)
+		return
+	}
+
+	// Set the chaos result uid
+	result.SetResultUID(&resultDetails, clients, &chaosDetails)
+
+	//DISPLAY THE APP INFORMATION
+	log.InfoWithValues("The application informations are as follows", logrus.Fields{
+		"Namespace": experimentsDetails.AppNS,
+		"Label":     experimentsDetails.AppLabel,
+		"Ramp Time": experimentsDetails.RampTime,
+	})
+
+	//PRE-CHAOS APPLICATION STATUS CHECK
+	log.Info("[Status]: Verify that the AUT (Application Under Test) is running (pre-chaos)")
+	err = status.CheckApplicationStatus(experimentsDetails.AppNS, experimentsDetails.AppLabel, experimentsDetails.Timeout, experimentsDetails.Delay, clients)
+	if err != nil {
+		log.Errorf("Application status check failed due to %v\n", err)
+		failStep := "Verify that the AUT (Application Under Test) is running (pre-chaos)"
+		types.SetResultAfterCompletion(&resultDetails, "Fail", "Completed", failStep)
+		result.ChaosResult(&chaosDetails, clients, &resultDetails, "EOT")
+		return
+	}
+	if experimentsDetails.EngineName != "" {
+		types.SetEngineEventAttributes(&eventsDetails, types.PreChaosCheck, "AUT is Running successfully", "Normal", &chaosDetails)
+		events.GenerateEvents(&eventsDetails, clients, &chaosDetails, "ChaosEngine")
+	}
+
+	// Including the litmus lib for pod-autoscaler
+	if experimentsDetails.ChaosLib == "litmus" {
+		err = pod_autoscaler.PreparePodAutoscaler(&experimentsDetails, clients, &resultDetails, &eventsDetails, &chaosDetails)
+		if err != nil {
+			log.Errorf("Chaos injection failed due to %v\n", err)
+			failStep := "Including the litmus lib for pod-autoscaler"
+			types.SetResultAfterCompletion(&resultDetails, "Fail", "Completed", failStep)
+			result.ChaosResult(&chaosDetails, clients, &resultDetails, "EOT")
+			return
+		}
+		log.Info("[Confirmation]: The application pod autoscaler completed successfully")
+		resultDetails.Verdict = "Pass"
+	} else {
+		log.Error("[Invalid]: Please Provide the correct LIB")
+		failStep := "Including the litmus lib for pod-autoscaler"
+		types.SetResultAfterCompletion(&resultDetails, "Fail", "Completed", failStep)
+		result.ChaosResult(&chaosDetails, clients, &resultDetails, "EOT")
+		return
+	}
+
+	//POST-CHAOS APPLICATION STATUS CHECK
+	log.Info("[Status]: Verify that the AUT (Application Under Test) is running (post-chaos)")
+	err = status.CheckApplicationStatus(experimentsDetails.AppNS, experimentsDetails.AppLabel, experimentsDetails.Timeout, experimentsDetails.Delay, clients)
+	if err != nil {
+		log.Errorf("Application status check failed due to %v\n", err)
+		failStep := "Verify that the AUT (Application Under Test) is running (post-chaos)"
+		types.SetResultAfterCompletion(&resultDetails, "Fail", "Completed", failStep)
+		result.ChaosResult(&chaosDetails, clients, &resultDetails, "EOT")
+		return
+	}
+	if experimentsDetails.EngineName != "" {
+		types.SetEngineEventAttributes(&eventsDetails, types.PostChaosCheck, "AUT is Running successfully", "Normal", &chaosDetails)
+		events.GenerateEvents(&eventsDetails, clients, &chaosDetails, "ChaosEngine")
+	}
+
+	//Updating the chaosResult in the end of experiment
+	log.Infof("[The End]: Updating the chaos result of %v experiment (EOT)", experimentsDetails.ExperimentName)
+	err = result.ChaosResult(&chaosDetails, clients, &resultDetails, "EOT")
+	if err != nil {
+		log.Fatalf("Unable to Update the Chaos Result due to %v\n", err)
+	}
+	if experimentsDetails.EngineName != "" {
+		msg := experimentsDetails.ExperimentName + " experiment has been " + resultDetails.Verdict + "ed"
+		types.SetEngineEventAttributes(&eventsDetails, types.Summary, msg, "Normal", &chaosDetails)
+		events.GenerateEvents(&eventsDetails, clients, &chaosDetails, "ChaosEngine")
+	}
+
+	msg := experimentsDetails.ExperimentName + " experiment has been " + resultDetails.Verdict + "ed"
+	types.SetResultEventAttributes(&eventsDetails, types.Summary, msg, "Normal", &resultDetails)
+	events.GenerateEvents(&eventsDetails, clients, &chaosDetails, "ChaosResult")
+}