Skip to content

Commit

Permalink
chore(exp): Add pod autoscaler experiment used to check the scalabili…
Browse files Browse the repository at this point in the history
…ty of the application pod (#65)

* chore(exp): Add pod autoscaler experiment used to check the salability of the application pod

Signed-off-by: Udit Gaurav <uditgaurav@gmail.com>

* Adding abort in the experiment

Signed-off-by: Udit Gaurav <udit.gaurav@mayadata.io>
  • Loading branch information
uditgaurav committed Aug 15, 2020
1 parent 55b0a6f commit 9b866d7
Show file tree
Hide file tree
Showing 17 changed files with 1,160 additions and 5 deletions.
2 changes: 2 additions & 0 deletions build/generate_go_binary
Original file line number Diff line number Diff line change
Expand Up @@ -26,5 +26,7 @@ go build -o build/_output/node-cpu-hog ./experiments/generic/node-cpu-hog
go build -o build/_output/container-kill ./experiments/generic/container-kill
# Buiding go binaries for disk_fill experiment
go build -o build/_output/disk-fill ./experiments/generic/disk-fill
# Buiding go binaries for pod-autoscaler experiment
go build -o build/_output/pod-autoscaler ./experiments/generic/pod-autoscaler
# Buiding go binaries for container_kill helper
go build -o build/_output/container-killer ./chaoslib/litmus/container_kill/helper
215 changes: 215 additions & 0 deletions chaoslib/litmus/pod_autoscaler/pod-autoscaler.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
package pod_autoscaler

import (
"strconv"
"time"

clients "github.com/litmuschaos/litmus-go/pkg/clients"
experimentTypes "github.com/litmuschaos/litmus-go/pkg/generic/pod-autoscaler/types"
"github.com/litmuschaos/litmus-go/pkg/log"
"github.com/litmuschaos/litmus-go/pkg/types"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
retries "k8s.io/client-go/util/retry"

"github.com/pkg/errors"
)

var err error

//PreparePodAutoscaler contains the prepration steps before chaos injection
func PreparePodAutoscaler(experimentsDetails *experimentTypes.ExperimentDetails, clients clients.ClientSets, resultDetails *types.ResultDetails, eventsDetails *types.EventDetails, chaosDetails *types.ChaosDetails) error {

appName, replicaCount, err := GetApplicationDetails(experimentsDetails, clients)
if err != nil {
return errors.Errorf("Unable to get the relicaCount of the application, err: %v", err)
}

//Waiting for the ramp time before chaos injection
if experimentsDetails.RampTime != 0 {
log.Infof("[Ramp]: Waiting for the %vs ramp time before injecting chaos", strconv.Itoa(experimentsDetails.RampTime))
waitForRampTime(experimentsDetails)
}

err = PodAutoscalerChaos(experimentsDetails, clients, replicaCount, appName, resultDetails, eventsDetails, chaosDetails)

if err != nil {
return errors.Errorf("Unable to perform autoscaling, due to %v", err)
}

err = AutoscalerRecovery(experimentsDetails, clients, replicaCount, appName)
if err != nil {
return errors.Errorf("Unable to perform autoscaling, due to %v", err)
}

//Waiting for the ramp time after chaos injection
if experimentsDetails.RampTime != 0 {
log.Infof("[Ramp]: Waiting for the %vs ramp time after injecting chaos", strconv.Itoa(experimentsDetails.RampTime))
waitForRampTime(experimentsDetails)
}
return nil
}

//waitForRampTime waits for the given ramp time duration (in seconds)
func waitForRampTime(experimentsDetails *experimentTypes.ExperimentDetails) {
time.Sleep(time.Duration(experimentsDetails.RampTime) * time.Second)
}

//GetApplicationDetails is used to get the application name, replicas of the application
func GetApplicationDetails(experimentsDetails *experimentTypes.ExperimentDetails, clients clients.ClientSets) (string, int, error) {

var appReplica int
var appName string
// Get Deployment replica count
applicationList, err := clients.KubeClient.AppsV1().Deployments(experimentsDetails.AppNS).List(metav1.ListOptions{LabelSelector: experimentsDetails.AppLabel})
if err != nil || len(applicationList.Items) == 0 {
return "", 0, errors.Errorf("Unable to get application, err: %v", err)
}
for _, app := range applicationList.Items {
appReplica = int(*app.Spec.Replicas)
appName = app.Name

}
return appName, appReplica, nil

}

//PodAutoscalerChaos scales up the application pod replicas
func PodAutoscalerChaos(experimentsDetails *experimentTypes.ExperimentDetails, clients clients.ClientSets, replicaCount int, appName string, resultDetails *types.ResultDetails, eventsDetails *types.EventDetails, chaosDetails *types.ChaosDetails) error {

applicationClient := clients.KubeClient.AppsV1().Deployments(experimentsDetails.AppNS)

replicas := int32(experimentsDetails.Replicas)
// Scale Application
retryErr := retries.RetryOnConflict(retries.DefaultRetry, func() error {
// Retrieve the latest version of Deployment before attempting update
// RetryOnConflict uses exponential backoff to avoid exhausting the apiserver
appUnderTest, err := applicationClient.Get(appName, metav1.GetOptions{})
if err != nil {
return errors.Errorf("Failed to get latest version of Application Deployment: %v", err)
}

appUnderTest.Spec.Replicas = int32Ptr(replicas) // modify replica count
_, updateErr := applicationClient.Update(appUnderTest)
return updateErr
})
if retryErr != nil {
return errors.Errorf("Unable to scale the application, due to: %v", retryErr)
}
log.Info("Application Started Scaling")

err = ApplicationPodStatusCheck(experimentsDetails, appName, clients, replicaCount, resultDetails, eventsDetails, chaosDetails)
if err != nil {
return errors.Errorf("Status Check failed, err: %v", err)
}

return nil
}

// ApplicationPodStatusCheck checks the status of the application pod
func ApplicationPodStatusCheck(experimentsDetails *experimentTypes.ExperimentDetails, appName string, clients clients.ClientSets, replicaCount int, resultDetails *types.ResultDetails, eventsDetails *types.EventDetails, chaosDetails *types.ChaosDetails) error {

//ChaosStartTimeStamp contains the start timestamp, when the chaos injection begin
ChaosStartTimeStamp := time.Now().Unix()
failFlag := false
applicationClient := clients.KubeClient.AppsV1().Deployments(experimentsDetails.AppNS)
applicationDeploy, err := applicationClient.Get(appName, metav1.GetOptions{})
if err != nil {
return errors.Errorf("Unable to get the application, err: %v", err)
}
for count := 0; count < int(experimentsDetails.ChaosDuration/2); count++ {

if int(applicationDeploy.Status.AvailableReplicas) != experimentsDetails.Replicas {

log.Infof("Application Pod Avaliable Count is: %s", strconv.Itoa(int(applicationDeploy.Status.AvailableReplicas)))
applicationDeploy, err = applicationClient.Get(appName, metav1.GetOptions{})
if err != nil {
return errors.Errorf("Unable to get the application, err: %v", err)
}

time.Sleep(2 * time.Second)
//ChaosCurrentTimeStamp contains the current timestamp
ChaosCurrentTimeStamp := time.Now().Unix()

//ChaosDiffTimeStamp contains the difference of current timestamp and start timestamp
//It will helpful to track the total chaos duration
chaosDiffTimeStamp := ChaosCurrentTimeStamp - ChaosStartTimeStamp
if int(chaosDiffTimeStamp) >= experimentsDetails.ChaosDuration {
failFlag = true
break
}

} else {
break
}
}
if failFlag == true {
err = AutoscalerRecovery(experimentsDetails, clients, replicaCount, appName)
if err != nil {
return errors.Errorf("Unable to perform autoscaling, due to %v", err)
}
return errors.Errorf("Application pod fails to come in running state after Chaos Duration of %d sec", experimentsDetails.ChaosDuration)
}
// Keeping a wait time of 10s after all pod comes in running state
// This is optional and used just for viewing the pod status
time.Sleep(10 * time.Second)

return nil
}

//AutoscalerRecovery scale back to initial number of replica
func AutoscalerRecovery(experimentsDetails *experimentTypes.ExperimentDetails, clients clients.ClientSets, replicaCount int, appName string) error {

applicationClient := clients.KubeClient.AppsV1().Deployments(experimentsDetails.ChaosNamespace)

// Scale back to initial number of replicas
retryErr := retries.RetryOnConflict(retries.DefaultRetry, func() error {
// Retrieve the latest version of Deployment before attempting update
// RetryOnConflict uses exponential backoff to avoid exhausting the apiserver
appUnderTest, err := applicationClient.Get(appName, metav1.GetOptions{})
if err != nil {
return errors.Errorf("Failed to get latest version of Application Deployment: %v", err)
}

appUnderTest.Spec.Replicas = int32Ptr(int32(replicaCount)) // modify replica count
_, updateErr := applicationClient.Update(appUnderTest)
return updateErr
})
if retryErr != nil {
return errors.Errorf("Unable to scale the, due to: %v", retryErr)
}
log.Info("[Info]: Application pod started rolling back")

applicationDeploy, err := clients.KubeClient.AppsV1().Deployments(experimentsDetails.AppNS).Get(appName, metav1.GetOptions{})
if err != nil {
return errors.Errorf("Unable to get the application, err: %v", err)
}

failFlag := false
// Check for 30 retries with 2secs of delay
for count := 0; count < 30; count++ {

if int(applicationDeploy.Status.AvailableReplicas) != replicaCount {

applicationDeploy, err = applicationClient.Get(appName, metav1.GetOptions{})
if err != nil {
return errors.Errorf("Unable to get the application, err: %v", err)
}
time.Sleep(2 * time.Second)
if count == 30 {
failFlag = true
break
}

} else {
break
}
}
if failFlag == true {
return errors.Errorf("Application fails to roll back")
}
log.Info("[RollBack]: Application Pod roll back to initial number of replicas")

return nil
}

func int32Ptr(i int32) *int32 { return &i }
2 changes: 1 addition & 1 deletion chaoslib/litmus/pod_cpu_hog/pod-cpu-hog.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ func ExperimentCPU(experimentsDetails *experimentTypes.ExperimentDetails, client
}
// updating the chaosresult after stopped
failStep := "CPU hog Chaos injection stopped!"
types.SetResultAfterCompletion(resultDetails, "Fail", "Stopped", failStep)
types.SetResultAfterCompletion(resultDetails, "Stopped", "Stopped", failStep)
result.ChaosResult(chaosDetails, clients, resultDetails, "EOT")

// generating summary event in chaosengine
Expand Down
2 changes: 1 addition & 1 deletion chaoslib/litmus/pod_memory_hog/pod-memory-hog.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ func ExperimentMemory(experimentsDetails *experimentTypes.ExperimentDetails, cli
}
// updating the chaosresult after stopped
failStep := "Memory hog Chaos injection stopped!"
types.SetResultAfterCompletion(resultDetails, "Fail", "Stopped", failStep)
types.SetResultAfterCompletion(resultDetails, "Stopped", "Stopped", failStep)
result.ChaosResult(chaosDetails, clients, resultDetails, "EOT")

// generating summary event in chaosengine
Expand Down
14 changes: 14 additions & 0 deletions experiments/generic/pod-autoscaler/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
## Experiment Metadata

<table>
<tr>
<th> Name </th>
<th> Description </th>
<th> Documentation Link </th>
</tr>
<tr>
<td> Pod Autoscaler</td>
<td> Scale the deployment replicas to check the autoscaling capability. </td>
<td> <a href="https://docs.litmuschaos.io/docs/pod-autoscaler/"> Here </a> </td>
</tr>
</table>
134 changes: 134 additions & 0 deletions experiments/generic/pod-autoscaler/pod-autoscaler.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
package main

import (
"github.com/litmuschaos/litmus-go/chaoslib/litmus/pod_autoscaler"
clients "github.com/litmuschaos/litmus-go/pkg/clients"
"github.com/litmuschaos/litmus-go/pkg/events"
experimentEnv "github.com/litmuschaos/litmus-go/pkg/generic/pod-autoscaler/environment"
experimentTypes "github.com/litmuschaos/litmus-go/pkg/generic/pod-autoscaler/types"
"github.com/litmuschaos/litmus-go/pkg/log"
"github.com/litmuschaos/litmus-go/pkg/result"
"github.com/litmuschaos/litmus-go/pkg/status"
"github.com/litmuschaos/litmus-go/pkg/types"
"github.com/sirupsen/logrus"
)

func init() {
// Log as JSON instead of the default ASCII formatter.
logrus.SetFormatter(&logrus.TextFormatter{
FullTimestamp: true,
DisableSorting: true,
DisableLevelTruncation: true,
})
}

func main() {

var err error
experimentsDetails := experimentTypes.ExperimentDetails{}
resultDetails := types.ResultDetails{}
eventsDetails := types.EventDetails{}
clients := clients.ClientSets{}
chaosDetails := types.ChaosDetails{}

//Getting kubeConfig and Generate ClientSets
if err := clients.GenerateClientSetFromKubeConfig(); err != nil {
log.Fatalf("Unable to Get the kubeconfig due to %v", err)
}

//Fetching all the ENV passed from the runner pod
log.Infof("[PreReq]: Getting the ENV for the %v experiment", experimentsDetails.ExperimentName)
experimentEnv.GetENV(&experimentsDetails, "pod-autoscaler")

// Intialise the chaos attributes
experimentEnv.InitialiseChaosVariables(&chaosDetails, &experimentsDetails)

// Intialise Chaos Result Parameters
types.SetResultAttributes(&resultDetails, chaosDetails)

//Updating the chaos result in the beginning of experiment
log.Infof("[PreReq]: Updating the chaos result of %v experiment (SOT)", experimentsDetails.ExperimentName)
err = result.ChaosResult(&chaosDetails, clients, &resultDetails, "SOT")
if err != nil {
log.Errorf("Unable to Create the Chaos Result due to %v", err)
failStep := "Updating the chaos result of pod-delete experiment (SOT)"
result.RecordAfterFailure(&chaosDetails, &resultDetails, failStep, clients, &eventsDetails)
return
}

// Set the chaos result uid
result.SetResultUID(&resultDetails, clients, &chaosDetails)

//DISPLAY THE APP INFORMATION
log.InfoWithValues("The application informations are as follows", logrus.Fields{
"Namespace": experimentsDetails.AppNS,
"Label": experimentsDetails.AppLabel,
"Ramp Time": experimentsDetails.RampTime,
})

//PRE-CHAOS APPLICATION STATUS CHECK
log.Info("[Status]: Verify that the AUT (Application Under Test) is running (pre-chaos)")
err = status.CheckApplicationStatus(experimentsDetails.AppNS, experimentsDetails.AppLabel, experimentsDetails.Timeout, experimentsDetails.Delay, clients)
if err != nil {
log.Errorf("Application status check failed due to %v\n", err)
failStep := "Verify that the AUT (Application Under Test) is running (pre-chaos)"
types.SetResultAfterCompletion(&resultDetails, "Fail", "Completed", failStep)
result.ChaosResult(&chaosDetails, clients, &resultDetails, "EOT")
return
}
if experimentsDetails.EngineName != "" {
types.SetEngineEventAttributes(&eventsDetails, types.PreChaosCheck, "AUT is Running successfully", "Normal", &chaosDetails)
events.GenerateEvents(&eventsDetails, clients, &chaosDetails, "ChaosEngine")
}

// Including the litmus lib for pod-autoscaler
if experimentsDetails.ChaosLib == "litmus" {
err = pod_autoscaler.PreparePodAutoscaler(&experimentsDetails, clients, &resultDetails, &eventsDetails, &chaosDetails)
if err != nil {
log.Errorf("Chaos injection failed due to %v\n", err)
failStep := "Including the litmus lib for pod-autoscaler"
types.SetResultAfterCompletion(&resultDetails, "Fail", "Completed", failStep)
result.ChaosResult(&chaosDetails, clients, &resultDetails, "EOT")
return
}
log.Info("[Confirmation]: The application pod autoscaler completed successfully")
resultDetails.Verdict = "Pass"
} else {
log.Error("[Invalid]: Please Provide the correct LIB")
failStep := "Including the litmus lib for pod-autoscaler"
types.SetResultAfterCompletion(&resultDetails, "Fail", "Completed", failStep)
result.ChaosResult(&chaosDetails, clients, &resultDetails, "EOT")
return
}

//POST-CHAOS APPLICATION STATUS CHECK
log.Info("[Status]: Verify that the AUT (Application Under Test) is running (post-chaos)")
err = status.CheckApplicationStatus(experimentsDetails.AppNS, experimentsDetails.AppLabel, experimentsDetails.Timeout, experimentsDetails.Delay, clients)
if err != nil {
log.Errorf("Application status check failed due to %v\n", err)
failStep := "Verify that the AUT (Application Under Test) is running (post-chaos)"
types.SetResultAfterCompletion(&resultDetails, "Fail", "Completed", failStep)
result.ChaosResult(&chaosDetails, clients, &resultDetails, "EOT")
return
}
if experimentsDetails.EngineName != "" {
types.SetEngineEventAttributes(&eventsDetails, types.PostChaosCheck, "AUT is Running successfully", "Normal", &chaosDetails)
events.GenerateEvents(&eventsDetails, clients, &chaosDetails, "ChaosEngine")
}

//Updating the chaosResult in the end of experiment
log.Infof("[The End]: Updating the chaos result of %v experiment (EOT)", experimentsDetails.ExperimentName)
err = result.ChaosResult(&chaosDetails, clients, &resultDetails, "EOT")
if err != nil {
log.Fatalf("Unable to Update the Chaos Result due to %v\n", err)
}
if experimentsDetails.EngineName != "" {
msg := experimentsDetails.ExperimentName + " experiment has been " + resultDetails.Verdict + "ed"
types.SetEngineEventAttributes(&eventsDetails, types.Summary, msg, "Normal", &chaosDetails)
events.GenerateEvents(&eventsDetails, clients, &chaosDetails, "ChaosEngine")
}

msg := experimentsDetails.ExperimentName + " experiment has been " + resultDetails.Verdict + "ed"
types.SetResultEventAttributes(&eventsDetails, types.Summary, msg, "Normal", &resultDetails)
events.GenerateEvents(&eventsDetails, clients, &chaosDetails, "ChaosResult")
}
Loading

0 comments on commit 9b866d7

Please sign in to comment.