Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore(exp): Add pod autoscaler experiment used to check the scalability of the application pod #65

Merged
merged 5 commits into from
Aug 15, 2020
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions build/generate_go_binary
Original file line number Diff line number Diff line change
Expand Up @@ -26,5 +26,7 @@ go build -o build/_output/node-cpu-hog ./experiments/generic/node-cpu-hog
go build -o build/_output/container-kill ./experiments/generic/container-kill
# Buiding go binaries for disk_fill experiment
go build -o build/_output/disk-fill ./experiments/generic/disk-fill
# Buiding go binaries for pod-autoscaler experiment
go build -o build/_output/pod-autoscaler ./experiments/generic/pod-autoscaler
# Buiding go binaries for container_kill helper
go build -o build/_output/container-killer ./chaoslib/litmus/container_kill/helper
267 changes: 267 additions & 0 deletions chaoslib/litmus/pod_autoscaler/pod-autoscaler.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,267 @@
package pod_autoscaler

import (
"os"
"os/signal"
"strconv"
"syscall"
"time"

clients "github.com/litmuschaos/litmus-go/pkg/clients"
"github.com/litmuschaos/litmus-go/pkg/events"
experimentTypes "github.com/litmuschaos/litmus-go/pkg/generic/pod-autoscaler/types"
"github.com/litmuschaos/litmus-go/pkg/log"
"github.com/litmuschaos/litmus-go/pkg/result"
"github.com/litmuschaos/litmus-go/pkg/types"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
retries "k8s.io/client-go/util/retry"

"github.com/pkg/errors"
)

var err error

//PreparePodAutoscaler contains the prepration steps before chaos injection
func PreparePodAutoscaler(experimentsDetails *experimentTypes.ExperimentDetails, clients clients.ClientSets, resultDetails *types.ResultDetails, eventsDetails *types.EventDetails, chaosDetails *types.ChaosDetails) error {

appName, replicaCount, err := GetApplicationDetails(experimentsDetails, clients)
if err != nil {
return errors.Errorf("Unable to get the relicaCount of the application, err: %v", err)
}

//Waiting for the ramp time before chaos injection
if experimentsDetails.RampTime != 0 {
log.Infof("[Ramp]: Waiting for the %vs ramp time before injecting chaos", strconv.Itoa(experimentsDetails.RampTime))
waitForRampTime(experimentsDetails)
}
if err != nil {
return errors.Errorf("Unable to get the serviceAccountName, err: %v", err)
ispeakc0de marked this conversation as resolved.
Show resolved Hide resolved
}

err = PodAutoscalerChaos(experimentsDetails, clients, replicaCount, appName, resultDetails, eventsDetails, chaosDetails)

if err != nil {
return errors.Errorf("Unable to perform autoscaling, due to %v", err)
}

err = AutoscalerReovery(experimentsDetails, clients, replicaCount, appName)
ispeakc0de marked this conversation as resolved.
Show resolved Hide resolved
if err != nil {
return errors.Errorf("Unable to perform autoscaling, due to %v", err)
}

//Waiting for the ramp time after chaos injection
if experimentsDetails.RampTime != 0 {
log.Infof("[Ramp]: Waiting for the %vs ramp time after injecting chaos", strconv.Itoa(experimentsDetails.RampTime))
waitForRampTime(experimentsDetails)
}
return nil
}

//waitForRampTime waits for the given ramp time duration (in seconds)
func waitForRampTime(experimentsDetails *experimentTypes.ExperimentDetails) {
time.Sleep(time.Duration(experimentsDetails.RampTime) * time.Second)
}

//GetApplicationDetails is used to get the application name, replicas of the application
func GetApplicationDetails(experimentsDetails *experimentTypes.ExperimentDetails, clients clients.ClientSets) (string, int, error) {

var appReplica int
var appName string
applicationClient := clients.KubeClient.AppsV1().Deployments(experimentsDetails.ChaosNamespace)
uditgaurav marked this conversation as resolved.
Show resolved Hide resolved
// Get Deployment replica count
applicationList, err := applicationClient.List(metav1.ListOptions{})
uditgaurav marked this conversation as resolved.
Show resolved Hide resolved
if err != nil {
return "", 0, errors.Errorf("Unable to get application, err: %v", err)
}
for _, app := range applicationList.Items {
appReplica = int(*app.Spec.Replicas)
appName = app.Name

}
return appName, appReplica, nil

}

//PodAutoscalerChaos scales up the application pod replicas
func PodAutoscalerChaos(experimentsDetails *experimentTypes.ExperimentDetails, clients clients.ClientSets, replicaCount int, appName string, resultDetails *types.ResultDetails, eventsDetails *types.EventDetails, chaosDetails *types.ChaosDetails) error {

applicationClient := clients.KubeClient.AppsV1().Deployments(experimentsDetails.ChaosNamespace)

replicas := int32(experimentsDetails.Replicas)
// Scale Application
retryErr := retries.RetryOnConflict(retries.DefaultRetry, func() error {
// Retrieve the latest version of Deployment before attempting update
// RetryOnConflict uses exponential backoff to avoid exhausting the apiserver
result, err := applicationClient.Get(appName, metav1.GetOptions{})
ksatchit marked this conversation as resolved.
Show resolved Hide resolved
if err != nil {
return errors.Errorf("Failed to get latest version of Application Deployment: %v", err)
}

result.Spec.Replicas = int32Ptr(replicas) // modify replica count
_, updateErr := applicationClient.Update(result)
return updateErr
})
if retryErr != nil {
return errors.Errorf("Unable to scale the application, due to: %v", retryErr)
}
log.Info("Application Started Scaling")

err = ApplicationPodStatusCheck(experimentsDetails, appName, clients, replicaCount, resultDetails, eventsDetails, chaosDetails)
if err != nil {
return errors.Errorf("Status Check failed, err: %v", err)
}

return nil
}

// ApplicationPodStatusCheck checks the status of the application pod
func ApplicationPodStatusCheck(experimentsDetails *experimentTypes.ExperimentDetails, appName string, clients clients.ClientSets, replicaCount int, resultDetails *types.ResultDetails, eventsDetails *types.EventDetails, chaosDetails *types.ChaosDetails) error {

//ChaosStartTimeStamp contains the start timestamp, when the chaos injection begin
ChaosStartTimeStamp := time.Now().Unix()
var endTime <-chan time.Time
timeDelay := time.Duration(experimentsDetails.ChaosDuration) * time.Second
failFlag := false
applicationClient := clients.KubeClient.AppsV1().Deployments(experimentsDetails.AppNS)
applicationDeploy, err := applicationClient.Get(appName, metav1.GetOptions{})
if err != nil {
return errors.Errorf("Unable to get the application, err: %v", err)
}
for count := 0; count < int(experimentsDetails.ChaosDuration/2); count++ {
uditgaurav marked this conversation as resolved.
Show resolved Hide resolved

if int(applicationDeploy.Status.AvailableReplicas) != experimentsDetails.Replicas {

log.Infof("Application Pod Avaliable Count is: %s", strconv.Itoa(int(applicationDeploy.Status.AvailableReplicas)))
applicationDeploy, err = applicationClient.Get(appName, metav1.GetOptions{})
if err != nil {
return errors.Errorf("Unable to get the application, err: %v", err)
}

time.Sleep(2 * time.Second)
//ChaosCurrentTimeStamp contains the current timestamp
ChaosCurrentTimeStamp := time.Now().Unix()

//ChaosDiffTimeStamp contains the difference of current timestamp and start timestamp
//It will helpful to track the total chaos duration
chaosDiffTimeStamp := ChaosCurrentTimeStamp - ChaosStartTimeStamp
if int(chaosDiffTimeStamp) >= experimentsDetails.ChaosDuration {
failFlag = true
break
}

// signChan channel is used to transmit signal notifications.
signChan := make(chan os.Signal, 1)
// Catch and relay certain signal(s) to signChan channel.
signal.Notify(signChan, os.Interrupt, syscall.SIGTERM, syscall.SIGKILL)
loop:
for {
endTime = time.After(timeDelay)
ksatchit marked this conversation as resolved.
Show resolved Hide resolved

select {
case <-signChan:
err = AutoscalerReovery(experimentsDetails, clients, replicaCount, appName)
if err != nil {
return errors.Errorf("Unable to perform autoscaling, due to %v", err)
}
// updating the chaosresult after stopped
failStep := "Pod autoscaler chaos injection stopped!"
types.SetResultAfterCompletion(resultDetails, "Fail", "Stopped", failStep)
result.ChaosResult(chaosDetails, clients, resultDetails, "EOT")

// generating summary event in chaosengine
msg := experimentsDetails.ExperimentName + " experiment has been aborted"
types.SetEngineEventAttributes(eventsDetails, types.Summary, msg, "Warning", chaosDetails)
events.GenerateEvents(eventsDetails, clients, chaosDetails, "ChaosEngine")

// generating summary event in chaosresult
types.SetResultEventAttributes(eventsDetails, types.Summary, msg, "Warning", resultDetails)
events.GenerateEvents(eventsDetails, clients, chaosDetails, "ChaosResult")

os.Exit(1)
case <-endTime:
log.Infof("[Chaos]: Time is up for experiment: %v", experimentsDetails.ExperimentName)
endTime = nil
break loop

}

}

err = AutoscalerReovery(experimentsDetails, clients, replicaCount, appName)
uditgaurav marked this conversation as resolved.
Show resolved Hide resolved
if err != nil {
return errors.Errorf("Unable to perform autoscaling, due to %v", err)
}
} else {
break
}
}
if failFlag == true {
err = AutoscalerReovery(experimentsDetails, clients, replicaCount, appName)
uditgaurav marked this conversation as resolved.
Show resolved Hide resolved
if err != nil {
return errors.Errorf("Unable to perform autoscaling, due to %v", err)
}
return errors.Errorf("Application pod fails to come in running state after Chaos Duration of %d sec", experimentsDetails.ChaosDuration)
}
// Keeping a wait time of 10s after all pod comes in running state
// This is optional and used just for viewing the pod status
time.Sleep(10 * time.Second)

return nil
}

//AutoscalerReovery scale back to initial number of replica
func AutoscalerReovery(experimentsDetails *experimentTypes.ExperimentDetails, clients clients.ClientSets, replicaCount int, appName string) error {

applicationClient := clients.KubeClient.AppsV1().Deployments(experimentsDetails.ChaosNamespace)

// Scale back to initial number of replicas
retryErr := retries.RetryOnConflict(retries.DefaultRetry, func() error {
// Retrieve the latest version of Deployment before attempting update
// RetryOnConflict uses exponential backoff to avoid exhausting the apiserver
result, err := applicationClient.Get(appName, metav1.GetOptions{})
if err != nil {
return errors.Errorf("Failed to get latest version of Application Deployment: %v", err)
}

result.Spec.Replicas = int32Ptr(int32(replicaCount)) // modify replica count
_, updateErr := applicationClient.Update(result)
return updateErr
})
if retryErr != nil {
return errors.Errorf("Unable to scale the, due to: %v", retryErr)
}
log.Info("[Info]: Application Started Scaling back")

applicationDeploy, err := clients.KubeClient.AppsV1().Deployments(experimentsDetails.AppNS).Get(appName, metav1.GetOptions{})
if err != nil {
return errors.Errorf("Unable to get the application, err: %v", err)
}
failFlag := false
// Check for 30 retries with 2secs of delay
for count := 0; count < 30; count++ {
ksatchit marked this conversation as resolved.
Show resolved Hide resolved

if int(applicationDeploy.Status.AvailableReplicas) != replicaCount {

applicationDeploy, err = applicationClient.Get(appName, metav1.GetOptions{})
if err != nil {
return errors.Errorf("Unable to get the application, err: %v", err)
}
time.Sleep(2 * time.Second)
if count == 30 {
failFlag = true
break
}

} else {
break
}
}
if failFlag == true {
return errors.Errorf("Application fails to roll back")
}
log.Info("[RollBack]: Application Pod roll back to initial number of replicas")

return nil
}

func int32Ptr(i int32) *int32 { return &i }
69 changes: 69 additions & 0 deletions experiments/generic/pod-autoscaler/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
## Experiment CR for the pod-autoscaler experiment
ksatchit marked this conversation as resolved.
Show resolved Hide resolved

```
apiVersion: litmuschaos.io/v1alpha1
description:
message: |
Scale the deployment replicas to check the autoscaling capability
kind: ChaosExperiment
metadata:
name: pod-autoscaler
version: 0.1.0
spec:
definition:
scope: Namespaced
permissions:
- apiGroups:
- ""
- "apps"
- "batch"
- "litmuschaos.io"
resources:
- "deployments"
- "jobs"
- "pods"
- "pods/log"
- "events"
- "configmaps"
- "chaosengines"
- "chaosexperiments"
- "chaosresults"
verbs:
- "create"
- "list"
- "get"
- "patch"
- "update"
- "delete"
- apiGroups:
- ""
resources:
- "nodes"
verbs:
- "get"
- "list"
image: "litmuschaos/go-runner:ci"
imagePullPolicy: Always
args:
- -c
- ./experiments/pod-autoscaler
command:
- /bin/bash
env:

- name: TOTAL_CHAOS_DURATION
value: '60'

- name: REPLICA_COUNT
value: '5'

# Period to wait before and after injection of chaos in sec
- name: RAMP_TIME
value: ''

- name: LIB
value: 'litmus'
labels:
name: pod-autoscaler

```
Loading