From 4b771b219044239ceb9100b6133ddf98c87f265b Mon Sep 17 00:00:00 2001 From: Shubham Chaudhary Date: Thu, 27 Apr 2023 20:08:37 +0530 Subject: [PATCH] chore(onchaos): Busy wait for onchaos mode (#659) Signed-off-by: Shubham Chaudhary --- pkg/probe/cmdprobe.go | 64 ++++++++++++++++++++---------------------- pkg/probe/httpprobe.go | 32 ++++++++++----------- pkg/probe/k8sprobe.go | 32 ++++++++++----------- pkg/probe/probe.go | 39 +++++++++++++++++++------ pkg/probe/promProbe.go | 35 +++++++++++------------ pkg/types/types.go | 3 +- 6 files changed, 109 insertions(+), 96 deletions(-) diff --git a/pkg/probe/cmdprobe.go b/pkg/probe/cmdprobe.go index 83fb97fcf..60a30079d 100644 --- a/pkg/probe/cmdprobe.go +++ b/pkg/probe/cmdprobe.go @@ -301,6 +301,7 @@ func getRunID() string { // triggerInlineContinuousCmdProbe trigger the inline continuous cmd probes func triggerInlineContinuousCmdProbe(probe v1alpha1.ProbeAttributes, clients clients.ClientSets, chaosresult *types.ResultDetails, chaosDetails *types.ChaosDetails) { var isExperimentFailed bool + probeDetails := getProbeByName(probe.Name, chaosresult.ProbeDetails) // waiting for initial delay if probe.RunProperties.InitialDelaySeconds != 0 { log.Infof("[Wait]: Waiting for %vs before probe execution", probe.RunProperties.InitialDelaySeconds) @@ -314,15 +315,13 @@ loop: err = triggerInlineCmdProbe(probe, chaosresult) // record the error inside the probeDetails, we are maintaining a dedicated variable for the err, inside probeDetails if err != nil { - for index := range chaosresult.ProbeDetails { - if chaosresult.ProbeDetails[index].Name == probe.Name { - chaosresult.ProbeDetails[index].IsProbeFailedWithError = err - log.Errorf("The %v cmd probe has been Failed, err: %v", probe.Name, err) - isExperimentFailed = true - break loop - } - } + probeDetails.IsProbeFailedWithError = err + log.Errorf("%v http probe has Failed, err: %v", probe.Name, err) + isExperimentFailed = true + probeDetails.HasProbeExecutedOnce = true + break loop } + probeDetails.HasProbeExecutedOnce = true // waiting for the probe polling interval time.Sleep(time.Duration(probe.RunProperties.ProbePollingInterval) * time.Second) } @@ -339,6 +338,7 @@ loop: // triggerInlineOnChaosCmdProbe trigger the inline onchaos cmd probes func triggerInlineOnChaosCmdProbe(probe v1alpha1.ProbeAttributes, clients clients.ClientSets, chaosresult *types.ResultDetails, chaosDetails *types.ChaosDetails) { var isExperimentFailed bool + probeDetails := getProbeByName(probe.Name, chaosresult.ProbeDetails) duration := chaosDetails.ChaosDuration // waiting for initial delay if probe.RunProperties.InitialDelaySeconds != 0 { @@ -363,15 +363,13 @@ loop: default: // record the error inside the probeDetails, we are maintaining a dedicated variable for the err, inside probeDetails if err = triggerInlineCmdProbe(probe, chaosresult); err != nil { - for index := range chaosresult.ProbeDetails { - if chaosresult.ProbeDetails[index].Name == probe.Name { - chaosresult.ProbeDetails[index].IsProbeFailedWithError = err - log.Errorf("The %v cmd probe has been Failed, err: %v", probe.Name, err) - isExperimentFailed = true - break loop - } - } + probeDetails.IsProbeFailedWithError = err + log.Errorf("%v http probe has Failed, err: %v", probe.Name, err) + isExperimentFailed = true + probeDetails.HasProbeExecutedOnce = true + break loop } + probeDetails.HasProbeExecutedOnce = true // waiting for the probe polling interval time.Sleep(time.Duration(probe.RunProperties.ProbePollingInterval) * time.Second) } @@ -390,6 +388,7 @@ loop: func triggerSourceOnChaosCmdProbe(probe v1alpha1.ProbeAttributes, execCommandDetails litmusexec.PodDetails, clients clients.ClientSets, chaosresult *types.ResultDetails, chaosDetails *types.ChaosDetails) { var isExperimentFailed bool + probeDetails := getProbeByName(probe.Name, chaosresult.ProbeDetails) duration := chaosDetails.ChaosDuration // waiting for initial delay if probe.RunProperties.InitialDelaySeconds != 0 { @@ -412,15 +411,13 @@ loop: default: // record the error inside the probeDetails, we are maintaining a dedicated variable for the err, inside probeDetails if err = triggerSourceCmdProbe(probe, execCommandDetails, clients, chaosresult); err != nil { - for index := range chaosresult.ProbeDetails { - if chaosresult.ProbeDetails[index].Name == probe.Name { - chaosresult.ProbeDetails[index].IsProbeFailedWithError = err - log.Errorf("The %v cmd probe has been Failed, err: %v", probe.Name, err) - isExperimentFailed = true - break loop - } - } + probeDetails.IsProbeFailedWithError = err + log.Errorf("%v http probe has Failed, err: %v", probe.Name, err) + isExperimentFailed = true + probeDetails.HasProbeExecutedOnce = true + break loop } + probeDetails.HasProbeExecutedOnce = true // waiting for the probe polling interval time.Sleep(time.Duration(probe.RunProperties.ProbePollingInterval) * time.Second) } @@ -440,6 +437,7 @@ loop: func triggerSourceContinuousCmdProbe(probe v1alpha1.ProbeAttributes, execCommandDetails litmusexec.PodDetails, clients clients.ClientSets, chaosresult *types.ResultDetails, chaosDetails *types.ChaosDetails) { var isExperimentFailed bool + probeDetails := getProbeByName(probe.Name, chaosresult.ProbeDetails) // waiting for initial delay if probe.RunProperties.InitialDelaySeconds != 0 { log.Infof("[Wait]: Waiting for %vs before probe execution", probe.RunProperties.InitialDelaySeconds) @@ -453,15 +451,13 @@ loop: err = triggerSourceCmdProbe(probe, execCommandDetails, clients, chaosresult) // record the error inside the probeDetails, we are maintaining a dedicated variable for the err, inside probeDetails if err != nil { - for index := range chaosresult.ProbeDetails { - if chaosresult.ProbeDetails[index].Name == probe.Name { - chaosresult.ProbeDetails[index].IsProbeFailedWithError = err - log.Errorf("The %v cmd probe has been Failed, err: %v", probe.Name, err) - isExperimentFailed = true - break loop - } - } + probeDetails.IsProbeFailedWithError = err + log.Errorf("%v http probe has Failed, err: %v", probe.Name, err) + isExperimentFailed = true + probeDetails.HasProbeExecutedOnce = true + break loop } + probeDetails.HasProbeExecutedOnce = true // waiting for the probe polling interval time.Sleep(time.Duration(probe.RunProperties.ProbePollingInterval) * time.Second) } @@ -648,14 +644,14 @@ func postChaosCmdProbe(probe v1alpha1.ProbeAttributes, resultDetails *types.Resu case "Continuous", "OnChaos": if reflect.DeepEqual(probe.CmdProbeInputs.Source, v1alpha1.SourceDetails{}) { // it will check for the error, It will detect the error if any error encountered in probe during chaos - err = checkForErrorInContinuousProbe(resultDetails, probe.Name) + err = checkForErrorInContinuousProbe(resultDetails, chaosDetails.Timeout, chaosDetails.Delay, probe.Name) // failing the probe, if the success condition doesn't met after the retry & timeout combinations if err = markedVerdictInEnd(err, resultDetails, probe, "PostChaos"); err != nil { return err } } else { // it will check for the error, It will detect the error if any error encountered in probe during chaos - err = checkForErrorInContinuousProbe(resultDetails, probe.Name) + err = checkForErrorInContinuousProbe(resultDetails, chaosDetails.Timeout, chaosDetails.Delay, probe.Name) // failing the probe, if the success condition doesn't met after the retry & timeout combinations if err = markedVerdictInEnd(err, resultDetails, probe, "PostChaos"); err != nil { diff --git a/pkg/probe/httpprobe.go b/pkg/probe/httpprobe.go index 3b4ec3bfa..b8562aa78 100644 --- a/pkg/probe/httpprobe.go +++ b/pkg/probe/httpprobe.go @@ -198,6 +198,7 @@ func getHTTPBody(httpBody v1alpha1.PostMethod) (string, error) { // triggerContinuousHTTPProbe trigger the continuous http probes func triggerContinuousHTTPProbe(probe v1alpha1.ProbeAttributes, clients clients.ClientSets, chaosresult *types.ResultDetails, chaosDetails *types.ChaosDetails) { var isExperimentFailed bool + probeDetails := getProbeByName(probe.Name, chaosresult.ProbeDetails) // waiting for initial delay if probe.RunProperties.InitialDelaySeconds != 0 { log.Infof("[Wait]: Waiting for %vs before probe execution", probe.RunProperties.InitialDelaySeconds) @@ -211,15 +212,13 @@ loop: err = triggerHTTPProbe(probe, chaosresult) // record the error inside the probeDetails, we are maintaining a dedicated variable for the err, inside probeDetails if err != nil { - for index := range chaosresult.ProbeDetails { - if chaosresult.ProbeDetails[index].Name == probe.Name { - chaosresult.ProbeDetails[index].IsProbeFailedWithError = err - log.Errorf("The %v http probe has been Failed, err: %v", probe.Name, err) - isExperimentFailed = true - break loop - } - } + probeDetails.IsProbeFailedWithError = err + log.Errorf("%v http probe has Failed, err: %v", probe.Name, err) + isExperimentFailed = true + probeDetails.HasProbeExecutedOnce = true + break loop } + probeDetails.HasProbeExecutedOnce = true // waiting for the probe polling interval time.Sleep(time.Duration(probe.RunProperties.ProbePollingInterval) * time.Second) } @@ -307,7 +306,7 @@ func postChaosHTTPProbe(probe v1alpha1.ProbeAttributes, resultDetails *types.Res } case "Continuous", "OnChaos": // it will check for the error, It will detect the error if any error encountered in probe during chaos - err = checkForErrorInContinuousProbe(resultDetails, probe.Name) + err = checkForErrorInContinuousProbe(resultDetails, chaosDetails.Timeout, chaosDetails.Delay, probe.Name) // failing the probe, if the success condition doesn't met after the retry & timeout combinations if err = markedVerdictInEnd(err, resultDetails, probe, "PostChaos"); err != nil { return err @@ -320,6 +319,7 @@ func postChaosHTTPProbe(probe v1alpha1.ProbeAttributes, resultDetails *types.Res func triggerOnChaosHTTPProbe(probe v1alpha1.ProbeAttributes, clients clients.ClientSets, chaosresult *types.ResultDetails, chaosDetails *types.ChaosDetails) { var isExperimentFailed bool + probeDetails := getProbeByName(probe.Name, chaosresult.ProbeDetails) duration := chaosDetails.ChaosDuration // waiting for initial delay if probe.RunProperties.InitialDelaySeconds != 0 { @@ -343,15 +343,13 @@ loop: err = triggerHTTPProbe(probe, chaosresult) // record the error inside the probeDetails, we are maintaining a dedicated variable for the err, inside probeDetails if err != nil { - for index := range chaosresult.ProbeDetails { - if chaosresult.ProbeDetails[index].Name == probe.Name { - chaosresult.ProbeDetails[index].IsProbeFailedWithError = err - isExperimentFailed = true - break loop - } - } + probeDetails.IsProbeFailedWithError = err + log.Errorf("%v http probe has Failed, err: %v", probe.Name, err) + isExperimentFailed = true + probeDetails.HasProbeExecutedOnce = true + break loop } - + probeDetails.HasProbeExecutedOnce = true // waiting for the probe polling interval time.Sleep(time.Duration(probe.RunProperties.ProbePollingInterval) * time.Second) } diff --git a/pkg/probe/k8sprobe.go b/pkg/probe/k8sprobe.go index 443bbc809..41c8eefef 100644 --- a/pkg/probe/k8sprobe.go +++ b/pkg/probe/k8sprobe.go @@ -115,6 +115,7 @@ func triggerK8sProbe(probe v1alpha1.ProbeAttributes, clients clients.ClientSets, // triggerContinuousK8sProbe trigger the continuous k8s probes func triggerContinuousK8sProbe(probe v1alpha1.ProbeAttributes, clients clients.ClientSets, chaosresult *types.ResultDetails, chaosDetails *types.ChaosDetails) { var isExperimentFailed bool + probeDetails := getProbeByName(probe.Name, chaosresult.ProbeDetails) // waiting for initial delay if probe.RunProperties.InitialDelaySeconds != 0 { log.Infof("[Wait]: Waiting for %vs before probe execution", probe.RunProperties.InitialDelaySeconds) @@ -128,15 +129,13 @@ loop: err = triggerK8sProbe(probe, clients, chaosresult) // record the error inside the probeDetails, we are maintaining a dedicated variable for the err, inside probeDetails if err != nil { - for index := range chaosresult.ProbeDetails { - if chaosresult.ProbeDetails[index].Name == probe.Name { - chaosresult.ProbeDetails[index].IsProbeFailedWithError = err - log.Errorf("the %v k8s probe has been Failed, err: %v", probe.Name, err) - isExperimentFailed = true - break loop - } - } + probeDetails.IsProbeFailedWithError = err + log.Errorf("%v http probe has Failed, err: %v", probe.Name, err) + isExperimentFailed = true + probeDetails.HasProbeExecutedOnce = true + break loop } + probeDetails.HasProbeExecutedOnce = true // waiting for the probe polling interval time.Sleep(time.Duration(probe.RunProperties.ProbePollingInterval) * time.Second) } @@ -255,7 +254,7 @@ func postChaosK8sProbe(probe v1alpha1.ProbeAttributes, resultDetails *types.Resu } case "continuous", "onchaos": // it will check for the error, It will detect the error if any error encountered in probe during chaos - err = checkForErrorInContinuousProbe(resultDetails, probe.Name) + err = checkForErrorInContinuousProbe(resultDetails, chaosDetails.Timeout, chaosDetails.Delay, probe.Name) // failing the probe, if the success condition doesn't met after the retry & timeout combinations if err = markedVerdictInEnd(err, resultDetails, probe, "PostChaos"); err != nil { return err @@ -287,6 +286,7 @@ func onChaosK8sProbe(probe v1alpha1.ProbeAttributes, resultDetails *types.Result func triggerOnChaosK8sProbe(probe v1alpha1.ProbeAttributes, clients clients.ClientSets, chaosresult *types.ResultDetails, chaosDetails *types.ChaosDetails) { var isExperimentFailed bool + probeDetails := getProbeByName(probe.Name, chaosresult.ProbeDetails) duration := chaosDetails.ChaosDuration // waiting for initial delay if probe.RunProperties.InitialDelaySeconds != 0 { @@ -310,15 +310,13 @@ loop: err = triggerK8sProbe(probe, clients, chaosresult) // record the error inside the probeDetails, we are maintaining a dedicated variable for the err, inside probeDetails if err != nil { - for index := range chaosresult.ProbeDetails { - if chaosresult.ProbeDetails[index].Name == probe.Name { - chaosresult.ProbeDetails[index].IsProbeFailedWithError = err - log.Errorf("The %v k8s probe has been Failed, err: %v", probe.Name, err) - isExperimentFailed = true - break loop - } - } + probeDetails.IsProbeFailedWithError = err + log.Errorf("%v http probe has Failed, err: %v", probe.Name, err) + isExperimentFailed = true + probeDetails.HasProbeExecutedOnce = true + break loop } + probeDetails.HasProbeExecutedOnce = true // waiting for the probe polling interval time.Sleep(time.Duration(probe.RunProperties.ProbePollingInterval) * time.Second) } diff --git a/pkg/probe/probe.go b/pkg/probe/probe.go index 0ea009067..92785e96e 100644 --- a/pkg/probe/probe.go +++ b/pkg/probe/probe.go @@ -139,7 +139,7 @@ func getProbesFromEngine(chaosDetails *types.ChaosDetails, clients clients.Clien // it fetch the probe details from the chaosengine and set into the chaosresult func InitializeProbesInChaosResultDetails(chaosDetails *types.ChaosDetails, clients clients.ClientSets, chaosresult *types.ResultDetails) error { - var probeDetails []types.ProbeDetails + var probeDetails []*types.ProbeDetails // get the probes from the chaosengine probes, err := getProbesFromEngine(chaosDetails, clients) if err != nil { @@ -148,7 +148,7 @@ func InitializeProbesInChaosResultDetails(chaosDetails *types.ChaosDetails, clie // set the probe details for k8s probe for _, probe := range probes { - tempProbe := types.ProbeDetails{} + tempProbe := &types.ProbeDetails{} tempProbe.Name = probe.Name tempProbe.Type = probe.Type tempProbe.Mode = probe.Mode @@ -252,14 +252,25 @@ func getDescription(mode, phase string) string { } //CheckForErrorInContinuousProbe check for the error in the continuous probes -func checkForErrorInContinuousProbe(resultDetails *types.ResultDetails, probeName string) error { - - for index, probe := range resultDetails.ProbeDetails { - if probe.Name == probeName { - return resultDetails.ProbeDetails[index].IsProbeFailedWithError +func checkForErrorInContinuousProbe(resultDetails *types.ResultDetails, timeout, delay int, probeName string) error { + probe := getProbeByName(probeName, resultDetails.ProbeDetails) + timeoutChan := time.After(time.Duration(timeout) * time.Second) + +loop: + for { + select { + case <-timeoutChan: + return errors.Errorf("%v probe execution timed out", probeName) + default: + if probe.HasProbeExecutedOnce { + break loop + } + log.Infof("[Probe]: Waiting for %s probe to finish or timeout", probeName) + time.Sleep(time.Duration(delay) * time.Second) } } - return nil + + return probe.IsProbeFailedWithError } // ParseCommand parse the templated command and replace the templated value by actual value @@ -282,7 +293,7 @@ func parseCommand(templatedCommand string, resultDetails *types.ResultDetails) ( // stopChaosEngine update the probe status and patch the chaosengine to stop state func stopChaosEngine(probe v1alpha1.ProbeAttributes, clients clients.ClientSets, chaosresult *types.ResultDetails, chaosDetails *types.ChaosDetails) error { // it will check for the error, It will detect the error if any error encountered in probe during chaos - err = checkForErrorInContinuousProbe(chaosresult, probe.Name) + err = checkForErrorInContinuousProbe(chaosresult, chaosDetails.Timeout, chaosDetails.Delay, probe.Name) // failing the probe, if the success condition doesn't met after the retry & timeout combinations markedVerdictInEnd(err, chaosresult, probe, "PostChaos") //patch chaosengine's state to stop @@ -332,3 +343,13 @@ func getProbeVerdict(resultDetails *types.ResultDetails, name, probeType string) } return v1alpha1.ProbeVerdictNA } + +// getProbeByName returns the probe details of a probe given its name +func getProbeByName(name string, probeDetails []*types.ProbeDetails) *types.ProbeDetails { + for _, p := range probeDetails { + if p.Name == name { + return p + } + } + return nil +} diff --git a/pkg/probe/promProbe.go b/pkg/probe/promProbe.go index f34f70478..7dfee0c6d 100644 --- a/pkg/probe/promProbe.go +++ b/pkg/probe/promProbe.go @@ -128,7 +128,7 @@ func postChaosPromProbe(probe v1alpha1.ProbeAttributes, resultDetails *types.Res case "continuous", "onchaos": // it will check for the error, It will detect the error if any error encountered in probe during chaos - err = checkForErrorInContinuousProbe(resultDetails, probe.Name) + err = checkForErrorInContinuousProbe(resultDetails, chaosDetails.Timeout, chaosDetails.Delay, probe.Name) // failing the probe, if the success condition doesn't met after the retry & timeout combinations if err = markedVerdictInEnd(err, resultDetails, probe, "PostChaos"); err != nil { @@ -217,6 +217,7 @@ func triggerPromProbe(probe v1alpha1.ProbeAttributes, resultDetails *types.Resul func triggerContinuousPromProbe(probe v1alpha1.ProbeAttributes, clients clients.ClientSets, chaosresult *types.ResultDetails, chaosDetails *types.ChaosDetails) { var isExperimentFailed bool + probeDetails := getProbeByName(probe.Name, chaosresult.ProbeDetails) // waiting for initial delay if probe.RunProperties.InitialDelaySeconds != 0 { log.Infof("[Wait]: Waiting for %vs before probe execution", probe.RunProperties.InitialDelaySeconds) @@ -230,15 +231,13 @@ loop: err = triggerPromProbe(probe, chaosresult) // record the error inside the probeDetails, we are maintaining a dedicated variable for the err, inside probeDetails if err != nil { - for index := range chaosresult.ProbeDetails { - if chaosresult.ProbeDetails[index].Name == probe.Name { - chaosresult.ProbeDetails[index].IsProbeFailedWithError = err - log.Errorf("The %v prom probe has been Failed, err: %v", probe.Name, err) - isExperimentFailed = true - break loop - } - } + probeDetails.IsProbeFailedWithError = err + log.Errorf("%v http probe has Failed, err: %v", probe.Name, err) + isExperimentFailed = true + probeDetails.HasProbeExecutedOnce = true + break loop } + probeDetails.HasProbeExecutedOnce = true // waiting for the probe polling interval time.Sleep(time.Duration(probe.RunProperties.ProbePollingInterval) * time.Second) } @@ -256,6 +255,7 @@ loop: func triggerOnChaosPromProbe(probe v1alpha1.ProbeAttributes, clients clients.ClientSets, chaosresult *types.ResultDetails, chaosDetails *types.ChaosDetails) { var isExperimentFailed bool + probeDetails := getProbeByName(probe.Name, chaosresult.ProbeDetails) duration := chaosDetails.ChaosDuration // waiting for initial delay if probe.RunProperties.InitialDelaySeconds != 0 { @@ -277,16 +277,15 @@ loop: break loop default: // record the error inside the probeDetails, we are maintaining a dedicated variable for the err, inside probeDetails - if err = triggerPromProbe(probe, chaosresult); err != nil { - for index := range chaosresult.ProbeDetails { - if chaosresult.ProbeDetails[index].Name == probe.Name { - chaosresult.ProbeDetails[index].IsProbeFailedWithError = err - log.Errorf("The %v prom probe has been Failed, err: %v", probe.Name, err) - isExperimentFailed = true - break loop - } - } + err = triggerPromProbe(probe, chaosresult) + if err != nil { + probeDetails.IsProbeFailedWithError = err + log.Errorf("%v http probe has Failed, err: %v", probe.Name, err) + isExperimentFailed = true + probeDetails.HasProbeExecutedOnce = true + break loop } + probeDetails.HasProbeExecutedOnce = true // waiting for the probe polling interval time.Sleep(time.Duration(probe.RunProperties.ProbePollingInterval) * time.Second) } diff --git a/pkg/types/types.go b/pkg/types/types.go index 1e9f62aec..85494c89a 100644 --- a/pkg/types/types.go +++ b/pkg/types/types.go @@ -35,7 +35,7 @@ type ResultDetails struct { FailStep string Phase v1alpha1.ResultPhase ResultUID clientTypes.UID - ProbeDetails []ProbeDetails + ProbeDetails []*ProbeDetails PassedProbeCount int ProbeArtifacts map[string]ProbeArtifact } @@ -57,6 +57,7 @@ type ProbeDetails struct { Mode string Status v1alpha1.ProbeStatus IsProbeFailedWithError error + HasProbeExecutedOnce bool RunID string RunCount int }