Skip to content

Commit

Permalink
PTX-19576 Fix enabled/disabled Telemetry logic validation
Browse files Browse the repository at this point in the history
Signed-off-by: nikolaypopov <nikolay.popov86@gmail.com>
  • Loading branch information
nikolaypopov committed Aug 14, 2023
1 parent 72b8d40 commit 2494973
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 25 deletions.
75 changes: 52 additions & 23 deletions pkg/util/test/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -742,7 +742,7 @@ func ValidateStorageCluster(
}

// Validate components
if err = validateComponents(pxImageList, liveCluster, timeout, interval); err != nil {
if err = validateComponents(pxImageList, clusterSpec, liveCluster, timeout, interval); err != nil {
return err
}

Expand Down Expand Up @@ -1401,7 +1401,7 @@ func IsK3sCluster() bool {
return false
}

func validateComponents(pxImageList map[string]string, cluster *corev1.StorageCluster, timeout, interval time.Duration) error {
func validateComponents(pxImageList map[string]string, originalClusterSpec, cluster *corev1.StorageCluster, timeout, interval time.Duration) error {
// Validate PVC Controller components and images
if err := ValidatePvcController(pxImageList, cluster, timeout, interval); err != nil {
return err
Expand All @@ -1423,7 +1423,7 @@ func validateComponents(pxImageList map[string]string, cluster *corev1.StorageCl
}

// Validate Monitoring
if err := ValidateMonitoring(pxImageList, cluster, timeout, interval); err != nil {
if err := ValidateMonitoring(pxImageList, originalClusterSpec, cluster, timeout, interval); err != nil {
return err
}

Expand Down Expand Up @@ -3056,15 +3056,15 @@ func validateStorkSecurityEnvVar(cluster *corev1.StorageCluster, storkDeployment
}

// ValidateMonitoring validates all PX Monitoring components
func ValidateMonitoring(pxImageList map[string]string, cluster *corev1.StorageCluster, timeout, interval time.Duration) error {
func ValidateMonitoring(pxImageList map[string]string, originalClusterSpec, cluster *corev1.StorageCluster, timeout, interval time.Duration) error {
if err := ValidatePrometheus(pxImageList, cluster, timeout, interval); err != nil {
return err
}

// Increasing timeout for Telemetry components as they take quite long time to initialize
defaultTelemetryRetryInterval := 30 * time.Second
defaultTelemetryTimeout := 30 * time.Minute
if err := ValidateTelemetry(pxImageList, cluster, defaultTelemetryTimeout, defaultTelemetryRetryInterval); err != nil {
if err := ValidateTelemetry(pxImageList, originalClusterSpec, cluster, defaultTelemetryTimeout, defaultTelemetryRetryInterval); err != nil {
return err
}

Expand Down Expand Up @@ -3291,7 +3291,7 @@ func ValidateTelemetryV1Disabled(cluster *corev1.StorageCluster, timeout, interv
}

// ValidateTelemetry validates telemetry component is installed/uninstalled as expected
func ValidateTelemetry(pxImageList map[string]string, cluster *corev1.StorageCluster, timeout, interval time.Duration) error {
func ValidateTelemetry(pxImageList map[string]string, originalClusterSpec, cluster *corev1.StorageCluster, timeout, interval time.Duration) error {
logrus.Info("Validate Telemetry components")
logrus.Info("Check PX and PX Operator versions to determine which telemetry to validate against..")
pxVersion := GetPortworxVersion(cluster)
Expand All @@ -3308,15 +3308,15 @@ func ValidateTelemetry(pxImageList map[string]string, cluster *corev1.StorageClu
}

if pxVersion.GreaterThanOrEqual(minimumPxVersionCCMGO) && opVersion.GreaterThanOrEqual(opVer1_10) {
return ValidateTelemetryV2(pxImageList, cluster, timeout, interval)
return ValidateTelemetryV2(pxImageList, originalClusterSpec, cluster, timeout, interval)
}
return ValidateTelemetryV1(pxImageList, cluster, timeout, interval)
return ValidateTelemetryV1(pxImageList, originalClusterSpec, cluster, timeout, interval)
}

// ValidateTelemetryV1 validates old version of ccm-java telemetry
func ValidateTelemetryV1(pxImageList map[string]string, cluster *corev1.StorageCluster, timeout, interval time.Duration) error {
func ValidateTelemetryV1(pxImageList map[string]string, originalClusterSpec, cluster *corev1.StorageCluster, timeout, interval time.Duration) error {
logrus.Info("Validating Telemetry (ccm-java)")
if shouldTelemetryBeEnabled(cluster) {
if shouldTelemetryBeEnabled(originalClusterSpec, cluster) {
if err := ValidateTelemetryV1Enabled(pxImageList, cluster, timeout, interval); err != nil {
return fmt.Errorf("failed to validate Telemetry enabled, Err: %v", err)
}
Expand All @@ -3330,9 +3330,9 @@ func ValidateTelemetryV1(pxImageList map[string]string, cluster *corev1.StorageC
}

// ValidateTelemetryV2 validates new version of ccm-go telemetry
func ValidateTelemetryV2(pxImageList map[string]string, cluster *corev1.StorageCluster, timeout, interval time.Duration) error {
func ValidateTelemetryV2(pxImageList map[string]string, originalClusterSpec, cluster *corev1.StorageCluster, timeout, interval time.Duration) error {
logrus.Info("Validating Telemetry (ccm-go)")
if shouldTelemetryBeEnabled(cluster) {
if shouldTelemetryBeEnabled(originalClusterSpec, cluster) {
if err := ValidateTelemetryV2Enabled(pxImageList, cluster, timeout, interval); err != nil {
return fmt.Errorf("failed to validate Telemetry enabled, Err: %v", err)
}
Expand All @@ -3346,29 +3346,54 @@ func ValidateTelemetryV2(pxImageList map[string]string, cluster *corev1.StorageC
}

// shouldTelemetryBeEnabled validates if Telemetry should be auto enabled/disabled by default
func shouldTelemetryBeEnabled(cluster *corev1.StorageCluster) bool {
func shouldTelemetryBeEnabled(originalClusterSpec, cluster *corev1.StorageCluster) bool {
logrus.Info("Checking if Telemetry should be enabled or disabled")
var shouldTelemetryBeEnabled bool
var telemetryEnabledInTheSpec bool

logrus.Info("Check PX and PX Operator versions to determine which Telemetry version to validate against..")
pxVersion := GetPortworxVersion(cluster)
logrus.Infof("PX Version: [%s]", pxVersion.String())
opVersion, _ := GetPxOperatorVersion()
logrus.Infof("PX Operator version: [%s]", opVersion.String())

// Telemetry is disabled explicitly then leave it as is
if cluster.Spec.Monitoring != nil &&
cluster.Spec.Monitoring.Telemetry != nil &&
!cluster.Spec.Monitoring.Telemetry.Enabled {
logrus.Debug("Telemetry is explicitly disabled in StorageCluster")
return false
} else {
logrus.Debug("Telemetry is explicitly enabled in StorageCluster")
shouldTelemetryBeEnabled = true
// Check if Telemetry is enabled or disabled in the original spec
if originalClusterSpec.Spec.Monitoring != nil && originalClusterSpec.Spec.Monitoring.Telemetry != nil {
if originalClusterSpec.Spec.Monitoring.Telemetry.Enabled {
logrus.Debug("Telemetry is explicitly enabled in StorageCluster spec")
telemetryEnabledInTheSpec = true
} else {
logrus.Debug("Telemetry is explicitly disabled in StorageCluster spec")
telemetryEnabledInTheSpec = false
}
}

// Telemetry is not supported in those cases, set to disabled
// Get PX PROXY env vars from StorageCluster, if any
proxyType, proxy := GetPxProxyEnvVarValue(cluster)

// Validate conditions for when we are expecting Telemetry to be enabled/disabled
if cluster.Spec.Monitoring != nil && cluster.Spec.Monitoring.Telemetry != nil {
liveTelemetryEnabled := cluster.Spec.Monitoring.Telemetry.Enabled

// If Telemetry is disabled in both live and original spec, expect it to be disabled
if !telemetryEnabledInTheSpec == false && !liveTelemetryEnabled {
logrus.Debug("Telemetry is explicitly disabled in live StorageCluster and original spec")
return false
}

// If Telemetry is enabled in both live and original spec, expect it to be enabled
if telemetryEnabledInTheSpec && liveTelemetryEnabled {
logrus.Debug("Telemetry is explicitly enabled in live StorageCluster and original spec")
shouldTelemetryBeEnabled = true
}

// If Telemetry is enabled in the original spec and disabled in the live spec, but the PX PROXY is present, expect it to be enabled
if telemetryEnabledInTheSpec && len(proxy) > 0 && !liveTelemetryEnabled {
logrus.Warnf("Telemetry is explicitly enabled in the original spec, but it seems to be disabled in live StorageCluster, expecting it to be enabled and working as PROXY was provided [%s]", proxy)
shouldTelemetryBeEnabled = true
}
}

if pxVersion.LessThan(minimumPxVersionCCMJAVA) {
// PX version is lower than 2.8
logrus.Warnf("Telemetry is not supported on Portworx version: [%s]", pxVersion.String())
Expand Down Expand Up @@ -3410,6 +3435,10 @@ func shouldTelemetryBeEnabled(cluster *corev1.StorageCluster) bool {
if errors.IsNotFound(err) {
logrus.Debugf("Telemetry secret [%s] was not found, will try to reach to Pure1 to see if Telemetry should be auto enabled by default", TelemetryCertName)
if canAccess := CanAccessArcusRegisterEndpoint(cluster, proxy); !canAccess {
if shouldTelemetryBeEnabled {
logrus.Warnf("Not able to reach Pure1, but it should have been able to reach it due to PX PROXY was passed, please check your PROXY server")
return true
}
logrus.Warnf("Telemetry be disabled due to cannot reach to Pure1")
return false
}
Expand Down
2 changes: 1 addition & 1 deletion test/integration_test/basic_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -545,7 +545,7 @@ func BasicTelemetryRegression(tc *types.TestCase) func(*testing.T) {
logrus.Infof("Validate Telemetry is enabled by default, PX version [%s], operator version [%s]", pxVersion, opVersion)
require.True(t, telemetryEnabled, "failed to validate default Telemetry status: expected enabled [true], actual enabled [%v]", telemetryEnabled)

err = testutil.ValidateMonitoring(ci_utils.PxSpecImages, cluster, ci_utils.DefaultValidateComponentTimeout, ci_utils.DefaultValidateComponentRetryInterval)
err = testutil.ValidateMonitoring(ci_utils.PxSpecImages, cluster, cluster, ci_utils.DefaultValidateComponentTimeout, ci_utils.DefaultValidateComponentRetryInterval)
require.NoError(t, err)
} else {
// Validate Telemetry is not enabled by default
Expand Down
2 changes: 1 addition & 1 deletion test/integration_test/utils/storagecluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@ func UpdateAndValidateMonitoring(cluster *corev1.StorageCluster, f func(*corev1.
latestLiveCluster, err := UpdateStorageCluster(newCluster)
require.NoError(t, err)

err = testutil.ValidateMonitoring(pxSpecImages, latestLiveCluster, DefaultValidateComponentTimeout, DefaultValidateComponentRetryInterval)
err = testutil.ValidateMonitoring(pxSpecImages, latestLiveCluster, latestLiveCluster, DefaultValidateComponentTimeout, DefaultValidateComponentRetryInterval)
require.NoError(t, err)

return latestLiveCluster
Expand Down

0 comments on commit 2494973

Please sign in to comment.