From 3e9250ec2673ddf81bae5041e38ca80f9620edf9 Mon Sep 17 00:00:00 2001 From: nikolaypopov Date: Thu, 3 Aug 2023 00:17:04 +0000 Subject: [PATCH] PTX-19346 Adding workaround for the known Telemetry port issue for specific Operator upgrades Signed-off-by: nikolaypopov --- pkg/util/test/util.go | 47 +++++++++++++++++----- test/integration_test/basic_test.go | 31 +++++++++++++- test/integration_test/marketplace_test.go | 29 ++++++++++++- test/integration_test/utils/px_operator.go | 4 ++ 4 files changed, 98 insertions(+), 13 deletions(-) diff --git a/pkg/util/test/util.go b/pkg/util/test/util.go index 3c1f6f821..659d8bf1c 100644 --- a/pkg/util/test/util.go +++ b/pkg/util/test/util.go @@ -3308,16 +3308,41 @@ func ValidateTelemetry(pxImageList map[string]string, cluster *corev1.StorageClu } if pxVersion.GreaterThanOrEqual(minimumPxVersionCCMGO) && opVersion.GreaterThanOrEqual(opVer1_10) { - if shouldTelemetryBeEnabled(cluster) { - return ValidateTelemetryV2Enabled(pxImageList, cluster, timeout, interval) + return ValidateTelemetryV2(pxImageList, cluster, timeout, interval) + } + return ValidateTelemetryV1(pxImageList, cluster, timeout, interval) +} + +// ValidateTelemetryV1 validates old version of ccm-java telemetry +func ValidateTelemetryV1(pxImageList map[string]string, cluster *corev1.StorageCluster, timeout, interval time.Duration) error { + logrus.Info("Validating Telemetry (ccm-java)") + if shouldTelemetryBeEnabled(cluster) { + if err := ValidateTelemetryV1Enabled(pxImageList, cluster, timeout, interval); err != nil { + return fmt.Errorf("failed to validate Telemetry enabled, Err: %v", err) } - return ValidateTelemetryV2Disabled(cluster, timeout, interval) - } else { - if shouldTelemetryBeEnabled(cluster) { - return ValidateTelemetryV1Enabled(pxImageList, cluster, timeout, interval) + return nil + } + + if err := ValidateTelemetryV1Disabled(cluster, timeout, interval); err != nil { + return fmt.Errorf("failed to validate Telemetry disabled, Err: %v", err) + } + return nil +} + +// ValidateTelemetryV2 validates new version of ccm-go telemetry +func ValidateTelemetryV2(pxImageList map[string]string, cluster *corev1.StorageCluster, timeout, interval time.Duration) error { + logrus.Info("Validating Telemetry (ccm-go)") + if shouldTelemetryBeEnabled(cluster) { + if err := ValidateTelemetryV2Enabled(pxImageList, cluster, timeout, interval); err != nil { + return fmt.Errorf("failed to validate Telemetry enabled, Err: %v", err) } - return ValidateTelemetryV1Disabled(cluster, timeout, interval) + return nil } + + if err := ValidateTelemetryV2Disabled(cluster, timeout, interval); err != nil { + return fmt.Errorf("failed to validate Telemetry disabled, Err: %v", err) + } + return nil } // shouldTelemetryBeEnabled validates if Telemetry should be auto enabled/disabled by default @@ -4041,7 +4066,7 @@ func validatePxTelemetryPhonehomeV2(pxImageList map[string]string, cluster *core // Validate px-telemetry-phonehome daemonset, pods and container images logrus.Info("Validate px-telemetry-phonehome daemonset and images") if err := appops.Instance().ValidateDaemonSet("px-telemetry-phonehome", cluster.Namespace, timeout); err != nil { - return err + return fmt.Errorf("failed to validate [px-telemetry-phonehome] daemonset in [%s] namespace, Err: %v", cluster.Namespace, err) } telemetryPhonehomeDs, err := appops.Instance().GetDaemonSet("px-telemetry-phonehome", cluster.Namespace) @@ -4094,7 +4119,7 @@ func validatePxTelemetryMetricsCollectorV2(pxImageList map[string]string, cluste }, } if err := appops.Instance().ValidateDeployment(metricsCollectorDep, timeout, interval); err != nil { - return err + return fmt.Errorf("failed to validate [%s] deployment in [%s] namespace, Err: %v", metricsCollectorDep.Name, metricsCollectorDep.Namespace, err) } pods, err := appops.Instance().GetDeploymentPods(metricsCollectorDep) @@ -4142,7 +4167,7 @@ func validatePxTelemetryRegistrationV2(pxImageList map[string]string, cluster *c }, } if err := appops.Instance().ValidateDeployment(registrationServiceDep, timeout, interval); err != nil { - return err + return fmt.Errorf("failed to validate [%s] deployment in [%s] namespace, Err: %v", registrationServiceDep.Name, registrationServiceDep.Namespace, err) } pods, err := appops.Instance().GetDeploymentPods(registrationServiceDep) @@ -4287,7 +4312,7 @@ func ValidateTelemetryV1Enabled(pxImageList map[string]string, cluster *corev1.S }, } if err := appops.Instance().ValidateDeployment(&dep, timeout, interval); err != nil { - return nil, true, err + return nil, true, fmt.Errorf("failed to validate [%s] deployment in [%s] namespace, Err: %v", dep.Name, dep.Namespace, err) } deployment, err := appops.Instance().GetDeployment(dep.Name, dep.Namespace) diff --git a/test/integration_test/basic_test.go b/test/integration_test/basic_test.go index b6883c422..186c4f3ab 100644 --- a/test/integration_test/basic_test.go +++ b/test/integration_test/basic_test.go @@ -5,6 +5,7 @@ package integrationtest import ( "fmt" + "strings" "testing" "time" @@ -447,6 +448,8 @@ func BasicUpgradeOperator(tc *types.TestCase) func(*testing.T) { require.NoError(t, err) pxOperatorImage, err := ci_utils.GetPxOperatorImage(pxOperatorDeployment) require.NoError(t, err) + lastHopVersion, err := ci_utils.GetPXOperatorVersion(pxOperatorDeployment) + require.NoError(t, err) if len(lastHopImage) == 0 { // This is initially deployed PX Operator version lastHopImage = pxOperatorImage @@ -483,9 +486,35 @@ func BasicUpgradeOperator(tc *types.TestCase) func(*testing.T) { require.Equal(t, pxOperatorImage, hopImage) // Validate StorageCluster + // NOTE: This is a workaround for a known Telemetry port issue, where restart of PX pods is required for them to set port from 9024 to new expected port 9029 + telemetryErr := "failed to validate Telemetry" err = testutil.ValidateStorageCluster(ci_utils.PxSpecImages, cluster, ci_utils.DefaultValidateDeployTimeout, ci_utils.DefaultValidateDeployRetryInterval, true, "") - require.NoError(t, err) + actualTelemetryErr := err + + // NOTE: This is a workaround for a known Telemetry port issue, where restart of PX pods is required for them to set port from 9024 to new expected port 9029 + if actualTelemetryErr != nil { + if strings.Contains(actualTelemetryErr.Error(), telemetryErr) { + logrus.Warnf("Got Telemetry error: %v", actualTelemetryErr) + logrus.Info("Checking if Telemetry error is expected and needs a workaround to get it to work after the upgrade of PX Operator..") + currentHopVersion, err := ci_utils.GetPXOperatorVersion(pxOperatorDeployment) + require.NoError(t, err) + if lastHopVersion.LessThanOrEqual(ci_utils.PxOperatorVer23_5_1) && currentHopVersion.GreaterThanOrEqual(ci_utils.PxOperatorVer23_5_1) { + logrus.Warnf("PX Operator upgraded from [%s] to [%s], before upgrade PX Operator version was less than 23.5.1, will need to delete PX pods due to known Telemetry port issue, will perform workaround..", lastHopVersion.String(), currentHopVersion.String()) + // If error is Telemetry related, bounce PX pods + logrus.Info("Deleting portworx pods..") + err = coreops.Instance().DeletePodsByLabels(cluster.Namespace, map[string]string{"name": "portworx"}, 120*time.Second) + require.NoError(t, err) + // Validate StorageCluster again + logrus.Info("Re-validating PX components..") + err = testutil.ValidateStorageCluster(ci_utils.PxSpecImages, cluster, ci_utils.DefaultValidateDeployTimeout, ci_utils.DefaultValidateDeployRetryInterval, true, "") + require.NoError(t, err) + } else { + logrus.Warnf("Previous PX Operator version before upgrade [%s] should have not caused Telemetry issue when upgrading to PX Operator [%s]", lastHopVersion.String(), currentHopVersion.String()) + require.NoError(t, fmt.Errorf("Telemetry error is not expected here, Err: %v", actualTelemetryErr)) + } + } + } lastHopImage = hopImage } diff --git a/test/integration_test/marketplace_test.go b/test/integration_test/marketplace_test.go index 4274ea1d5..646d8c207 100644 --- a/test/integration_test/marketplace_test.go +++ b/test/integration_test/marketplace_test.go @@ -14,6 +14,7 @@ import ( testutil "github.com/libopenstorage/operator/pkg/util/test" "github.com/libopenstorage/operator/test/integration_test/types" ci_utils "github.com/libopenstorage/operator/test/integration_test/utils" + coreops "github.com/portworx/sched-ops/k8s/core" "github.com/sirupsen/logrus" "github.com/stretchr/testify/require" ) @@ -159,8 +160,34 @@ func BasicUpgradeOperatorViaOcpMarketplace(tc *types.TestCase) func(*testing.T) time.Sleep(15 * time.Second) // Validate StorageCluster + telemetryErr := "failed to validate Telemetry" err = testutil.ValidateStorageCluster(ci_utils.PxSpecImages, cluster, ci_utils.DefaultValidateDeployTimeout, ci_utils.DefaultValidateDeployRetryInterval, true, "") - require.NoError(t, err) + actualTelemetryErr := err + + // NOTE: This is a workaround for a known Telemetry port issue, where restart of PX pods is required for them to set port from 9024 to new expected port 9029 + if actualTelemetryErr != nil { + if strings.Contains(actualTelemetryErr.Error(), telemetryErr) { + logrus.Warnf("Got Telemetry error: %v", actualTelemetryErr) + logrus.Info("Checking if Telemetry error is exected and needs a workaround to get it to work after the upgrade of PX Operator..") + pxOperatorVersionAfterUpgrade, err := ci_utils.GetPXOperatorVersion(opDep) + require.NoError(t, err) + if pxOperatorVersionAfterUpgrade.LessThanOrEqual(ci_utils.PxOperatorVer23_5_1) && pxOperatorCurrentVersion.GreaterThanOrEqual(ci_utils.PxOperatorVer23_5_1) { + logrus.Warnf("PX Operator upgraded from [%s] to [%s], before upgrade version was less than 23.5.1, will need to delete PX pods due to known Telemetry port issue, will perform workaround..", pxOperatorCurrentVersion.String(), pxOperatorVersionAfterUpgrade.String()) + // If error is Telemetry related, bounce PX pods + logrus.Info("Deleting portworx pods..") + err = coreops.Instance().DeletePodsByLabels(cluster.Namespace, map[string]string{"name": "portworx"}, 120*time.Second) + require.NoError(t, err) + + // Validate StorageCluster again + logrus.Info("Re-validating PX components..") + err = testutil.ValidateStorageCluster(ci_utils.PxSpecImages, cluster, ci_utils.DefaultValidateDeployTimeout, ci_utils.DefaultValidateDeployRetryInterval, true, "") + require.NoError(t, err) + } else { + logrus.Warnf("Previous PX Operator version before upgrade [%s] should have not caused Telemetry issue when upgrading to PX Operator [%s]", pxOperatorCurrentVersion.String(), pxOperatorVersionAfterUpgrade.String()) + require.NoError(t, fmt.Errorf("Telemetry error is not expected here, Err: %v", actualTelemetryErr)) + } + } + } } // Delete and validate StorageCluster deletion diff --git a/test/integration_test/utils/px_operator.go b/test/integration_test/utils/px_operator.go index 29d77576b..246f429ab 100644 --- a/test/integration_test/utils/px_operator.go +++ b/test/integration_test/utils/px_operator.go @@ -28,6 +28,10 @@ var ( PxOperatorVer1_8_1, _ = version.NewVersion("1.8.1-") // PxOperatorVer23_3 portworx-operator 23.3 minimum version PxOperatorVer23_3, _ = version.NewVersion("23.3-") + // PxOperatorVer23_5_0 portworx-operator 23.5.0 version to check for known Telemetry port issue + PxOperatorVer23_5_0, _ = version.NewVersion("23.5.0") + // PxOperatorVer23_5_1 portworx-operator 23.5.1 version + PxOperatorVer23_5_1, _ = version.NewVersion("23.5.1") // PxOperatorVer23_8 portworx-operator 23.8 minimum version PxOperatorVer23_8, _ = version.NewVersion("23.8-") )