Skip to content

Commit

Permalink
PTX-19346 Adding workaround for the known Telemetry port issue for sp…
Browse files Browse the repository at this point in the history
…ecific Operator upgrades

Signed-off-by: nikolaypopov <nikolay.popov86@gmail.com>
  • Loading branch information
nikolaypopov committed Aug 4, 2023
1 parent e27dec4 commit 3e9250e
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 13 deletions.
47 changes: 36 additions & 11 deletions pkg/util/test/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -3308,16 +3308,41 @@ func ValidateTelemetry(pxImageList map[string]string, cluster *corev1.StorageClu
}

if pxVersion.GreaterThanOrEqual(minimumPxVersionCCMGO) && opVersion.GreaterThanOrEqual(opVer1_10) {
if shouldTelemetryBeEnabled(cluster) {
return ValidateTelemetryV2Enabled(pxImageList, cluster, timeout, interval)
return ValidateTelemetryV2(pxImageList, cluster, timeout, interval)
}
return ValidateTelemetryV1(pxImageList, cluster, timeout, interval)
}

// ValidateTelemetryV1 validates old version of ccm-java telemetry
func ValidateTelemetryV1(pxImageList map[string]string, cluster *corev1.StorageCluster, timeout, interval time.Duration) error {
logrus.Info("Validating Telemetry (ccm-java)")
if shouldTelemetryBeEnabled(cluster) {
if err := ValidateTelemetryV1Enabled(pxImageList, cluster, timeout, interval); err != nil {
return fmt.Errorf("failed to validate Telemetry enabled, Err: %v", err)
}
return ValidateTelemetryV2Disabled(cluster, timeout, interval)
} else {
if shouldTelemetryBeEnabled(cluster) {
return ValidateTelemetryV1Enabled(pxImageList, cluster, timeout, interval)
return nil
}

if err := ValidateTelemetryV1Disabled(cluster, timeout, interval); err != nil {
return fmt.Errorf("failed to validate Telemetry disabled, Err: %v", err)
}
return nil
}

// ValidateTelemetryV2 validates new version of ccm-go telemetry
func ValidateTelemetryV2(pxImageList map[string]string, cluster *corev1.StorageCluster, timeout, interval time.Duration) error {
logrus.Info("Validating Telemetry (ccm-go)")
if shouldTelemetryBeEnabled(cluster) {
if err := ValidateTelemetryV2Enabled(pxImageList, cluster, timeout, interval); err != nil {
return fmt.Errorf("failed to validate Telemetry enabled, Err: %v", err)
}
return ValidateTelemetryV1Disabled(cluster, timeout, interval)
return nil
}

if err := ValidateTelemetryV2Disabled(cluster, timeout, interval); err != nil {
return fmt.Errorf("failed to validate Telemetry disabled, Err: %v", err)
}
return nil
}

// shouldTelemetryBeEnabled validates if Telemetry should be auto enabled/disabled by default
Expand Down Expand Up @@ -4041,7 +4066,7 @@ func validatePxTelemetryPhonehomeV2(pxImageList map[string]string, cluster *core
// Validate px-telemetry-phonehome daemonset, pods and container images
logrus.Info("Validate px-telemetry-phonehome daemonset and images")
if err := appops.Instance().ValidateDaemonSet("px-telemetry-phonehome", cluster.Namespace, timeout); err != nil {
return err
return fmt.Errorf("failed to validate [px-telemetry-phonehome] daemonset in [%s] namespace, Err: %v", cluster.Namespace, err)
}

telemetryPhonehomeDs, err := appops.Instance().GetDaemonSet("px-telemetry-phonehome", cluster.Namespace)
Expand Down Expand Up @@ -4094,7 +4119,7 @@ func validatePxTelemetryMetricsCollectorV2(pxImageList map[string]string, cluste
},
}
if err := appops.Instance().ValidateDeployment(metricsCollectorDep, timeout, interval); err != nil {
return err
return fmt.Errorf("failed to validate [%s] deployment in [%s] namespace, Err: %v", metricsCollectorDep.Name, metricsCollectorDep.Namespace, err)
}

pods, err := appops.Instance().GetDeploymentPods(metricsCollectorDep)
Expand Down Expand Up @@ -4142,7 +4167,7 @@ func validatePxTelemetryRegistrationV2(pxImageList map[string]string, cluster *c
},
}
if err := appops.Instance().ValidateDeployment(registrationServiceDep, timeout, interval); err != nil {
return err
return fmt.Errorf("failed to validate [%s] deployment in [%s] namespace, Err: %v", registrationServiceDep.Name, registrationServiceDep.Namespace, err)
}

pods, err := appops.Instance().GetDeploymentPods(registrationServiceDep)
Expand Down Expand Up @@ -4287,7 +4312,7 @@ func ValidateTelemetryV1Enabled(pxImageList map[string]string, cluster *corev1.S
},
}
if err := appops.Instance().ValidateDeployment(&dep, timeout, interval); err != nil {
return nil, true, err
return nil, true, fmt.Errorf("failed to validate [%s] deployment in [%s] namespace, Err: %v", dep.Name, dep.Namespace, err)
}

deployment, err := appops.Instance().GetDeployment(dep.Name, dep.Namespace)
Expand Down
31 changes: 30 additions & 1 deletion test/integration_test/basic_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ package integrationtest

import (
"fmt"
"strings"
"testing"
"time"

Expand Down Expand Up @@ -447,6 +448,8 @@ func BasicUpgradeOperator(tc *types.TestCase) func(*testing.T) {
require.NoError(t, err)
pxOperatorImage, err := ci_utils.GetPxOperatorImage(pxOperatorDeployment)
require.NoError(t, err)
lastHopVersion, err := ci_utils.GetPXOperatorVersion(pxOperatorDeployment)
require.NoError(t, err)
if len(lastHopImage) == 0 {
// This is initially deployed PX Operator version
lastHopImage = pxOperatorImage
Expand Down Expand Up @@ -483,9 +486,35 @@ func BasicUpgradeOperator(tc *types.TestCase) func(*testing.T) {
require.Equal(t, pxOperatorImage, hopImage)

// Validate StorageCluster
// NOTE: This is a workaround for a known Telemetry port issue, where restart of PX pods is required for them to set port from 9024 to new expected port 9029
telemetryErr := "failed to validate Telemetry"
err = testutil.ValidateStorageCluster(ci_utils.PxSpecImages, cluster, ci_utils.DefaultValidateDeployTimeout, ci_utils.DefaultValidateDeployRetryInterval, true, "")
require.NoError(t, err)
actualTelemetryErr := err

// NOTE: This is a workaround for a known Telemetry port issue, where restart of PX pods is required for them to set port from 9024 to new expected port 9029
if actualTelemetryErr != nil {
if strings.Contains(actualTelemetryErr.Error(), telemetryErr) {
logrus.Warnf("Got Telemetry error: %v", actualTelemetryErr)
logrus.Info("Checking if Telemetry error is expected and needs a workaround to get it to work after the upgrade of PX Operator..")
currentHopVersion, err := ci_utils.GetPXOperatorVersion(pxOperatorDeployment)
require.NoError(t, err)
if lastHopVersion.LessThanOrEqual(ci_utils.PxOperatorVer23_5_1) && currentHopVersion.GreaterThanOrEqual(ci_utils.PxOperatorVer23_5_1) {
logrus.Warnf("PX Operator upgraded from [%s] to [%s], before upgrade PX Operator version was less than 23.5.1, will need to delete PX pods due to known Telemetry port issue, will perform workaround..", lastHopVersion.String(), currentHopVersion.String())
// If error is Telemetry related, bounce PX pods
logrus.Info("Deleting portworx pods..")
err = coreops.Instance().DeletePodsByLabels(cluster.Namespace, map[string]string{"name": "portworx"}, 120*time.Second)
require.NoError(t, err)

// Validate StorageCluster again
logrus.Info("Re-validating PX components..")
err = testutil.ValidateStorageCluster(ci_utils.PxSpecImages, cluster, ci_utils.DefaultValidateDeployTimeout, ci_utils.DefaultValidateDeployRetryInterval, true, "")
require.NoError(t, err)
} else {
logrus.Warnf("Previous PX Operator version before upgrade [%s] should have not caused Telemetry issue when upgrading to PX Operator [%s]", lastHopVersion.String(), currentHopVersion.String())
require.NoError(t, fmt.Errorf("Telemetry error is not expected here, Err: %v", actualTelemetryErr))
}
}
}
lastHopImage = hopImage
}

Expand Down
29 changes: 28 additions & 1 deletion test/integration_test/marketplace_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
testutil "github.com/libopenstorage/operator/pkg/util/test"
"github.com/libopenstorage/operator/test/integration_test/types"
ci_utils "github.com/libopenstorage/operator/test/integration_test/utils"
coreops "github.com/portworx/sched-ops/k8s/core"
"github.com/sirupsen/logrus"
"github.com/stretchr/testify/require"
)
Expand Down Expand Up @@ -159,8 +160,34 @@ func BasicUpgradeOperatorViaOcpMarketplace(tc *types.TestCase) func(*testing.T)
time.Sleep(15 * time.Second)

// Validate StorageCluster
telemetryErr := "failed to validate Telemetry"
err = testutil.ValidateStorageCluster(ci_utils.PxSpecImages, cluster, ci_utils.DefaultValidateDeployTimeout, ci_utils.DefaultValidateDeployRetryInterval, true, "")
require.NoError(t, err)
actualTelemetryErr := err

// NOTE: This is a workaround for a known Telemetry port issue, where restart of PX pods is required for them to set port from 9024 to new expected port 9029
if actualTelemetryErr != nil {
if strings.Contains(actualTelemetryErr.Error(), telemetryErr) {
logrus.Warnf("Got Telemetry error: %v", actualTelemetryErr)
logrus.Info("Checking if Telemetry error is exected and needs a workaround to get it to work after the upgrade of PX Operator..")
pxOperatorVersionAfterUpgrade, err := ci_utils.GetPXOperatorVersion(opDep)
require.NoError(t, err)
if pxOperatorVersionAfterUpgrade.LessThanOrEqual(ci_utils.PxOperatorVer23_5_1) && pxOperatorCurrentVersion.GreaterThanOrEqual(ci_utils.PxOperatorVer23_5_1) {
logrus.Warnf("PX Operator upgraded from [%s] to [%s], before upgrade version was less than 23.5.1, will need to delete PX pods due to known Telemetry port issue, will perform workaround..", pxOperatorCurrentVersion.String(), pxOperatorVersionAfterUpgrade.String())
// If error is Telemetry related, bounce PX pods
logrus.Info("Deleting portworx pods..")
err = coreops.Instance().DeletePodsByLabels(cluster.Namespace, map[string]string{"name": "portworx"}, 120*time.Second)
require.NoError(t, err)

// Validate StorageCluster again
logrus.Info("Re-validating PX components..")
err = testutil.ValidateStorageCluster(ci_utils.PxSpecImages, cluster, ci_utils.DefaultValidateDeployTimeout, ci_utils.DefaultValidateDeployRetryInterval, true, "")
require.NoError(t, err)
} else {
logrus.Warnf("Previous PX Operator version before upgrade [%s] should have not caused Telemetry issue when upgrading to PX Operator [%s]", pxOperatorCurrentVersion.String(), pxOperatorVersionAfterUpgrade.String())
require.NoError(t, fmt.Errorf("Telemetry error is not expected here, Err: %v", actualTelemetryErr))
}
}
}
}

// Delete and validate StorageCluster deletion
Expand Down
4 changes: 4 additions & 0 deletions test/integration_test/utils/px_operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ var (
PxOperatorVer1_8_1, _ = version.NewVersion("1.8.1-")
// PxOperatorVer23_3 portworx-operator 23.3 minimum version
PxOperatorVer23_3, _ = version.NewVersion("23.3-")
// PxOperatorVer23_5_0 portworx-operator 23.5.0 version to check for known Telemetry port issue
PxOperatorVer23_5_0, _ = version.NewVersion("23.5.0")
// PxOperatorVer23_5_1 portworx-operator 23.5.1 version
PxOperatorVer23_5_1, _ = version.NewVersion("23.5.1")
// PxOperatorVer23_8 portworx-operator 23.8 minimum version
PxOperatorVer23_8, _ = version.NewVersion("23.8-")
)
Expand Down

0 comments on commit 3e9250e

Please sign in to comment.