Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PTX-19346 Adding workaround for the known Telemetry port issue for specific Operator upgrades #1179

Merged
merged 2 commits into from
Aug 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 36 additions & 11 deletions pkg/util/test/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -3308,16 +3308,41 @@ func ValidateTelemetry(pxImageList map[string]string, cluster *corev1.StorageClu
}

if pxVersion.GreaterThanOrEqual(minimumPxVersionCCMGO) && opVersion.GreaterThanOrEqual(opVer1_10) {
if shouldTelemetryBeEnabled(cluster) {
return ValidateTelemetryV2Enabled(pxImageList, cluster, timeout, interval)
nikolaypopov marked this conversation as resolved.
Show resolved Hide resolved
return ValidateTelemetryV2(pxImageList, cluster, timeout, interval)
}
return ValidateTelemetryV1(pxImageList, cluster, timeout, interval)
}

// ValidateTelemetryV1 validates old version of ccm-java telemetry
func ValidateTelemetryV1(pxImageList map[string]string, cluster *corev1.StorageCluster, timeout, interval time.Duration) error {
logrus.Info("Validating Telemetry (ccm-java)")
if shouldTelemetryBeEnabled(cluster) {
if err := ValidateTelemetryV1Enabled(pxImageList, cluster, timeout, interval); err != nil {
return fmt.Errorf("failed to validate Telemetry enabled, Err: %v", err)
}
return ValidateTelemetryV2Disabled(cluster, timeout, interval)
} else {
if shouldTelemetryBeEnabled(cluster) {
return ValidateTelemetryV1Enabled(pxImageList, cluster, timeout, interval)
return nil
}

if err := ValidateTelemetryV1Disabled(cluster, timeout, interval); err != nil {
return fmt.Errorf("failed to validate Telemetry disabled, Err: %v", err)
}
return nil
}

// ValidateTelemetryV2 validates new version of ccm-go telemetry
func ValidateTelemetryV2(pxImageList map[string]string, cluster *corev1.StorageCluster, timeout, interval time.Duration) error {
logrus.Info("Validating Telemetry (ccm-go)")
if shouldTelemetryBeEnabled(cluster) {
if err := ValidateTelemetryV2Enabled(pxImageList, cluster, timeout, interval); err != nil {
return fmt.Errorf("failed to validate Telemetry enabled, Err: %v", err)
}
return ValidateTelemetryV1Disabled(cluster, timeout, interval)
return nil
}

if err := ValidateTelemetryV2Disabled(cluster, timeout, interval); err != nil {
return fmt.Errorf("failed to validate Telemetry disabled, Err: %v", err)
}
return nil
}

// shouldTelemetryBeEnabled validates if Telemetry should be auto enabled/disabled by default
Expand Down Expand Up @@ -4041,7 +4066,7 @@ func validatePxTelemetryPhonehomeV2(pxImageList map[string]string, cluster *core
// Validate px-telemetry-phonehome daemonset, pods and container images
logrus.Info("Validate px-telemetry-phonehome daemonset and images")
if err := appops.Instance().ValidateDaemonSet("px-telemetry-phonehome", cluster.Namespace, timeout); err != nil {
return err
return fmt.Errorf("failed to validate [px-telemetry-phonehome] daemonset in [%s] namespace, Err: %v", cluster.Namespace, err)
}

telemetryPhonehomeDs, err := appops.Instance().GetDaemonSet("px-telemetry-phonehome", cluster.Namespace)
Expand Down Expand Up @@ -4094,7 +4119,7 @@ func validatePxTelemetryMetricsCollectorV2(pxImageList map[string]string, cluste
},
}
if err := appops.Instance().ValidateDeployment(metricsCollectorDep, timeout, interval); err != nil {
return err
return fmt.Errorf("failed to validate [%s] deployment in [%s] namespace, Err: %v", metricsCollectorDep.Name, metricsCollectorDep.Namespace, err)
}

pods, err := appops.Instance().GetDeploymentPods(metricsCollectorDep)
Expand Down Expand Up @@ -4142,7 +4167,7 @@ func validatePxTelemetryRegistrationV2(pxImageList map[string]string, cluster *c
},
}
if err := appops.Instance().ValidateDeployment(registrationServiceDep, timeout, interval); err != nil {
return err
return fmt.Errorf("failed to validate [%s] deployment in [%s] namespace, Err: %v", registrationServiceDep.Name, registrationServiceDep.Namespace, err)
}

pods, err := appops.Instance().GetDeploymentPods(registrationServiceDep)
Expand Down Expand Up @@ -4287,7 +4312,7 @@ func ValidateTelemetryV1Enabled(pxImageList map[string]string, cluster *corev1.S
},
}
if err := appops.Instance().ValidateDeployment(&dep, timeout, interval); err != nil {
return nil, true, err
return nil, true, fmt.Errorf("failed to validate [%s] deployment in [%s] namespace, Err: %v", dep.Name, dep.Namespace, err)
}

deployment, err := appops.Instance().GetDeployment(dep.Name, dep.Namespace)
Expand Down
31 changes: 30 additions & 1 deletion test/integration_test/basic_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ package integrationtest

import (
"fmt"
"strings"
"testing"
"time"

Expand Down Expand Up @@ -447,6 +448,8 @@ func BasicUpgradeOperator(tc *types.TestCase) func(*testing.T) {
require.NoError(t, err)
pxOperatorImage, err := ci_utils.GetPxOperatorImage(pxOperatorDeployment)
require.NoError(t, err)
lastHopVersion, err := ci_utils.GetPXOperatorVersion(pxOperatorDeployment)
require.NoError(t, err)
if len(lastHopImage) == 0 {
// This is initially deployed PX Operator version
lastHopImage = pxOperatorImage
Expand Down Expand Up @@ -483,9 +486,35 @@ func BasicUpgradeOperator(tc *types.TestCase) func(*testing.T) {
require.Equal(t, pxOperatorImage, hopImage)

// Validate StorageCluster
// NOTE: This is a workaround for a known Telemetry port issue, where restart of PX pods is required for them to set port from 9024 to new expected port 9029
telemetryErr := "failed to validate Telemetry"
err = testutil.ValidateStorageCluster(ci_utils.PxSpecImages, cluster, ci_utils.DefaultValidateDeployTimeout, ci_utils.DefaultValidateDeployRetryInterval, true, "")
require.NoError(t, err)
actualTelemetryErr := err

// NOTE: This is a workaround for a known Telemetry port issue, where restart of PX pods is required for them to set port from 9024 to new expected port 9029
if actualTelemetryErr != nil {
if strings.Contains(actualTelemetryErr.Error(), telemetryErr) {
logrus.Warnf("Got Telemetry error: %v", actualTelemetryErr)
logrus.Info("Checking if Telemetry error is expected and needs a workaround to get it to work after the upgrade of PX Operator..")
currentHopVersion, err := ci_utils.GetPXOperatorVersion(pxOperatorDeployment)
require.NoError(t, err)
if lastHopVersion.LessThanOrEqual(ci_utils.PxOperatorVer23_5_1) && currentHopVersion.GreaterThanOrEqual(ci_utils.PxOperatorVer23_5_1) {
logrus.Warnf("PX Operator upgraded from [%s] to [%s], before upgrade PX Operator version was less than 23.5.1, will need to delete PX pods due to known Telemetry port issue, will perform workaround..", lastHopVersion.String(), currentHopVersion.String())
// If error is Telemetry related, bounce PX pods
logrus.Info("Deleting portworx pods..")
err = coreops.Instance().DeletePodsByLabels(cluster.Namespace, map[string]string{"name": "portworx"}, 120*time.Second)
require.NoError(t, err)

// Validate StorageCluster again
logrus.Info("Re-validating PX components..")
err = testutil.ValidateStorageCluster(ci_utils.PxSpecImages, cluster, ci_utils.DefaultValidateDeployTimeout, ci_utils.DefaultValidateDeployRetryInterval, true, "")
require.NoError(t, err)
} else {
logrus.Warnf("Previous PX Operator version before upgrade [%s] should have not caused Telemetry issue when upgrading to PX Operator [%s]", lastHopVersion.String(), currentHopVersion.String())
require.NoError(t, fmt.Errorf("Telemetry error is not expected here, Err: %v", actualTelemetryErr))
}
}
}
lastHopImage = hopImage
}

Expand Down
29 changes: 28 additions & 1 deletion test/integration_test/marketplace_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
testutil "github.com/libopenstorage/operator/pkg/util/test"
"github.com/libopenstorage/operator/test/integration_test/types"
ci_utils "github.com/libopenstorage/operator/test/integration_test/utils"
coreops "github.com/portworx/sched-ops/k8s/core"
"github.com/sirupsen/logrus"
"github.com/stretchr/testify/require"
)
Expand Down Expand Up @@ -159,8 +160,34 @@ func BasicUpgradeOperatorViaOcpMarketplace(tc *types.TestCase) func(*testing.T)
time.Sleep(15 * time.Second)

// Validate StorageCluster
telemetryErr := "failed to validate Telemetry"
err = testutil.ValidateStorageCluster(ci_utils.PxSpecImages, cluster, ci_utils.DefaultValidateDeployTimeout, ci_utils.DefaultValidateDeployRetryInterval, true, "")
require.NoError(t, err)
actualTelemetryErr := err

// NOTE: This is a workaround for a known Telemetry port issue, where restart of PX pods is required for them to set port from 9024 to new expected port 9029
if actualTelemetryErr != nil {
if strings.Contains(actualTelemetryErr.Error(), telemetryErr) {
logrus.Warnf("Got Telemetry error: %v", actualTelemetryErr)
logrus.Info("Checking if Telemetry error is exected and needs a workaround to get it to work after the upgrade of PX Operator..")
pxOperatorVersionAfterUpgrade, err := ci_utils.GetPXOperatorVersion(opDep)
require.NoError(t, err)
if pxOperatorVersionAfterUpgrade.LessThanOrEqual(ci_utils.PxOperatorVer23_5_1) && pxOperatorCurrentVersion.GreaterThanOrEqual(ci_utils.PxOperatorVer23_5_1) {
logrus.Warnf("PX Operator upgraded from [%s] to [%s], before upgrade version was less than 23.5.1, will need to delete PX pods due to known Telemetry port issue, will perform workaround..", pxOperatorCurrentVersion.String(), pxOperatorVersionAfterUpgrade.String())
// If error is Telemetry related, bounce PX pods
logrus.Info("Deleting portworx pods..")
err = coreops.Instance().DeletePodsByLabels(cluster.Namespace, map[string]string{"name": "portworx"}, 120*time.Second)
require.NoError(t, err)

// Validate StorageCluster again
logrus.Info("Re-validating PX components..")
err = testutil.ValidateStorageCluster(ci_utils.PxSpecImages, cluster, ci_utils.DefaultValidateDeployTimeout, ci_utils.DefaultValidateDeployRetryInterval, true, "")
require.NoError(t, err)
} else {
logrus.Warnf("Previous PX Operator version before upgrade [%s] should have not caused Telemetry issue when upgrading to PX Operator [%s]", pxOperatorCurrentVersion.String(), pxOperatorVersionAfterUpgrade.String())
require.NoError(t, fmt.Errorf("Telemetry error is not expected here, Err: %v", actualTelemetryErr))
}
}
}
}

// Delete and validate StorageCluster deletion
Expand Down
4 changes: 4 additions & 0 deletions test/integration_test/utils/px_operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ var (
PxOperatorVer1_8_1, _ = version.NewVersion("1.8.1-")
// PxOperatorVer23_3 portworx-operator 23.3 minimum version
PxOperatorVer23_3, _ = version.NewVersion("23.3-")
// PxOperatorVer23_5_0 portworx-operator 23.5.0 version to check for known Telemetry port issue
PxOperatorVer23_5_0, _ = version.NewVersion("23.5.0")
// PxOperatorVer23_5_1 portworx-operator 23.5.1 version
PxOperatorVer23_5_1, _ = version.NewVersion("23.5.1")
// PxOperatorVer23_8 portworx-operator 23.8 minimum version
PxOperatorVer23_8, _ = version.NewVersion("23.8-")
)
Expand Down
Loading