From 9f73c2e27be8c3bcfb7cd80fffd2d26d7f8c13e8 Mon Sep 17 00:00:00 2001 From: Simon Bein Date: Wed, 17 Apr 2024 15:47:01 +0200 Subject: [PATCH] Applications: discover if a release is stuck and rollback if necessary (#13301) * Discover if a release is stuck and rollback if necessary Signed-off-by: Simon Bein * linter + noop stubs Signed-off-by: Simon Bein * add additional logging Signed-off-by: Simon Bein * rename to isStuck Signed-off-by: Simon Bein * remove timeout from message, as we do not explicitly check for it Signed-off-by: Simon Bein * remove printing directive for AppInstall, as we already print it centrally via the logger Signed-off-by: Simon Bein --------- Signed-off-by: Simon Bein --- pkg/applications/fake/fake_installer.go | 29 ++++++++ pkg/applications/helmclient/client.go | 20 +++++ pkg/applications/installer.go | 26 +++++++ pkg/applications/providers/template/helm.go | 73 +++++++++++++++++++ pkg/applications/providers/types.go | 6 ++ .../controller.go | 14 ++++ 6 files changed, 168 insertions(+) diff --git a/pkg/applications/fake/fake_installer.go b/pkg/applications/fake/fake_installer.go index ce7196906f6..01659feab69 100644 --- a/pkg/applications/fake/fake_installer.go +++ b/pkg/applications/fake/fake_installer.go @@ -59,6 +59,15 @@ func (a *ApplicationInstallerRecorder) Delete(ctx context.Context, log *zap.Suga return util.NoStatusUpdate, nil } +func (a *ApplicationInstallerRecorder) IsStuck(ctx context.Context, log *zap.SugaredLogger, seedClient ctrlruntimeclient.Client, userClient ctrlruntimeclient.Client, applicationInstallation *appskubermaticv1.ApplicationInstallation) (bool, error) { + // NOOP + return false, nil +} + +func (a *ApplicationInstallerRecorder) Rollback(ctx context.Context, log *zap.SugaredLogger, seedClient ctrlruntimeclient.Client, userClient ctrlruntimeclient.Client, applicationInstallation *appskubermaticv1.ApplicationInstallation) error { + return nil +} + // ApplicationInstallerLogger is a fake ApplicationInstaller that just logs actions. it's used for the development of the controller. type ApplicationInstallerLogger struct { } @@ -81,6 +90,16 @@ func (a ApplicationInstallerLogger) Delete(ctx context.Context, log *zap.Sugared return util.NoStatusUpdate, nil } +func (a ApplicationInstallerLogger) IsStuck(ctx context.Context, log *zap.SugaredLogger, seedClient ctrlruntimeclient.Client, userClient ctrlruntimeclient.Client, applicationInstallation *appskubermaticv1.ApplicationInstallation) (bool, error) { + // NOOP + return false, nil +} + +func (a ApplicationInstallerLogger) Rollback(ctx context.Context, log *zap.SugaredLogger, seedClient ctrlruntimeclient.Client, userClient ctrlruntimeclient.Client, applicationInstallation *appskubermaticv1.ApplicationInstallation) error { + // NOOP + return nil +} + // CustomApplicationInstaller is an applicationInstaller in which every function can be independently mocked. // If a function is not mocked, then default values are returned. type CustomApplicationInstaller struct { @@ -117,3 +136,13 @@ func (c CustomApplicationInstaller) Delete(ctx context.Context, log *zap.Sugared } return util.NoStatusUpdate, nil } + +func (c CustomApplicationInstaller) IsStuck(ctx context.Context, log *zap.SugaredLogger, seedClient ctrlruntimeclient.Client, userClient ctrlruntimeclient.Client, applicationInstallation *appskubermaticv1.ApplicationInstallation) (bool, error) { + // NOOP + return false, nil +} + +func (c CustomApplicationInstaller) Rollback(ctx context.Context, log *zap.SugaredLogger, seedClient ctrlruntimeclient.Client, userClient ctrlruntimeclient.Client, applicationInstallation *appskubermaticv1.ApplicationInstallation) error { + // NOOP + return nil +} diff --git a/pkg/applications/helmclient/client.go b/pkg/applications/helmclient/client.go index 9a94674f265..dc5bd6361c9 100644 --- a/pkg/applications/helmclient/client.go +++ b/pkg/applications/helmclient/client.go @@ -416,6 +416,26 @@ func (h HelmClient) Uninstall(releaseName string) (*release.UninstallReleaseResp return uninstallReleaseResponse, err } +// GetMetadata wraps helms GetMetadata command to be used with our ActionConfig. +func (h HelmClient) GetMetadata(releaseName string) (*action.Metadata, error) { + client := action.NewGetMetadata(h.actionConfig) + res, err := client.Run(releaseName) + if err != nil { + return nil, fmt.Errorf("Could not retrieve metadata for release %q: %w", releaseName, err) + } + return res, nil +} + +// Rollback wraps helms Rollback command to be used with our ActionConfig. +func (h HelmClient) Rollback(releaseName string) error { + client := action.NewRollback(h.actionConfig) + err := client.Run(releaseName) + if err != nil { + return fmt.Errorf("Could not rollback release %q: %w", releaseName, err) + } + return nil +} + // buildDependencies adds missing repositories and then does a Helm dependency build (i.e. download the chart dependencies // from repositories into "charts" folder). func (h HelmClient) buildDependencies(chartLoc string, auth AuthSettings) (*chart.Chart, error) { diff --git a/pkg/applications/installer.go b/pkg/applications/installer.go index 1b671cbbcbf..52ca93bad1c 100644 --- a/pkg/applications/installer.go +++ b/pkg/applications/installer.go @@ -44,6 +44,12 @@ type ApplicationInstaller interface { // Delete function uninstalls the application on the user-cluster and returns an error if the uninstallation has failed. StatusUpdater is guaranteed to be non nil. This is idempotent. Delete(ctx context.Context, log *zap.SugaredLogger, seedClient ctrlruntimeclient.Client, userClient ctrlruntimeclient.Client, applicationInstallation *appskubermaticv1.ApplicationInstallation) (util.StatusUpdater, error) + + // IsStuck determines if a release is stuck. Its main purpose is to detect inconsistent behavior in upstream Application libraries + IsStuck(ctx context.Context, log *zap.SugaredLogger, seedClient ctrlruntimeclient.Client, userClient ctrlruntimeclient.Client, applicationInstallation *appskubermaticv1.ApplicationInstallation) (bool, error) + + // Rollback rolls an Application back to the previous release + Rollback(ctx context.Context, log *zap.SugaredLogger, seedClient ctrlruntimeclient.Client, userClient ctrlruntimeclient.Client, applicationInstallation *appskubermaticv1.ApplicationInstallation) error } // ApplicationManager handles the installation / uninstallation of an Application on the user-cluster. @@ -138,3 +144,23 @@ func (a *ApplicationManager) reconcileNamespace(ctx context.Context, log *zap.Su } return nil } + +// IsStuck determines if a release is stuck. Its main purpose is to detect inconsistent behavior in upstream Application libraries. +func (a *ApplicationManager) IsStuck(ctx context.Context, log *zap.SugaredLogger, seedClient ctrlruntimeclient.Client, userClient ctrlruntimeclient.Client, applicationInstallation *appskubermaticv1.ApplicationInstallation) (bool, error) { + templateProvider, err := providers.NewTemplateProvider(ctx, seedClient, a.Kubeconfig, a.ApplicationCache, log, applicationInstallation, a.SecretNamespace) + if err != nil { + return false, fmt.Errorf("failed to initialize template provider: %w", err) + } + + return templateProvider.IsStuck(applicationInstallation) +} + +// Rollback rolls an Application back to the previous release. +func (a *ApplicationManager) Rollback(ctx context.Context, log *zap.SugaredLogger, seedClient ctrlruntimeclient.Client, userClient ctrlruntimeclient.Client, applicationInstallation *appskubermaticv1.ApplicationInstallation) error { + templateProvider, err := providers.NewTemplateProvider(ctx, seedClient, a.Kubeconfig, a.ApplicationCache, log, applicationInstallation, a.SecretNamespace) + if err != nil { + return fmt.Errorf("failed to initialize template provider: %w", err) + } + + return templateProvider.Rollback(applicationInstallation) +} diff --git a/pkg/applications/providers/template/helm.go b/pkg/applications/providers/template/helm.go index 6a07163b6cf..9d13183928b 100644 --- a/pkg/applications/providers/template/helm.go +++ b/pkg/applications/providers/template/helm.go @@ -208,3 +208,76 @@ func getDeployOpts(appDefinition *appskubermaticv1.ApplicationDefinition, appIns // Fallback to default options. return helmclient.NewDeployOpts(false, 0, false, false) } + +// IsStuck aims to identify if a helm release is stuck. This targets an upstream issue in helm, which has not been resolved. For further details see: +// - https://github.com/helm/helm/issues/7476 +// - https://github.com/helm/helm/issues/4558 +func (h HelmTemplate) IsStuck(applicationInstallation *appskubermaticv1.ApplicationInstallation) (bool, error) { + // if the release was successful, exit early + if applicationInstallation.Status.Conditions[appskubermaticv1.Ready].Status == "True" { + return false, nil + } + // currently we observe the stuck error exclusively with this message. If it does not exist, exit early + if applicationInstallation.Status.Conditions[appskubermaticv1.Ready].Message != "another operation (install/upgrade/rollback) is in progress" { + return false, nil + } + + helmCacheDir, err := util.CreateHelmTempDir(h.CacheDir) + if err != nil { + return false, fmt.Errorf("failed to create helmCacheDir: %w", err) + } + + defer util.CleanUpHelmTempDir(helmCacheDir, h.Log) + restClientGetter := &genericclioptions.ConfigFlags{ + KubeConfig: &h.Kubeconfig, + Namespace: &applicationInstallation.Spec.Namespace.Name, + } + helmClient, err := helmclient.NewClient( + h.Ctx, + restClientGetter, + helmclient.NewSettings(helmCacheDir), + applicationInstallation.Spec.Namespace.Name, + h.Log) + if err != nil { + return false, fmt.Errorf("failed to create helmClient: %w", err) + } + + // retrieve metadata of the latest release + releaseName := getReleaseName(applicationInstallation) + metadata, err := helmClient.GetMetadata(releaseName) + if err != nil { + return false, fmt.Errorf("failed to retrieve metadata for checking if release %q is stuck: %w", releaseName, err) + } + + // if the status of the release is not still pending, exit early + if metadata.Status != "pending-upgrade" { + return false, nil + } + + return true, nil +} + +// Rollback rolls an Application back to the previous release. +func (h HelmTemplate) Rollback(applicationInstallation *appskubermaticv1.ApplicationInstallation) error { + helmCacheDir, err := util.CreateHelmTempDir(h.CacheDir) + if err != nil { + return fmt.Errorf("failed to create helmCacheDir: %w", err) + } + + defer util.CleanUpHelmTempDir(helmCacheDir, h.Log) + restClientGetter := &genericclioptions.ConfigFlags{ + KubeConfig: &h.Kubeconfig, + Namespace: &applicationInstallation.Spec.Namespace.Name, + } + helmClient, err := helmclient.NewClient( + h.Ctx, + restClientGetter, + helmclient.NewSettings(helmCacheDir), + applicationInstallation.Spec.Namespace.Name, + h.Log) + if err != nil { + return fmt.Errorf("failed to create helmClient: %w", err) + } + + return helmClient.Rollback(getReleaseName(applicationInstallation)) +} diff --git a/pkg/applications/providers/types.go b/pkg/applications/providers/types.go index be7160e3764..72134e16925 100644 --- a/pkg/applications/providers/types.go +++ b/pkg/applications/providers/types.go @@ -59,6 +59,12 @@ type TemplateProvider interface { // Uninstall the application. Uninstall(applicationInstallation *appskubermaticv1.ApplicationInstallation) (util.StatusUpdater, error) + + // Check if a release is stuck + IsStuck(applicationInstallation *appskubermaticv1.ApplicationInstallation) (bool, error) + + // Rollback the Application to the previous release + Rollback(applicationInstallation *appskubermaticv1.ApplicationInstallation) error } // NewTemplateProvider return the concrete implementation of TemplateProvider according to the templateMethod. diff --git a/pkg/controller/user-cluster-controller-manager/application-installation-controller/controller.go b/pkg/controller/user-cluster-controller-manager/application-installation-controller/controller.go index 72e90236619..6b88d1de63b 100644 --- a/pkg/controller/user-cluster-controller-manager/application-installation-controller/controller.go +++ b/pkg/controller/user-cluster-controller-manager/application-installation-controller/controller.go @@ -242,6 +242,20 @@ func (r *reconciler) handleInstallation(ctx context.Context, log *zap.SugaredLog return nil } + // Because some upstream tools are not completely idempotent, we need a check to make sure a release is not stuck. + // This should be run before we make any changes to the status field, so we can use it in our analysis + stuck, err := r.appInstaller.IsStuck(ctx, log, r.seedClient, r.userClient, appInstallation) + if err != nil { + return fmt.Errorf("failed to check if the previous release is stuck: %w", err) + } + if stuck { + log.Infof("Release for ApplicationInstallation seems to be stuck, attempting rollback now") + if err := r.appInstaller.Rollback(ctx, log, r.seedClient, r.userClient, appInstallation); err != nil { + return fmt.Errorf("failed to rollback release: %w", err) + } + log.Infof("Release for ApplicationInstallation has been rolled back successfully") + } + downloadDest, err := os.MkdirTemp(r.appInstaller.GetAppCache(), appInstallation.Namespace+"-"+appInstallation.Name) if err != nil { return fmt.Errorf("failed to create temporary directory where application source will be downloaded: %w", err)