Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[release/v2.25] Applications: discover if a release is stuck and rollback if necessary #13332

29 changes: 29 additions & 0 deletions pkg/applications/fake/fake_installer.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,15 @@ func (a *ApplicationInstallerRecorder) Delete(ctx context.Context, log *zap.Suga
return util.NoStatusUpdate, nil
}

func (a *ApplicationInstallerRecorder) IsStuck(ctx context.Context, log *zap.SugaredLogger, seedClient ctrlruntimeclient.Client, userClient ctrlruntimeclient.Client, applicationInstallation *appskubermaticv1.ApplicationInstallation) (bool, error) {
// NOOP
return false, nil
}

func (a *ApplicationInstallerRecorder) Rollback(ctx context.Context, log *zap.SugaredLogger, seedClient ctrlruntimeclient.Client, userClient ctrlruntimeclient.Client, applicationInstallation *appskubermaticv1.ApplicationInstallation) error {
return nil
}

// ApplicationInstallerLogger is a fake ApplicationInstaller that just logs actions. it's used for the development of the controller.
type ApplicationInstallerLogger struct {
}
Expand All @@ -81,6 +90,16 @@ func (a ApplicationInstallerLogger) Delete(ctx context.Context, log *zap.Sugared
return util.NoStatusUpdate, nil
}

func (a ApplicationInstallerLogger) IsStuck(ctx context.Context, log *zap.SugaredLogger, seedClient ctrlruntimeclient.Client, userClient ctrlruntimeclient.Client, applicationInstallation *appskubermaticv1.ApplicationInstallation) (bool, error) {
// NOOP
return false, nil
}

func (a ApplicationInstallerLogger) Rollback(ctx context.Context, log *zap.SugaredLogger, seedClient ctrlruntimeclient.Client, userClient ctrlruntimeclient.Client, applicationInstallation *appskubermaticv1.ApplicationInstallation) error {
// NOOP
return nil
}

// CustomApplicationInstaller is an applicationInstaller in which every function can be independently mocked.
// If a function is not mocked, then default values are returned.
type CustomApplicationInstaller struct {
Expand Down Expand Up @@ -117,3 +136,13 @@ func (c CustomApplicationInstaller) Delete(ctx context.Context, log *zap.Sugared
}
return util.NoStatusUpdate, nil
}

func (c CustomApplicationInstaller) IsStuck(ctx context.Context, log *zap.SugaredLogger, seedClient ctrlruntimeclient.Client, userClient ctrlruntimeclient.Client, applicationInstallation *appskubermaticv1.ApplicationInstallation) (bool, error) {
// NOOP
return false, nil
}

func (c CustomApplicationInstaller) Rollback(ctx context.Context, log *zap.SugaredLogger, seedClient ctrlruntimeclient.Client, userClient ctrlruntimeclient.Client, applicationInstallation *appskubermaticv1.ApplicationInstallation) error {
// NOOP
return nil
}
28 changes: 28 additions & 0 deletions pkg/applications/helmclient/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,34 @@ func (h HelmClient) Uninstall(releaseName string) (*release.UninstallReleaseResp
return uninstallReleaseResponse, err
}

// GetMetadata wraps helms GetMetadata command to be used with our ActionConfig.
func (h HelmClient) GetMetadata(releaseName string) (*release.Release, error) {
if err := h.actionConfig.KubeClient.IsReachable(); err != nil {
return nil, err
}

rel, err := h.actionConfig.Releases.Get(releaseName, 0)
if err != nil {
return nil, fmt.Errorf("could not retrieve metadata for release %q: %w", releaseName, err)
}

if rel.Info == nil {
return nil, fmt.Errorf("release metadata for %q does not contain release info", releaseName)
}

return rel, nil
}

// Rollback wraps helms Rollback command to be used with our ActionConfig.
func (h HelmClient) Rollback(releaseName string) error {
client := action.NewRollback(h.actionConfig)
err := client.Run(releaseName)
if err != nil {
return fmt.Errorf("Could not rollback release %q: %w", releaseName, err)
}
return nil
}

// buildDependencies adds missing repositories and then does a Helm dependency build (i.e. download the chart dependencies
// from repositories into "charts" folder).
func (h HelmClient) buildDependencies(chartLoc string, auth AuthSettings) (*chart.Chart, error) {
Expand Down
26 changes: 26 additions & 0 deletions pkg/applications/installer.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,12 @@ type ApplicationInstaller interface {

// Delete function uninstalls the application on the user-cluster and returns an error if the uninstallation has failed. StatusUpdater is guaranteed to be non nil. This is idempotent.
Delete(ctx context.Context, log *zap.SugaredLogger, seedClient ctrlruntimeclient.Client, userClient ctrlruntimeclient.Client, applicationInstallation *appskubermaticv1.ApplicationInstallation) (util.StatusUpdater, error)

// IsStuck determines if a release is stuck. Its main purpose is to detect inconsistent behavior in upstream Application libraries
IsStuck(ctx context.Context, log *zap.SugaredLogger, seedClient ctrlruntimeclient.Client, userClient ctrlruntimeclient.Client, applicationInstallation *appskubermaticv1.ApplicationInstallation) (bool, error)

// Rollback rolls an Application back to the previous release
Rollback(ctx context.Context, log *zap.SugaredLogger, seedClient ctrlruntimeclient.Client, userClient ctrlruntimeclient.Client, applicationInstallation *appskubermaticv1.ApplicationInstallation) error
}

// ApplicationManager handles the installation / uninstallation of an Application on the user-cluster.
Expand Down Expand Up @@ -138,3 +144,23 @@ func (a *ApplicationManager) reconcileNamespace(ctx context.Context, log *zap.Su
}
return nil
}

// IsStuck determines if a release is stuck. Its main purpose is to detect inconsistent behavior in upstream Application libraries.
func (a *ApplicationManager) IsStuck(ctx context.Context, log *zap.SugaredLogger, seedClient ctrlruntimeclient.Client, userClient ctrlruntimeclient.Client, applicationInstallation *appskubermaticv1.ApplicationInstallation) (bool, error) {
templateProvider, err := providers.NewTemplateProvider(ctx, seedClient, a.Kubeconfig, a.ApplicationCache, log, applicationInstallation, a.SecretNamespace)
if err != nil {
return false, fmt.Errorf("failed to initialize template provider: %w", err)
}

return templateProvider.IsStuck(applicationInstallation)
}

// Rollback rolls an Application back to the previous release.
func (a *ApplicationManager) Rollback(ctx context.Context, log *zap.SugaredLogger, seedClient ctrlruntimeclient.Client, userClient ctrlruntimeclient.Client, applicationInstallation *appskubermaticv1.ApplicationInstallation) error {
templateProvider, err := providers.NewTemplateProvider(ctx, seedClient, a.Kubeconfig, a.ApplicationCache, log, applicationInstallation, a.SecretNamespace)
if err != nil {
return fmt.Errorf("failed to initialize template provider: %w", err)
}

return templateProvider.Rollback(applicationInstallation)
}
73 changes: 73 additions & 0 deletions pkg/applications/providers/template/helm.go
Original file line number Diff line number Diff line change
Expand Up @@ -208,3 +208,76 @@ func getDeployOpts(appDefinition *appskubermaticv1.ApplicationDefinition, appIns
// Fallback to default options.
return helmclient.NewDeployOpts(false, 0, false, false)
}

// IsStuck aims to identify if a helm release is stuck. This targets an upstream issue in helm, which has not been resolved. For further details see:
// - https://github.com/helm/helm/issues/7476
// - https://github.com/helm/helm/issues/4558
func (h HelmTemplate) IsStuck(applicationInstallation *appskubermaticv1.ApplicationInstallation) (bool, error) {
// if the release was successful, exit early
if applicationInstallation.Status.Conditions[appskubermaticv1.Ready].Status == "True" {
return false, nil
}
// currently we observe the stuck error exclusively with this message. If it does not exist, exit early
if applicationInstallation.Status.Conditions[appskubermaticv1.Ready].Message != "another operation (install/upgrade/rollback) is in progress" {
return false, nil
}

helmCacheDir, err := util.CreateHelmTempDir(h.CacheDir)
if err != nil {
return false, fmt.Errorf("failed to create helmCacheDir: %w", err)
}

defer util.CleanUpHelmTempDir(helmCacheDir, h.Log)
restClientGetter := &genericclioptions.ConfigFlags{
KubeConfig: &h.Kubeconfig,
Namespace: &applicationInstallation.Spec.Namespace.Name,
}
helmClient, err := helmclient.NewClient(
h.Ctx,
restClientGetter,
helmclient.NewSettings(helmCacheDir),
applicationInstallation.Spec.Namespace.Name,
h.Log)
if err != nil {
return false, fmt.Errorf("failed to create helmClient: %w", err)
}

// retrieve metadata of the latest release
releaseName := getReleaseName(applicationInstallation)
metadata, err := helmClient.GetMetadata(releaseName)
if err != nil {
return false, fmt.Errorf("failed to retrieve metadata for checking if release %q is stuck: %w", releaseName, err)
}

// if the status of the release is not still pending, exit early
if metadata.Info.Status != "pending-upgrade" {
return false, nil
}

return true, nil
}

// Rollback rolls an Application back to the previous release.
func (h HelmTemplate) Rollback(applicationInstallation *appskubermaticv1.ApplicationInstallation) error {
helmCacheDir, err := util.CreateHelmTempDir(h.CacheDir)
if err != nil {
return fmt.Errorf("failed to create helmCacheDir: %w", err)
}

defer util.CleanUpHelmTempDir(helmCacheDir, h.Log)
restClientGetter := &genericclioptions.ConfigFlags{
KubeConfig: &h.Kubeconfig,
Namespace: &applicationInstallation.Spec.Namespace.Name,
}
helmClient, err := helmclient.NewClient(
h.Ctx,
restClientGetter,
helmclient.NewSettings(helmCacheDir),
applicationInstallation.Spec.Namespace.Name,
h.Log)
if err != nil {
return fmt.Errorf("failed to create helmClient: %w", err)
}

return helmClient.Rollback(getReleaseName(applicationInstallation))
}
6 changes: 6 additions & 0 deletions pkg/applications/providers/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,12 @@ type TemplateProvider interface {

// Uninstall the application.
Uninstall(applicationInstallation *appskubermaticv1.ApplicationInstallation) (util.StatusUpdater, error)

// Check if a release is stuck
IsStuck(applicationInstallation *appskubermaticv1.ApplicationInstallation) (bool, error)

// Rollback the Application to the previous release
Rollback(applicationInstallation *appskubermaticv1.ApplicationInstallation) error
}

// NewTemplateProvider return the concrete implementation of TemplateProvider according to the templateMethod.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,20 @@ func (r *reconciler) handleInstallation(ctx context.Context, log *zap.SugaredLog
return nil
}

// Because some upstream tools are not completely idempotent, we need a check to make sure a release is not stuck.
// This should be run before we make any changes to the status field, so we can use it in our analysis
stuck, err := r.appInstaller.IsStuck(ctx, log, r.seedClient, r.userClient, appInstallation)
if err != nil {
return fmt.Errorf("failed to check if the previous release is stuck: %w", err)
}
if stuck {
log.Infof("Release for ApplicationInstallation seems to be stuck, attempting rollback now")
if err := r.appInstaller.Rollback(ctx, log, r.seedClient, r.userClient, appInstallation); err != nil {
return fmt.Errorf("failed to rollback release: %w", err)
}
log.Infof("Release for ApplicationInstallation has been rolled back successfully")
}

downloadDest, err := os.MkdirTemp(r.appInstaller.GetAppCache(), appInstallation.Namespace+"-"+appInstallation.Name)
if err != nil {
return fmt.Errorf("failed to create temporary directory where application source will be downloaded: %w", err)
Expand Down