Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

kubeadm etcd modifying recovery steps #56500

Merged
merged 1 commit into from Nov 29, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmd/kubeadm/app/cmd/upgrade/apply.go
Expand Up @@ -64,7 +64,7 @@ func NewCmdApply(parentFlags *cmdUpgradeFlags) *cobra.Command {
flags := &applyFlags{
parent: parentFlags,
imagePullTimeout: 15 * time.Minute,
etcdUpgrade: false,
etcdUpgrade: true,
}

cmd := &cobra.Command{
Expand Down
71 changes: 60 additions & 11 deletions cmd/kubeadm/app/phases/upgrade/staticpods.go
Expand Up @@ -127,6 +127,12 @@ func (spm *KubeStaticPodPathManager) BackupEtcdDir() string {
}

func upgradeComponent(component string, waiter apiclient.Waiter, pathMgr StaticPodPathManager, cfg *kubeadmapi.MasterConfiguration, beforePodHash string, recoverManifests map[string]string) error {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: change the cfg parameter to just nodeName string as that is the only thing used in cfg
I'd prefer if you parameterized the rollback function to call here instead of baking in this recoverEtcd logic... can you do that please?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

here is the thing, if a separate function used for etcd manifest rollback then:

  1. We just end up duplicating the code
  2. componentUpgrade will needs to be changed and checked everytime when roll back is called to see which rollback needs to be called, normal or etcd specific, I think it will bring more confusion than gain from separating. Please let me know if you still wants me to change it.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, no need to change it

// Special treatment is required for etcd case, when rollbackOldManifests should roll back etcd
// manifests only for the case when component is Etcd
recoverEtcd := false
if component == constants.Etcd {
recoverEtcd = true
}
// The old manifest is here; in the /etc/kubernetes/manifests/
currentManifestPath := pathMgr.RealManifestPath(component)
// The new, upgraded manifest will be written here
Expand All @@ -140,12 +146,12 @@ func upgradeComponent(component string, waiter apiclient.Waiter, pathMgr StaticP

// Move the old manifest into the old-manifests directory
if err := pathMgr.MoveFile(currentManifestPath, backupManifestPath); err != nil {
return rollbackOldManifests(recoverManifests, err, pathMgr)
return rollbackOldManifests(recoverManifests, err, pathMgr, recoverEtcd)
}

// Move the new manifest into the manifests directory
if err := pathMgr.MoveFile(newManifestPath, currentManifestPath); err != nil {
return rollbackOldManifests(recoverManifests, err, pathMgr)
return rollbackOldManifests(recoverManifests, err, pathMgr, recoverEtcd)
}

fmt.Printf("[upgrade/staticpods] Moved upgraded manifest to %q and backed up old manifest to %q\n", currentManifestPath, backupManifestPath)
Expand All @@ -156,12 +162,12 @@ func upgradeComponent(component string, waiter apiclient.Waiter, pathMgr StaticP
// If we don't do this, there is a case where we remove the Static Pod manifest, kubelet is slow to react, kubeadm checks the
// API endpoint below of the OLD Static Pod component and proceeds quickly enough, which might lead to unexpected results.
if err := waiter.WaitForStaticPodControlPlaneHashChange(cfg.NodeName, component, beforePodHash); err != nil {
return rollbackOldManifests(recoverManifests, err, pathMgr)
return rollbackOldManifests(recoverManifests, err, pathMgr, recoverEtcd)
}

// Wait for the static pod component to come up and register itself as a mirror pod
if err := waiter.WaitForPodsWithLabel("component=" + component); err != nil {
return rollbackOldManifests(recoverManifests, err, pathMgr)
return rollbackOldManifests(recoverManifests, err, pathMgr, recoverEtcd)
}

fmt.Printf("[upgrade/staticpods] Component %q upgraded successfully!\n", component)
Expand Down Expand Up @@ -212,20 +218,59 @@ func performEtcdStaticPodUpgrade(waiter apiclient.Waiter, pathMgr StaticPodPathM
return true, fmt.Errorf("fail to get etcd pod's hash: %v", err)
}

// Write the updated etcd static Pod manifest into the temporary directory
// Write the updated etcd static Pod manifest into the temporary directory, at this point no etcd change
// has occured in any aspects.
if err := etcdphase.CreateLocalEtcdStaticPodManifestFile(pathMgr.TempManifestDir(), cfg); err != nil {
return true, rollbackEtcdData(cfg, fmt.Errorf("error creating local etcd static pod manifest file: %v", err), pathMgr)
return true, fmt.Errorf("error creating local etcd static pod manifest file: %v", err)
}

// Perform etcd upgrade using common to all control plane components function
if err := upgradeComponent(constants.Etcd, waiter, pathMgr, cfg, beforeEtcdPodHash, recoverManifests); err != nil {
return true, rollbackEtcdData(cfg, err, pathMgr)
// Since etcd upgrade component failed, the old manifest has been restored
// now we need to check the heatlth of etcd cluster if it came back up with old manifest
if _, err := etcdCluster.GetEtcdClusterStatus(); err != nil {
// At this point we know that etcd cluster is dead and it is safe to copy backup datastore and to rollback old etcd manifest
if err := rollbackEtcdData(cfg, fmt.Errorf("etcd cluster is not healthy after upgrade: %v rolling back", err), pathMgr); err != nil {
// Even copying back datastore failed, no options for recovery left, bailing out
return true, fmt.Errorf("fatal error upgrading local etcd cluster: %v, the backup of etcd database is stored here:(%s)", err, backupEtcdDir)
}
// Old datastore has been copied, rolling back old manifests
if err := rollbackOldManifests(recoverManifests, err, pathMgr, true); err != nil {
// Rolling back to old manifests failed, no options for recovery left, bailing out
return true, fmt.Errorf("fatal error upgrading local etcd cluster: %v, the backup of etcd database is stored here:(%s)", err, backupEtcdDir)
}
// Since rollback of the old etcd manifest was successful, checking again the status of etcd cluster
if _, err := etcdCluster.GetEtcdClusterStatus(); err != nil {
// Nothing else left to try to recover etcd cluster
return true, fmt.Errorf("fatal error upgrading local etcd cluster: %v, the backup of etcd database is stored here:(%s)", err, backupEtcdDir)
}

return true, fmt.Errorf("fatal error upgrading local etcd cluster: %v, rolled the state back to pre-upgrade state", err)
}
// Since etcd cluster came back up with the old manifest
return true, fmt.Errorf("fatal error when trying to upgrade the etcd cluster: %v, rolled the state back to pre-upgrade state", err)
}

// Checking health state of etcd after the upgrade
etcdStatus, err = etcdCluster.GetEtcdClusterStatus()
if err != nil {
return true, rollbackEtcdData(cfg, fmt.Errorf("etcd cluster is not healthy after upgrade: %v rolling back", err), pathMgr)
if _, err = etcdCluster.GetEtcdClusterStatus(); err != nil {
// Despite the fact that upgradeComponent was sucessfull, there is something wrong with etcd cluster
// First step is to restore back up of datastore
if err := rollbackEtcdData(cfg, fmt.Errorf("etcd cluster is not healthy after upgrade: %v rolling back", err), pathMgr); err != nil {
// Even copying back datastore failed, no options for recovery left, bailing out
return true, fmt.Errorf("fatal error upgrading local etcd cluster: %v, the backup of etcd database is stored here:(%s)", err, backupEtcdDir)
}
// Old datastore has been copied, rolling back old manifests
if err := rollbackOldManifests(recoverManifests, err, pathMgr, true); err != nil {
// Rolling back to old manifests failed, no options for recovery left, bailing out
return true, fmt.Errorf("fatal error upgrading local etcd cluster: %v, the backup of etcd database is stored here:(%s)", err, backupEtcdDir)
}
// Since rollback of the old etcd manifest was successful, checking again the status of etcd cluster
if _, err := etcdCluster.GetEtcdClusterStatus(); err != nil {
// Nothing else left to try to recover etcd cluster
return true, fmt.Errorf("fatal error upgrading local etcd cluster: %v, the backup of etcd database is stored here:(%s)", err, backupEtcdDir)
}

return true, fmt.Errorf("fatal error upgrading local etcd cluster: %v, rolled the state back to pre-upgrade state", err)
}

return false, nil
Expand Down Expand Up @@ -276,9 +321,13 @@ func StaticPodControlPlane(waiter apiclient.Waiter, pathMgr StaticPodPathManager
}

// rollbackOldManifests rolls back the backuped manifests if something went wrong
func rollbackOldManifests(oldManifests map[string]string, origErr error, pathMgr StaticPodPathManager) error {
func rollbackOldManifests(oldManifests map[string]string, origErr error, pathMgr StaticPodPathManager, restoreEtcd bool) error {
errs := []error{origErr}
for component, backupPath := range oldManifests {
// Will restore etcd manifest only if it was explicitely requested by setting restoreEtcd to True
if component == constants.Etcd && !restoreEtcd {
continue
}
// Where we should put back the backed up manifest
realManifestPath := pathMgr.RealManifestPath(component)

Expand Down