diff --git a/pkg/autohealing/cloudprovider/openstack/provider.go b/pkg/autohealing/cloudprovider/openstack/provider.go index c731cdedca..1fbe86fd4e 100644 --- a/pkg/autohealing/cloudprovider/openstack/provider.go +++ b/pkg/autohealing/cloudprovider/openstack/provider.go @@ -250,6 +250,67 @@ func (provider OpenStackCloudProvider) waitForServerDetachVolumes(serverID strin return rootVolumeID, err } +// FirstTimeRepair Handle the first time repair for a node +// 1) If the node is the first time in error, reboot and uncordon it +// 2) If the node is not the first time in error, check if the last reboot time is in provider.Config.RebuildDelayAfterReboot +// That said, if the node has been found in broken status before but has been long time since then, the processed variable +// will be kept as False, which means the node need to be rebuilt to fix it, otherwise it means the has been processed. +// The bool type return value means that if the node has been processed from a first time repair PoV +func (provider OpenStackCloudProvider) firstTimeRepair(n healthcheck.NodeInfo, serverID string, firstTimeRebootNodes map[string]healthcheck.NodeInfo) (bool, error) { + var firstTimeUnhealthy = true + for id := range unHealthyNodes { + log.V(5).Infof("comparing server ID %s with known broken ID %s", serverID, id) + if id == serverID { + log.Infof("it is NOT the first time that node %s is being repaired", serverID) + firstTimeUnhealthy = false + break + } + } + + var processed = false + if firstTimeUnhealthy == true { + log.Infof("rebooting node %s to repair it", serverID) + + if res := servers.Reboot(provider.Nova, serverID, servers.RebootOpts{Type: servers.SoftReboot}); res.Err != nil { + // Usually it means the node is being rebooted + log.Warningf("failed to reboot node %s, error: %v", serverID, res.Err) + if strings.Contains(res.Err.Error(), "reboot_started") { + // The node has been restarted + firstTimeRebootNodes[serverID] = n + processed = true + } + } else { + // Uncordon the node + if n.IsWorker == true { + nodeName := n.KubeNode.Name + newNode := n.KubeNode.DeepCopy() + newNode.Spec.Unschedulable = false + if _, err := provider.KubeClient.CoreV1().Nodes().Update(context.TODO(), newNode, metav1.UpdateOptions{}); err != nil { + log.Errorf("Failed to cordon node %s, error: %v", nodeName, err) + } else { + log.Infof("Node %s is cordoned", nodeName) + } + } + + n.RebootAt = time.Now() + firstTimeRebootNodes[serverID] = n + unHealthyNodes[serverID] = n + log.Infof("Node %s has been rebooted at %s", serverID, n.RebootAt) + } + processed = true + } else { + // If the node was rebooted within mins (defined by RepairDelayAfterReboot), ignore it + log.Infof("Node %s is found in unhealthy again which was rebooted at %s", serverID, unHealthyNodes[serverID].RebootAt) + if unHealthyNodes[serverID].RebootAt.After(time.Now().Add(-1 * provider.Config.RebuildDelayAfterReboot)) { + log.Infof("Node %s is found in unhealthy again, but we're going to defer the repair because it maybe in reboot process", serverID) + firstTimeRebootNodes[serverID] = n + processed = true + } + } + + return processed, nil +} + // Repair For master nodes: detach etcd and docker volumes, find the root // volume, then shutdown the VM, marks the both the VM and the root // volume (heat resource) as "unhealthy" then trigger Heat stack update @@ -275,6 +336,8 @@ func (provider OpenStackCloudProvider) Repair(nodes []healthcheck.NodeInfo) erro masters = nodes } + firstTimeRebootNodes := make(map[string]healthcheck.NodeInfo) + err := provider.UpdateHealthStatus(masters, workers) if err != nil { return fmt.Errorf("failed to update the helath status of cluster %s, error: %v", clusterName, err) @@ -295,21 +358,10 @@ func (provider OpenStackCloudProvider) Repair(nodes []healthcheck.NodeInfo) erro } serverID := machineID.String() - var firsttimeUnhealthy = true - for id := range unHealthyNodes { - log.V(5).Infof("comparing server ID %s with known broken ID %s", serverID, id) - if id == serverID { - firsttimeUnhealthy = false - break - } - } - - if firsttimeUnhealthy == true { - unHealthyNodes[serverID] = n - log.Infof("rebooting node %s to repair it", serverID) - if res := servers.Reboot(provider.Nova, serverID, servers.RebootOpts{Type: servers.SoftReboot}); res.Err != nil { - log.Warningf("failed to reboot node %s, error: %v", serverID, res.Err) - } + if processed, err := provider.firstTimeRepair(n, serverID, firstTimeRebootNodes); err != nil { + log.Warningf("Failed to process if the node %s is in first time repair , error: %v", serverID, err) + } else if processed == true { + log.Infof("Node %s has been processed", serverID) continue } @@ -374,21 +426,10 @@ func (provider OpenStackCloudProvider) Repair(nodes []healthcheck.NodeInfo) erro } serverID := machineID.String() - var firsttimeUnhealthy = true - for id := range unHealthyNodes { - log.Infof("comparing server ID %s with known broken ID %s", serverID, id) - if id == serverID { - firsttimeUnhealthy = false - break - } - } - - if firsttimeUnhealthy == true { - unHealthyNodes[serverID] = n - log.Infof("rebooting node %s to repair it", serverID) - if res := servers.Reboot(provider.Nova, serverID, servers.RebootOpts{Type: servers.SoftReboot}); res.Err != nil { - log.Warningf("failed to reboot node %s, error: %v", serverID, res.Err) - } + if processed, err := provider.firstTimeRepair(n, serverID, firstTimeRebootNodes); err != nil { + log.Warningf("Failed to process if the node %s is in first time repair , error: %v", serverID, err) + } else if processed == true { + log.Infof("Node %s has been processed", serverID) continue } @@ -404,8 +445,14 @@ func (provider OpenStackCloudProvider) Repair(nodes []healthcheck.NodeInfo) erro } } - if err := provider.waitForServerPoweredOff(serverID, 30*time.Second); err != nil { + if err := provider.waitForServerPoweredOff(serverID, 180*time.Second); err != nil { log.Warningf("Failed to shutdown the server %s, error: %v", serverID, err) + // If the server is failed to delete after 180s, then delete it to avoid the + // stack update failure later. + res := servers.ForceDelete(provider.Nova, serverID) + if res.Err != nil { + log.Warningf("Failed to delete the server %s, error: %v", serverID, err) + } } log.Infof("Marking Nova VM %s(Heat resource %s) unhealthy for Heat stack %s", serverID, allMapping[serverID].ResourceID, cluster.StackID) @@ -428,6 +475,11 @@ func (provider OpenStackCloudProvider) Repair(nodes []healthcheck.NodeInfo) erro // Remove the broken nodes from the cluster for _, n := range nodes { + serverID := uuid.Parse(n.KubeNode.Status.NodeInfo.MachineID).String() + if _, found := firstTimeRebootNodes[serverID]; found { + log.Infof("Skip node delete for %s because it's repaired by reboot", serverID) + continue + } if err := provider.KubeClient.CoreV1().Nodes().Delete(context.TODO(), n.KubeNode.Name, metav1.DeleteOptions{}); err != nil { log.Errorf("Failed to remove the node %s from cluster, error: %v", n.KubeNode.Name, err) } diff --git a/pkg/autohealing/config/config.go b/pkg/autohealing/config/config.go index 10c3b5b043..131d827b0f 100644 --- a/pkg/autohealing/config/config.go +++ b/pkg/autohealing/config/config.go @@ -56,6 +56,9 @@ type Config struct { // (Optional) How long after new node added that the node will be checked for health status. Default: 10m CheckDelayAfterAdd time.Duration `mapstructure:"check-delay-after-add"` + + // (Optional) How long to wait after a node being rebooted + RebuildDelayAfterReboot time.Duration `mapstructure:"rebuild-delay-after-reboot"` } type healthCheck struct { @@ -83,12 +86,13 @@ type kubeConfig struct { // NewConfig defines the default values for Config func NewConfig() Config { return Config{ - DryRun: false, - CloudProvider: "openstack", - MonitorInterval: 30 * time.Second, - MasterMonitorEnabled: true, - WorkerMonitorEnabled: true, - LeaderElect: true, - CheckDelayAfterAdd: 10 * time.Minute, + DryRun: false, + CloudProvider: "openstack", + MonitorInterval: 60 * time.Second, + MasterMonitorEnabled: true, + WorkerMonitorEnabled: true, + LeaderElect: true, + CheckDelayAfterAdd: 10 * time.Minute, + RebuildDelayAfterReboot: 5 * time.Minute, } } diff --git a/pkg/autohealing/controller/controller.go b/pkg/autohealing/controller/controller.go index d2db36d7a1..32c96e6b77 100644 --- a/pkg/autohealing/controller/controller.go +++ b/pkg/autohealing/controller/controller.go @@ -316,6 +316,10 @@ func (c *Controller) repairNodes(unhealthyNodes []healthcheck.NodeInfo) { newNode := node.KubeNode.DeepCopy() newNode.Spec.Unschedulable = true + // Skip cordon for master node + if node.IsWorker == false { + continue + } if _, err := c.kubeClient.CoreV1().Nodes().Update(context.TODO(), newNode, metav1.UpdateOptions{}); err != nil { log.Errorf("Failed to cordon node %s, error: %v", nodeName, err) } else { diff --git a/pkg/autohealing/healthcheck/healthcheck.go b/pkg/autohealing/healthcheck/healthcheck.go index 7f6202d034..b8ecc565fa 100644 --- a/pkg/autohealing/healthcheck/healthcheck.go +++ b/pkg/autohealing/healthcheck/healthcheck.go @@ -17,6 +17,8 @@ limitations under the License. package healthcheck import ( + "time" + apiv1 "k8s.io/api/core/v1" log "k8s.io/klog/v2" ) @@ -32,6 +34,8 @@ type NodeInfo struct { KubeNode apiv1.Node IsWorker bool FailedCheck string + FoundAt time.Time + RebootAt time.Time } type HealthCheck interface { @@ -81,6 +85,7 @@ func CheckNodes(checkers []HealthCheck, nodes []NodeInfo, controller NodeControl for _, checker := range checkers { if !checker.Check(node, controller) { node.FailedCheck = checker.GetName() + node.FoundAt = time.Now() unhealthyNodes = append(unhealthyNodes, node) break }