Skip to content

Commit

Permalink
Fix duplicated repair action (#1530) (#1556)
Browse files Browse the repository at this point in the history
There are several changes in this commit:

1) Fix the duplicated repair actions
2) Change default health check period from 30s to 60s

(cherry picked from commit db60e00)
  • Loading branch information
openstacker committed Jun 2, 2021
1 parent 0dbca73 commit 2adf3ac
Show file tree
Hide file tree
Showing 4 changed files with 103 additions and 38 deletions.
114 changes: 83 additions & 31 deletions pkg/autohealing/cloudprovider/openstack/provider.go
Expand Up @@ -250,6 +250,67 @@ func (provider OpenStackCloudProvider) waitForServerDetachVolumes(serverID strin
return rootVolumeID, err
}

// FirstTimeRepair Handle the first time repair for a node
// 1) If the node is the first time in error, reboot and uncordon it
// 2) If the node is not the first time in error, check if the last reboot time is in provider.Config.RebuildDelayAfterReboot
// That said, if the node has been found in broken status before but has been long time since then, the processed variable
// will be kept as False, which means the node need to be rebuilt to fix it, otherwise it means the has been processed.
// The bool type return value means that if the node has been processed from a first time repair PoV
func (provider OpenStackCloudProvider) firstTimeRepair(n healthcheck.NodeInfo, serverID string, firstTimeRebootNodes map[string]healthcheck.NodeInfo) (bool, error) {
var firstTimeUnhealthy = true
for id := range unHealthyNodes {
log.V(5).Infof("comparing server ID %s with known broken ID %s", serverID, id)
if id == serverID {
log.Infof("it is NOT the first time that node %s is being repaired", serverID)
firstTimeUnhealthy = false
break
}
}

var processed = false
if firstTimeUnhealthy == true {
log.Infof("rebooting node %s to repair it", serverID)

if res := servers.Reboot(provider.Nova, serverID, servers.RebootOpts{Type: servers.SoftReboot}); res.Err != nil {
// Usually it means the node is being rebooted
log.Warningf("failed to reboot node %s, error: %v", serverID, res.Err)
if strings.Contains(res.Err.Error(), "reboot_started") {
// The node has been restarted
firstTimeRebootNodes[serverID] = n
processed = true
}
} else {
// Uncordon the node
if n.IsWorker == true {
nodeName := n.KubeNode.Name
newNode := n.KubeNode.DeepCopy()
newNode.Spec.Unschedulable = false
if _, err := provider.KubeClient.CoreV1().Nodes().Update(context.TODO(), newNode, metav1.UpdateOptions{}); err != nil {
log.Errorf("Failed to cordon node %s, error: %v", nodeName, err)
} else {
log.Infof("Node %s is cordoned", nodeName)
}
}

n.RebootAt = time.Now()
firstTimeRebootNodes[serverID] = n
unHealthyNodes[serverID] = n
log.Infof("Node %s has been rebooted at %s", serverID, n.RebootAt)
}
processed = true
} else {
// If the node was rebooted within mins (defined by RepairDelayAfterReboot), ignore it
log.Infof("Node %s is found in unhealthy again which was rebooted at %s", serverID, unHealthyNodes[serverID].RebootAt)
if unHealthyNodes[serverID].RebootAt.After(time.Now().Add(-1 * provider.Config.RebuildDelayAfterReboot)) {
log.Infof("Node %s is found in unhealthy again, but we're going to defer the repair because it maybe in reboot process", serverID)
firstTimeRebootNodes[serverID] = n
processed = true
}
}

return processed, nil
}

// Repair For master nodes: detach etcd and docker volumes, find the root
// volume, then shutdown the VM, marks the both the VM and the root
// volume (heat resource) as "unhealthy" then trigger Heat stack update
Expand All @@ -275,6 +336,8 @@ func (provider OpenStackCloudProvider) Repair(nodes []healthcheck.NodeInfo) erro
masters = nodes
}

firstTimeRebootNodes := make(map[string]healthcheck.NodeInfo)

err := provider.UpdateHealthStatus(masters, workers)
if err != nil {
return fmt.Errorf("failed to update the helath status of cluster %s, error: %v", clusterName, err)
Expand All @@ -295,21 +358,10 @@ func (provider OpenStackCloudProvider) Repair(nodes []healthcheck.NodeInfo) erro
}
serverID := machineID.String()

var firsttimeUnhealthy = true
for id := range unHealthyNodes {
log.V(5).Infof("comparing server ID %s with known broken ID %s", serverID, id)
if id == serverID {
firsttimeUnhealthy = false
break
}
}

if firsttimeUnhealthy == true {
unHealthyNodes[serverID] = n
log.Infof("rebooting node %s to repair it", serverID)
if res := servers.Reboot(provider.Nova, serverID, servers.RebootOpts{Type: servers.SoftReboot}); res.Err != nil {
log.Warningf("failed to reboot node %s, error: %v", serverID, res.Err)
}
if processed, err := provider.firstTimeRepair(n, serverID, firstTimeRebootNodes); err != nil {
log.Warningf("Failed to process if the node %s is in first time repair , error: %v", serverID, err)
} else if processed == true {
log.Infof("Node %s has been processed", serverID)
continue
}

Expand Down Expand Up @@ -374,21 +426,10 @@ func (provider OpenStackCloudProvider) Repair(nodes []healthcheck.NodeInfo) erro
}
serverID := machineID.String()

var firsttimeUnhealthy = true
for id := range unHealthyNodes {
log.Infof("comparing server ID %s with known broken ID %s", serverID, id)
if id == serverID {
firsttimeUnhealthy = false
break
}
}

if firsttimeUnhealthy == true {
unHealthyNodes[serverID] = n
log.Infof("rebooting node %s to repair it", serverID)
if res := servers.Reboot(provider.Nova, serverID, servers.RebootOpts{Type: servers.SoftReboot}); res.Err != nil {
log.Warningf("failed to reboot node %s, error: %v", serverID, res.Err)
}
if processed, err := provider.firstTimeRepair(n, serverID, firstTimeRebootNodes); err != nil {
log.Warningf("Failed to process if the node %s is in first time repair , error: %v", serverID, err)
} else if processed == true {
log.Infof("Node %s has been processed", serverID)
continue
}

Expand All @@ -404,8 +445,14 @@ func (provider OpenStackCloudProvider) Repair(nodes []healthcheck.NodeInfo) erro
}
}

if err := provider.waitForServerPoweredOff(serverID, 30*time.Second); err != nil {
if err := provider.waitForServerPoweredOff(serverID, 180*time.Second); err != nil {
log.Warningf("Failed to shutdown the server %s, error: %v", serverID, err)
// If the server is failed to delete after 180s, then delete it to avoid the
// stack update failure later.
res := servers.ForceDelete(provider.Nova, serverID)
if res.Err != nil {
log.Warningf("Failed to delete the server %s, error: %v", serverID, err)
}
}

log.Infof("Marking Nova VM %s(Heat resource %s) unhealthy for Heat stack %s", serverID, allMapping[serverID].ResourceID, cluster.StackID)
Expand All @@ -428,6 +475,11 @@ func (provider OpenStackCloudProvider) Repair(nodes []healthcheck.NodeInfo) erro

// Remove the broken nodes from the cluster
for _, n := range nodes {
serverID := uuid.Parse(n.KubeNode.Status.NodeInfo.MachineID).String()
if _, found := firstTimeRebootNodes[serverID]; found {
log.Infof("Skip node delete for %s because it's repaired by reboot", serverID)
continue
}
if err := provider.KubeClient.CoreV1().Nodes().Delete(context.TODO(), n.KubeNode.Name, metav1.DeleteOptions{}); err != nil {
log.Errorf("Failed to remove the node %s from cluster, error: %v", n.KubeNode.Name, err)
}
Expand Down
18 changes: 11 additions & 7 deletions pkg/autohealing/config/config.go
Expand Up @@ -56,6 +56,9 @@ type Config struct {

// (Optional) How long after new node added that the node will be checked for health status. Default: 10m
CheckDelayAfterAdd time.Duration `mapstructure:"check-delay-after-add"`

// (Optional) How long to wait after a node being rebooted
RebuildDelayAfterReboot time.Duration `mapstructure:"rebuild-delay-after-reboot"`
}

type healthCheck struct {
Expand Down Expand Up @@ -83,12 +86,13 @@ type kubeConfig struct {
// NewConfig defines the default values for Config
func NewConfig() Config {
return Config{
DryRun: false,
CloudProvider: "openstack",
MonitorInterval: 30 * time.Second,
MasterMonitorEnabled: true,
WorkerMonitorEnabled: true,
LeaderElect: true,
CheckDelayAfterAdd: 10 * time.Minute,
DryRun: false,
CloudProvider: "openstack",
MonitorInterval: 60 * time.Second,
MasterMonitorEnabled: true,
WorkerMonitorEnabled: true,
LeaderElect: true,
CheckDelayAfterAdd: 10 * time.Minute,
RebuildDelayAfterReboot: 5 * time.Minute,
}
}
4 changes: 4 additions & 0 deletions pkg/autohealing/controller/controller.go
Expand Up @@ -316,6 +316,10 @@ func (c *Controller) repairNodes(unhealthyNodes []healthcheck.NodeInfo) {
newNode := node.KubeNode.DeepCopy()
newNode.Spec.Unschedulable = true

// Skip cordon for master node
if node.IsWorker == false {
continue
}
if _, err := c.kubeClient.CoreV1().Nodes().Update(context.TODO(), newNode, metav1.UpdateOptions{}); err != nil {
log.Errorf("Failed to cordon node %s, error: %v", nodeName, err)
} else {
Expand Down
5 changes: 5 additions & 0 deletions pkg/autohealing/healthcheck/healthcheck.go
Expand Up @@ -17,6 +17,8 @@ limitations under the License.
package healthcheck

import (
"time"

apiv1 "k8s.io/api/core/v1"
log "k8s.io/klog/v2"
)
Expand All @@ -32,6 +34,8 @@ type NodeInfo struct {
KubeNode apiv1.Node
IsWorker bool
FailedCheck string
FoundAt time.Time
RebootAt time.Time
}

type HealthCheck interface {
Expand Down Expand Up @@ -81,6 +85,7 @@ func CheckNodes(checkers []HealthCheck, nodes []NodeInfo, controller NodeControl
for _, checker := range checkers {
if !checker.Check(node, controller) {
node.FailedCheck = checker.GetName()
node.FoundAt = time.Now()
unhealthyNodes = append(unhealthyNodes, node)
break
}
Expand Down

0 comments on commit 2adf3ac

Please sign in to comment.