Skip to content

Commit

Permalink
Watch external objects for machine before deleting
Browse files Browse the repository at this point in the history
This fixes a race condition in the machine controller when the
controller is restarted after a machine has been marked for deletion but
before the infra machine or bootstrap config are deleted. In that case,
the controller doesn't have watches on those external objects, so the
reconciliation is not triggered once they are deleted. This means that
Machines stay in Deleting state for the resync period, the default is 10
minutes.
  • Loading branch information
g-gaston committed Feb 19, 2024
1 parent fc43a06 commit a64cd41
Show file tree
Hide file tree
Showing 3 changed files with 241 additions and 35 deletions.
24 changes: 16 additions & 8 deletions internal/controllers/machine/machine_controller.go
Expand Up @@ -333,7 +333,7 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, cluster *clusterv1.Clu
if err != nil {
switch err {
case errNoControlPlaneNodes, errLastControlPlaneNode, errNilNodeRef, errClusterIsBeingDeleted, errControlPlaneIsBeingDeleted:
var nodeName = ""
nodeName := ""
if m.Status.NodeRef != nil {
nodeName = m.Status.NodeRef.Name
}
Expand Down Expand Up @@ -427,7 +427,7 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, cluster *clusterv1.Clu
return ctrl.Result{}, errors.Wrap(err, "failed to patch Machine")
}

infrastructureDeleted, err := r.reconcileDeleteInfrastructure(ctx, m)
infrastructureDeleted, err := r.reconcileDeleteInfrastructure(ctx, cluster, m)
if err != nil {
return ctrl.Result{}, err
}
Expand All @@ -436,7 +436,7 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, cluster *clusterv1.Clu
return ctrl.Result{}, nil
}

bootstrapDeleted, err := r.reconcileDeleteBootstrap(ctx, m)
bootstrapDeleted, err := r.reconcileDeleteBootstrap(ctx, cluster, m)
if err != nil {
return ctrl.Result{}, err
}
Expand Down Expand Up @@ -723,8 +723,8 @@ func (r *Reconciler) deleteNode(ctx context.Context, cluster *clusterv1.Cluster,
return nil
}

func (r *Reconciler) reconcileDeleteBootstrap(ctx context.Context, m *clusterv1.Machine) (bool, error) {
obj, err := r.reconcileDeleteExternal(ctx, m, m.Spec.Bootstrap.ConfigRef)
func (r *Reconciler) reconcileDeleteBootstrap(ctx context.Context, cluster *clusterv1.Cluster, m *clusterv1.Machine) (bool, error) {
obj, err := r.reconcileDeleteExternal(ctx, cluster, m, m.Spec.Bootstrap.ConfigRef)
if err != nil {
return false, err
}
Expand All @@ -743,8 +743,8 @@ func (r *Reconciler) reconcileDeleteBootstrap(ctx context.Context, m *clusterv1.
return false, nil
}

func (r *Reconciler) reconcileDeleteInfrastructure(ctx context.Context, m *clusterv1.Machine) (bool, error) {
obj, err := r.reconcileDeleteExternal(ctx, m, &m.Spec.InfrastructureRef)
func (r *Reconciler) reconcileDeleteInfrastructure(ctx context.Context, cluster *clusterv1.Cluster, m *clusterv1.Machine) (bool, error) {
obj, err := r.reconcileDeleteExternal(ctx, cluster, m, &m.Spec.InfrastructureRef)
if err != nil {
return false, err
}
Expand All @@ -764,7 +764,7 @@ func (r *Reconciler) reconcileDeleteInfrastructure(ctx context.Context, m *clust
}

// reconcileDeleteExternal tries to delete external references.
func (r *Reconciler) reconcileDeleteExternal(ctx context.Context, m *clusterv1.Machine, ref *corev1.ObjectReference) (*unstructured.Unstructured, error) {
func (r *Reconciler) reconcileDeleteExternal(ctx context.Context, cluster *clusterv1.Cluster, m *clusterv1.Machine, ref *corev1.ObjectReference) (*unstructured.Unstructured, error) {
if ref == nil {
return nil, nil
}
Expand All @@ -777,6 +777,14 @@ func (r *Reconciler) reconcileDeleteExternal(ctx context.Context, m *clusterv1.M
}

if obj != nil {
// reconcileExternal ensures that we set the object's OwnerReferences correctly and watch the object.
// The machine delete logic depends on reconciling the machine when the external objects are deleted.
// This avoids a race condition where the machine is deleted before the external objects are ever reconciled
// by this controller.
if _, err := r.ensureExternalOwnershipAndWatch(ctx, cluster, m, ref); err != nil {
return nil, err
}

// Issue a delete request.
if err := r.Client.Delete(ctx, obj); err != nil && !apierrors.IsNotFound(err) {
return obj, errors.Wrapf(err,
Expand Down
56 changes: 35 additions & 21 deletions internal/controllers/machine/machine_controller_phases.go
Expand Up @@ -43,9 +43,7 @@ import (
"sigs.k8s.io/cluster-api/util/patch"
)

var (
externalReadyWait = 30 * time.Second
)
var externalReadyWait = 30 * time.Second

func (r *Reconciler) reconcilePhase(_ context.Context, m *clusterv1.Machine) {
originalPhase := m.Status.Phase
Expand Down Expand Up @@ -89,12 +87,44 @@ func (r *Reconciler) reconcilePhase(_ context.Context, m *clusterv1.Machine) {

// reconcileExternal handles generic unstructured objects referenced by a Machine.
func (r *Reconciler) reconcileExternal(ctx context.Context, cluster *clusterv1.Cluster, m *clusterv1.Machine, ref *corev1.ObjectReference) (external.ReconcileOutput, error) {
log := ctrl.LoggerFrom(ctx)

if err := utilconversion.UpdateReferenceAPIContract(ctx, r.Client, ref); err != nil {
return external.ReconcileOutput{}, err
}

result, err := r.ensureExternalOwnershipAndWatch(ctx, cluster, m, ref)
if err != nil {
return external.ReconcileOutput{}, err
}
if result.RequeueAfter > 0 || result.Paused {
return result, nil
}

obj := result.Result

// Set failure reason and message, if any.
failureReason, failureMessage, err := external.FailuresFrom(obj)
if err != nil {
return external.ReconcileOutput{}, err
}
if failureReason != "" {
machineStatusError := capierrors.MachineStatusError(failureReason)
m.Status.FailureReason = &machineStatusError
}
if failureMessage != "" {
m.Status.FailureMessage = pointer.String(
fmt.Sprintf("Failure detected from referenced resource %v with name %q: %s",
obj.GroupVersionKind(), obj.GetName(), failureMessage),
)
}

return external.ReconcileOutput{Result: obj}, nil
}

// ensureExternalOwnershipAndWatch ensures that only the Machine owns the external object,
// adds a watch to the external object if one does not already exist and adds the necessary labels.
func (r *Reconciler) ensureExternalOwnershipAndWatch(ctx context.Context, cluster *clusterv1.Cluster, m *clusterv1.Machine, ref *corev1.ObjectReference) (external.ReconcileOutput, error) {
log := ctrl.LoggerFrom(ctx)

obj, err := external.Get(ctx, r.UnstructuredCachingClient, ref, m.Namespace)
if err != nil {
if apierrors.IsNotFound(errors.Cause(err)) {
Expand Down Expand Up @@ -146,22 +176,6 @@ func (r *Reconciler) reconcileExternal(ctx context.Context, cluster *clusterv1.C
return external.ReconcileOutput{}, err
}

// Set failure reason and message, if any.
failureReason, failureMessage, err := external.FailuresFrom(obj)
if err != nil {
return external.ReconcileOutput{}, err
}
if failureReason != "" {
machineStatusError := capierrors.MachineStatusError(failureReason)
m.Status.FailureReason = &machineStatusError
}
if failureMessage != "" {
m.Status.FailureMessage = pointer.String(
fmt.Sprintf("Failure detected from referenced resource %v with name %q: %s",
obj.GroupVersionKind(), obj.GetName(), failureMessage),
)
}

return external.ReconcileOutput{Result: obj}, nil
}

Expand Down

0 comments on commit a64cd41

Please sign in to comment.