diff --git a/internal/controllers/machine/machine_controller.go b/internal/controllers/machine/machine_controller.go index 31663a5de504..47c5b4002785 100644 --- a/internal/controllers/machine/machine_controller.go +++ b/internal/controllers/machine/machine_controller.go @@ -333,7 +333,7 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, cluster *clusterv1.Clu if err != nil { switch err { case errNoControlPlaneNodes, errLastControlPlaneNode, errNilNodeRef, errClusterIsBeingDeleted, errControlPlaneIsBeingDeleted: - var nodeName = "" + nodeName := "" if m.Status.NodeRef != nil { nodeName = m.Status.NodeRef.Name } @@ -427,7 +427,7 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, cluster *clusterv1.Clu return ctrl.Result{}, errors.Wrap(err, "failed to patch Machine") } - infrastructureDeleted, err := r.reconcileDeleteInfrastructure(ctx, m) + infrastructureDeleted, err := r.reconcileDeleteInfrastructure(ctx, cluster, m) if err != nil { return ctrl.Result{}, err } @@ -436,7 +436,7 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, cluster *clusterv1.Clu return ctrl.Result{}, nil } - bootstrapDeleted, err := r.reconcileDeleteBootstrap(ctx, m) + bootstrapDeleted, err := r.reconcileDeleteBootstrap(ctx, cluster, m) if err != nil { return ctrl.Result{}, err } @@ -723,8 +723,8 @@ func (r *Reconciler) deleteNode(ctx context.Context, cluster *clusterv1.Cluster, return nil } -func (r *Reconciler) reconcileDeleteBootstrap(ctx context.Context, m *clusterv1.Machine) (bool, error) { - obj, err := r.reconcileDeleteExternal(ctx, m, m.Spec.Bootstrap.ConfigRef) +func (r *Reconciler) reconcileDeleteBootstrap(ctx context.Context, cluster *clusterv1.Cluster, m *clusterv1.Machine) (bool, error) { + obj, err := r.reconcileDeleteExternal(ctx, cluster, m, m.Spec.Bootstrap.ConfigRef) if err != nil { return false, err } @@ -743,8 +743,8 @@ func (r *Reconciler) reconcileDeleteBootstrap(ctx context.Context, m *clusterv1. return false, nil } -func (r *Reconciler) reconcileDeleteInfrastructure(ctx context.Context, m *clusterv1.Machine) (bool, error) { - obj, err := r.reconcileDeleteExternal(ctx, m, &m.Spec.InfrastructureRef) +func (r *Reconciler) reconcileDeleteInfrastructure(ctx context.Context, cluster *clusterv1.Cluster, m *clusterv1.Machine) (bool, error) { + obj, err := r.reconcileDeleteExternal(ctx, cluster, m, &m.Spec.InfrastructureRef) if err != nil { return false, err } @@ -764,7 +764,7 @@ func (r *Reconciler) reconcileDeleteInfrastructure(ctx context.Context, m *clust } // reconcileDeleteExternal tries to delete external references. -func (r *Reconciler) reconcileDeleteExternal(ctx context.Context, m *clusterv1.Machine, ref *corev1.ObjectReference) (*unstructured.Unstructured, error) { +func (r *Reconciler) reconcileDeleteExternal(ctx context.Context, cluster *clusterv1.Cluster, m *clusterv1.Machine, ref *corev1.ObjectReference) (*unstructured.Unstructured, error) { if ref == nil { return nil, nil } @@ -777,6 +777,14 @@ func (r *Reconciler) reconcileDeleteExternal(ctx context.Context, m *clusterv1.M } if obj != nil { + // reconcileExternal ensures that we set the object's OwnerReferences correctly and watch the object. + // The machine delete logic depends on reconciling the machine when the external objects are deleted. + // This avoids a race condition where the machine is deleted before the external objects are ever reconciled + // by this controller. + if _, err := r.ensureExternalOwnershipAndWatch(ctx, cluster, m, ref); err != nil { + return nil, err + } + // Issue a delete request. if err := r.Client.Delete(ctx, obj); err != nil && !apierrors.IsNotFound(err) { return obj, errors.Wrapf(err, diff --git a/internal/controllers/machine/machine_controller_phases.go b/internal/controllers/machine/machine_controller_phases.go index 5e34f8904444..8c197b4c1fe6 100644 --- a/internal/controllers/machine/machine_controller_phases.go +++ b/internal/controllers/machine/machine_controller_phases.go @@ -43,9 +43,7 @@ import ( "sigs.k8s.io/cluster-api/util/patch" ) -var ( - externalReadyWait = 30 * time.Second -) +var externalReadyWait = 30 * time.Second func (r *Reconciler) reconcilePhase(_ context.Context, m *clusterv1.Machine) { originalPhase := m.Status.Phase @@ -89,12 +87,44 @@ func (r *Reconciler) reconcilePhase(_ context.Context, m *clusterv1.Machine) { // reconcileExternal handles generic unstructured objects referenced by a Machine. func (r *Reconciler) reconcileExternal(ctx context.Context, cluster *clusterv1.Cluster, m *clusterv1.Machine, ref *corev1.ObjectReference) (external.ReconcileOutput, error) { - log := ctrl.LoggerFrom(ctx) - if err := utilconversion.UpdateReferenceAPIContract(ctx, r.Client, ref); err != nil { return external.ReconcileOutput{}, err } + result, err := r.ensureExternalOwnershipAndWatch(ctx, cluster, m, ref) + if err != nil { + return external.ReconcileOutput{}, err + } + if result.RequeueAfter > 0 || result.Paused { + return result, nil + } + + obj := result.Result + + // Set failure reason and message, if any. + failureReason, failureMessage, err := external.FailuresFrom(obj) + if err != nil { + return external.ReconcileOutput{}, err + } + if failureReason != "" { + machineStatusError := capierrors.MachineStatusError(failureReason) + m.Status.FailureReason = &machineStatusError + } + if failureMessage != "" { + m.Status.FailureMessage = pointer.String( + fmt.Sprintf("Failure detected from referenced resource %v with name %q: %s", + obj.GroupVersionKind(), obj.GetName(), failureMessage), + ) + } + + return external.ReconcileOutput{Result: obj}, nil +} + +// ensureExternalOwnershipAndWatch ensures that only the Machine owns the external object, +// adds a watch to the external object if one does not already exist and adds the necessary labels. +func (r *Reconciler) ensureExternalOwnershipAndWatch(ctx context.Context, cluster *clusterv1.Cluster, m *clusterv1.Machine, ref *corev1.ObjectReference) (external.ReconcileOutput, error) { + log := ctrl.LoggerFrom(ctx) + obj, err := external.Get(ctx, r.UnstructuredCachingClient, ref, m.Namespace) if err != nil { if apierrors.IsNotFound(errors.Cause(err)) { @@ -146,22 +176,6 @@ func (r *Reconciler) reconcileExternal(ctx context.Context, cluster *clusterv1.C return external.ReconcileOutput{}, err } - // Set failure reason and message, if any. - failureReason, failureMessage, err := external.FailuresFrom(obj) - if err != nil { - return external.ReconcileOutput{}, err - } - if failureReason != "" { - machineStatusError := capierrors.MachineStatusError(failureReason) - m.Status.FailureReason = &machineStatusError - } - if failureMessage != "" { - m.Status.FailureMessage = pointer.String( - fmt.Sprintf("Failure detected from referenced resource %v with name %q: %s", - obj.GroupVersionKind(), obj.GetName(), failureMessage), - ) - } - return external.ReconcileOutput{Result: obj}, nil } diff --git a/internal/controllers/machine/machine_controller_test.go b/internal/controllers/machine/machine_controller_test.go index 62ecfcba8245..01a128cdc8f6 100644 --- a/internal/controllers/machine/machine_controller_test.go +++ b/internal/controllers/machine/machine_controller_test.go @@ -35,6 +35,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/apiutil" "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/reconcile" @@ -150,7 +151,8 @@ func TestWatches(t *testing.T) { Kind: "GenericBootstrapConfig", Name: "bootstrap-config-machinereconcile", }, - }}, + }, + }, } g.Expect(env.Create(ctx, machine)).To(Succeed()) @@ -183,6 +185,185 @@ func TestWatches(t *testing.T) { }, timeout).Should(BeTrue()) } +func TestWatchesDelete(t *testing.T) { + g := NewWithT(t) + ns, err := env.CreateNamespace(ctx, "test-machine-watches-delete") + g.Expect(err).ToNot(HaveOccurred()) + + infraMachine := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "kind": "GenericInfrastructureMachine", + "apiVersion": "infrastructure.cluster.x-k8s.io/v1beta1", + "metadata": map[string]interface{}{ + "name": "infra-config1", + "namespace": ns.Name, + }, + "spec": map[string]interface{}{ + "providerID": "test://id-1", + }, + "status": map[string]interface{}{ + "ready": true, + "addresses": []interface{}{ + map[string]interface{}{ + "type": "InternalIP", + "address": "10.0.0.1", + }, + }, + }, + }, + } + infraMachineFinalizer := "test.infrastructure.cluster.x-k8s.io" + controllerutil.AddFinalizer(infraMachine, infraMachineFinalizer) + + defaultBootstrap := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "kind": "GenericBootstrapConfig", + "apiVersion": "bootstrap.cluster.x-k8s.io/v1beta1", + "metadata": map[string]interface{}{ + "name": "bootstrap-config-machinereconcile", + "namespace": ns.Name, + }, + "spec": map[string]interface{}{}, + "status": map[string]interface{}{}, + }, + } + bootstrapFinalizer := "test.bootstrap.cluster.x-k8s.io" + controllerutil.AddFinalizer(defaultBootstrap, bootstrapFinalizer) + + testCluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "machine-reconcile-", + Namespace: ns.Name, + }, + Spec: clusterv1.ClusterSpec{ + // we create the cluster in paused state so we don't reconcile + // the machine immediately after creation. + // This avoids going through reconcileExternal, which adds watches + // for the provider machine and the bootstrap config objects. + Paused: true, + }, + } + + g.Expect(env.Create(ctx, testCluster)).To(Succeed()) + g.Expect(env.CreateKubeconfigSecret(ctx, testCluster)).To(Succeed()) + g.Expect(env.Create(ctx, defaultBootstrap)).To(Succeed()) + g.Expect(env.Create(ctx, infraMachine)).To(Succeed()) + + defer func(do ...client.Object) { + g.Expect(env.Cleanup(ctx, do...)).To(Succeed()) + }(ns, testCluster, defaultBootstrap) + + // Patch infra machine ready + patchHelper, err := patch.NewHelper(infraMachine, env) + g.Expect(err).ShouldNot(HaveOccurred()) + g.Expect(unstructured.SetNestedField(infraMachine.Object, true, "status", "ready")).To(Succeed()) + g.Expect(patchHelper.Patch(ctx, infraMachine, patch.WithStatusObservedGeneration{})).To(Succeed()) + + // Patch bootstrap ready + patchHelper, err = patch.NewHelper(defaultBootstrap, env) + g.Expect(err).ShouldNot(HaveOccurred()) + g.Expect(unstructured.SetNestedField(defaultBootstrap.Object, true, "status", "ready")).To(Succeed()) + g.Expect(unstructured.SetNestedField(defaultBootstrap.Object, "secretData", "status", "dataSecretName")).To(Succeed()) + g.Expect(patchHelper.Patch(ctx, defaultBootstrap, patch.WithStatusObservedGeneration{})).To(Succeed()) + + machine := &clusterv1.Machine{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "machine-created-", + Namespace: ns.Name, + Labels: map[string]string{ + clusterv1.MachineControlPlaneLabel: "", + }, + }, + Spec: clusterv1.MachineSpec{ + ClusterName: testCluster.Name, + InfrastructureRef: corev1.ObjectReference{ + APIVersion: "infrastructure.cluster.x-k8s.io/v1beta1", + Kind: "GenericInfrastructureMachine", + Name: "infra-config1", + }, + Bootstrap: clusterv1.Bootstrap{ + ConfigRef: &corev1.ObjectReference{ + APIVersion: "bootstrap.cluster.x-k8s.io/v1beta1", + Kind: "GenericBootstrapConfig", + Name: "bootstrap-config-machinereconcile", + }, + }, + }, + } + // We create the machine with a finalizer so the machine is not deleted immediately. + controllerutil.AddFinalizer(machine, clusterv1.MachineFinalizer) + + g.Expect(env.Create(ctx, machine)).To(Succeed()) + defer func() { + g.Expect(env.Cleanup(ctx, machine)).To(Succeed()) + }() + + // We mark the machine for deletion + g.Expect(env.Delete(ctx, machine)).To(Succeed()) + + // We unpause the cluster so the machine can be reconciled. + testCluster.Spec.Paused = false + g.Expect(env.Update(ctx, testCluster)).To(Succeed()) + + // Wait for reconciliation to happen. + // The first reconciliation should add the cluster name label. + key := client.ObjectKey{Name: machine.Name, Namespace: machine.Namespace} + g.Eventually(func() bool { + if err := env.Get(ctx, key, machine); err != nil { + return false + } + return machine.Labels[clusterv1.ClusterNameLabel] == testCluster.Name + }, timeout).Should(BeTrue()) + + // Deleting the machine should mark the infra machine for deletion + infraMachineKey := client.ObjectKey{Name: infraMachine.GetName(), Namespace: infraMachine.GetNamespace()} + g.Eventually(func() bool { + if err := env.Get(ctx, infraMachineKey, infraMachine); err != nil { + return false + } + return infraMachine.GetDeletionTimestamp() != nil + }, timeout).Should(BeTrue(), "infra machine should be marked for deletion") + + // We wait a bit and remove the finalizer, simulating the infra machine controller. + time.Sleep(2 * time.Second) + infraMachine.SetFinalizers([]string{}) + g.Expect(env.Update(ctx, infraMachine)).To(Succeed()) + + // This should delete the infra machine + g.Eventually(func() bool { + err := env.Get(ctx, infraMachineKey, infraMachine) + return apierrors.IsNotFound(err) + }, timeout).Should(BeTrue(), "infra machine should be deleted") + + // If the watch on infra machine works, deleting of the infra machine will trigger another + // reconcile, which will mark the bootstrap config for deletion + bootstrapKey := client.ObjectKey{Name: defaultBootstrap.GetName(), Namespace: defaultBootstrap.GetNamespace()} + g.Eventually(func() bool { + if err := env.Get(ctx, bootstrapKey, defaultBootstrap); err != nil { + return false + } + return defaultBootstrap.GetDeletionTimestamp() != nil + }, timeout).Should(BeTrue(), "bootstrap config should be marked for deletion") + + // We wait a bit a remove the finalizer, simulating the bootstrap config controller. + time.Sleep(2 * time.Second) + defaultBootstrap.SetFinalizers([]string{}) + g.Expect(env.Update(ctx, defaultBootstrap)).To(Succeed()) + + // This should delete the bootstrap config. + g.Eventually(func() bool { + err := env.Get(ctx, bootstrapKey, defaultBootstrap) + return apierrors.IsNotFound(err) + }, timeout).Should(BeTrue(), "bootstrap config should be deleted") + + // If the watch on bootstrap config works, the deleting of the bootstrap config will trigger another + // reconcile, which will remove the finalizer and delete the machine + g.Eventually(func() bool { + err := env.Get(ctx, key, machine) + return apierrors.IsNotFound(err) + }, timeout).Should(BeTrue(), "machine should be deleted") +} + func TestMachine_Reconcile(t *testing.T) { g := NewWithT(t) @@ -250,7 +431,8 @@ func TestMachine_Reconcile(t *testing.T) { Kind: "GenericBootstrapConfig", Name: "bootstrap-config-machinereconcile", }, - }}, + }, + }, Status: clusterv1.MachineStatus{ NodeRef: &corev1.ObjectReference{ Name: "test", @@ -1076,13 +1258,13 @@ func TestReconcileDeleteExternal(t *testing.T) { UnstructuredCachingClient: c, } - obj, err := r.reconcileDeleteExternal(ctx, machine, machine.Spec.Bootstrap.ConfigRef) - g.Expect(obj).To(BeComparableTo(tc.expected)) + obj, err := r.reconcileDeleteExternal(ctx, testCluster, machine, machine.Spec.Bootstrap.ConfigRef) if tc.expectError { g.Expect(err).To(HaveOccurred()) } else { g.Expect(err).ToNot(HaveOccurred()) } + g.Expect(obj).To(BeComparableTo(tc.expected)) }) } } @@ -1889,7 +2071,8 @@ func TestNodeToMachine(t *testing.T) { Kind: "GenericBootstrapConfig", Name: "bootstrap-config-machinereconcile", }, - }}, + }, + }, } g.Expect(env.Create(ctx, expectedMachine)).To(Succeed()) @@ -1928,7 +2111,8 @@ func TestNodeToMachine(t *testing.T) { Kind: "GenericBootstrapConfig", Name: "bootstrap-config-machinereconcile", }, - }}, + }, + }, } g.Expect(env.Create(ctx, randomMachine)).To(Succeed())