Skip to content

Commit

Permalink
Add Machine and KCP conditions to KCP controller
Browse files Browse the repository at this point in the history
  • Loading branch information
Sedef committed Sep 25, 2020
1 parent 0cf9f80 commit e7226fb
Show file tree
Hide file tree
Showing 17 changed files with 466 additions and 162 deletions.
54 changes: 52 additions & 2 deletions api/v1alpha3/condition_consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,8 @@ const (
// MachineHasFailureReason is the reason used when a machine has either a FailureReason or a FailureMessage set on its status.
MachineHasFailureReason = "MachineHasFailure"

// NodeNotFoundReason is the reason used when a machine's node has previously been observed but is now gone.
// NodeNotFoundReason (Severity=Error) documents a machine's node has previously been observed but is now gone.
// NB. provisioned --> NodeRef != ""
NodeNotFoundReason = "NodeNotFound"

// NodeStartupTimeoutReason is the reason used when a machine's node does not appear within the specified timeout.
Expand All @@ -120,10 +121,59 @@ const (
)

const (
// MachineOwnerRemediatedCondition is set on machines that have failed a healthcheck by the MachineHealthCheck controller.
// MachineOwnerRemediatedCondition is set on machines that have failed a healthcheck by the Machine's owner controller.
// MachineOwnerRemediatedCondition is set to False after a health check fails, but should be changed to True by the owning controller after remediation succeeds.
MachineOwnerRemediatedCondition ConditionType = "OwnerRemediated"

// WaitingForRemediationReason is the reason used when a machine fails a health check and remediation is needed.
WaitingForRemediationReason = "WaitingForRemediation"
)

// Common Pod-related Condition Reasons used by Pod-related Conditions such as MachineAPIServerPodHealthyCondition etc.
const (
// PodProvisioningReason (Severity=Info) documents a pod waiting to be provisioned i.e., Pod is in "Pending" phase and
// PodScheduled and Initialized conditions are not yet set to True.
PodProvisioningReason = "PodProvisioning"

// PodMissingReason (Severity=Warning) documents a pod does not exist.
PodMissingReason = "PodMissing"

// PodFailedReason (Severity=Error) documents if
// i) a pod failed during provisioning i.e., Pod is in "Pending" phase and
// PodScheduled and Initialized conditions are set to True but ContainersReady or Ready condition is false
// (i.e., at least one of the containers are in waiting state(e.g CrashLoopbackOff, ImagePullBackOff)
// ii) a pod has at least one container that is terminated with a failure and hence Pod is in "Failed" phase.
PodFailedReason = "PodFailed"
)

// Conditions that are only for control-plane machines. KubeadmControlPlane is the owner of these conditions.

const (
// MachineAPIServerPodHealthyCondition reports a machine's kube-apiserver's health status.
// Set to true if kube-apiserver pod is in "Running" phase, otherwise uses Pod-related Condition Reasons.
MachineAPIServerPodHealthyCondition ConditionType = "APIServerPodHealthy"

// MachineControllerManagerHealthyCondition reports a machine's kube-controller-manager's health status.
// Set to true if kube-controller-manager pod is in "Running" phase, otherwise uses Pod-related Condition Reasons.
MachineControllerManagerHealthyCondition ConditionType = "ControllerManagerPodHealthy"

// MachineSchedulerPodHealthyCondition reports a machine's kube-scheduler's health status.
// Set to true if kube-scheduler pod is in "Running" phase, otherwise uses Pod-related Condition Reasons.
MachineSchedulerPodHealthyCondition ConditionType = "SchedulerPodHealthy"

// MachineEtcdPodHealthyCondition reports a machine's etcd pod's health status.
// Set to true if etcd pod is in "Running" phase, otherwise uses Pod-related Condition Reasons.
MachineEtcdPodHealthyCondition ConditionType = "EtcdPodHealthy"
)

const (
// MachineEtcdMemberHealthyCondition documents if the machine has an healthy etcd member.
// If not true, Pod-related Condition Reasons can be used as reasons.
MachineEtcdMemberHealthyCondition ConditionType = "EtcdMemberHealthy"

// EtcdMemberUnhealthyReason (Severity=Error) documents a Machine's etcd member is unhealthy for a number of reasons:
// i) etcd member has alarms.
// ii) creating etcd client fails or using the created etcd client to perform some operations fails.
// iii) Quorum is lost
EtcdMemberUnhealthyReason = "EtcdMemberUnhealthy"
)
11 changes: 11 additions & 0 deletions controlplane/kubeadm/api/v1alpha3/condition_consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,3 +66,14 @@ const (
// ScalingDownReason (Severity=Info) documents a KubeadmControlPlane that is decreasing the number of replicas.
ScalingDownReason = "ScalingDown"
)

const (
// EtcdClusterHealthy documents the overall etcd cluster's health for the KCP-managed etcd.
EtcdClusterHealthy clusterv1.ConditionType = "EtcdClusterHealthy"

// EtcdClusterUnhealthyReason (Severity=Warning) is set when the etcd cluster as unhealthy due to
// i) if etcd cluster has lost its quorum.
// ii) if etcd cluster has alarms armed.
// iii) if etcd pods do not match with etcd members.
EtcdClusterUnhealthyReason = "EtcdClusterUnhealthy"
)
78 changes: 73 additions & 5 deletions controlplane/kubeadm/controllers/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,27 @@ func (r *KubeadmControlPlaneReconciler) Reconcile(req ctrl.Request) (res ctrl.Re
return r.reconcile(ctx, cluster, kcp)
}

// setSingleMachineConditions updates the machine's conditions according to health tracker.
func setSingleMachineConditions(machine *clusterv1.Machine, controlPlane *internal.ControlPlane) {
for condType, condition := range controlPlane.MachineConditions[machine.Name] {
doesConditionExist := false
for _, mCondition := range machine.Status.Conditions {
// If the condition already exists, change the condition.
if mCondition.Type == condType {
conditions.Set(machine, condition)
doesConditionExist = true
}
}
if !doesConditionExist {
if machine.Status.Conditions == nil {
machine.Status.Conditions = clusterv1.Conditions{}
}
conditions.Set(machine, condition)
}

}
}

func patchKubeadmControlPlane(ctx context.Context, patchHelper *patch.Helper, kcp *controlplanev1.KubeadmControlPlane) error {
// Always update the readyCondition by summarizing the state of other conditions.
conditions.SetSummary(kcp,
Expand All @@ -221,6 +242,7 @@ func patchKubeadmControlPlane(ctx context.Context, patchHelper *patch.Helper, kc
controlplanev1.MachinesReadyCondition,
controlplanev1.AvailableCondition,
controlplanev1.CertificatesAvailableCondition,
controlplanev1.EtcdClusterHealthy,
),
)

Expand Down Expand Up @@ -305,6 +327,14 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster *
// source ref (reason@machine/name) so the problem can be easily tracked down to its source machine.
conditions.SetAggregate(controlPlane.KCP, controlplanev1.MachinesReadyCondition, ownedMachines.ConditionGetters(), conditions.AddSourceRef())

// If control plane is initialized, reconcile health.
if ownedMachines.Len() != 0 {
// reconcileControlPlaneHealth returns err if there is a machine being delete
if result, err := r.reconcileControlPlaneHealth(ctx, cluster, kcp, controlPlane); err != nil || !result.IsZero() {
return result, err
}
}

// Control plane machines rollout due to configuration changes (e.g. upgrades) takes precedence over other operations.
needRollout := controlPlane.MachinesNeedingRollout()
switch {
Expand Down Expand Up @@ -442,21 +472,59 @@ func (r *KubeadmControlPlaneReconciler) ClusterToKubeadmControlPlane(o handler.M
return nil
}

// reconcileHealth performs health checks for control plane components and etcd
func patchControlPlaneMachine(ctx context.Context, patchHelper *patch.Helper, machine *clusterv1.Machine) error {
// Patch the object, ignoring conflicts on the conditions owned by this controller.

// TODO: Is it okay to own these conditions or just patch?
// return patchHelper.Patch(ctx, machine)

return patchHelper.Patch(
ctx,
machine,
patch.WithOwnedConditions{Conditions: []clusterv1.ConditionType{
clusterv1.MachineAPIServerPodHealthyCondition,
clusterv1.MachineControllerManagerHealthyCondition,
clusterv1.MachineEtcdMemberHealthyCondition,
clusterv1.MachineEtcdPodHealthyCondition,
clusterv1.MachineSchedulerPodHealthyCondition,
}},
)
}

// reconcileControlPlaneHealth performs health checks for control plane components and etcd
// It removes any etcd members that do not have a corresponding node.
// Also, as a final step, checks if there is any machines that is being deleted.
func (r *KubeadmControlPlaneReconciler) reconcileHealth(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane, controlPlane *internal.ControlPlane) (ctrl.Result, error) {
func (r *KubeadmControlPlaneReconciler) reconcileControlPlaneHealth(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane, controlPlane *internal.ControlPlane) (ctrl.Result, error) {
logger := r.Log.WithValues("namespace", kcp.Namespace, "kubeadmControlPlane", kcp.Name)

for _, m := range controlPlane.Machines {
// Initialize the patch helper.
patchHelper, err := patch.NewHelper(m, r.Client)
if err != nil {
logger.Error(err, "Failed to configure the patch helper")
return ctrl.Result{Requeue: true}, nil
}

machine := m
defer func() {
setSingleMachineConditions(machine, controlPlane)
// Always attempt to Patch the Machine conditions after each health reconciliation.
if err := patchControlPlaneMachine(ctx, patchHelper, machine); err != nil {
logger.Error(err, "Failed to patch KubeadmControlPlane Machine")
}
}()
}

// Do a health check of the Control Plane components
if err := r.managementCluster.TargetClusterControlPlaneIsHealthy(ctx, util.ObjectKey(cluster)); err != nil {
if err := r.managementCluster.TargetClusterControlPlaneIsHealthy(ctx, controlPlane, util.ObjectKey(cluster)); err != nil {
r.recorder.Eventf(kcp, corev1.EventTypeWarning, "ControlPlaneUnhealthy",
"Waiting for control plane to pass control plane health check to continue reconciliation: %v", err)
return ctrl.Result{RequeueAfter: healthCheckFailedRequeueAfter}, nil
return ctrl.Result{}, errors.Wrap(err, "failed to pass control-plane health check")
}

// If KCP should manage etcd, ensure etcd is healthy.
if controlPlane.IsEtcdManaged() {
if err := r.managementCluster.TargetClusterEtcdIsHealthy(ctx, util.ObjectKey(cluster)); err != nil {
if err := r.managementCluster.TargetClusterEtcdIsHealthy(ctx, controlPlane, util.ObjectKey(cluster)); err != nil {
errList := []error{errors.Wrap(err, "failed to pass etcd health check")}
r.recorder.Eventf(kcp, corev1.EventTypeWarning, "ControlPlaneUnhealthy",
"Waiting for control plane to pass etcd health check to continue reconciliation: %v", err)
Expand Down
4 changes: 2 additions & 2 deletions controlplane/kubeadm/controllers/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -572,7 +572,7 @@ func TestKubeadmControlPlaneReconciler_adoption(t *testing.T) {
g := NewWithT(t)

cluster, kcp, tmpl := createClusterWithControlPlane()
cluster.Spec.ControlPlaneEndpoint.Host = "nodomain.example.com"
cluster.Spec.ControlPlaneEndpoint.Host = "nodomain2.example.com"
cluster.Spec.ControlPlaneEndpoint.Port = 6443
kcp.Spec.Version = version

Expand Down Expand Up @@ -642,7 +642,7 @@ func TestKubeadmControlPlaneReconciler_adoption(t *testing.T) {
g := NewWithT(t)

cluster, kcp, tmpl := createClusterWithControlPlane()
cluster.Spec.ControlPlaneEndpoint.Host = "nodomain.example.com"
cluster.Spec.ControlPlaneEndpoint.Host = "nodomain3.example.com"
cluster.Spec.ControlPlaneEndpoint.Port = 6443
kcp.Spec.Version = "v1.17.0"

Expand Down
4 changes: 2 additions & 2 deletions controlplane/kubeadm/controllers/fakes_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,14 +57,14 @@ func (f *fakeManagementCluster) GetMachinesForCluster(c context.Context, n clien
return f.Machines, nil
}

func (f *fakeManagementCluster) TargetClusterControlPlaneIsHealthy(_ context.Context, _ client.ObjectKey) error {
func (f *fakeManagementCluster) TargetClusterControlPlaneIsHealthy(_ context.Context, _ *internal.ControlPlane, _ client.ObjectKey) error {
if !f.ControlPlaneHealthy {
return errors.New("control plane is not healthy")
}
return nil
}

func (f *fakeManagementCluster) TargetClusterEtcdIsHealthy(_ context.Context, _ client.ObjectKey) error {
func (f *fakeManagementCluster) TargetClusterEtcdIsHealthy(_ context.Context, _ *internal.ControlPlane, _ client.ObjectKey) error {
if !f.EtcdHealthy {
return errors.New("etcd is not healthy")
}
Expand Down
12 changes: 2 additions & 10 deletions controlplane/kubeadm/controllers/scale.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,6 @@ func (r *KubeadmControlPlaneReconciler) initializeControlPlane(ctx context.Conte
func (r *KubeadmControlPlaneReconciler) scaleUpControlPlane(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane, controlPlane *internal.ControlPlane) (ctrl.Result, error) {
logger := controlPlane.Logger()

// reconcileHealth returns err if there is a machine being delete which is a required condition to check before scaling up
if result, err := r.reconcileHealth(ctx, cluster, kcp, controlPlane); err != nil || !result.IsZero() {
return result, err
}

// Create the bootstrap configuration
bootstrapSpec := controlPlane.JoinControlPlaneConfig()
fd := controlPlane.NextFailureDomainForScaleUp()
Expand All @@ -90,10 +85,6 @@ func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane(
) (ctrl.Result, error) {
logger := controlPlane.Logger()

if result, err := r.reconcileHealth(ctx, cluster, kcp, controlPlane); err != nil || !result.IsZero() {
return result, err
}

workloadCluster, err := r.managementCluster.GetWorkloadCluster(ctx, util.ObjectKey(cluster))
if err != nil {
logger.Error(err, "Failed to create client to workload cluster")
Expand Down Expand Up @@ -123,7 +114,8 @@ func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane(
}
}

if err := r.managementCluster.TargetClusterControlPlaneIsHealthy(ctx, util.ObjectKey(cluster)); err != nil {
// TODO: check if this is needed after moving the health check to the main reconcile
if err := r.managementCluster.TargetClusterControlPlaneIsHealthy(ctx, controlPlane, util.ObjectKey(cluster)); err != nil {
logger.V(2).Info("Waiting for control plane to pass control plane health check before removing a control plane machine", "cause", err)
r.recorder.Eventf(kcp, corev1.EventTypeWarning, "ControlPlaneUnhealthy",
"Waiting for control plane to pass control plane health check before removing a control plane machine: %v", err)
Expand Down
17 changes: 6 additions & 11 deletions controlplane/kubeadm/controllers/scale_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,14 +116,16 @@ func TestKubeadmControlPlaneReconciler_scaleUpControlPlane(t *testing.T) {

result, err := r.scaleUpControlPlane(context.Background(), cluster, kcp, controlPlane)
g.Expect(result).To(Equal(ctrl.Result{Requeue: true}))
g.Expect(err).ToNot(HaveOccurred())
g.Expect(err).NotTo(HaveOccurred())

controlPlaneMachines := clusterv1.MachineList{}
g.Expect(fakeClient.List(context.Background(), &controlPlaneMachines)).To(Succeed())
g.Expect(controlPlaneMachines.Items).To(HaveLen(3))
})
t.Run("does not create a control plane Machine if health checks fail", func(t *testing.T) {
cluster, kcp, genericMachineTemplate := createClusterWithControlPlane()
cluster.Spec.ControlPlaneEndpoint.Host = "nodomain.example.com"
cluster.Spec.ControlPlaneEndpoint.Port = 6443
initObjs := []runtime.Object{cluster.DeepCopy(), kcp.DeepCopy(), genericMachineTemplate.DeepCopy()}

beforeMachines := internal.NewFilterableMachineCollection()
Expand Down Expand Up @@ -170,18 +172,11 @@ func TestKubeadmControlPlaneReconciler_scaleUpControlPlane(t *testing.T) {
Log: log.Log,
recorder: record.NewFakeRecorder(32),
}
controlPlane := &internal.ControlPlane{
KCP: kcp,
Cluster: cluster,
Machines: beforeMachines,
}

result, err := r.scaleUpControlPlane(context.Background(), cluster.DeepCopy(), kcp.DeepCopy(), controlPlane)
if tc.expectErr {
g.Expect(err).To(HaveOccurred())
}
g.Expect(result).To(Equal(tc.expectResult))
_, err := r.reconcile(context.Background(), cluster, kcp)
g.Expect(err).To(HaveOccurred())

// scaleUpControlPlane is never called due to health check failure and new machine is not created to scale up.
controlPlaneMachines := &clusterv1.MachineList{}
g.Expect(fakeClient.List(context.Background(), controlPlaneMachines)).To(Succeed())
g.Expect(controlPlaneMachines.Items).To(HaveLen(len(beforeMachines)))
Expand Down
3 changes: 1 addition & 2 deletions controlplane/kubeadm/controllers/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ package controllers

import (
"context"

"github.com/pkg/errors"
clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha3"
controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1alpha3"
Expand All @@ -29,7 +28,7 @@ import (
)

// updateStatus is called after every reconcilitation loop in a defer statement to always make sure we have the
// resource status subresourcs up-to-date.
// resource status subresources up-to-date.
func (r *KubeadmControlPlaneReconciler) updateStatus(ctx context.Context, kcp *controlplanev1.KubeadmControlPlane, cluster *clusterv1.Cluster) error {
selector := machinefilters.ControlPlaneSelectorForCluster(cluster.Name)
// Copy label selector to its status counterpart in string format.
Expand Down
8 changes: 5 additions & 3 deletions controlplane/kubeadm/controllers/upgrade_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ func TestKubeadmControlPlaneReconciler_upgradeControlPlane(t *testing.T) {
g := NewWithT(t)

cluster, kcp, genericMachineTemplate := createClusterWithControlPlane()
cluster.Spec.ControlPlaneEndpoint.Host = "nodomain.example.com"
cluster.Spec.ControlPlaneEndpoint.Port = 6443
kcp.Spec.Version = "v1.17.3"
kcp.Spec.KubeadmConfigSpec.ClusterConfiguration = nil
kcp.Spec.Replicas = pointer.Int32Ptr(1)
Expand Down Expand Up @@ -89,9 +91,9 @@ func TestKubeadmControlPlaneReconciler_upgradeControlPlane(t *testing.T) {

// run upgrade a second time, simulate that the node has not appeared yet but the machine exists
r.managementCluster.(*fakeManagementCluster).ControlPlaneHealthy = false
result, err = r.upgradeControlPlane(context.Background(), cluster, kcp, controlPlane, needingUpgrade)
g.Expect(result).To(Equal(ctrl.Result{RequeueAfter: healthCheckFailedRequeueAfter}))
g.Expect(err).To(BeNil())
// Unhealthy control plane will be detected during reconcile loop and upgrade will never be called.
_, err = r.reconcile(context.Background(), cluster, kcp)
g.Expect(err).To(HaveOccurred())
g.Expect(fakeClient.List(context.Background(), bothMachines, client.InNamespace(cluster.Namespace))).To(Succeed())
g.Expect(bothMachines.Items).To(HaveLen(2))

Expand Down

0 comments on commit e7226fb

Please sign in to comment.