Skip to content

Commit

Permalink
Add health checks to reconcileDelete
Browse files Browse the repository at this point in the history
  • Loading branch information
Sedef committed Oct 9, 2020
1 parent e2d6f83 commit f984dba
Show file tree
Hide file tree
Showing 4 changed files with 104 additions and 75 deletions.
52 changes: 42 additions & 10 deletions controlplane/kubeadm/controllers/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -290,21 +290,19 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster *
return ctrl.Result{}, err
}

ownedMachines := controlPlaneMachines.Filter(machinefilters.OwnedMachines(kcp))
if len(ownedMachines) != len(controlPlaneMachines) {
logger.Info("Not all control plane machines are owned by this KubeadmControlPlane, refusing to operate in mixed management mode")
return ctrl.Result{}, nil
}

controlPlane, err := internal.NewControlPlane(ctx, r.Client, cluster, kcp, ownedMachines)
if err != nil {
controlPlane, err := r.createControlPlane(ctx, cluster, kcp)
if controlPlane == nil || err != nil {
logger.Error(err, "failed to initialize control plane")
return ctrl.Result{}, err
}
if len(controlPlane.Machines) != len(controlPlaneMachines) {
logger.Info("Not all control plane machines are owned by this KubeadmControlPlane, refusing to operate in mixed management mode")
return ctrl.Result{}, nil
}

// Aggregate the operational state of all the machines; while aggregating we are adding the
// source ref (reason@machine/name) so the problem can be easily tracked down to its source machine.
conditions.SetAggregate(controlPlane.KCP, controlplanev1.MachinesReadyCondition, ownedMachines.ConditionGetters(), conditions.AddSourceRef())
conditions.SetAggregate(controlPlane.KCP, controlplanev1.MachinesReadyCondition, controlPlane.Machines.ConditionGetters(), conditions.AddSourceRef())

// reconcileControlPlaneHealth returns err if there is a machine being delete or control plane is unhealthy.
// If control plane is not initialized, then control-plane machines will be empty and hence health check will not fail.
Expand All @@ -331,7 +329,7 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster *
}

// If we've made it this far, we can assume that all ownedMachines are up to date
numMachines := len(ownedMachines)
numMachines := len(controlPlane.Machines)
desiredReplicas := int(*kcp.Spec.Replicas)

switch {
Expand Down Expand Up @@ -379,13 +377,47 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster *
return ctrl.Result{}, nil
}

func (r *KubeadmControlPlaneReconciler) createControlPlane(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane) (*internal.ControlPlane, error) {
logger := r.Log.WithValues("namespace", kcp.Namespace, "kubeadmControlPlane", kcp.Name, "cluster", cluster.Name)

controlPlaneMachines, err := r.managementCluster.GetMachinesForCluster(ctx, util.ObjectKey(cluster), machinefilters.ControlPlaneMachines(cluster.Name))
if err != nil {
logger.Error(err, "failed to retrieve control plane machines for cluster")
return nil, err
}
ownedMachines := controlPlaneMachines.Filter(machinefilters.OwnedMachines(kcp))

controlPlane, err := internal.NewControlPlane(ctx, r.Client, cluster, kcp, ownedMachines)
if err != nil {
logger.Error(err, "failed to initialize control plane")
return nil, err
}
return controlPlane, nil
}

// reconcileDelete handles KubeadmControlPlane deletion.
// The implementation does not take non-control plane workloads into consideration. This may or may not change in the future.
// Please see https://github.com/kubernetes-sigs/cluster-api/issues/2064.
func (r *KubeadmControlPlaneReconciler) reconcileDelete(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane) (ctrl.Result, error) {
logger := r.Log.WithValues("namespace", kcp.Namespace, "kubeadmControlPlane", kcp.Name, "cluster", cluster.Name)
logger.Info("Reconcile KubeadmControlPlane deletion")

controlPlane, err := r.createControlPlane(ctx, cluster, kcp)
if controlPlane == nil || err != nil {
logger.Error(err, "failed to initialize control plane")
return ctrl.Result{}, err
}

// Ignore the health check results here, they are used to set health related conditions on Machines.
_, err = r.managementCluster.TargetClusterControlPlaneHealthCheck(ctx, controlPlane, util.ObjectKey(cluster))
if err != nil {
return ctrl.Result{}, err
}
_, err = r.managementCluster.TargetClusterEtcdHealthCheck(ctx, controlPlane, util.ObjectKey(cluster))
if err != nil {
return ctrl.Result{}, err
}

allMachines, err := r.managementCluster.GetMachinesForCluster(ctx, util.ObjectKey(cluster))
if err != nil {
return ctrl.Result{}, err
Expand Down
13 changes: 13 additions & 0 deletions controlplane/kubeadm/controllers/fakes_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,19 @@ func (f *fakeManagementCluster) TargetClusterEtcdIsHealthy(_ context.Context, _
}
return nil
}
func (f *fakeManagementCluster) TargetClusterEtcdHealthCheck(_ context.Context, _ *internal.ControlPlane, _ client.ObjectKey) (internal.HealthCheckResult, error) {
if !f.EtcdHealthy {
return nil, errors.New("etcd is not healthy")
}
return nil, nil
}

func (f *fakeManagementCluster) TargetClusterControlPlaneHealthCheck(_ context.Context, _ *internal.ControlPlane, _ client.ObjectKey) (internal.HealthCheckResult, error) {
if !f.ControlPlaneHealthy {
return nil, errors.New("control plane is not healthy")
}
return nil, nil
}

type fakeWorkloadCluster struct {
*internal.Workload
Expand Down
36 changes: 24 additions & 12 deletions controlplane/kubeadm/internal/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ type ManagementCluster interface {
ctrlclient.Reader

GetMachinesForCluster(ctx context.Context, cluster client.ObjectKey, filters ...machinefilters.Func) (FilterableMachineCollection, error)
TargetClusterControlPlaneHealthCheck(ctx context.Context, controlPlane *ControlPlane, clusterKey client.ObjectKey) (HealthCheckResult, error)
TargetClusterEtcdHealthCheck(ctx context.Context, controlPlane *ControlPlane, clusterKey client.ObjectKey) (HealthCheckResult, error)
TargetClusterEtcdIsHealthy(ctx context.Context, controlPlane *ControlPlane, clusterKey client.ObjectKey) error
TargetClusterControlPlaneIsHealthy(ctx context.Context, controlPlane *ControlPlane, clusterKey client.ObjectKey) error
GetWorkloadCluster(ctx context.Context, clusterKey client.ObjectKey) (WorkloadCluster, error)
Expand Down Expand Up @@ -179,21 +181,14 @@ func (m *Management) getApiServerEtcdClientCert(ctx context.Context, clusterKey
return tls.X509KeyPair(crtData, keyData)
}

type healthCheck func(context.Context, *ControlPlane) (HealthCheckResult, error)

// HealthCheck will run a generic health check function and report any errors discovered.
// In addition to the health check, it also ensures there is a 1;1 match between nodes and machines.
// To have access to the owned control-plane machines during health checks, need to pass owningMachines here.
func (m *Management) healthCheck(ctx context.Context, controlPlane *ControlPlane, check healthCheck, clusterKey client.ObjectKey) error {
func (m *Management) healthCheck(controlPlane *ControlPlane, nodeChecks HealthCheckResult, clusterKey client.ObjectKey) error {
var errorList []error
kcpMachines := controlPlane.Machines.UnsortedList()
// Make sure Cluster API is aware of all the nodes.

nodeChecks, err := check(ctx, controlPlane)
if err != nil {
errorList = append(errorList, err)
}

// TODO: If any node has a failure, healthCheck fails. This may be too strict for the health check of HA clusters (Missing a single etcd pod does not indicate a problem)...
for nodeName, err := range nodeChecks {
if err != nil {
Expand Down Expand Up @@ -222,22 +217,39 @@ func (m *Management) healthCheck(ctx context.Context, controlPlane *ControlPlane
return nil
}

func (m *Management) TargetClusterControlPlaneHealthCheck(ctx context.Context, controlPlane *ControlPlane, clusterKey client.ObjectKey) (HealthCheckResult, error) {
workloadCluster, err := m.GetWorkloadCluster(ctx, clusterKey)
if err != nil {
return nil, err
}
return workloadCluster.ControlPlaneIsHealthy(ctx, controlPlane)
}

func (m *Management) TargetClusterEtcdHealthCheck(ctx context.Context, controlPlane *ControlPlane, clusterKey client.ObjectKey) (HealthCheckResult, error) {
workloadCluster, err := m.GetWorkloadCluster(ctx, clusterKey)
if err != nil {
return nil, err
}
return workloadCluster.EtcdIsHealthy(ctx, controlPlane)
}

// TargetClusterControlPlaneIsHealthy checks every node for control plane health.
func (m *Management) TargetClusterControlPlaneIsHealthy(ctx context.Context, controlPlane *ControlPlane, clusterKey client.ObjectKey) error {
// TODO: add checks for expected taints/labels
cluster, err := m.GetWorkloadCluster(ctx, clusterKey)

checkResult, err := m.TargetClusterControlPlaneHealthCheck(ctx, controlPlane, clusterKey)
if err != nil {
return err
}
return m.healthCheck(ctx, controlPlane, cluster.ControlPlaneIsHealthy, clusterKey)
return m.healthCheck(controlPlane, checkResult, clusterKey)
}

// TargetClusterEtcdIsHealthy runs a series of checks over a target cluster's etcd cluster.
// In addition, it verifies that there are the same number of etcd members as control plane Machines.
func (m *Management) TargetClusterEtcdIsHealthy(ctx context.Context, controlPlane *ControlPlane, clusterKey client.ObjectKey) error {
cluster, err := m.GetWorkloadCluster(ctx, clusterKey)
checkResult, err := m.TargetClusterEtcdHealthCheck(ctx, controlPlane, clusterKey)
if err != nil {
return err
}
return m.healthCheck(ctx, controlPlane, cluster.EtcdIsHealthy, clusterKey)
return m.healthCheck(controlPlane, checkResult, clusterKey)
}
78 changes: 25 additions & 53 deletions controlplane/kubeadm/internal/cluster_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,6 @@ import (
"errors"
"fmt"
"math/big"
"sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1alpha3"
"sigs.k8s.io/cluster-api/util/kubeconfig"
"sigs.k8s.io/cluster-api/util/secret"
"testing"
"time"

Expand All @@ -41,8 +38,11 @@ import (
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
clusterv1 "sigs.k8s.io/cluster-api/api/v1alpha3"
"sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1alpha3"
"sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/machinefilters"
"sigs.k8s.io/cluster-api/util/certs"
"sigs.k8s.io/cluster-api/util/kubeconfig"
"sigs.k8s.io/cluster-api/util/secret"
"sigs.k8s.io/controller-runtime/pkg/client"
)

Expand Down Expand Up @@ -476,19 +476,17 @@ func TestManagementCluster_healthCheck_NoError(t *testing.T) {
controlPlane := createControlPlane(threeMachines)
tests := []struct {
name string
check healthCheck
checkResult HealthCheckResult
clusterKey client.ObjectKey
controlPlaneName string
controlPlane *ControlPlane
}{
{
name: "simple",
check: func(ctx context.Context, controlPlane *ControlPlane) (HealthCheckResult, error) {
return HealthCheckResult{
"one": nil,
"two": nil,
"three": nil,
}, nil
checkResult: HealthCheckResult{
"one": nil,
"two": nil,
"three": nil,
},
clusterKey: client.ObjectKey{Namespace: "default", Name: "cluster-name"},
controlPlane: controlPlane,
Expand All @@ -499,11 +497,10 @@ func TestManagementCluster_healthCheck_NoError(t *testing.T) {
t.Run(tt.name, func(t *testing.T) {
g := NewWithT(t)

ctx := context.Background()
m := &Management{
Client: &fakeClient{},
}
g.Expect(m.healthCheck(ctx, controlPlane, tt.check, tt.clusterKey)).To(Succeed())
g.Expect(m.healthCheck(controlPlane, tt.checkResult, tt.clusterKey)).To(Succeed())
})
}
}
Expand All @@ -512,7 +509,7 @@ func TestManagementCluster_healthCheck_Errors(t *testing.T) {

tests := []struct {
name string
check healthCheck
checkResult HealthCheckResult
clusterKey client.ObjectKey
controlPlaneName string
controlPlane *ControlPlane
Expand All @@ -527,53 +524,31 @@ func TestManagementCluster_healthCheck_Errors(t *testing.T) {
controlPlaneMachine("two"),
controlPlaneMachine("three"),
}),
check: func(ctx context.Context, controlPlane *ControlPlane) (HealthCheckResult, error) {
return HealthCheckResult{
"one": nil,
}, nil
},
},
{
name: "health check returns an error not related to the nodes health",
controlPlane: createControlPlane([]*clusterv1.Machine{
controlPlaneMachine("one"),
controlPlaneMachine("two"),
controlPlaneMachine("three"),
}),
check: func(ctx context.Context, controlPlane *ControlPlane) (HealthCheckResult, error) {
return HealthCheckResult{
"one": nil,
"two": errors.New("two"),
"three": errors.New("three"),
}, errors.New("meta")
checkResult: HealthCheckResult{
"one": nil,
},
expectedErrors: []string{"two", "three", "meta"},
},
{
name: "two nodes error on the check but no overall error occurred",
controlPlane: createControlPlane([]*clusterv1.Machine{
controlPlaneMachine("one"),
controlPlaneMachine("two"),
controlPlaneMachine("three")}),
check: func(ctx context.Context, controlPlane *ControlPlane) (HealthCheckResult, error) {
return HealthCheckResult{
"one": nil,
"two": errors.New("two"),
"three": errors.New("three"),
}, nil
checkResult: HealthCheckResult{
"one": nil,
"two": errors.New("two"),
"three": errors.New("three"),
},
expectedErrors: []string{"two", "three"},
},
{
name: "more nodes than machines were checked (out of band control plane nodes)",
controlPlane: createControlPlane([]*clusterv1.Machine{
controlPlaneMachine("one")}),
check: func(ctx context.Context, controlPlane *ControlPlane) (HealthCheckResult, error) {
return HealthCheckResult{
"one": nil,
"two": nil,
"three": nil,
}, nil
checkResult: HealthCheckResult{
"one": nil,
"two": nil,
"three": nil,
},
},
{
Expand All @@ -582,24 +557,21 @@ func TestManagementCluster_healthCheck_Errors(t *testing.T) {
controlPlaneMachine("one"),
controlPlaneMachine("two"),
nilNodeRef(controlPlaneMachine("three"))}),
check: func(ctx context.Context, controlPlane *ControlPlane) (HealthCheckResult, error) {
return HealthCheckResult{
"one": nil,
"two": nil,
"three": nil,
}, nil
checkResult: HealthCheckResult{
"one": nil,
"two": nil,
"three": nil,
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
g := NewWithT(t)

ctx := context.Background()
clusterKey := client.ObjectKey{Namespace: "default", Name: "cluster-name"}

m := &Management{Client: &fakeClient{}}
err := m.healthCheck(ctx, tt.controlPlane, tt.check, clusterKey)
err := m.healthCheck(tt.controlPlane, tt.checkResult, clusterKey)
g.Expect(err).To(HaveOccurred())

for _, expectedError := range tt.expectedErrors {
Expand Down

0 comments on commit f984dba

Please sign in to comment.