Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Upgrade canary test to fully update the cluster after the canary upgrade #654

Merged
merged 5 commits into from
May 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/actions/run-integ-test/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ runs:
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
- name: Set up Go
uses: actions/setup-go@v4
uses: actions/setup-go@v5
with:
go-version-file: 'go.mod'
cache: true
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/operatorBuildAndDeploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:
- uses: actions/checkout@v4
if: github.event_name != 'pull_request'
- name: Set up Go
uses: actions/setup-go@v4
uses: actions/setup-go@v5
with:
go-version-file: 'go.mod'
cache: true
Expand Down Expand Up @@ -53,7 +53,7 @@ jobs:
- name: Check out code into the Go module directory
uses: actions/checkout@v4
- name: Set up Go
uses: actions/setup-go@v4
uses: actions/setup-go@v5
with:
go-version-file: 'go.mod'
cache: true
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
- name: Check out code into the Go module directory
uses: actions/checkout@v4
- name: Set up Go
uses: actions/setup-go@v4
uses: actions/setup-go@v5
with:
go-version-file: 'go.mod'
cache: true
Expand Down
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ Changelog for Cass Operator, new PRs should update the `main / unreleased` secti
## unreleased

* [BUGFIX] [#652](https://github.com/k8ssandra/cass-operator/issues/652) Update nodeStatuses info when a pod is recreated
* [BUGFIX] [#656](https://github.com/k8ssandra/cass-operator/issues/656) After canary upgrade, it was not possible to continue upgrading rest of the nodes if the only change was removing the canary upgrade
* [BUGFIX] [#657](https://github.com/k8ssandra/cass-operator/issues/657) If a change did not result in StatefulSet changes, a Ready -> Ready state would lose an ObservedGeneration update in the status
* [BUGFIX] [#382](https://github.com/k8ssandra/cass-operator/issues/382) If StatefulSet has not caught up yet, do not allow any changes to it. Instead, requeue and wait (does not change behavior of forceRacksUpgrade)

## v1.20.0

Expand Down
17 changes: 17 additions & 0 deletions pkg/reconciliation/construct_statefulset.go
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,23 @@ func newStatefulSetForCassandraDatacenter(
result.Spec.ServiceName = sts.Spec.ServiceName
}

if dc.Spec.CanaryUpgrade {
var partition int32
if dc.Spec.CanaryUpgradeCount == 0 || dc.Spec.CanaryUpgradeCount > replicaCountInt32 {
partition = replicaCountInt32
} else {
partition = replicaCountInt32 - dc.Spec.CanaryUpgradeCount
}

strategy := appsv1.StatefulSetUpdateStrategy{
Type: appsv1.RollingUpdateStatefulSetStrategyType,
RollingUpdate: &appsv1.RollingUpdateStatefulSetStrategy{
Partition: &partition,
},
}
result.Spec.UpdateStrategy = strategy
}

// add a hash here to facilitate checking if updates are needed
utils.AddHashAnnotation(result)

Expand Down
9 changes: 4 additions & 5 deletions pkg/reconciliation/constructor.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@ package reconciliation
// This file defines constructors for k8s objects

import (
"fmt"

api "github.com/k8ssandra/cass-operator/apis/cassandra/v1beta1"
"github.com/k8ssandra/cass-operator/pkg/oplabels"
"github.com/k8ssandra/cass-operator/pkg/utils"

corev1 "k8s.io/api/core/v1"
policyv1 "k8s.io/api/policy/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
Expand Down Expand Up @@ -48,7 +49,7 @@ func newPodDisruptionBudgetForDatacenter(dc *api.CassandraDatacenter) *policyv1.
}

func setOperatorProgressStatus(rc *ReconciliationContext, newState api.ProgressState) error {
rc.ReqLogger.Info("reconcile_racks::setOperatorProgressStatus")
rc.ReqLogger.Info(fmt.Sprintf("reconcile_racks::setOperatorProgressStatus::%v", newState))
currentState := rc.Datacenter.Status.CassandraOperatorProgress
if currentState == newState {
// early return, no need to ping k8s
Expand All @@ -57,13 +58,11 @@ func setOperatorProgressStatus(rc *ReconciliationContext, newState api.ProgressS

patch := client.MergeFrom(rc.Datacenter.DeepCopy())
rc.Datacenter.Status.CassandraOperatorProgress = newState
// TODO there may be a better place to push status.observedGeneration in the reconcile loop

if newState == api.ProgressReady {
rc.Datacenter.Status.ObservedGeneration = rc.Datacenter.Generation
if rc.Datacenter.Status.DatacenterName == nil {
rc.Datacenter.Status.DatacenterName = &rc.Datacenter.Spec.DatacenterName
}
rc.setCondition(api.NewDatacenterCondition(api.DatacenterRequiresUpdate, corev1.ConditionFalse))
}
if err := rc.Client.Status().Patch(rc.Ctx, rc.Datacenter, patch); err != nil {
rc.ReqLogger.Error(err, "error updating the Cassandra Operator Progress state")
Expand Down
25 changes: 23 additions & 2 deletions pkg/reconciliation/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@ import (
"fmt"
"sync"

"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
"sigs.k8s.io/controller-runtime/pkg/reconcile"

corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"

api "github.com/k8ssandra/cass-operator/apis/cassandra/v1beta1"
Expand Down Expand Up @@ -65,9 +67,28 @@ func (rc *ReconciliationContext) CalculateReconciliationActions() (reconcile.Res
return result.Error(err).Output()
}

result, err := rc.ReconcileAllRacks()
res, err := rc.ReconcileAllRacks()
if err != nil {
return result.Error(err).Output()
}

if err := rc.updateStatus(); err != nil {
return result.Error(err).Output()
}

return result, err
return res, nil
}

func (rc *ReconciliationContext) updateStatus() error {
patch := client.MergeFrom(rc.Datacenter.DeepCopy())
rc.Datacenter.Status.ObservedGeneration = rc.Datacenter.Generation
rc.setCondition(api.NewDatacenterCondition(api.DatacenterRequiresUpdate, corev1.ConditionFalse))
if err := rc.Client.Status().Patch(rc.Ctx, rc.Datacenter, patch); err != nil {
rc.ReqLogger.Error(err, "error updating the Cassandra Operator Progress state")
return err
}

return nil
}

// This file contains various definitions and plumbing setup used for reconciliation.
Expand Down
63 changes: 24 additions & 39 deletions pkg/reconciliation/reconcile_racks.go
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,30 @@ func (rc *ReconciliationContext) CheckRackPodTemplate() result.ReconcileResult {
}
statefulSet := rc.statefulSets[idx]

status := statefulSet.Status

updatedReplicas := status.UpdatedReplicas
if status.CurrentRevision != status.UpdateRevision {
// Previous update was a canary upgrade, so we have pods in different versions
updatedReplicas = status.CurrentReplicas + status.UpdatedReplicas
}

if statefulSet.Generation != status.ObservedGeneration ||
status.Replicas != status.ReadyReplicas ||
status.Replicas != updatedReplicas {

logger.Info(
"waiting for upgrade to finish on statefulset",
"statefulset", statefulSet.Name,
"replicas", status.Replicas,
"readyReplicas", status.ReadyReplicas,
"currentReplicas", status.CurrentReplicas,
"updatedReplicas", status.UpdatedReplicas,
)

return result.RequeueSoon(10)
}

desiredSts, err := newStatefulSetForCassandraDatacenter(statefulSet, rackName, dc, int(*statefulSet.Spec.Replicas))

if err != nil {
Expand Down Expand Up @@ -232,22 +256,6 @@ func (rc *ReconciliationContext) CheckRackPodTemplate() result.ReconcileResult {
// selector must match podTemplate.Labels, those can't be updated either
desiredSts.Spec.Selector = statefulSet.Spec.Selector

if dc.Spec.CanaryUpgrade {
var partition int32
if dc.Spec.CanaryUpgradeCount == 0 || dc.Spec.CanaryUpgradeCount > int32(rc.desiredRackInformation[idx].NodeCount) {
partition = int32(rc.desiredRackInformation[idx].NodeCount)
} else {
partition = int32(rc.desiredRackInformation[idx].NodeCount) - dc.Spec.CanaryUpgradeCount
}

strategy := appsv1.StatefulSetUpdateStrategy{
Type: appsv1.RollingUpdateStatefulSetStrategyType,
RollingUpdate: &appsv1.RollingUpdateStatefulSetStrategy{
Partition: &partition,
},
}
desiredSts.Spec.UpdateStrategy = strategy
}
stateMeta, err := meta.Accessor(statefulSet)
resVersion := stateMeta.GetResourceVersion()
if err != nil {
Expand Down Expand Up @@ -300,29 +308,6 @@ func (rc *ReconciliationContext) CheckRackPodTemplate() result.ReconcileResult {
// we just updated k8s and pods will be knocked out of ready state, so let k8s
// call us back when these changes are done and the new pods are back to ready
return result.Done()
} else {

// the pod template is right, but if any pods don't match it,
// or are missing, we should not move onto the next rack,
// because there's an upgrade in progress

status := statefulSet.Status
if statefulSet.Generation != status.ObservedGeneration ||
status.Replicas != status.ReadyReplicas ||
status.Replicas != status.CurrentReplicas ||
status.Replicas != status.UpdatedReplicas {

logger.Info(
"waiting for upgrade to finish on statefulset",
"statefulset", statefulSet.Name,
"replicas", status.Replicas,
"readyReplicas", status.ReadyReplicas,
"currentReplicas", status.CurrentReplicas,
"updatedReplicas", status.UpdatedReplicas,
)

return result.RequeueSoon(10)
}
}
}

Expand Down
19 changes: 12 additions & 7 deletions pkg/reconciliation/reconcile_racks_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,6 @@ func TestCheckRackPodTemplate_CanaryUpgrade(t *testing.T) {
rc, _, cleanpMockSrc := setupTest()
defer cleanpMockSrc()

rc.Datacenter.Spec.ServerVersion = "6.8.2"
rc.Datacenter.Spec.Racks = []api.Rack{
{Name: "rack1", DeprecatedZone: "zone-1"},
}
Expand All @@ -314,11 +313,10 @@ func TestCheckRackPodTemplate_CanaryUpgrade(t *testing.T) {

assert.True(t, result.Completed())
assert.Nil(t, err)
previousLabels := rc.statefulSets[0].Spec.Template.Labels

rc.Datacenter.Spec.CanaryUpgrade = true
rc.Datacenter.Spec.CanaryUpgradeCount = 1
rc.Datacenter.Spec.ServerVersion = "6.8.3"
rc.Datacenter.Spec.ServerVersion = "6.8.44"
partition := rc.Datacenter.Spec.CanaryUpgradeCount

result = rc.CheckRackPodTemplate()
Expand All @@ -342,12 +340,14 @@ func TestCheckRackPodTemplate_CanaryUpgrade(t *testing.T) {
rc.statefulSets[0].Status.ReadyReplicas = 2
rc.statefulSets[0].Status.CurrentReplicas = 1
rc.statefulSets[0].Status.UpdatedReplicas = 1
rc.statefulSets[0].Status.CurrentRevision = "1"
rc.statefulSets[0].Status.UpdateRevision = "2"

result = rc.CheckRackPodTemplate()

assert.EqualValues(t, previousLabels, rc.statefulSets[0].Spec.Template.Labels)
rc.Datacenter.Spec.CanaryUpgrade = false

result = rc.CheckRackPodTemplate()
assert.True(t, result.Completed())
assert.NotEqual(t, expectedStrategy, rc.statefulSets[0].Spec.UpdateStrategy)
}

func TestCheckRackPodTemplate_GenerationCheck(t *testing.T) {
Expand Down Expand Up @@ -405,7 +405,11 @@ func TestCheckRackPodTemplate_TemplateLabels(t *testing.T) {
2)
require.NoErrorf(err, "error occurred creating statefulset")

desiredStatefulSet.Generation = 1
desiredStatefulSet.Spec.Replicas = ptr.To(int32(1))
desiredStatefulSet.Status.Replicas = int32(1)
desiredStatefulSet.Status.UpdatedReplicas = int32(1)
desiredStatefulSet.Status.ObservedGeneration = 1
desiredStatefulSet.Status.ReadyReplicas = int32(1)

trackObjects := []runtime.Object{
Expand All @@ -424,9 +428,10 @@ func TestCheckRackPodTemplate_TemplateLabels(t *testing.T) {
rc.statefulSets = make([]*appsv1.StatefulSet, len(rackInfo))
rc.statefulSets[0] = desiredStatefulSet

rc.Client = fake.NewClientBuilder().WithStatusSubresource(rc.Datacenter).WithRuntimeObjects(trackObjects...).Build()
rc.Client = fake.NewClientBuilder().WithStatusSubresource(rc.Datacenter, rc.statefulSets[0]).WithRuntimeObjects(trackObjects...).Build()
res := rc.CheckRackPodTemplate()
require.Equal(result.Done(), res)
rc.statefulSets[0].Status.ObservedGeneration = rc.statefulSets[0].Generation

sts := &appsv1.StatefulSet{}
require.NoError(rc.Client.Get(rc.Ctx, types.NamespacedName{Name: desiredStatefulSet.Name, Namespace: desiredStatefulSet.Namespace}, sts))
Expand Down
16 changes: 16 additions & 0 deletions tests/canary_upgrade/canary_upgrade_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,22 @@ var _ = Describe(testName, func() {
ns.WaitForCassandraImages(dcName, images, 300)
ns.WaitForDatacenterReadyPodCount(dcName, 3)

// TODO Verify that after the canary upgrade we can issue upgrades to the rest of the nodes
step = "remove canary upgrade"
json = "{\"spec\": {\"canaryUpgrade\": false}}"
k = kubectl.PatchMerge(dcResource, json)
ns.ExecAndLog(step, k)

ns.WaitForDatacenterOperatorProgress(dcName, "Updating", 30)
ns.WaitForDatacenterReadyPodCount(dcName, 3)

images = []string{
updated,
updated,
updated,
}
ns.WaitForCassandraImages(dcName, images, 300)

step = "deleting the dc"
k = kubectl.DeleteFromFiles(dcYaml)
ns.ExecAndLog(step, k)
Expand Down
Loading