k8ssandra · burmanm · Mar 9, 2023 · Mar 8, 2023 · Mar 8, 2023 · Mar 9, 2023
@@ -12,7 +12,8 @@ Changelog for Cass Operator, new PRs should update the `main / unreleased` secti
 ## unreleased
 
 * [CHANGE] [#501](https://github.com/k8ssandra/cass-operator/issues/501) Replaced server-system-logger with a Vector based implementation. Also, examples are added how the Cassandra system.log can be parsed to a more structured format.
-* [CHANGE] []() ScalingUp is no longer tied to the lifecycle of the cleanup job. The cleanup job is created after the ScaleUp has finished, but to track its progress one should check the status of the CassandraTask and not the CassandraDatacenter's status. Also, added a new annotation to the Datacenter "cassandra.datastax.com/no-cleanup", which if set prevents from the creation of the CassandraTask.
+* [CHANGE] [#496](https://github.com/k8ssandra/cass-operator/issues/496) ScalingUp is no longer tied to the lifecycle of the cleanup job. The cleanup job is created after the ScaleUp has finished, but to track its progress one should check the status of the CassandraTask and not the CassandraDatacenter's status. Also, added a new annotation to the Datacenter "cassandra.datastax.com/no-cleanup", which if set prevents from the creation of the CassandraTask.
+* [ENHANCEMENT] [#500](https://github.com/k8ssandra/cass-operator/issues/500) Allow the /start command to run for a longer period of time (up to 10 minutes), before killing the pod if no response is received. This is intermediate solution until we can correctly detect from the pod that the start is not proceeding correctly.
 
 ## v1.14.0
 

@@ -243,12 +243,12 @@ OPM ?= $(LOCALBIN)/opm
 
 ## Tool Versions
 CERT_MANAGER_VERSION ?= v1.10.1
-KUSTOMIZE_VERSION ?= v4.5.7
+KUSTOMIZE_VERSION ?= v5.0.0
 CONTROLLER_TOOLS_VERSION ?= v0.10.0
 OPERATOR_SDK_VERSION ?= 1.25.1
 HELM_VERSION ?= 3.10.2
 OPM_VERSION ?= 1.26.2
-GOLINT_VERSION ?= 1.50.1
+GOLINT_VERSION ?= 1.51.2
 
 .PHONY: cert-manager
 cert-manager: ## Install cert-manager to the cluster
@@ -372,4 +372,8 @@ catalog-push: ## Push a catalog image.
 
 .PHONY: golangci-lint
 golangci-lint:
+	@if test -x $(LOCALBIN)/golangci-lint && ! $(LOCALBIN)/golangci-lint version | grep -q $(GOLINT_VERSION); then \
+		echo "$(LOCALBIN)/golangci-lint version is not expected $(GOLINT_VERSION). Removing it before installing."; \
+		rm -rf $(LOCALBIN)/golangci-lint; \
+	fi
 	test -s $(LOCALBIN)/golangci-lint || curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s v$(GOLINT_VERSION)
@@ -899,7 +899,7 @@ func (client *NodeMgmtClient) CallLifecycleStartEndpointWithReplaceIp(pod *corev
 		endpoint: endpoint,
 		host:     podIP,
 		method:   http.MethodPost,
-		timeout:  10 * time.Second,
+		timeout:  10 * time.Minute,
 	}
 
 	_, err := callNodeMgmtEndpoint(client, request, "")

@@ -1758,7 +1758,6 @@ func (rc *ReconciliationContext) findStartedNotReadyNodes() (bool, error) {
 
 func (rc *ReconciliationContext) startCassandra(endpointData httphelper.CassMetadataEndpoints, pod *corev1.Pod) error {
 	dc := rc.Datacenter
-	mgmtClient := rc.NodeMgmtClient
 
 	// Are we replacing this node?
 	shouldReplacePod := utils.IndexOfString(dc.Status.NodeReplacements, pod.Name) > -1
@@ -1783,33 +1782,34 @@ func (rc *ReconciliationContext) startCassandra(endpointData httphelper.CassMeta
 		}
 	}
 
-	var err error
-
-	if shouldReplacePod && replaceAddress != "" {
-		// If we have a replace address that means the cassandra node did
-		// join the ring previously and is marked for replacement, so we
-		// start it accordingly
-		rc.Recorder.Eventf(rc.Datacenter, corev1.EventTypeNormal, events.StartingCassandraAndReplacingNode,
-			"Starting Cassandra for pod %s to replace Cassandra node with address %s", pod.Name, replaceAddress)
-		err = mgmtClient.CallLifecycleStartEndpointWithReplaceIp(pod, replaceAddress)
-	} else {
-		// Either we are not replacing this pod or the relevant cassandra node
-		// never joined the ring in the first place and can be started normally
-		rc.Recorder.Eventf(rc.Datacenter, corev1.EventTypeNormal, events.StartingCassandra,
-			"Starting Cassandra for pod %s", pod.Name)
-		err = mgmtClient.CallLifecycleStartEndpoint(pod)
-	}
+	go func(pod *corev1.Pod) {
+		var err error
+		if shouldReplacePod && replaceAddress != "" {
+			// If we have a replace address that means the cassandra node did
+			// join the ring previously and is marked for replacement, so we
+			// start it accordingly
+			rc.Recorder.Eventf(rc.Datacenter, corev1.EventTypeNormal, events.StartingCassandraAndReplacingNode,
+				"Starting Cassandra for pod %s to replace Cassandra node with address %s", pod.Name, replaceAddress)
+			err = rc.NodeMgmtClient.CallLifecycleStartEndpointWithReplaceIp(pod, replaceAddress)
+		} else {
+			// Either we are not replacing this pod or the relevant cassandra node
+			// never joined the ring in the first place and can be started normally
+			rc.Recorder.Eventf(rc.Datacenter, corev1.EventTypeNormal, events.StartingCassandra,
+				"Starting Cassandra for pod %s", pod.Name)
+			err = rc.NodeMgmtClient.CallLifecycleStartEndpoint(pod)
+		}
 
-	if err != nil {
-		// Pod was unable to start. Most likely this is not a recoverable error, so lets kill the pod and
-		// try again.
-		if deleteErr := rc.Client.Delete(rc.Ctx, pod); deleteErr != nil {
-			return deleteErr
+		if err != nil {
+			// Pod was unable to start. Most likely this is not a recoverable error, so lets kill the pod and
+			// try again.
+			if deleteErr := rc.Client.Delete(rc.Ctx, pod); deleteErr != nil {
+				rc.ReqLogger.Error(err, "Unable to delete the pod, pod has failed to start", "Pod", pod.Name)
+			}
+			rc.Recorder.Eventf(rc.Datacenter, corev1.EventTypeWarning, events.StartingCassandra,
+				"Failed to start pod %s, deleting it", pod.Name)
 		}
-		rc.Recorder.Eventf(rc.Datacenter, corev1.EventTypeWarning, events.StartingCassandra,
-			"Failed to start pod %s, deleting it", pod.Name)
-		return err
-	}
+	}(pod)
+
 	return rc.labelServerPodStarting(pod)
 }
 

@@ -1616,7 +1616,13 @@ func TestFailedStart(t *testing.T) {
 	mockClient := &mocks.Client{}
 	rc.Client = mockClient
 
-	k8sMockClientDelete(mockClient, nil).Times(1)
+	done := make(chan struct{})
+	k8sMockClientDelete(mockClient, nil).Once().Run(func(mock.Arguments) { close(done) })
+
+	// Patch labelStarting, lastNodeStarted..
+	k8sMockClientPatch(mockClient, nil).Once()
+	k8sMockClientStatus(mockClient, mockClient).Once()
+	k8sMockClientPatch(mockClient, nil).Once()
 
 	res := &http.Response{
 		StatusCode: http.StatusInternalServerError,
@@ -1650,7 +1656,14 @@ func TestFailedStart(t *testing.T) {
 	rc.Recorder = fakeRecorder
 
 	err := rc.startCassandra(epData, pod)
-	assert.Error(t, err)
+	// The start is async method, so the error is not returned here
+	assert.Nil(t, err)
+
+	select {
+	case <-done:
+	case <-time.After(2 * time.Second):
+		assert.Fail(t, "No pod delete occurred")
+	}
 
 	close(fakeRecorder.Events)
 	// Should have 2 events, one to indicate Cassandra is starting, one to indicate it failed to start