From 897583fad586326576869a964e611aa379c3bcba Mon Sep 17 00:00:00 2001 From: Anubhav Agarwal Date: Thu, 26 Mar 2026 22:35:51 +0530 Subject: [PATCH 1/3] Add TestInstanceOperationEvacuationCancel with independent tests Extract 3 evacuation-cancel tests from TestInstanceOperation into TestInstanceOperationEvacuationCancel extending TestInstanceOperationBase: - testEvacuateAndCancelBeforeBootstrapFinish (cancel during slow bootstrap) - testEvacuateAndCancelBeforeDropFinish (cancel during slow drop) - testMarkEvacuationAfterEMM (evacuate during/after maintenance mode) All tests run independently with no dependsOnMethods chains. Tests call enabledTopologyAwareRebalance() explicitly. Both bootstrap and drop tests create extra MasterSlave resources to match the original test environment. testMarkEvacuationAfterEMM uses a 60s verifier for isReadyForPreparing JoiningCluster since delayed drop transitions need time to complete. TestInstanceOperation is not modified. Made-with: Cursor --- ...TestInstanceOperationEvacuationCancel.java | 167 ++++++++++++++++++ 1 file changed, 167 insertions(+) create mode 100644 helix-core/src/test/java/org/apache/helix/integration/rebalancer/TestInstanceOperationEvacuationCancel.java diff --git a/helix-core/src/test/java/org/apache/helix/integration/rebalancer/TestInstanceOperationEvacuationCancel.java b/helix-core/src/test/java/org/apache/helix/integration/rebalancer/TestInstanceOperationEvacuationCancel.java new file mode 100644 index 0000000000..1e7e53b9ac --- /dev/null +++ b/helix-core/src/test/java/org/apache/helix/integration/rebalancer/TestInstanceOperationEvacuationCancel.java @@ -0,0 +1,167 @@ +package org.apache.helix.integration.rebalancer; + +import java.util.Date; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +import org.apache.helix.constants.InstanceConstants; +import org.apache.helix.controller.rebalancer.strategy.CrushEdRebalanceStrategy; +import org.apache.helix.model.ExternalView; +import org.testng.Assert; +import org.testng.annotations.Test; + + +public class TestInstanceOperationEvacuationCancel extends TestInstanceOperationBase { + + @Test + public void testEvacuateAndCancelBeforeBootstrapFinish() throws Exception { + System.out.println( + "START TestInstanceOperationEvacuationCancel.testEvacuateAndCancelBeforeBootstrapFinish() at " + + new Date(System.currentTimeMillis())); + + enabledTopologyAwareRebalance(); + + createResourceWithDelayedRebalance(CLUSTER_NAME, "TEST_DB3_DELAYED_CRUSHED", "MasterSlave", + PARTITIONS, REPLICA, REPLICA - 1, 200000, CrushEdRebalanceStrategy.class.getName()); + _allDBs.add("TEST_DB3_DELAYED_CRUSHED"); + createResourceWithWagedRebalance(CLUSTER_NAME, "TEST_DB4_DELAYED_WAGED", "MasterSlave", + PARTITIONS, REPLICA, REPLICA - 1); + _allDBs.add("TEST_DB4_DELAYED_WAGED"); + Assert.assertTrue(_clusterVerifier.verifyByPolling()); + + _stateModelDelay = -10000L; + String instanceToEvacuate = _participants.get(0).getInstanceName(); + _gSetupTool.getClusterManagementTool().setInstanceOperation(CLUSTER_NAME, instanceToEvacuate, + InstanceConstants.InstanceOperation.EVACUATE); + + for (String participant : _participantNames) { + if (participant.equals(instanceToEvacuate)) { + continue; + } + verifier(() -> ((_dataAccessor.getChildNames( + _dataAccessor.keyBuilder().messages(participant))).isEmpty()), 30000); + } + Assert.assertFalse(_admin.isEvacuateFinished(CLUSTER_NAME, instanceToEvacuate)); + Assert.assertFalse(_admin.isReadyForPreparingJoiningCluster(CLUSTER_NAME, instanceToEvacuate)); + + Thread.sleep(Math.abs(_stateModelDelay / 100)); + Map assignment = getEVs(); + for (String resource : _allDBs) { + validateAssignmentInEv(assignment.get(resource)); + } + + _gSetupTool.getClusterManagementTool().setInstanceOperation(CLUSTER_NAME, instanceToEvacuate, + InstanceConstants.InstanceOperation.ENABLE); + + assignment = getEVs(); + for (String resource : _allDBs) { + validateAssignmentInEv(assignment.get(resource)); + } + + Assert.assertTrue(_clusterVerifier.verifyByPolling()); + + assignment = getEVs(); + for (String resource : _allDBs) { + Assert.assertTrue( + getParticipantsInEv(assignment.get(resource)).containsAll(_participantNames)); + validateAssignmentInEv(assignment.get(resource)); + } + } + + @Test + public void testEvacuateAndCancelBeforeDropFinish() throws Exception { + System.out.println( + "START TestInstanceOperationEvacuationCancel.testEvacuateAndCancelBeforeDropFinish() at " + + new Date(System.currentTimeMillis())); + + enabledTopologyAwareRebalance(); + + createResourceWithDelayedRebalance(CLUSTER_NAME, "TEST_DB3_DELAYED_CRUSHED", "MasterSlave", + PARTITIONS, REPLICA, REPLICA - 1, 200000, CrushEdRebalanceStrategy.class.getName()); + _allDBs.add("TEST_DB3_DELAYED_CRUSHED"); + createResourceWithWagedRebalance(CLUSTER_NAME, "TEST_DB4_DELAYED_WAGED", "MasterSlave", + PARTITIONS, REPLICA, REPLICA - 1); + _allDBs.add("TEST_DB4_DELAYED_WAGED"); + Assert.assertTrue(_clusterVerifier.verifyByPolling()); + + _stateModelDelay = 10000L; + + String instanceToEvacuate = _participants.get(0).getInstanceName(); + _gSetupTool.getClusterManagementTool().setInstanceOperation(CLUSTER_NAME, instanceToEvacuate, + InstanceConstants.InstanceOperation.EVACUATE); + + verifier(() -> ((_dataAccessor.getChildNames( + _dataAccessor.keyBuilder().messages(instanceToEvacuate))).isEmpty()), 30000); + Assert.assertFalse(_admin.isEvacuateFinished(CLUSTER_NAME, instanceToEvacuate)); + + _gSetupTool.getClusterManagementTool().setInstanceOperation(CLUSTER_NAME, instanceToEvacuate, + InstanceConstants.InstanceOperation.ENABLE); + Map assignment = getEVs(); + for (String resource : _allDBs) { + validateAssignmentInEv(assignment.get(resource)); + } + + Assert.assertTrue(_clusterVerifier.verifyByPolling()); + + assignment = getEVs(); + for (String resource : _allDBs) { + Assert.assertTrue( + getParticipantsInEv(assignment.get(resource)).containsAll(_participantNames)); + validateAssignmentInEv(assignment.get(resource)); + } + } + + @Test + public void testMarkEvacuationAfterEMM() throws Exception { + System.out.println( + "START TestInstanceOperationEvacuationCancel.testMarkEvacuationAfterEMM() at " + + new Date(System.currentTimeMillis())); + + enabledTopologyAwareRebalance(); + + _stateModelDelay = 1000L; + Assert.assertFalse( + _gSetupTool.getClusterManagementTool().isInMaintenanceMode(CLUSTER_NAME)); + _gSetupTool.getClusterManagementTool() + .manuallyEnableMaintenanceMode(CLUSTER_NAME, true, null, null); + String newParticipantName = PARTICIPANT_PREFIX + "_" + _nextStartPort.get(); + addParticipant(newParticipantName); + + Assert.assertTrue(_clusterVerifier.verifyByPolling()); + Map assignment = getEVs(); + for (String resource : _allDBs) { + Assert.assertFalse( + getParticipantsInEv(assignment.get(resource)).contains(newParticipantName)); + } + + String instanceToEvacuate = _participants.get(0).getInstanceName(); + _gSetupTool.getClusterManagementTool().setInstanceOperation(CLUSTER_NAME, instanceToEvacuate, + InstanceConstants.InstanceOperation.EVACUATE); + + for (String resource : _allDBs) { + Assert.assertTrue( + getParticipantsInEv(assignment.get(resource)).contains(instanceToEvacuate)); + } + + _gSetupTool.getClusterManagementTool() + .manuallyEnableMaintenanceMode(CLUSTER_NAME, false, null, null); + + Assert.assertTrue(_clusterVerifier.verifyByPolling()); + + assignment = getEVs(); + List currentActiveInstances = + _participantNames.stream().filter(n -> !n.equals(instanceToEvacuate)) + .collect(Collectors.toList()); + for (String resource : _allDBs) { + validateAssignmentInEv(assignment.get(resource)); + Set newPAssignedParticipants = getParticipantsInEv(assignment.get(resource)); + Assert.assertFalse(newPAssignedParticipants.contains(instanceToEvacuate)); + Assert.assertTrue(newPAssignedParticipants.containsAll(currentActiveInstances)); + } + verifier( + () -> _admin.isReadyForPreparingJoiningCluster(CLUSTER_NAME, instanceToEvacuate), + 60000); + } +} From deb4797e29f5b2504b2e7f292f786db63067eb32 Mon Sep 17 00:00:00 2001 From: Anubhav Agarwal Date: Fri, 27 Mar 2026 09:00:13 +0530 Subject: [PATCH 2/3] Improve readability of TestInstanceOperationEvacuationCancel Extract duplicated resource creation into createAdditionalMasterSlaveResources() helper method. Add comments explaining _stateModelDelay sign convention (negative = slow upward transitions, positive = slow downward transitions) and why cluster convergence succeeds only because state transition cancellation is triggered by the ENABLE operation. Made-with: Cursor --- ...TestInstanceOperationEvacuationCancel.java | 36 ++++++++++--------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/helix-core/src/test/java/org/apache/helix/integration/rebalancer/TestInstanceOperationEvacuationCancel.java b/helix-core/src/test/java/org/apache/helix/integration/rebalancer/TestInstanceOperationEvacuationCancel.java index 1e7e53b9ac..bc7ab8a9c9 100644 --- a/helix-core/src/test/java/org/apache/helix/integration/rebalancer/TestInstanceOperationEvacuationCancel.java +++ b/helix-core/src/test/java/org/apache/helix/integration/rebalancer/TestInstanceOperationEvacuationCancel.java @@ -15,14 +15,11 @@ public class TestInstanceOperationEvacuationCancel extends TestInstanceOperationBase { - @Test - public void testEvacuateAndCancelBeforeBootstrapFinish() throws Exception { - System.out.println( - "START TestInstanceOperationEvacuationCancel.testEvacuateAndCancelBeforeBootstrapFinish() at " - + new Date(System.currentTimeMillis())); - - enabledTopologyAwareRebalance(); - + /** + * Adds two extra MasterSlave resources to increase the partition count so that + * evacuation has enough work to avoid completing prematurely during cancel tests. + */ + private void createAdditionalMasterSlaveResources() throws InterruptedException { createResourceWithDelayedRebalance(CLUSTER_NAME, "TEST_DB3_DELAYED_CRUSHED", "MasterSlave", PARTITIONS, REPLICA, REPLICA - 1, 200000, CrushEdRebalanceStrategy.class.getName()); _allDBs.add("TEST_DB3_DELAYED_CRUSHED"); @@ -30,7 +27,18 @@ public void testEvacuateAndCancelBeforeBootstrapFinish() throws Exception { PARTITIONS, REPLICA, REPLICA - 1); _allDBs.add("TEST_DB4_DELAYED_WAGED"); Assert.assertTrue(_clusterVerifier.verifyByPolling()); + } + @Test + public void testEvacuateAndCancelBeforeBootstrapFinish() throws Exception { + System.out.println( + "START TestInstanceOperationEvacuationCancel.testEvacuateAndCancelBeforeBootstrapFinish() at " + + new Date(System.currentTimeMillis())); + + enabledTopologyAwareRebalance(); + createAdditionalMasterSlaveResources(); + + // Negative delay = slow upward transitions (OFFLINE→SLAVE, SLAVE→MASTER) _stateModelDelay = -10000L; String instanceToEvacuate = _participants.get(0).getInstanceName(); _gSetupTool.getClusterManagementTool().setInstanceOperation(CLUSTER_NAME, instanceToEvacuate, @@ -60,6 +68,8 @@ public void testEvacuateAndCancelBeforeBootstrapFinish() throws Exception { validateAssignmentInEv(assignment.get(resource)); } + // ST delay exceeds verifier timeout, so convergence only succeeds because + // cancelling the evacuation (ENABLE) triggers state transition cancellation. Assert.assertTrue(_clusterVerifier.verifyByPolling()); assignment = getEVs(); @@ -77,15 +87,9 @@ public void testEvacuateAndCancelBeforeDropFinish() throws Exception { + new Date(System.currentTimeMillis())); enabledTopologyAwareRebalance(); + createAdditionalMasterSlaveResources(); - createResourceWithDelayedRebalance(CLUSTER_NAME, "TEST_DB3_DELAYED_CRUSHED", "MasterSlave", - PARTITIONS, REPLICA, REPLICA - 1, 200000, CrushEdRebalanceStrategy.class.getName()); - _allDBs.add("TEST_DB3_DELAYED_CRUSHED"); - createResourceWithWagedRebalance(CLUSTER_NAME, "TEST_DB4_DELAYED_WAGED", "MasterSlave", - PARTITIONS, REPLICA, REPLICA - 1); - _allDBs.add("TEST_DB4_DELAYED_WAGED"); - Assert.assertTrue(_clusterVerifier.verifyByPolling()); - + // Positive delay = slow downward transitions (MASTER→SLAVE, SLAVE→OFFLINE, OFFLINE→DROPPED) _stateModelDelay = 10000L; String instanceToEvacuate = _participants.get(0).getInstanceName(); From e2cb4daa109d23da075457a172e5ecf361112957 Mon Sep 17 00:00:00 2001 From: Anubhav Agarwal Date: Thu, 16 Apr 2026 12:55:41 +0530 Subject: [PATCH 3/3] addresss comments --- .../rebalancer/TestInstanceOperationEvacuationCancel.java | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/helix-core/src/test/java/org/apache/helix/integration/rebalancer/TestInstanceOperationEvacuationCancel.java b/helix-core/src/test/java/org/apache/helix/integration/rebalancer/TestInstanceOperationEvacuationCancel.java index bc7ab8a9c9..11ff5b9478 100644 --- a/helix-core/src/test/java/org/apache/helix/integration/rebalancer/TestInstanceOperationEvacuationCancel.java +++ b/helix-core/src/test/java/org/apache/helix/integration/rebalancer/TestInstanceOperationEvacuationCancel.java @@ -167,5 +167,12 @@ public void testMarkEvacuationAfterEMM() throws Exception { verifier( () -> _admin.isReadyForPreparingJoiningCluster(CLUSTER_NAME, instanceToEvacuate), 60000); + + // Stop the evacuated participant so beforeMethod can clean it up. + // Without this, the instance stays connected in EVACUATE state and + // _participants grows unboundedly across test runs. + _participants.stream() + .filter(p -> p.getInstanceName().equals(instanceToEvacuate)) + .findFirst().ifPresent(p -> p.syncStop()); } }