Skip to content

Commit

Permalink
[BACKPORT] Fixed a data loss issue on lite member promotion (#17644) (#…
Browse files Browse the repository at this point in the history
…17758)

Notify cluster members on lite member promotion to update
memberGroupSize. Otherwise, a data member might be not aware of other
promoted data members, and it may cause backup operations
not being issued to other data members.

Closes #17621
  • Loading branch information
petrpleshachkov committed Dec 1, 2020
1 parent 242f5b9 commit 8509d80
Show file tree
Hide file tree
Showing 3 changed files with 121 additions and 0 deletions.
Expand Up @@ -285,6 +285,7 @@ String memberListString() {
}

// handles both new and left members
@SuppressWarnings("checkstyle:npathcomplexity")
void updateMembers(MembersView membersView) {
MemberMap currentMemberMap = memberMapRef.get();

Expand All @@ -294,11 +295,17 @@ void updateMembers(MembersView membersView) {

MemberImpl[] members = new MemberImpl[membersView.size()];
int memberIndex = 0;
// Indicates whether we received a notification on lite member membership change
// (e.g. its promotion to a data member)
boolean updatedLiteMember = false;
for (MemberInfo memberInfo : membersView.getMembers()) {
Address address = memberInfo.getAddress();
MemberImpl member = currentMemberMap.getMember(address);

if (member != null && member.getUuid().equals(memberInfo.getUuid())) {
if (member.isLiteMember()) {
updatedLiteMember = true;
}
member = createNewMemberImplIfChanged(memberInfo, member);
members[memberIndex++] = member;
continue;
Expand Down Expand Up @@ -331,6 +338,10 @@ void updateMembers(MembersView membersView) {

setMembers(MemberMap.createNew(membersView.getVersion(), members));

if (updatedLiteMember) {
node.partitionService.updateMemberGroupSize();
}

for (MemberImpl member : removedMembers) {
closeConnection(member.getAddress(), "Member left event received from master");
handleMemberRemove(memberMapRef.get(), member);
Expand Down
Expand Up @@ -358,6 +358,10 @@ public int getMaxAllowedBackupCount() {
return max(min(getMemberGroupsSize() - 1, InternalPartition.MAX_BACKUP_COUNT), 0);
}

public void updateMemberGroupSize() {
partitionStateManager.updateMemberGroupsSize();
}

@Override
public void memberAdded(Member member) {
logger.fine("Adding " + member);
Expand Down
Expand Up @@ -20,6 +20,7 @@
import com.hazelcast.config.Config;
import com.hazelcast.core.Cluster;
import com.hazelcast.core.HazelcastInstance;
import com.hazelcast.core.IMap;
import com.hazelcast.core.Member;
import com.hazelcast.core.MemberLeftException;
import com.hazelcast.internal.cluster.impl.operations.PromoteLiteMemberOp;
Expand Down Expand Up @@ -64,6 +65,7 @@
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotEquals;
import static org.junit.Assert.assertThat;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;

@RunWith(HazelcastParallelClassRunner.class)
Expand Down Expand Up @@ -374,6 +376,110 @@ private void memberAttributes_arePreserved_afterPromotion(boolean isMaster) thro
}
}

@Test
public void test_lite_member_promotion_causes_no_data_loss_on_three_members() throws InterruptedException {
final int entryCount = 1000;

TestHazelcastInstanceFactory factory = createHazelcastInstanceFactory();
Config config = new Config().setLiteMember(true);

// start first hazelcast instance as a lite member
final HazelcastInstance firstHazelcastInstance = factory.newHazelcastInstance(config);

// start second and third hazelcast instances as a lite member
final HazelcastInstance secondHazelcastInstance = factory.newHazelcastInstance(config);
final HazelcastInstance thirdHazelcastInstance = factory.newHazelcastInstance(config);

// promote all instances to data members
firstHazelcastInstance.getCluster().promoteLocalLiteMember();
secondHazelcastInstance.getCluster().promoteLocalLiteMember();
thirdHazelcastInstance.getCluster().promoteLocalLiteMember();

// check if cluster is in a good shape
assertTrueEventually(new AssertTask() {
@Override
public void run() throws Exception {
assertTrue(firstHazelcastInstance.getPartitionService().isClusterSafe());
}
});

// insert some dummy data into the testing map
final String mapName = randomMapName();
IMap<String, String> testMap = firstHazelcastInstance.getMap(mapName);
for (int i = 0; i < entryCount; ++i) {
testMap.put("key" + i, "value" + i);
}

// check all data is correctly inserted
assertEquals(entryCount, testMap.size());

// kill second instance
secondHazelcastInstance.getLifecycleService().terminate();

// backup count for the map is set to 1
// even with 1 node down, no data loss is expected
assertTrueEventually(new AssertTask() {
@Override
public void run() throws Exception {
assertEquals(entryCount, firstHazelcastInstance.getMap(mapName).size());
}
});
assertTrueEventually(new AssertTask() {
@Override
public void run() throws Exception {
assertEquals(entryCount, thirdHazelcastInstance.getMap(mapName).size());
}
});
}

@Test
public void test_lite_member_promotion_causes_no_data_loss_on_two_members() throws InterruptedException {
final int entryCount = 1000;

TestHazelcastInstanceFactory factory = createHazelcastInstanceFactory();
Config config = new Config().setLiteMember(true);

// start first hazelcast instance as a lite member
final HazelcastInstance firstHazelcastInstance = factory.newHazelcastInstance(config);
// start second hazelcast instance as a lite member
final HazelcastInstance secondHazelcastInstance = factory.newHazelcastInstance(config);

// promote all instances to data members
firstHazelcastInstance.getCluster().promoteLocalLiteMember();

secondHazelcastInstance.getCluster().promoteLocalLiteMember();

// check if cluster is in a good shape
assertTrueEventually(new AssertTask() {
@Override
public void run() throws Exception {
firstHazelcastInstance.getPartitionService().isClusterSafe();
}
});

// insert some dummy data into the testing map
final String mapName = randomMapName();
IMap<String, String> testMap = firstHazelcastInstance.getMap(mapName);
for (int i = 0; i < entryCount; ++i) {
testMap.put("key" + i, "value" + i);
}

// check all data is correctly inserted
assertEquals(entryCount, testMap.size());

// kill second instance
secondHazelcastInstance.getLifecycleService().terminate();

// backup count for the map is set to 1
// even with 1 node down, no data loss is expected
assertTrueEventually(new AssertTask() {
@Override
public void run() throws Exception {
assertEquals(entryCount, firstHazelcastInstance.getMap(mapName).size());
}
});
}

private void assertPromotionInvocationStarted(HazelcastInstance instance) {
final OperationServiceImpl operationService =
(OperationServiceImpl) getNode(instance).getNodeEngine().getOperationService();
Expand Down

0 comments on commit 8509d80

Please sign in to comment.