Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

ISPN-2897 NPE in DefaultConsistentHash

If any partition is rebalancing during a merge, the merge coordinator
should end the rebalance it and start a new one with all the members.
  • Loading branch information...
commit 5731ed9a4566851eed71b8d9ed1b07cd3f1f46f0 1 parent 2937d82
@danberindei danberindei authored Mircea Markus committed
View
2  core/src/main/java/org/infinispan/topology/ClusterCacheStatus.java
@@ -142,7 +142,7 @@ public boolean updateClusterMembers(List<Address> newClusterMembers) {
synchronized (this) {
if (newClusterMembers.containsAll(members)) {
if (trace) log.tracef("Cluster members updated for cache %s, no leavers detected: " +
- "cache members = %s", members, newClusterMembers);
+ "cache members = %s", cacheName, newClusterMembers);
return false;
}
View
81 core/src/main/java/org/infinispan/topology/ClusterTopologyManagerImpl.java
@@ -233,7 +233,7 @@ protected void handleNewView(List<Address> newMembers, boolean mergeView, int ne
for (Map.Entry<String, List<CacheTopology>> e : clusterCacheMap.entrySet()) {
String cacheName = e.getKey();
List<CacheTopology> topologyList = e.getValue();
- updateCacheStatusAfterMerge(cacheName, topologyList);
+ updateCacheStatusAfterMerge(cacheName, newMembers, topologyList);
}
} catch (InterruptedException e) {
log.tracef("Cluster state recovery interrupted because the coordinator is shutting down");
@@ -270,7 +270,8 @@ private ClusterCacheStatus initCacheStatusIfAbsent(String cacheName, CacheJoinIn
return cacheStatus;
}
- public void updateCacheStatusAfterMerge(String cacheName, List<CacheTopology> partitionTopologies)
+ public void updateCacheStatusAfterMerge(String cacheName, List<Address> clusterMembers,
+ List<CacheTopology> partitionTopologies)
throws Exception {
log.tracef("Initializing rebalance policy for cache %s, pre-existing partitions are %s",
cacheName, partitionTopologies);
@@ -278,61 +279,47 @@ public void updateCacheStatusAfterMerge(String cacheName, List<CacheTopology> pa
if (partitionTopologies.isEmpty())
return;
- int unionTopologyId = 0;
- ConsistentHash currentCHUnion = null;
- ConsistentHash pendingCHUnion = null;
- ConsistentHashFactory chFactory = cacheStatus.getJoinInfo().getConsistentHashFactory();
- for (CacheTopology topology : partitionTopologies) {
- if (topology.getTopologyId() > unionTopologyId) {
- unionTopologyId = topology.getTopologyId();
- }
- if (currentCHUnion == null) {
- currentCHUnion = topology.getCurrentCH();
- } else {
- currentCHUnion = chFactory.union(currentCHUnion, topology.getCurrentCH());
- }
+ synchronized (cacheStatus) {
+ int unionTopologyId = 0;
+ // We only use the currentCH, we ignore any ongoing rebalance in the partitions
+ ConsistentHash currentCHUnion = null;
+ ConsistentHashFactory chFactory = cacheStatus.getJoinInfo().getConsistentHashFactory();
+ for (CacheTopology topology : partitionTopologies) {
+ if (topology.getTopologyId() > unionTopologyId) {
+ unionTopologyId = topology.getTopologyId();
+ }
- if (pendingCHUnion == null) {
- pendingCHUnion = topology.getPendingCH();
- } else {
- if (topology.getPendingCH() != null)
- pendingCHUnion = chFactory.union(pendingCHUnion, topology.getPendingCH());
+ if (currentCHUnion == null) {
+ currentCHUnion = topology.getCurrentCH();
+ } else {
+ currentCHUnion = chFactory.union(currentCHUnion, topology.getCurrentCH());
+ }
}
- }
- // We have added each node to the cache status when we received its status response
- List<Address> members = cacheStatus.getMembers();
- if (currentCHUnion != null) {
- currentCHUnion = chFactory.updateMembers(currentCHUnion, members);
- }
- if (pendingCHUnion != null) {
- pendingCHUnion = chFactory.updateMembers(pendingCHUnion, members);
- }
+ // We have added each node to the cache status when we received its status response
+ List<Address> members = cacheStatus.getMembers();
+ // Filter out any nodes that aren't members of the cluster any more
+ cacheStatus.updateClusterMembers(clusterMembers);
+ if (currentCHUnion != null) {
+ currentCHUnion = chFactory.updateMembers(currentCHUnion, members);
+ }
- // Make sure the topology id is higher than any topology id we had before in the cluster
- unionTopologyId += 2;
- CacheTopology cacheTopology = new CacheTopology(unionTopologyId, currentCHUnion, pendingCHUnion);
- boolean wasRebalanceInProgress = pendingCHUnion != null;
+ // Make sure the topology id is higher than any topology id we had before in the cluster
+ unionTopologyId += 2;
+ CacheTopology cacheTopology = new CacheTopology(unionTopologyId, currentCHUnion, null);
- synchronized (cacheStatus) {
- // TODO Deal with members had joined in a partition, but which did not start receiving data yet
- // (i.e. they weren't in the current or in the pending CH)
- cacheStatus.setMembers(cacheTopology.getMembers());
- if (wasRebalanceInProgress) {
- cacheStatus.startRebalance(cacheTopology);
- } else {
- cacheStatus.updateCacheTopology(cacheTopology);
+ // End any running rebalance
+ if (cacheStatus.isRebalanceInProgress()) {
+ cacheStatus.endRebalance();
}
+ cacheStatus.updateCacheTopology(cacheTopology);
}
+ // End any rebalance that was running in the other partitions
broadcastConsistentHashUpdate(cacheName, cacheStatus);
- if (wasRebalanceInProgress) {
- broadcastRebalanceStart(cacheName, cacheStatus);
- } else {
- // Trigger another rebalance in case the CH is not balanced (even though there was no rebalance in progress)
- triggerRebalance(cacheName);
- }
+ // Trigger another rebalance in case the CH is not balanced
+ triggerRebalance(cacheName);
}
private void broadcastConsistentHashUpdate(String cacheName, ClusterCacheStatus cacheStatus) throws Exception {
View
204 core/src/test/java/org/infinispan/statetransfer/ClusterTopologyManagerTest.java
@@ -22,33 +22,59 @@
*/
package org.infinispan.statetransfer;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.Callable;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.concurrent.locks.Condition;
+import java.util.concurrent.locks.Lock;
+import java.util.concurrent.locks.ReentrantLock;
+
import org.infinispan.Cache;
import org.infinispan.configuration.cache.CacheMode;
import org.infinispan.configuration.cache.ConfigurationBuilder;
-import org.infinispan.configuration.global.GlobalConfigurationBuilder;
+import org.infinispan.manager.EmbeddedCacheManager;
+import org.infinispan.remoting.transport.Address;
import org.infinispan.test.MultipleCacheManagersTest;
import org.infinispan.test.TestingUtil;
import org.infinispan.test.fwk.CleanupAfterMethod;
import org.infinispan.test.fwk.TransportFlags;
+import org.infinispan.topology.CacheTopology;
+import org.infinispan.topology.LocalTopologyManager;
import org.infinispan.util.Util;
import org.jgroups.protocols.DISCARD;
+import org.mockito.invocation.InvocationOnMock;
+import org.mockito.stubbing.Answer;
import org.testng.annotations.Test;
+import static org.mockito.Matchers.any;
+import static org.mockito.Matchers.anyInt;
+import static org.mockito.Matchers.eq;
+import static org.mockito.Mockito.doAnswer;
+import static org.mockito.Mockito.spy;
+
@Test(groups = "functional", testName = "statetransfer.ClusterTopologyManagerTest")
@CleanupAfterMethod
public class ClusterTopologyManagerTest extends MultipleCacheManagersTest {
+ public static final String CACHE_NAME = "cache";
+ private ConfigurationBuilder defaultConfig;
Cache c1, c2, c3;
DISCARD d1, d2, d3;
@Override
protected void createCacheManagers() throws Throwable {
- ConfigurationBuilder defaultConfig = getDefaultClusteredCacheConfig(CacheMode.DIST_SYNC, false);
+ defaultConfig = getDefaultClusteredCacheConfig(CacheMode.DIST_SYNC, false);
createClusteredCaches(3, defaultConfig, new TransportFlags().withFD(true).withMerge(true));
- c1 = cache(0, "cache");
- c2 = cache(1, "cache");
- c3 = cache(2, "cache");
+ c1 = cache(0, CACHE_NAME);
+ c2 = cache(1, CACHE_NAME);
+ c3 = cache(2, CACHE_NAME);
d1 = TestingUtil.getDiscardForCache(c1);
d1.setExcludeItself(true);
d2 = TestingUtil.getDiscardForCache(c2);
@@ -91,7 +117,7 @@ public void testNodeAbruptLeave() throws Exception {
// Check that a new node can join
ConfigurationBuilder defaultConfig = getDefaultClusteredCacheConfig(CacheMode.DIST_SYNC, false);
addClusterEnabledCacheManager(defaultConfig, new TransportFlags().withFD(true).withMerge(true));
- Cache<Object, Object> c4 = cache(3, "cache");
+ Cache<Object, Object> c4 = cache(3, CACHE_NAME);
TestingUtil.blockUntilViewsReceived(30000, true, c1, c2, c4);
TestingUtil.waitForRehashToComplete(c1, c2, c4);
@@ -124,7 +150,7 @@ public void testClusterRecoveryAfterCoordLeave() throws Exception {
// Check that a new node can join
ConfigurationBuilder defaultConfig = getDefaultClusteredCacheConfig(CacheMode.DIST_SYNC, false);
addClusterEnabledCacheManager(defaultConfig, new TransportFlags().withFD(true).withMerge(true));
- Cache<Object, Object> c4 = cache(3, "cache");
+ Cache<Object, Object> c4 = cache(3, CACHE_NAME);
TestingUtil.blockUntilViewsReceived(30000, true, c2, c3, c4);
TestingUtil.waitForRehashToComplete(c2, c3, c4);
}
@@ -162,7 +188,7 @@ public void testClusterRecoveryAfterThreeWaySplit() throws Exception {
// Check that a new node can join
ConfigurationBuilder defaultConfig = getDefaultClusteredCacheConfig(CacheMode.DIST_SYNC, false);
addClusterEnabledCacheManager(defaultConfig, new TransportFlags().withFD(true).withMerge(true));
- Cache<Object, Object> c4 = cache(3, "cache");
+ Cache<Object, Object> c4 = cache(3, CACHE_NAME);
TestingUtil.blockUntilViewsReceived(30000, true, c1, c2, c3, c4);
TestingUtil.waitForRehashToComplete(c1, c2, c3, c4);
}
@@ -202,8 +228,168 @@ public void testClusterRecoveryAfterSplitAndCoordLeave() throws Exception {
// Check that a new node can join
ConfigurationBuilder defaultConfig = getDefaultClusteredCacheConfig(CacheMode.DIST_SYNC, false);
addClusterEnabledCacheManager(defaultConfig, new TransportFlags().withFD(true).withMerge(true));
- Cache<Object, Object> c4 = cache(3, "cache");
+ Cache<Object, Object> c4 = cache(3, CACHE_NAME);
TestingUtil.blockUntilViewsReceived(30000, true, c2, c3, c4);
TestingUtil.waitForRehashToComplete(c2, c3, c4);
}
+
+ public void testClusterRecoveryWithRebalance() throws Exception {
+ // Compute the merge coordinator by sorting the JGroups addresses, the same way MERGE2/3 do
+ List<Address> members = new ArrayList<Address>(manager(0).getMembers());
+ Collections.sort(members);
+ Address mergeCoordAddress = members.get(0);
+ log.debugf("The merge coordinator will be %s", mergeCoordAddress);
+ EmbeddedCacheManager mergeCoordManager = manager(mergeCoordAddress);
+ int mergeCoordIndex = cacheManagers.indexOf(mergeCoordManager);
+
+ // create the partitions
+ log.debugf("Splitting the cluster in three");
+ d1.setDiscardAll(true);
+ d2.setDiscardAll(true);
+ d3.setDiscardAll(true);
+
+ // wait for the coordinator to be separated (don't care about the others)
+ TestingUtil.blockUntilViewsReceived(30000, false, c1);
+ TestingUtil.blockUntilViewsReceived(30000, false, c2);
+ TestingUtil.blockUntilViewsReceived(30000, false, c3);
+ TestingUtil.waitForRehashToComplete(c1);
+ TestingUtil.waitForRehashToComplete(c2);
+ TestingUtil.waitForRehashToComplete(c3);
+
+ // Disable DISCARD *only* on the merge coordinator
+ if (mergeCoordIndex == 0) d1.setDiscardAll(false);
+ if (mergeCoordIndex == 1) d2.setDiscardAll(false);
+ if (mergeCoordIndex == 2) d3.setDiscardAll(false);
+
+ int viewIdAfterSplit = mergeCoordManager.getTransport().getViewId();
+ final LocalTopologyManager localTopologyManager = TestingUtil.extractGlobalComponent(mergeCoordManager,
+ LocalTopologyManager.class);
+ final CheckPoint checkpoint = new CheckPoint();
+ LocalTopologyManager spyLocalTopologyManager = spy(localTopologyManager);
+ doAnswer(new Answer<Object>() {
+ @Override
+ public Object answer(InvocationOnMock invocation) throws Throwable {
+ int viewId = (Integer) invocation.getArguments()[2];
+ checkpoint.trigger("rebalance" + viewId);
+ log.debugf("Blocking the REBALANCE_START command on the merge coordinator");
+ checkpoint.awaitStrict("merge", 10, TimeUnit.SECONDS);
+ return invocation.callRealMethod();
+ }
+ }).when(spyLocalTopologyManager).handleRebalance(eq(CACHE_NAME), any(CacheTopology.class), anyInt());
+ TestingUtil.replaceComponent(mergeCoordManager, LocalTopologyManager.class, spyLocalTopologyManager, true);
+
+ final EmbeddedCacheManager cm4 = addClusterEnabledCacheManager(defaultConfig, new TransportFlags().withFD(true).withMerge(true));
+ Future<Cache<Object,Object>> cacheFuture = fork(new Callable<Cache<Object, Object>>() {
+ @Override
+ public Cache<Object, Object> call() throws Exception {
+ return cm4.getCache(CACHE_NAME);
+ }
+ });
+
+ log.debugf("Waiting for the REBALANCE_START command to reach the merge coordinator");
+ checkpoint.awaitStrict("rebalance" + (viewIdAfterSplit + 1), 10, TimeUnit.SECONDS);
+
+ // merge the partitions
+ log.debugf("Merging the cluster partitions");
+ d1.setDiscardAll(false);
+ d2.setDiscardAll(false);
+ d3.setDiscardAll(false);
+
+ // wait for the JGroups merge
+ long startTime = System.currentTimeMillis();
+ TestingUtil.blockUntilViewsReceived(30000, cacheManagers);
+
+ // unblock the REBALANCE_START command
+ log.debugf("Unblocking the REBALANCE_START command on the coordinator");
+ checkpoint.triggerForever("merge");
+
+ // wait for the 4th cache to finish joining
+ Cache<Object, Object> c4 = cacheFuture.get(30, TimeUnit.SECONDS);
+ TestingUtil.waitForRehashToComplete(c1, c2, c3, c4);
+
+ long endTime = System.currentTimeMillis();
+ log.debugf("Merge took %s", Util.prettyPrintTime(endTime - startTime));
+ assert endTime - startTime < 30000 : "Merge took too long: " + Util.prettyPrintTime(endTime - startTime);
+
+ // Check that another node can join
+ ConfigurationBuilder defaultConfig = getDefaultClusteredCacheConfig(CacheMode.DIST_SYNC, false);
+ EmbeddedCacheManager cm5 = addClusterEnabledCacheManager(defaultConfig, new TransportFlags().withFD(true).withMerge(true));
+ Cache<Object, Object> c5 = cm5.getCache(CACHE_NAME);
+ TestingUtil.blockUntilViewsReceived(30000, true, c1, c2, c3, c4, c5);
+ TestingUtil.waitForRehashToComplete(c1, c2, c3, c4, c5);
+ }
+
+}
+
+class CheckPoint {
+ private final Lock lock = new ReentrantLock();
+ private final Condition unblockCondition = lock.newCondition();
+ private final Map<String, Integer> events = new HashMap<String, Integer>();
+
+ public void awaitStrict(String event, long timeout, TimeUnit unit)
+ throws InterruptedException, TimeoutException {
+ awaitStrict(event, 1, timeout, unit);
+ }
+
+ public boolean await(String event, long timeout, TimeUnit unit) throws InterruptedException {
+ return await(event, 1, timeout, unit);
+ }
+
+ public void awaitStrict(String event, int count, long timeout, TimeUnit unit)
+ throws InterruptedException, TimeoutException {
+ if (!await(event, count, timeout, unit)) {
+ throw new TimeoutException("Timed out waiting for event " + event);
+ }
+ }
+
+ public boolean await(String event, int count, long timeout, TimeUnit unit) throws InterruptedException {
+ lock.lock();
+ try {
+ long waitNanos = unit.toNanos(timeout);
+ while (waitNanos > 0) {
+ Integer currentCount = events.get(event);
+ if (currentCount != null && currentCount >= count) {
+ events.put(event, currentCount - count);
+ break;
+ }
+ waitNanos = unblockCondition.awaitNanos(waitNanos);
+ }
+
+ if (waitNanos <= 0) {
+ // let the triggering thread know that we timed out
+ events.put(event, -1);
+ return false;
+ }
+
+ return true;
+ } finally {
+ lock.unlock();
+ }
+ }
+
+ public void trigger(String event) {
+ trigger(event, 1);
+ }
+
+ public void triggerForever(String event) {
+ trigger(event, Integer.MAX_VALUE);
+ }
+
+ public void trigger(String event, int count) {
+ lock.lock();
+ try {
+ Integer currentCount = events.get(event);
+ if (currentCount == null) {
+ currentCount = 0;
+ } else if (currentCount < 0) {
+ throw new IllegalStateException("Thread already timed out waiting for event " + event);
+ }
+
+ // If triggerForever is called more than once, it will cause an overflow and the waiters will fail.
+ events.put(event, currentCount + count);
+ unblockCondition.signalAll();
+ } finally {
+ lock.unlock();
+ }
+ }
}
View
9 core/src/test/java/org/infinispan/test/MultipleCacheManagersTest.java
@@ -425,6 +425,15 @@ protected EmbeddedCacheManager manager(int i) {
return cacheManagers.get(i);
}
+ public EmbeddedCacheManager manager(Address a) {
+ for (EmbeddedCacheManager cm : cacheManagers) {
+ if (cm.getAddress().equals(a)) {
+ return cm;
+ }
+ }
+ throw new IllegalArgumentException(a + " is not a valid cache manager address!");
+ }
+
protected <K, V> Cache<K, V> cache(int managerIndex, String cacheName) {
return manager(managerIndex).getCache(cacheName);
}
Please sign in to comment.
Something went wrong with that request. Please try again.