Replica allocation consider no-op

This is a first step away from sync-ids. We now check if replica and primary are identical using sequence numbers when determining where to allocate a replica shard. If an index is no longer indexed into, issuing a regular flush will now be enough to ensure a no-op recovery is done. This has the nice side-effect of ensuring that closed indices and frozen indices choose existing shard copies with identical data over file-overlap comparison, increasing the chance that we end up doing a no-op recovery (only no-op and file-based recovery is supported by closed indices). Relates elastic#41400 and elastic#33888 Supersedes elastic#41784
henningandersen · May 24, 2019 · 06f4d3c · 06f4d3c
1 parent cfc12b4
commit 06f4d3c
Show file tree

Hide file tree

Showing 19 changed files with 913 additions and 111 deletions.
diff --git a/qa/rolling-upgrade/src/test/java/org/elasticsearch/upgrades/RecoveryIT.java b/qa/rolling-upgrade/src/test/java/org/elasticsearch/upgrades/RecoveryIT.java
@@ -48,6 +48,7 @@
 import static org.elasticsearch.cluster.routing.UnassignedInfo.INDEX_DELAYED_NODE_LEFT_TIMEOUT_SETTING;
 import static org.elasticsearch.cluster.routing.allocation.decider.EnableAllocationDecider.INDEX_ROUTING_ALLOCATION_ENABLE_SETTING;
 import static org.elasticsearch.cluster.routing.allocation.decider.MaxRetryAllocationDecider.SETTING_ALLOCATION_MAX_RETRY;
+import static org.hamcrest.Matchers.empty;
 import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.hasSize;
 import static org.hamcrest.Matchers.is;
@@ -359,6 +360,7 @@ public void testRecoveryClosedIndex() throws Exception {
                 .put(INDEX_DELAYED_NODE_LEFT_TIMEOUT_SETTING.getKey(), "100ms")
                 .put(SETTING_ALLOCATION_MAX_RETRY.getKey(), "0") // fail faster
                 .build());
+            indexDocs(indexName, 0, randomInt(10));
             ensureGreen(indexName);
             closeIndex(indexName);
         }
@@ -369,6 +371,9 @@ public void testRecoveryClosedIndex() throws Exception {
             // so we expect the index to be closed and replicated
             ensureGreen(indexName);
             assertClosedIndex(indexName, true);
+            if (minimumNodeVersion().onOrAfter(Version.V_8_0_0)) { // todo: change to 7_X once backported.
+                assertNoFileBasedRecovery(indexName);
+            }
         } else {
             assertClosedIndex(indexName, false);
         }
@@ -480,4 +485,34 @@ private void assertClosedIndex(final String index, final boolean checkRoutingTab
             assertThat(XContentMapValues.extractValue("index.verified_before_close", settings), nullValue());
         }
     }
+
+    private void assertNoFileBasedRecovery(String indexName) throws IOException {
+        Map<String, Object> recoveries = entityAsMap(client()
+            .performRequest(new Request("GET", indexName + "/_recovery?detailed=true")));
+
+        @SuppressWarnings("unchecked")
+        List<Map<String, ?>> shards = (List<Map<String,?>>) XContentMapValues.extractValue(indexName + ".shards", recoveries);
+        assertNotNull(shards);
+        boolean foundReplica = false;
+        for (Map<String, ?> shard : shards) {
+            if (shard.get("primary") == Boolean.FALSE) {
+                List<?> details = (List<?>) XContentMapValues.extractValue("index.files.details", shard);
+                // once detailed recoveries works, remove this if.
+                if (details == null) {
+                    long totalFiles = ((Number) XContentMapValues.extractValue("index.files.total", shard)).longValue();
+                    long reusedFiles = ((Number) XContentMapValues.extractValue("index.files.reused", shard)).longValue();
+                    assertEquals(totalFiles, reusedFiles);
+                } else {
+                    assertNotNull(details);
+                    assertThat(details, empty());
+                }
+
+                long translogRecovered = ((Number) XContentMapValues.extractValue("translog.recovered", shard)).longValue();
+                assertEquals("must be noop", 0, translogRecovered);
+                foundReplica = true;
+            }
+        }
+
+        assertTrue("must find replica", foundReplica);
+    }
 }
diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/NodeAllocationResult.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/NodeAllocationResult.java
@@ -228,6 +228,8 @@ public String getAllocationId() {
          * matching sync ids are irrelevant.
          */
         public boolean hasMatchingSyncId() {
+            // TODO: this method needs a rename, leaving it for now to not make too many iterations on that until we have full seqno
+            // based recovery.
             return matchingBytes == Long.MAX_VALUE;
         }
 
@@ -274,6 +276,10 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
                     builder.field("allocation_id", allocationId);
                 }
                 if (matchingBytes >= 0) {
+                    // TODO: we should eventually either distinguish between sync-id and non sync-id equivalent closed shard allocation or
+                    // rename this to synced_match
+                    // left this for now, since it changes the API and should preferably be handled together with seqno based
+                    // replica shard allocation, consisting of whether this will be ops based and how many ops to recover.
                     if (hasMatchingSyncId()) {
                         builder.field("matching_sync_id", true);
                     } else {

diff --git a/server/src/main/java/org/elasticsearch/gateway/AsyncShardFetch.java b/server/src/main/java/org/elasticsearch/gateway/AsyncShardFetch.java
@@ -231,6 +231,13 @@ protected synchronized void processAsyncFetch(List<T> responses, List<FailedNode
      */
     protected abstract void reroute(ShardId shardId, String reason);
 
+    /**
+     * Clear cache for node, ensuring next fetch will fetch a fresh copy.
+     */
+    public synchronized void clearCacheForNode(String nodeId) {
+        cache.remove(nodeId);
+    }
+
     /**
      * Fills the shard fetched data with new (data) nodes and a fresh NodeEntry, and removes from
      * it nodes that are no longer part of the state.

diff --git a/server/src/main/java/org/elasticsearch/gateway/GatewayAllocator.java b/server/src/main/java/org/elasticsearch/gateway/GatewayAllocator.java
@@ -23,6 +23,8 @@
 import org.apache.logging.log4j.Logger;
 import org.elasticsearch.action.support.nodes.BaseNodeResponse;
 import org.elasticsearch.action.support.nodes.BaseNodesResponse;
+import org.elasticsearch.cluster.node.DiscoveryNode;
+import org.elasticsearch.cluster.node.DiscoveryNodes;
 import org.elasticsearch.cluster.routing.RoutingNodes;
 import org.elasticsearch.cluster.routing.RoutingService;
 import org.elasticsearch.cluster.routing.ShardRouting;
@@ -35,7 +37,11 @@
 import org.elasticsearch.index.shard.ShardId;
 import org.elasticsearch.indices.store.TransportNodesListShardStoreMetaData;
 
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Iterator;
 import java.util.List;
+import java.util.Set;
 import java.util.concurrent.ConcurrentMap;
 
 public class GatewayAllocator {
@@ -52,6 +58,9 @@ public class GatewayAllocator {
     private final ConcurrentMap<ShardId, AsyncShardFetch<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData>>
         asyncFetchStore = ConcurrentCollections.newConcurrentMap();
 
+    // contains ephemeralIds
+    private volatile Set<String> lastDataNodes = Collections.emptySet();
+
     @Inject
     public GatewayAllocator(RoutingService routingService,
                             TransportNodesListGatewayStartedShards startedAction,
@@ -101,6 +110,7 @@ public void applyFailedShards(final RoutingAllocation allocation, final List<Fai
     }
 
     public void allocateUnassigned(final RoutingAllocation allocation) {
+        ensureAsyncFetchStorePrimaryRecency(allocation);
         innerAllocatedUnassigned(allocation, primaryShardAllocator, replicaShardAllocator);
     }
 
@@ -128,6 +138,43 @@ public AllocateUnassignedDecision decideUnassignedShardAllocation(ShardRouting u
         }
     }
 
+    /**
+     * Whenever we see a new data node, we clear the information we have on primary to ensure it is at least as recent as the start
+     * of the new node. This reduces risk of making a decision on stale information from primary.
+     */
+    private void ensureAsyncFetchStorePrimaryRecency(RoutingAllocation allocation) {
+        DiscoveryNodes nodes = allocation.nodes();
+        if (hasNewNodes(nodes, lastDataNodes)) {
+            asyncFetchStore.values().forEach(fetch -> clearCacheForPrimary(fetch, allocation));
+            // recalc to also (lazily) clear out old nodes.
+            Set<String> newDataNodes = new HashSet<>(nodes.getDataNodes().size());
+            for (Iterator<DiscoveryNode> iterator = nodes.getDataNodes().valuesIt(); iterator.hasNext(); ) {
+                newDataNodes.add(iterator.next().getEphemeralId());
+            }
+            this.lastDataNodes = newDataNodes;
+        }
+    }
+
+    private void clearCacheForPrimary(AsyncShardFetch<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> fetch,
+                                      RoutingAllocation allocation) {
+        ShardRouting primary = allocation.routingNodes().activePrimary(fetch.shardId);
+        if (primary != null) {
+            fetch.clearCacheForNode(primary.currentNodeId());
+        }
+    }
+
+    private boolean hasNewNodes(DiscoveryNodes nodes, Set<String> lastDataNodes) {
+        for (Iterator<DiscoveryNode> iterator = nodes.getDataNodes().valuesIt(); iterator.hasNext(); ) {
+            DiscoveryNode node = iterator.next();
+            if (lastDataNodes.contains(node.getEphemeralId()) == false) {
+                logger.trace("new node {} found, clearing primary async-fetch-store cache", node);
+                return true;
+            }
+        }
+
+        return false;
+    }
+
     class InternalAsyncFetch<T extends BaseNodeResponse> extends AsyncShardFetch<T> {
 
         InternalAsyncFetch(Logger logger, String type, ShardId shardId, Lister<? extends BaseNodesResponse<T>, T> action) {

diff --git a/server/src/main/java/org/elasticsearch/gateway/ReplicaShardAllocator.java b/server/src/main/java/org/elasticsearch/gateway/ReplicaShardAllocator.java
@@ -49,7 +49,6 @@
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
-import java.util.Objects;
 
 import static org.elasticsearch.cluster.routing.UnassignedInfo.INDEX_DELAYED_NODE_LEFT_TIMEOUT_SETTING;
 
@@ -101,17 +100,16 @@ public void processExistingRecoveries(RoutingAllocation allocation) {
                     DiscoveryNode currentNode = allocation.nodes().get(shard.currentNodeId());
                     DiscoveryNode nodeWithHighestMatch = matchingNodes.getNodeWithHighestMatch();
                     // current node will not be in matchingNodes as it is filtered away by SameShardAllocationDecider
-                    final String currentSyncId;
+                    final TransportNodesListShardStoreMetaData.StoreFilesMetaData currentStore;
                     if (shardStores.getData().containsKey(currentNode)) {
-                        currentSyncId = shardStores.getData().get(currentNode).storeFilesMetaData().syncId();
+                        currentStore = shardStores.getData().get(currentNode).storeFilesMetaData();
                     } else {
-                        currentSyncId = null;
+                        currentStore = null;
                     }
                     if (currentNode.equals(nodeWithHighestMatch) == false
-                            && Objects.equals(currentSyncId, primaryStore.syncId()) == false
-                            && matchingNodes.isNodeMatchBySyncID(nodeWithHighestMatch)) {
-                        // we found a better match that has a full sync id match, the existing allocation is not fully synced
-                        // so we found a better one, cancel this one
+                            && isNoopRecovery(primaryStore, currentStore) == false
+                            && matchingNodes.isNoopRecovery(nodeWithHighestMatch)) {
+                        // we found a better match that can do a fast recovery, cancel current recovery
                         logger.debug("cancelling allocation of replica on [{}], sync id match found on node [{}]",
                                 currentNode, nodeWithHighestMatch);
                         UnassignedInfo unassignedInfo = new UnassignedInfo(UnassignedInfo.Reason.REALLOCATED_REPLICA,
@@ -363,10 +361,7 @@ private MatchingNodes findMatchingNodes(ShardRouting shard, RoutingAllocation al
 
     private static long computeMatchingBytes(TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryStore,
                                              TransportNodesListShardStoreMetaData.StoreFilesMetaData storeFilesMetaData) {
-        String primarySyncId = primaryStore.syncId();
-        String replicaSyncId = storeFilesMetaData.syncId();
-        // see if we have a sync id we can make use of
-        if (replicaSyncId != null && replicaSyncId.equals(primarySyncId)) {
+        if (isNoopRecovery(primaryStore, storeFilesMetaData)) {
             return Long.MAX_VALUE;
         } else {
             long sizeMatched = 0;
@@ -380,6 +375,34 @@ private static long computeMatchingBytes(TransportNodesListShardStoreMetaData.St
         }
     }
 
+    /**
+     * Is a "noop recovery", which means expecting no operations to recover (though with sync-id, we could in principle still
+     * have a few).
+     */
+    private static boolean isNoopRecovery(TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryStore,
+                                          TransportNodesListShardStoreMetaData.StoreFilesMetaData candidateStore) {
+        // keeping syncIdMatch for 7.x to remain backwards compatible with pre-7.2 versions, but will remove for 8.0.
+        return syncIdMatch(primaryStore, candidateStore)
+            || noopMatch(primaryStore, candidateStore);
+    }
+
+    private static boolean syncIdMatch(TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryStore,
+                                       TransportNodesListShardStoreMetaData.StoreFilesMetaData candidateStore) {
+        String primarySyncId = primaryStore.syncId();
+        String replicaSyncId = candidateStore.syncId();
+        return (replicaSyncId != null && replicaSyncId.equals(primarySyncId));
+    }
+
+    private static boolean noopMatch(TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryStore,
+                                     TransportNodesListShardStoreMetaData.StoreFilesMetaData candidateStore) {
+        // We need the maxSeqNo conditions until we support non-noop recovery for closed indices (and preferably also have
+        // retention leases in place to ensure ops based recovery will actually be performed).
+        return primaryStore.hasSeqNoInfo()
+            && primaryStore.maxSeqNo() == candidateStore.maxSeqNo()
+            && primaryStore.provideRecoverySeqNo() <= candidateStore.requireRecoverySeqNo()
+            && candidateStore.requireRecoverySeqNo() == primaryStore.maxSeqNo() + 1;
+    }
+
     protected abstract AsyncShardFetch.FetchResult<NodeStoreFilesMetaData> fetchData(ShardRouting shard, RoutingAllocation allocation);
 
     /**
@@ -418,7 +441,10 @@ public DiscoveryNode getNodeWithHighestMatch() {
             return this.nodeWithHighestMatch;
         }
 
-        public boolean isNodeMatchBySyncID(DiscoveryNode node) {
+        /**
+         * Is supplied node a no-operations recovery, either sync-id match or sequence number match.
+         */
+        public boolean isNoopRecovery(DiscoveryNode node) {
             return nodesToSize.get(node) == Long.MAX_VALUE;
         }