Improvements for running Hazelcast persistence on kubernetes [5.2.0] (#…

…22501) - Adds automated cluster state management for persistence on kubernetes - Supports cluster-wide shutdown, rolling restart and partial member recovery from failure on kubernetes [HZ-1190] [HZ-1191] [HZ-1193] - Fixes behaviour of readiness probe with persistence enabled [HZ-1349] - Allows tuning either for speedy crash recovery with FROZEN state or availability of in-memory data structures with NO_MIGRATION state for missing members [HZ-1311] - Fixes backup sync after single member crash recovery [HZ-1349] Design document in EE side: https://github.com/vbekiaris/hazelcast-enterprise/blob/enhancements/5.2/k8s-persistence/docs/design/persistence/04-persistence-kubernetes-improvements.md (cherry picked from commit 1ddc16e) 1:1 clean backport of #21844 to 5.2.0 release branch Also includes backport of #22512 Co-authored-by: Łukasz Dziedziul <lukasz.dziedziul@hazelcast.com>
hazelcast · Oct 17, 2022 · 2b4d98f · 2b4d98f
1 parent 814709e
commit 2b4d98f
Show file tree

Hide file tree

Showing 62 changed files with 1,740 additions and 96 deletions.
diff --git a/hazelcast/src/main/java/com/hazelcast/cache/impl/AbstractCacheService.java b/hazelcast/src/main/java/com/hazelcast/cache/impl/AbstractCacheService.java
@@ -61,7 +61,6 @@
 import com.hazelcast.spi.impl.eventservice.EventFilter;
 import com.hazelcast.spi.impl.eventservice.EventRegistration;
 import com.hazelcast.spi.impl.eventservice.EventService;
-import com.hazelcast.spi.impl.operationservice.Operation;
 import com.hazelcast.spi.merge.SplitBrainMergePolicy;
 import com.hazelcast.spi.merge.SplitBrainMergePolicyProvider;
 import com.hazelcast.spi.properties.ClusterProperty;
@@ -98,7 +97,7 @@
 
 @SuppressWarnings("checkstyle:classdataabstractioncoupling")
 public abstract class AbstractCacheService implements ICacheService,
-        PreJoinAwareService, PartitionAwareService,
+        PreJoinAwareService<OnJoinCacheOperation>, PartitionAwareService,
         SplitBrainProtectionAwareService, SplitBrainHandlerService,
         ClusterStateListener, TenantContextAwareService {
     /**
@@ -734,7 +733,7 @@ protected void deleteCacheResources(String name) {
     }
 
     @Override
-    public Operation getPreJoinOperation() {
+    public OnJoinCacheOperation getPreJoinOperation() {
         OnJoinCacheOperation preJoinCacheOperation;
         preJoinCacheOperation = new OnJoinCacheOperation();
         for (Map.Entry<String, CompletableFuture<CacheConfig>> cacheConfigEntry : configs.entrySet()) {

diff --git a/hazelcast/src/main/java/com/hazelcast/cache/impl/operation/OnJoinCacheOperation.java b/hazelcast/src/main/java/com/hazelcast/cache/impl/operation/OnJoinCacheOperation.java
@@ -23,6 +23,7 @@
 import com.hazelcast.nio.ObjectDataInput;
 import com.hazelcast.nio.ObjectDataOutput;
 import com.hazelcast.nio.serialization.IdentifiedDataSerializable;
+import com.hazelcast.spi.impl.AllowedDuringPassiveState;
 import com.hazelcast.spi.impl.operationservice.Operation;
 import com.hazelcast.spi.exception.ServiceNotFoundException;
 
@@ -41,7 +42,7 @@
  * resolve a race between the {@link CacheConfig} becoming available in the joining member and creation of a
  * {@link com.hazelcast.cache.ICache} proxy.
  */
-public class OnJoinCacheOperation extends Operation implements IdentifiedDataSerializable {
+public class OnJoinCacheOperation extends Operation implements IdentifiedDataSerializable, AllowedDuringPassiveState {
 
     private List<CacheConfig> configs = new ArrayList<CacheConfig>();
 

diff --git a/hazelcast/src/main/java/com/hazelcast/cp/internal/RaftService.java b/hazelcast/src/main/java/com/hazelcast/cp/internal/RaftService.java
@@ -158,8 +158,8 @@
  */
 @SuppressWarnings({"checkstyle:methodcount", "checkstyle:classfanoutcomplexity", "checkstyle:classdataabstractioncoupling"})
 public class RaftService implements ManagedService, SnapshotAwareService<MetadataRaftGroupSnapshot>, GracefulShutdownAwareService,
-                                    MembershipAwareService, PreJoinAwareService, RaftNodeLifecycleAwareService,
-                                    MigrationAwareService, DynamicMetricsProvider,
+                                    MembershipAwareService, PreJoinAwareService<RaftServicePreJoinOp>,
+                                    RaftNodeLifecycleAwareService, MigrationAwareService, DynamicMetricsProvider,
                                     EventPublishingService<Object, EventListener> {
 
     public static final String SERVICE_NAME = "hz:core:raft";
@@ -569,7 +569,7 @@ private boolean ensureCPMemberRemoved(CPMemberInfo member, long remainingTimeNan
     }
 
     @Override
-    public Operation getPreJoinOperation() {
+    public RaftServicePreJoinOp getPreJoinOperation() {
         if (!cpSubsystemEnabled) {
             return null;
         }

diff --git a/hazelcast/src/main/java/com/hazelcast/cp/internal/raftop/metadata/RaftServicePreJoinOp.java b/hazelcast/src/main/java/com/hazelcast/cp/internal/raftop/metadata/RaftServicePreJoinOp.java
@@ -24,6 +24,7 @@
 import com.hazelcast.nio.ObjectDataInput;
 import com.hazelcast.nio.ObjectDataOutput;
 import com.hazelcast.nio.serialization.IdentifiedDataSerializable;
+import com.hazelcast.spi.impl.AllowedDuringPassiveState;
 import com.hazelcast.spi.impl.operationservice.Operation;
 
 import java.io.IOException;
@@ -35,7 +36,7 @@
  * Please note that this operation is not a {@link RaftOp},
  * so it is not handled via the Raft layer.
  */
-public class RaftServicePreJoinOp extends Operation implements IdentifiedDataSerializable {
+public class RaftServicePreJoinOp extends Operation implements IdentifiedDataSerializable, AllowedDuringPassiveState {
 
     private boolean discoveryCompleted;
 

diff --git a/hazelcast/src/main/java/com/hazelcast/instance/impl/ClusterTopologyIntent.java b/hazelcast/src/main/java/com/hazelcast/instance/impl/ClusterTopologyIntent.java
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2008-2022, Hazelcast, Inc. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.hazelcast.instance.impl;
+
+/**
+ * This class represents the estimated intent of topology changes
+ * performed in a managed runtime context (kubernetes) that may affect the cluster.
+ * {@link #NOT_IN_MANAGED_CONTEXT} indicates Hazelcast is not executed within a
+ * managed runtime context; all other values represent different detected intents
+ * within a managed runtime context.
+ *
+ * @see ClusterTopologyIntentTracker
+ */
+public enum ClusterTopologyIntent {
+    /**
+     * Hazelcast is not deployed in a managed context
+     */
+    NOT_IN_MANAGED_CONTEXT(0),
+    /**
+     * Unknown state, but in managed context (Kubernetes)
+     */
+    IN_MANAGED_CONTEXT_UNKNOWN(1),
+    /**
+     * No change to the number of Hazelcast members in the cluster is intended
+     */
+    CLUSTER_STABLE(2),
+    /**
+     * No change to the number of Hazelcast members in the cluster is intended,
+     * but some members are missing from the cluster.
+     * For example this might happen when kubernetes reschedules a pod, so it is first shutdown, then restarted.
+     * Even though the requested number of members in the cluster stays the same
+     * all the time, a member will be missing for some time and the detected intent during
+     * that time will be {@code MISSING_MEMBERS}.
+     */
+    CLUSTER_STABLE_WITH_MISSING_MEMBERS(3),
+    /**
+     * Full cluster shutdown is intended
+     */
+    CLUSTER_SHUTDOWN(4),
+    /**
+     * Cluster is starting up
+     */
+    CLUSTER_START(5),
+    /**
+     * Cluster is shutting down, but had some missing members before
+     * cluster-wide shutdown.
+     */
+    CLUSTER_SHUTDOWN_WITH_MISSING_MEMBERS(6),
+    /**
+     * Hazelcast cluster is being scaled up or down
+     */
+    SCALING(7);
+
+    private final int id;
+
+    ClusterTopologyIntent(int id) {
+        this.id = id;
+    }
+
+    public int getId() {
+        return id;
+    }
+
+    public static ClusterTopologyIntent of(int id) {
+        for (ClusterTopologyIntent intent : ClusterTopologyIntent.values()) {
+            if (intent.id == id) {
+                return intent;
+            }
+        }
+        throw new IllegalArgumentException("No ClusterTopologyIntent exists with id " + id);
+    }
+}
diff --git a/hazelcast/src/main/java/com/hazelcast/instance/impl/ClusterTopologyIntentTracker.java b/hazelcast/src/main/java/com/hazelcast/instance/impl/ClusterTopologyIntentTracker.java
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2008-2022, Hazelcast, Inc. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.hazelcast.instance.impl;
+
+/**
+ * Receives updates about the context in which Hazelcast is executed
+ * in order to detect what is the intent of topology changes from the
+ * managed runtime context that may affect how the Hazelcast cluster should react.
+ * <p/>
+ * Terminology:
+ * <ul>
+ *     <li><b>Managed runtime context</b>: a runtime orchestration environment within which Hazelcast is
+ *     being executed (e.g. Kubernetes).</li>
+ *     <li><b>Replica</b>: a Hazelcast member that is executed in a managed runtime context. For example,
+ *     when running a 3-member Hazelcast cluster, there are 3 replicas.</li>
+ *     <li><b>Specification / specified replicas</b>: the desired runtime state of the Hazelcast member, as declared by the user
+ *     to the managed runtime context. For example, when user executes {@code kubectl scale sts hz --replicas 5},
+ *     the <b>specified replicas</b> are 5.</li>
+ *     <li><b>Current replicas</b>: number of replicas that are running by the managed runtime. Notice that current replicas
+ *     does not necessarily reflect the current number of members in the Hazelcast cluster. Examples:
+ *     <br/>- When a kubernetes pod is terminated, "current replicas" is immediately
+ *     decreased by 1, even though the Hazelcast member is still live and just started performing its graceful shutdown. So
+ *     there will be some time during which current replicas may be lower than actual number of members in the Hazelcast cluster
+ *     (until graceful shutdown completes).
+ *     <br/>- When a kubernetes pod is scheduled, "current replicas" immediately increases by 1 even though Hazelcast is just
+ *     started and not joined to any cluster.</li>
+ *     <li><b>Ready replicas</b>: replicas which are observed as "ready" by the managed runtime. "Ready" implies that the
+ *     managed runtime has queried the configured readiness probe and found the Hazelcast member to be ready (i.e. up & running
+ *     and ready to accept and serve requests). Notice that since "ready" state is based on a periodic check by the managed
+ *     runtime, its updates can lag for some time, depending on configuration (e.g. what is the period of readiness probe checks
+ *     etc). Example:
+ *     <br/>Kubernetes deletes a pod of a running cluster with specified replicas 3. Immediately
+ *     "current replicas" drops to 2, however "ready replicas" is still 3 until the readiness probe monitoring period passes
+ *     and the readiness check figures out the Hazelcast member is no longer ready.
+ *     <br/>
+ *     Also related to "readiness" definition: {@code NodeExtension#isReady}.</li>
+ * </ul>
+ */
+public interface ClusterTopologyIntentTracker {
+
+    int UNKNOWN = -1;
+
+    /**
+     * Process an update of the cluster topology. Each update carries information about
+     * the previous & current specified replicas count, the current number of replicas and ready replicas in the cluster.
+     * <br/>
+     * <b>Examples</b> (numbers in parentheses indicate (previous specified replica count, current specified replica count,
+     * current replicas, current ready replicas))
+     * <p>
+     *     A cluster with specified replica count 3 is starting up. This is expected to result in a series
+     *     of updates similar to the following:
+     *     <pre>{@code
+     *     (-1, 3, 0, 0)
+     *     (3, 3, 1, 0)
+     *     (3, 3, 1, 1)
+     *     (3, 3, 2, 1)
+     *     (3, 3, 2, 2)
+     *     (3, 3, 3, 2)
+     *     (3, 3, 3, 3)
+     *     }</pre>
+     * </p>
+     * <p>
+     *     Assuming user requests scaling up a running cluster of 3 members to 5, the following
+     *     updates are expected:
+     *     <pre>{@code
+     *     (3, 3, 3, 3)
+     *     (3, 5, 4, 3)
+     *     (5, 5, 4, 4)
+     *     (5, 5, 5, 4)
+     *     (5, 5, 5, 5)
+     *     }</pre>
+     * </p>
+     * Notice that actual updates may differ (eg duplicate notifications of intermediate states may be received).
+     *
+     * @param previousSpecifiedReplicas   previous specified replicas count
+     * @param updatedSpecifiedReplicas    updated specified replicas count
+     * @param previousReadyReplicas       number of previously ready replicas
+     * @param updatedReadyReplicas        number of updated ready replicas
+     * @param previousCurrentReplicas     number of previous current replicas
+     * @param updatedCurrentReplicas      number of updated current replicas
+     *
+     * @see NodeExtension#isReady()
+     */
+    void update(int previousSpecifiedReplicas, int updatedSpecifiedReplicas,
+                int previousReadyReplicas, int updatedReadyReplicas,
+                int previousCurrentReplicas, int updatedCurrentReplicas);
+
+    ClusterTopologyIntent getClusterTopologyIntent();
+
+    /**
+     * Initialize this tracker, if the tracker supports it. This method must be called first, before the tracker
+     * can receive any updates.
+     */
+    void initialize();
+
+    /**
+     * Prepare this tracker for shutdown.
+     */
+    void destroy();
+
+    /**
+     * Initialize explicitly the cluster topology intent.
+     * @param clusterTopologyIntent
+     */
+    void initializeClusterTopologyIntent(ClusterTopologyIntent clusterTopologyIntent);
+
+    /**
+     * Handle Hazelcast node shutdown with the given cluster topology intent.
+     * @param clusterTopologyIntent
+     */
+    void shutdownWithIntent(ClusterTopologyIntent clusterTopologyIntent);
+
+    /**
+     * @return {@code true} if this instance of {@code ClusterTopologyIntentTracker} is active and tracking
+     *         cluster topology changes in a managed context, otherwise {@code false}.
+     */
+    boolean isEnabled();
+
+    /**
+     * @return  the number of requested Hazelcast members in the cluster, as determined by the specification
+     *          that is managed by the runtime context. When running Hazelcast in a Kubernetes StatefulSet,
+     *          this corresponds to the value in {@code StatefulSetSpec.size}.
+     */
+    int getCurrentSpecifiedReplicaCount();
+
+    /**
+     * Notifies the {@link ClusterTopologyIntentTracker} that Hazelcast members list has changed.
+     */
+    void onMembershipChange();
+}
diff --git a/hazelcast/src/main/java/com/hazelcast/instance/impl/DefaultNodeExtension.java b/hazelcast/src/main/java/com/hazelcast/instance/impl/DefaultNodeExtension.java
@@ -321,6 +321,11 @@ public boolean isStartCompleted() {
         return node.getClusterService().isJoined();
     }
 
+    @Override
+    public boolean isReady() {
+        return node.getClusterService().isJoined();
+    }
+
     @Override
     public SecurityContext getSecurityContext() {
         logger.warning("Security features are only available on Hazelcast Enterprise!");
@@ -533,6 +538,7 @@ public void onMemberListChange() {
         if (service != null) {
             service.onMemberListChange();
         }
+        node.clusterTopologyIntentTracker.onMembershipChange();
     }
 
     @Override