Skip to content

Commit

Permalink
Improvements for running Hazelcast persistence on kubernetes [5.2.0] (#…
Browse files Browse the repository at this point in the history
…22501)

- Adds automated cluster state management for persistence on kubernetes
- Supports cluster-wide shutdown, rolling restart and partial member
recovery from failure on kubernetes [HZ-1190] [HZ-1191] [HZ-1193]
- Fixes behaviour of readiness probe with persistence enabled [HZ-1349]
- Allows tuning either for speedy crash recovery with FROZEN state or
availability of in-memory data structures with NO_MIGRATION state for
missing members [HZ-1311]
- Fixes backup sync after single member crash recovery [HZ-1349]

Design document in EE side:

https://github.com/vbekiaris/hazelcast-enterprise/blob/enhancements/5.2/k8s-persistence/docs/design/persistence/04-persistence-kubernetes-improvements.md

(cherry picked from commit 1ddc16e)
1:1 clean backport of #21844 to 5.2.0 release branch

Also includes backport of #22512 

Co-authored-by: Łukasz Dziedziul <lukasz.dziedziul@hazelcast.com>
  • Loading branch information
vbekiaris and ldziedziul committed Oct 17, 2022
1 parent 814709e commit 2b4d98f
Show file tree
Hide file tree
Showing 62 changed files with 1,740 additions and 96 deletions.
Expand Up @@ -61,7 +61,6 @@
import com.hazelcast.spi.impl.eventservice.EventFilter;
import com.hazelcast.spi.impl.eventservice.EventRegistration;
import com.hazelcast.spi.impl.eventservice.EventService;
import com.hazelcast.spi.impl.operationservice.Operation;
import com.hazelcast.spi.merge.SplitBrainMergePolicy;
import com.hazelcast.spi.merge.SplitBrainMergePolicyProvider;
import com.hazelcast.spi.properties.ClusterProperty;
Expand Down Expand Up @@ -98,7 +97,7 @@

@SuppressWarnings("checkstyle:classdataabstractioncoupling")
public abstract class AbstractCacheService implements ICacheService,
PreJoinAwareService, PartitionAwareService,
PreJoinAwareService<OnJoinCacheOperation>, PartitionAwareService,
SplitBrainProtectionAwareService, SplitBrainHandlerService,
ClusterStateListener, TenantContextAwareService {
/**
Expand Down Expand Up @@ -734,7 +733,7 @@ protected void deleteCacheResources(String name) {
}

@Override
public Operation getPreJoinOperation() {
public OnJoinCacheOperation getPreJoinOperation() {
OnJoinCacheOperation preJoinCacheOperation;
preJoinCacheOperation = new OnJoinCacheOperation();
for (Map.Entry<String, CompletableFuture<CacheConfig>> cacheConfigEntry : configs.entrySet()) {
Expand Down
Expand Up @@ -23,6 +23,7 @@
import com.hazelcast.nio.ObjectDataInput;
import com.hazelcast.nio.ObjectDataOutput;
import com.hazelcast.nio.serialization.IdentifiedDataSerializable;
import com.hazelcast.spi.impl.AllowedDuringPassiveState;
import com.hazelcast.spi.impl.operationservice.Operation;
import com.hazelcast.spi.exception.ServiceNotFoundException;

Expand All @@ -41,7 +42,7 @@
* resolve a race between the {@link CacheConfig} becoming available in the joining member and creation of a
* {@link com.hazelcast.cache.ICache} proxy.
*/
public class OnJoinCacheOperation extends Operation implements IdentifiedDataSerializable {
public class OnJoinCacheOperation extends Operation implements IdentifiedDataSerializable, AllowedDuringPassiveState {

private List<CacheConfig> configs = new ArrayList<CacheConfig>();

Expand Down
Expand Up @@ -158,8 +158,8 @@
*/
@SuppressWarnings({"checkstyle:methodcount", "checkstyle:classfanoutcomplexity", "checkstyle:classdataabstractioncoupling"})
public class RaftService implements ManagedService, SnapshotAwareService<MetadataRaftGroupSnapshot>, GracefulShutdownAwareService,
MembershipAwareService, PreJoinAwareService, RaftNodeLifecycleAwareService,
MigrationAwareService, DynamicMetricsProvider,
MembershipAwareService, PreJoinAwareService<RaftServicePreJoinOp>,
RaftNodeLifecycleAwareService, MigrationAwareService, DynamicMetricsProvider,
EventPublishingService<Object, EventListener> {

public static final String SERVICE_NAME = "hz:core:raft";
Expand Down Expand Up @@ -569,7 +569,7 @@ private boolean ensureCPMemberRemoved(CPMemberInfo member, long remainingTimeNan
}

@Override
public Operation getPreJoinOperation() {
public RaftServicePreJoinOp getPreJoinOperation() {
if (!cpSubsystemEnabled) {
return null;
}
Expand Down
Expand Up @@ -24,6 +24,7 @@
import com.hazelcast.nio.ObjectDataInput;
import com.hazelcast.nio.ObjectDataOutput;
import com.hazelcast.nio.serialization.IdentifiedDataSerializable;
import com.hazelcast.spi.impl.AllowedDuringPassiveState;
import com.hazelcast.spi.impl.operationservice.Operation;

import java.io.IOException;
Expand All @@ -35,7 +36,7 @@
* Please note that this operation is not a {@link RaftOp},
* so it is not handled via the Raft layer.
*/
public class RaftServicePreJoinOp extends Operation implements IdentifiedDataSerializable {
public class RaftServicePreJoinOp extends Operation implements IdentifiedDataSerializable, AllowedDuringPassiveState {

private boolean discoveryCompleted;

Expand Down
@@ -0,0 +1,86 @@
/*
* Copyright (c) 2008-2022, Hazelcast, Inc. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.hazelcast.instance.impl;

/**
* This class represents the estimated intent of topology changes
* performed in a managed runtime context (kubernetes) that may affect the cluster.
* {@link #NOT_IN_MANAGED_CONTEXT} indicates Hazelcast is not executed within a
* managed runtime context; all other values represent different detected intents
* within a managed runtime context.
*
* @see ClusterTopologyIntentTracker
*/
public enum ClusterTopologyIntent {
/**
* Hazelcast is not deployed in a managed context
*/
NOT_IN_MANAGED_CONTEXT(0),
/**
* Unknown state, but in managed context (Kubernetes)
*/
IN_MANAGED_CONTEXT_UNKNOWN(1),
/**
* No change to the number of Hazelcast members in the cluster is intended
*/
CLUSTER_STABLE(2),
/**
* No change to the number of Hazelcast members in the cluster is intended,
* but some members are missing from the cluster.
* For example this might happen when kubernetes reschedules a pod, so it is first shutdown, then restarted.
* Even though the requested number of members in the cluster stays the same
* all the time, a member will be missing for some time and the detected intent during
* that time will be {@code MISSING_MEMBERS}.
*/
CLUSTER_STABLE_WITH_MISSING_MEMBERS(3),
/**
* Full cluster shutdown is intended
*/
CLUSTER_SHUTDOWN(4),
/**
* Cluster is starting up
*/
CLUSTER_START(5),
/**
* Cluster is shutting down, but had some missing members before
* cluster-wide shutdown.
*/
CLUSTER_SHUTDOWN_WITH_MISSING_MEMBERS(6),
/**
* Hazelcast cluster is being scaled up or down
*/
SCALING(7);

private final int id;

ClusterTopologyIntent(int id) {
this.id = id;
}

public int getId() {
return id;
}

public static ClusterTopologyIntent of(int id) {
for (ClusterTopologyIntent intent : ClusterTopologyIntent.values()) {
if (intent.id == id) {
return intent;
}
}
throw new IllegalArgumentException("No ClusterTopologyIntent exists with id " + id);
}
}
@@ -0,0 +1,144 @@
/*
* Copyright (c) 2008-2022, Hazelcast, Inc. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.hazelcast.instance.impl;

/**
* Receives updates about the context in which Hazelcast is executed
* in order to detect what is the intent of topology changes from the
* managed runtime context that may affect how the Hazelcast cluster should react.
* <p/>
* Terminology:
* <ul>
* <li><b>Managed runtime context</b>: a runtime orchestration environment within which Hazelcast is
* being executed (e.g. Kubernetes).</li>
* <li><b>Replica</b>: a Hazelcast member that is executed in a managed runtime context. For example,
* when running a 3-member Hazelcast cluster, there are 3 replicas.</li>
* <li><b>Specification / specified replicas</b>: the desired runtime state of the Hazelcast member, as declared by the user
* to the managed runtime context. For example, when user executes {@code kubectl scale sts hz --replicas 5},
* the <b>specified replicas</b> are 5.</li>
* <li><b>Current replicas</b>: number of replicas that are running by the managed runtime. Notice that current replicas
* does not necessarily reflect the current number of members in the Hazelcast cluster. Examples:
* <br/>- When a kubernetes pod is terminated, "current replicas" is immediately
* decreased by 1, even though the Hazelcast member is still live and just started performing its graceful shutdown. So
* there will be some time during which current replicas may be lower than actual number of members in the Hazelcast cluster
* (until graceful shutdown completes).
* <br/>- When a kubernetes pod is scheduled, "current replicas" immediately increases by 1 even though Hazelcast is just
* started and not joined to any cluster.</li>
* <li><b>Ready replicas</b>: replicas which are observed as "ready" by the managed runtime. "Ready" implies that the
* managed runtime has queried the configured readiness probe and found the Hazelcast member to be ready (i.e. up & running
* and ready to accept and serve requests). Notice that since "ready" state is based on a periodic check by the managed
* runtime, its updates can lag for some time, depending on configuration (e.g. what is the period of readiness probe checks
* etc). Example:
* <br/>Kubernetes deletes a pod of a running cluster with specified replicas 3. Immediately
* "current replicas" drops to 2, however "ready replicas" is still 3 until the readiness probe monitoring period passes
* and the readiness check figures out the Hazelcast member is no longer ready.
* <br/>
* Also related to "readiness" definition: {@code NodeExtension#isReady}.</li>
* </ul>
*/
public interface ClusterTopologyIntentTracker {

int UNKNOWN = -1;

/**
* Process an update of the cluster topology. Each update carries information about
* the previous & current specified replicas count, the current number of replicas and ready replicas in the cluster.
* <br/>
* <b>Examples</b> (numbers in parentheses indicate (previous specified replica count, current specified replica count,
* current replicas, current ready replicas))
* <p>
* A cluster with specified replica count 3 is starting up. This is expected to result in a series
* of updates similar to the following:
* <pre>{@code
* (-1, 3, 0, 0)
* (3, 3, 1, 0)
* (3, 3, 1, 1)
* (3, 3, 2, 1)
* (3, 3, 2, 2)
* (3, 3, 3, 2)
* (3, 3, 3, 3)
* }</pre>
* </p>
* <p>
* Assuming user requests scaling up a running cluster of 3 members to 5, the following
* updates are expected:
* <pre>{@code
* (3, 3, 3, 3)
* (3, 5, 4, 3)
* (5, 5, 4, 4)
* (5, 5, 5, 4)
* (5, 5, 5, 5)
* }</pre>
* </p>
* Notice that actual updates may differ (eg duplicate notifications of intermediate states may be received).
*
* @param previousSpecifiedReplicas previous specified replicas count
* @param updatedSpecifiedReplicas updated specified replicas count
* @param previousReadyReplicas number of previously ready replicas
* @param updatedReadyReplicas number of updated ready replicas
* @param previousCurrentReplicas number of previous current replicas
* @param updatedCurrentReplicas number of updated current replicas
*
* @see NodeExtension#isReady()
*/
void update(int previousSpecifiedReplicas, int updatedSpecifiedReplicas,
int previousReadyReplicas, int updatedReadyReplicas,
int previousCurrentReplicas, int updatedCurrentReplicas);

ClusterTopologyIntent getClusterTopologyIntent();

/**
* Initialize this tracker, if the tracker supports it. This method must be called first, before the tracker
* can receive any updates.
*/
void initialize();

/**
* Prepare this tracker for shutdown.
*/
void destroy();

/**
* Initialize explicitly the cluster topology intent.
* @param clusterTopologyIntent
*/
void initializeClusterTopologyIntent(ClusterTopologyIntent clusterTopologyIntent);

/**
* Handle Hazelcast node shutdown with the given cluster topology intent.
* @param clusterTopologyIntent
*/
void shutdownWithIntent(ClusterTopologyIntent clusterTopologyIntent);

/**
* @return {@code true} if this instance of {@code ClusterTopologyIntentTracker} is active and tracking
* cluster topology changes in a managed context, otherwise {@code false}.
*/
boolean isEnabled();

/**
* @return the number of requested Hazelcast members in the cluster, as determined by the specification
* that is managed by the runtime context. When running Hazelcast in a Kubernetes StatefulSet,
* this corresponds to the value in {@code StatefulSetSpec.size}.
*/
int getCurrentSpecifiedReplicaCount();

/**
* Notifies the {@link ClusterTopologyIntentTracker} that Hazelcast members list has changed.
*/
void onMembershipChange();
}
Expand Up @@ -321,6 +321,11 @@ public boolean isStartCompleted() {
return node.getClusterService().isJoined();
}

@Override
public boolean isReady() {
return node.getClusterService().isJoined();
}

@Override
public SecurityContext getSecurityContext() {
logger.warning("Security features are only available on Hazelcast Enterprise!");
Expand Down Expand Up @@ -533,6 +538,7 @@ public void onMemberListChange() {
if (service != null) {
service.onMemberListChange();
}
node.clusterTopologyIntentTracker.onMembershipChange();
}

@Override
Expand Down

0 comments on commit 2b4d98f

Please sign in to comment.