diff --git a/.classpath b/.classpath index 009f6148a2..ed5672151e 100644 --- a/.classpath +++ b/.classpath @@ -42,7 +42,6 @@ - @@ -63,6 +62,7 @@ + diff --git a/.gitignore b/.gitignore index cd6301beb2..4ec7a61bb3 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,4 @@ server.state .temp .idea data/ +META-INF/MANIFEST.MF diff --git a/META-INF/MANIFEST.MF b/META-INF/MANIFEST.MF deleted file mode 100644 index 08a19d78e9..0000000000 --- a/META-INF/MANIFEST.MF +++ /dev/null @@ -1,8 +0,0 @@ -Manifest-Version: 1.0 -Ant-Version: Apache Ant 1.7.1 -Created-By: 20.2-b06 (Sun Microsystems Inc.) -Voldemort-Implementation-Version: 1.3.0 -Implementation-Title: Voldemort -Implementation-Version: 1.3.0 -Implementation-Vendor: LinkedIn - diff --git a/build.properties b/build.properties index d851cb378b..a05bc031f7 100644 --- a/build.properties +++ b/build.properties @@ -42,5 +42,5 @@ tomcat.context=/voldemort javac.version=1.5 ## Release -curr.release=1.3.1 +curr.release=1.3.3 diff --git a/contrib/ec2-testing/test/voldemort/utils/Ec2RebalanceTest.java b/contrib/ec2-testing/test/voldemort/utils/Ec2RebalanceTest.java index 448a733c68..0fd77e2e2f 100644 --- a/contrib/ec2-testing/test/voldemort/utils/Ec2RebalanceTest.java +++ b/contrib/ec2-testing/test/voldemort/utils/Ec2RebalanceTest.java @@ -44,7 +44,7 @@ import voldemort.client.protocol.RequestFormatType; import voldemort.client.protocol.admin.AdminClient; import voldemort.client.protocol.admin.AdminClientConfig; -import voldemort.client.rebalance.AbstractRebalanceTest; +import voldemort.client.rebalance.AbstractNonZonedRebalanceTest; import voldemort.cluster.Cluster; import voldemort.cluster.Node; import voldemort.server.RequestRoutingType; @@ -55,7 +55,7 @@ /** */ -public class Ec2RebalanceTest extends AbstractRebalanceTest { +public class Ec2RebalanceTest extends AbstractNonZonedRebalanceTest { private static int NUM_KEYS; @@ -66,7 +66,10 @@ public class Ec2RebalanceTest extends AbstractRebalanceTest { private Map nodeIdsInv = new HashMap(); private List activeHostNames = new ArrayList(); - private boolean useDonorBased = true; + + public Ec2RebalanceTest() { + super(true, true); + } @BeforeClass public static void ec2Setup() throws Exception { @@ -209,11 +212,6 @@ protected void stopServer(List nodesToStop) throws Exception { stopCluster(hostsToStop, ec2RebalanceTestConfig); } - @Override - protected boolean useDonorBased() { - return this.useDonorBased; - } - private static class Ec2RebalanceTestConfig extends Ec2RemoteTestConfig { private String configDirName; diff --git a/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/mr/azkaban/VoldemortBuildAndPushJob.java b/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/mr/azkaban/VoldemortBuildAndPushJob.java index 49128e0f2f..fc5cfcfeee 100644 --- a/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/mr/azkaban/VoldemortBuildAndPushJob.java +++ b/contrib/hadoop-store-builder/src/java/voldemort/store/readonly/mr/azkaban/VoldemortBuildAndPushJob.java @@ -411,6 +411,11 @@ public void verifySchema(String url) throws Exception { + "\nBut expected: " + remoteStoreDef); } + } else { + throw new RuntimeException("Your store definition does not match the store definition that is already in the cluster. Have: " + + newStoreDef + + "\nBut expected: " + + remoteStoreDef); } } @@ -788,6 +793,11 @@ public void verifyAvroSchemaAndVersions(String url, boolean isVersioned) throws + "\nBut expected: " + remoteStoreDef); } + } else { + throw new RuntimeException("Your store definition does not match the store definition that is already in the cluster. Have: " + + newStoreDef + + "\nBut expected: " + + remoteStoreDef); } } diff --git a/contrib/krati/src/java/voldemort/store/krati/KratiStorageEngine.java b/contrib/krati/src/java/voldemort/store/krati/KratiStorageEngine.java index c30eea305f..87ddd5227d 100644 --- a/contrib/krati/src/java/voldemort/store/krati/KratiStorageEngine.java +++ b/contrib/krati/src/java/voldemort/store/krati/KratiStorageEngine.java @@ -231,6 +231,29 @@ else if(occurred == Occurred.AFTER) } } + @Override + public List> multiVersionPut(ByteArray key, + final List> values) + throws VoldemortException { + StoreUtils.assertValidKey(key); + List> valuesInStorage = null; + List> obsoleteVals = null; + + synchronized(this.locks.lockFor(key.get())) { + valuesInStorage = this.get(key, null); + obsoleteVals = resolveAndConstructVersionsToPersist(valuesInStorage, values); + + try { + datastore.put(key.get(), assembleValues(valuesInStorage)); + } catch(Exception e) { + String message = "Failed to put key " + key; + logger.error(message, e); + throw new VoldemortException(message, e); + } + } + return obsoleteVals; + } + /** * Store the versioned values * diff --git a/contrib/restclient/src/java/voldemort/restclient/R2Store.java b/contrib/restclient/src/java/voldemort/restclient/R2Store.java index 4b2abc26f7..bb04c8ea35 100644 --- a/contrib/restclient/src/java/voldemort/restclient/R2Store.java +++ b/contrib/restclient/src/java/voldemort/restclient/R2Store.java @@ -41,6 +41,7 @@ import org.codehaus.jackson.map.ObjectMapper; import voldemort.VoldemortException; +import voldemort.coordinator.CoordinatorUtils; import voldemort.coordinator.VectorClockWrapper; import voldemort.store.AbstractStore; import voldemort.utils.ByteArray; @@ -70,10 +71,14 @@ public class R2Store extends AbstractStore { private static final String POST = "POST"; private static final String DELETE = "DELETE"; private static final String ETAG = "ETag"; + public static final String X_VOLD_VECTOR_CLOCK = "X-VOLD-Vector-Clock"; + public static final String CONTENT_TYPE = "Content-Type"; + public static final String CONTENT_LENGTH = "Content-Length"; public static final String X_VOLD_REQUEST_TIMEOUT_MS = "X-VOLD-Request-Timeout-ms"; public static final String X_VOLD_INCONSISTENCY_RESOLVER = "X-VOLD-Inconsistency-Resolver"; public static final String CUSTOM_RESOLVING_STRATEGY = "custom"; public static final String DEFAULT_RESOLVING_STRATEGY = "timestamp"; + private static final String LAST_MODIFIED = "Last-Modified"; private static final String MULTIPART_CONTENT_TYPE = "multipart/binary"; private final Logger logger = Logger.getLogger(R2Store.class); @@ -82,6 +87,7 @@ public class R2Store extends AbstractStore { private HttpClientFactory _clientFactory; private Client client = null; private String baseURL; + private ObjectMapper mapper; public R2Store(String baseURL, String storeName) { super(storeName); @@ -90,6 +96,7 @@ public R2Store(String baseURL, String storeName) { final TransportClient transportClient = _clientFactory.getClient(new HashMap()); client = new TransportClientAdapter(transportClient); this.baseURL = baseURL; + mapper = new ObjectMapper(); } catch(Exception e) { e.printStackTrace(); } @@ -120,8 +127,8 @@ public boolean delete(ByteArray key, Version version) throws VoldemortException // Create a HTTP POST request // TODO: Create a proper request based on client config rb.setMethod(DELETE); - rb.setHeader("Content-Type", "application/json"); - rb.setHeader("Content-Length", "0"); + rb.setHeader(CONTENT_TYPE, "binary"); + rb.setHeader(CONTENT_LENGTH, "0"); rb.setHeader(X_VOLD_REQUEST_TIMEOUT_MS, "1000"); RestRequest request = rb.build(); @@ -137,9 +144,13 @@ public boolean delete(ByteArray key, Version version) throws VoldemortException ve.printStackTrace(); throw ve; } catch(Exception e) { - e.printStackTrace(); + if(!e.getMessage().contains("status=404")) { + logger.error("Specified key to delete does not exist.", e); + return false; + } } - return false; + + return true; } @Override @@ -154,9 +165,8 @@ public List> get(ByteArray key, byte[] transforms) throws Vold // TODO: Form a proper request based on client config rb.setMethod(GET); - rb.setHeader("Accept", "application/json"); + rb.setHeader("Accept", "binary"); rb.setHeader(X_VOLD_REQUEST_TIMEOUT_MS, "1000"); - rb.setHeader(X_VOLD_INCONSISTENCY_RESOLVER, "custom"); RestRequest request = rb.build(); Future f = client.restRequest(request); @@ -179,7 +189,7 @@ public List> get(ByteArray key, byte[] transforms) throws Vold throw ve; } catch(Exception e) { if(!e.getMessage().contains("status=404")) { - logger.error("ERROR: " + e); + logger.error("Specified key does not exist." + e); } } @@ -202,20 +212,30 @@ public void put(ByteArray key, Versioned value, byte[] transform) RestRequestBuilder rb = new RestRequestBuilder(new URI(this.baseURL + "/" + getName() + "/" + base64Key)); + // Serialize the Vector clock + VectorClock vc = (VectorClock) value.getVersion(); + String serializedVC = null; + if(!vc.getEntries().isEmpty()) { + serializedVC = CoordinatorUtils.getSerializedVectorClock(vc); + } + // Create a HTTP POST request // TODO: Create a proper request based on client config rb.setMethod(POST); rb.setEntity(outputBytes.toByteArray()); - rb.setHeader("Content-Type", "application/json"); - rb.setHeader("Content-Length", "" + payload.length); + rb.setHeader(CONTENT_TYPE, "binary"); + rb.setHeader(CONTENT_LENGTH, "" + payload.length); rb.setHeader(X_VOLD_REQUEST_TIMEOUT_MS, "1000"); - rb.setHeader(X_VOLD_INCONSISTENCY_RESOLVER, "custom"); + if(serializedVC != null && serializedVC.length() > 0) { + rb.setHeader(X_VOLD_VECTOR_CLOCK, serializedVC); + } RestRequest request = rb.build(); Future f = client.restRequest(request); // This will block RestResponse response = f.get(); + String eTag = response.getHeader(ETAG); final ByteString entity = response.getEntity(); if(entity == null) { logger.error("Empty response !"); @@ -224,14 +244,15 @@ public void put(ByteArray key, Versioned value, byte[] transform) ve.printStackTrace(); throw ve; } catch(Exception e) { - logger.error("ERROR: " + e); + if(!e.getMessage().contains("status=412")) { + logger.error("Specified version of the value is Obsolete.", e); + } } } private List> readResults(ByteString entity, String eTag, String lastModified) throws IOException { - ObjectMapper mapper = new ObjectMapper(); logger.debug("Received etag : " + eTag); logger.debug("Received last modified date : " + lastModified); VectorClockWrapper vcWrapper = mapper.readValue(eTag, VectorClockWrapper.class); @@ -281,9 +302,7 @@ public Map>> getAll(Iterable keys, // Parse the response final ByteString entity = response.getEntity(); - String contentType = response.getHeader("Content-Type"); - // String eTag = response.getHeader(ETAG); - // String lastModified = response.getHeader(LAST_MODIFIED); + String contentType = response.getHeader(CONTENT_TYPE); if(entity != null) { if(contentType.equalsIgnoreCase(MULTIPART_CONTENT_TYPE)) { resultMap = readResultsGetAll(entity); @@ -311,23 +330,11 @@ private Map>> readResultsGetAll(ByteString ent Map>> results = new HashMap>>(); try { - ObjectMapper mapper = new ObjectMapper(); - // VectorClockWrapper vcWrapper = mapper.readValue(eTag, - // VectorClockWrapper.class); - // Build the multipart object byte[] bytes = new byte[entity.length()]; entity.copyBytes(bytes, 0); ByteArrayDataSource ds = new ByteArrayDataSource(bytes, "multipart/mixed"); - // logger.info("received data = "); - // BufferedReader in = new BufferedReader(new - // InputStreamReader(ds.getInputStream())); - // String inputLine; - // while((inputLine = in.readLine()) != null) - // System.out.println(inputLine); - // in.close(); - MimeMultipart mp = new MimeMultipart(ds); for(int i = 0; i < mp.getCount(); i++) { MimeBodyPart part = (MimeBodyPart) mp.getBodyPart(i); diff --git a/contrib/restclient/src/java/voldemort/restclient/SampleRESTClient.java b/contrib/restclient/src/java/voldemort/restclient/SampleRESTClient.java index 595351117d..4eb5124698 100644 --- a/contrib/restclient/src/java/voldemort/restclient/SampleRESTClient.java +++ b/contrib/restclient/src/java/voldemort/restclient/SampleRESTClient.java @@ -19,6 +19,8 @@ import java.util.ArrayList; import java.util.List; +import voldemort.versioning.Versioned; + public class SampleRESTClient { public static void main(String[] args) { @@ -31,8 +33,18 @@ public static void main(String[] args) { clientStore.put("a", "Howdy!!!!"); clientStore.put("b", "Partner!!!!"); - // Do a sample operation: - System.out.println("Received response : " + clientStore.get("a")); + // Do a sample get operation: + Versioned versionedValue = clientStore.get("a"); + System.out.println("Received response : " + versionedValue); + + // Do a versioned put operation: + versionedValue.setObject("New Value !!!"); + clientStore.put("a", versionedValue); + + // Do a get again on the last versioned put operation: + versionedValue = clientStore.get("a"); + System.out.println("Received response on the versioned put: " + versionedValue); + List keyList = new ArrayList(); keyList.add("a"); keyList.add("b"); diff --git a/lib/google-collect-1.0.jar b/lib/google-collect-1.0.jar deleted file mode 100644 index a7abdbbdb4..0000000000 Binary files a/lib/google-collect-1.0.jar and /dev/null differ diff --git a/lib/guava-14.0.1.jar b/lib/guava-14.0.1.jar new file mode 100644 index 0000000000..3a3d9258e3 Binary files /dev/null and b/lib/guava-14.0.1.jar differ diff --git a/release_notes.txt b/release_notes.txt index 4288726d15..162bef1d1b 100644 --- a/release_notes.txt +++ b/release_notes.txt @@ -1,3 +1,8 @@ +Release 1.3.3 on 04/24/2013 +* VoldemortBuildandPush + - Fixed bug with schema check +* Streaming Client + - Fixed issue with redundant callback invocation Release 1.3.1 on 03/25/2013 * HDFSFetcher - Fixed the bug in calculating checksums when we entere a retry loop diff --git a/src/java/voldemort/VoldemortAdminTool.java b/src/java/voldemort/VoldemortAdminTool.java index c462150b65..c0fa24382a 100644 --- a/src/java/voldemort/VoldemortAdminTool.java +++ b/src/java/voldemort/VoldemortAdminTool.java @@ -49,6 +49,7 @@ import joptsimple.OptionParser; import joptsimple.OptionSet; +import org.apache.commons.codec.DecoderException; import org.apache.commons.io.FileUtils; import org.codehaus.jackson.JsonFactory; import org.codehaus.jackson.JsonGenerator; @@ -100,8 +101,6 @@ public class VoldemortAdminTool { private static final String ALL_METADATA = "all"; - private static final String STORES_VERSION_KEY = "stores.xml"; - private static final String CLUSTER_VERSION_KEY = "cluster.xml"; @SuppressWarnings("unchecked") public static void main(String[] args) throws Exception { @@ -169,7 +168,8 @@ public static void main(String[] args) throws Exception { parser.accepts("check-metadata", "retreive metadata information from all nodes and checks if they are consistent across [ " + MetadataStore.CLUSTER_KEY + " | " + MetadataStore.STORES_KEY - + " | " + MetadataStore.SERVER_STATE_KEY + " ]") + + " | " + MetadataStore.REBALANCING_SOURCE_CLUSTER_XML + " | " + + MetadataStore.SERVER_STATE_KEY + " ]") .withRequiredArg() .describedAs("metadata-key") .ofType(String.class); @@ -185,13 +185,15 @@ public static void main(String[] args) throws Exception { parser.accepts("set-metadata", "Forceful setting of metadata [ " + MetadataStore.CLUSTER_KEY + " | " + MetadataStore.STORES_KEY + " | " + MetadataStore.SERVER_STATE_KEY - + " | " + MetadataStore.REBALANCING_STEAL_INFO + " ]") + + " | " + MetadataStore.REBALANCING_SOURCE_CLUSTER_XML + " | " + + MetadataStore.REBALANCING_STEAL_INFO + " ]") .withRequiredArg() .describedAs("metadata-key") .ofType(String.class); parser.accepts("set-metadata-value", "The value for the set-metadata [ " + MetadataStore.CLUSTER_KEY + " | " + MetadataStore.STORES_KEY + ", " + + MetadataStore.REBALANCING_SOURCE_CLUSTER_XML + ", " + MetadataStore.REBALANCING_STEAL_INFO + " ] - xml file location, [ " + MetadataStore.SERVER_STATE_KEY + " ] - " + MetadataStore.VoldemortState.NORMAL_SERVER + "," @@ -491,14 +493,15 @@ public static void main(String[] args) throws Exception { throw new VoldemortException("Missing set-metadata-value"); } else { String metadataValue = (String) options.valueOf("set-metadata-value"); - if(metadataKey.compareTo(MetadataStore.CLUSTER_KEY) == 0) { + if(metadataKey.compareTo(MetadataStore.CLUSTER_KEY) == 0 + || metadataKey.compareTo(MetadataStore.REBALANCING_SOURCE_CLUSTER_XML) == 0) { if(!Utils.isReadableFile(metadataValue)) throw new VoldemortException("Cluster xml file path incorrect"); ClusterMapper mapper = new ClusterMapper(); Cluster newCluster = mapper.readCluster(new File(metadataValue)); executeSetMetadata(nodeId, adminClient, - MetadataStore.CLUSTER_KEY, + metadataKey, mapper.writeCluster(newCluster)); } else if(metadataKey.compareTo(MetadataStore.SERVER_STATE_KEY) == 0) { VoldemortState newState = VoldemortState.valueOf(metadataValue); @@ -611,7 +614,7 @@ public static void main(String[] args) throws Exception { if(storeNames == null || storeNames.size() == 0) { throw new VoldemortException("Must specify store name using --stores option"); } - executeQueryKeys(nodeId, adminClient, storeNames, keyList); + executeQueryKeys(nodeId, adminClient, storeNames, keyList, options.has("ascii")); } if(ops.contains("h")) { if(nodeId == -1) { @@ -763,21 +766,27 @@ public static void printHelp(PrintStream stream, OptionParser parser) throws IOE stream.println("\t5) Set metadata on all nodes"); stream.println("\t\t./bin/voldemort-admin-tool.sh --set-metadata [" + MetadataStore.CLUSTER_KEY + ", " + MetadataStore.SERVER_STATE_KEY + ", " - + MetadataStore.STORES_KEY + ", " + MetadataStore.REBALANCING_STEAL_INFO + + MetadataStore.STORES_KEY + ", " + + MetadataStore.REBALANCING_SOURCE_CLUSTER_XML + ", " + + MetadataStore.REBALANCING_STEAL_INFO + "] --set-metadata-value [metadata-value] --url [url]"); stream.println("\t6) Set metadata for a particular node"); stream.println("\t\t./bin/voldemort-admin-tool.sh --set-metadata [" + MetadataStore.CLUSTER_KEY + ", " + MetadataStore.SERVER_STATE_KEY + ", " - + MetadataStore.STORES_KEY + ", " + MetadataStore.REBALANCING_STEAL_INFO + + MetadataStore.STORES_KEY + ", " + + MetadataStore.REBALANCING_SOURCE_CLUSTER_XML + ", " + + MetadataStore.REBALANCING_STEAL_INFO + "] --set-metadata-value [metadata-value] --url [url] --node [node-id]"); stream.println("\t7) Check if metadata is same on all nodes"); stream.println("\t\t./bin/voldemort-admin-tool.sh --check-metadata [" + MetadataStore.CLUSTER_KEY + ", " + MetadataStore.SERVER_STATE_KEY + ", " + MetadataStore.STORES_KEY + "] --url [url]"); stream.println("\t8) Clear rebalancing metadata [" + MetadataStore.SERVER_STATE_KEY + ", " + + ", " + MetadataStore.REBALANCING_SOURCE_CLUSTER_XML + ", " + MetadataStore.REBALANCING_STEAL_INFO + "] on all node "); stream.println("\t\t./bin/voldemort-admin-tool.sh --clear-rebalancing-metadata --url [url]"); stream.println("\t9) Clear rebalancing metadata [" + MetadataStore.SERVER_STATE_KEY + ", " + + ", " + MetadataStore.REBALANCING_SOURCE_CLUSTER_XML + ", " + MetadataStore.REBALANCING_STEAL_INFO + "] on a particular node "); stream.println("\t\t./bin/voldemort-admin-tool.sh --clear-rebalancing-metadata --url [url] --node [node-id]"); stream.println(); @@ -818,8 +827,10 @@ public static void printHelp(PrintStream stream, OptionParser parser) throws IOE stream.println("\t\t./bin/voldemort-admin-tool.sh --fetch-entries --url [url] --node [node-id]"); stream.println("\t9) Update entries for a set of stores using the output from a binary dump fetch entries"); stream.println("\t\t./bin/voldemort-admin-tool.sh --update-entries [folder path from output of --fetch-entries --outdir] --url [url] --node [node-id] --stores [comma-separated list of store names]"); - stream.println("\t10) Query stores for a set of keys on a specific node."); + stream.println("\t10.a) Query stores for a set of keys on a specific node, in hexstring format"); stream.println("\t\t./bin/voldemort-admin-tool.sh --query-keys [comma-separated list of keys] --url [url] --node [node-id] --stores [comma-separated list of store names]"); + stream.println("\t10.b) Query stores for a set of keys on a specific node, in ascii format"); + stream.println("\t\t./bin/voldemort-admin-tool.sh --query-keys [comma-separated list of keys] --url [url] --node [node-id] --stores [comma-separated list of store names] --ascii"); stream.println("\t11) Mirror data from another voldemort server (possibly in another cluster) for specified stores"); stream.println("\t\t./bin/voldemort-admin-tool.sh --mirror-from-url [bootstrap url to mirror from] --mirror-node [node to mirror from] --url [url] --node [node-id] --stores [comma-separated-list-of-store-names]"); stream.println("\t12) Mirror data from another voldemort server (possibly in another cluster) for all stores in current cluster"); @@ -924,7 +935,9 @@ private static void executeClearRebalancing(int nodeId, AdminClient adminClient) adminClient, MetadataStore.REBALANCING_STEAL_INFO, state.toJsonString()); - + System.out.println("Cleaning up " + MetadataStore.REBALANCING_SOURCE_CLUSTER_XML + + " to empty string"); + executeSetMetadata(nodeId, adminClient, MetadataStore.REBALANCING_SOURCE_CLUSTER_XML, ""); } private static void executeKeyDistribution(AdminClient adminClient) { @@ -951,7 +964,8 @@ private static void executeCheckMetadata(AdminClient adminClient, String metadat + " was null"); } else { - if(metadataKey.compareTo(MetadataStore.CLUSTER_KEY) == 0) { + if(metadataKey.compareTo(MetadataStore.CLUSTER_KEY) == 0 + || metadataKey.compareTo(MetadataStore.REBALANCING_SOURCE_CLUSTER_XML) == 0) { metadataValues.add(new ClusterMapper().readCluster(new StringReader(versioned.getValue()))); } else if(metadataKey.compareTo(MetadataStore.STORES_KEY) == 0) { metadataValues.add(new StoreDefinitionsMapper().readStoreList(new StringReader(versioned.getValue()))); @@ -1620,11 +1634,22 @@ private static void executeDeletePartitions(Integer nodeId, private static void executeQueryKeys(final Integer nodeId, AdminClient adminClient, List storeNames, - List keys) throws IOException { - Serializer serializer = new StringSerializer(); + List keys, + boolean useAscii) throws IOException { List listKeys = new ArrayList(); + Serializer serializer = new StringSerializer(); for(String key: keys) { - listKeys.add(new ByteArray(serializer.toBytes(key))); + try { + if(useAscii) { + listKeys.add(new ByteArray(serializer.toBytes(key))); + } else { + listKeys.add(new ByteArray(ByteUtils.fromHexString(key))); + } + } catch(DecoderException de) { + System.err.println("Error decoding key " + key); + de.printStackTrace(); + return; + } } for(final String storeName: storeNames) { final Iterator iterator = adminClient.streamingOps.queryKeys(nodeId.intValue(), diff --git a/src/java/voldemort/client/ClientConfig.java b/src/java/voldemort/client/ClientConfig.java index eb62ccc74b..568a0bcb6b 100644 --- a/src/java/voldemort/client/ClientConfig.java +++ b/src/java/voldemort/client/ClientConfig.java @@ -65,6 +65,14 @@ public class ClientConfig { private volatile boolean enablePipelineRoutedStore = true; private volatile int clientZoneId = Zone.DEFAULT_ZONE_ID; + /* + * Following properties are required for the Fat client wrapper to be + * embedded inside the CoordinatorService + */ + private volatile int fatClientWrapperMaxPoolSize = 20; + private volatile int fatClientWrapperCorePoolSize = 20; + private volatile int fatClientWrapperKeepAliveInSecs = 60; + /* * The following are only used with a non pipe line routed, i.e non NIO * based client @@ -165,6 +173,9 @@ public ClientConfig() {} public static final String ENABLE_COMPRESSION_LAYER = "enable_compression_layer"; public static final String ENABLE_SERIALIZATION_LAYER = "enable_serialization_layer"; public static final String ENABLE_INCONSISTENCY_RESOLVING_LAYER = "enable_inconsistency_resolving_layer"; + public static final String FAT_CLIENT_WRAPPER_MAX_POOL_SIZE_PROPERTY = "fat_client_wrapper_max_pool_size"; + public static final String FAT_CLIENT_WRAPPER_CORE_POOL_SIZE_PROPERTY = "fat_client_wrapper_core_pool_size"; + public static final String FAT_CLIENT_WRAPPER_POOL_KEEPALIVE_IN_SECS = "fat_client_wrapper_pool_keepalive_in_secs"; /** * Instantiate the client config using a properties file @@ -380,6 +391,21 @@ private void setProperties(Properties properties) { this.setEnableInconsistencyResolvingLayer(props.getBoolean(ENABLE_INCONSISTENCY_RESOLVING_LAYER)); } + if(props.containsKey(FAT_CLIENT_WRAPPER_CORE_POOL_SIZE_PROPERTY)) { + this.setFatClientWrapperCorePoolSize(props.getInt(FAT_CLIENT_WRAPPER_CORE_POOL_SIZE_PROPERTY, + this.fatClientWrapperCorePoolSize)); + } + + if(props.containsKey(FAT_CLIENT_WRAPPER_MAX_POOL_SIZE_PROPERTY)) { + this.setFatClientWrapperMaxPoolSize(props.getInt(FAT_CLIENT_WRAPPER_MAX_POOL_SIZE_PROPERTY, + this.fatClientWrapperMaxPoolSize)); + } + + if(props.containsKey(FAT_CLIENT_WRAPPER_POOL_KEEPALIVE_IN_SECS)) { + this.setFatClientWrapperKeepAliveInSecs(props.getInt(FAT_CLIENT_WRAPPER_POOL_KEEPALIVE_IN_SECS, + this.fatClientWrapperKeepAliveInSecs)); + } + } /** @@ -1105,6 +1131,45 @@ public ClientConfig setEnableInconsistencyResolvingLayer(boolean enableInconsist return this; } + public int getFatClientWrapperMaxPoolSize() { + return fatClientWrapperMaxPoolSize; + } + + /** + * @param fatClientWrapperMaxPoolSize Defines the Maximum pool size for the + * thread pool used in the Fat client wrapper + */ + public ClientConfig setFatClientWrapperMaxPoolSize(int fatClientWrapperMaxPoolSize) { + this.fatClientWrapperMaxPoolSize = fatClientWrapperMaxPoolSize; + return this; + } + + public int getFatClientWrapperCorePoolSize() { + return fatClientWrapperCorePoolSize; + } + + /** + * @param fatClientWrapperMaxPoolSize Defines the Core pool size for the + * thread pool used in the Fat client wrapper + */ + public ClientConfig setFatClientWrapperCorePoolSize(int fatClientWrapperCorePoolSize) { + this.fatClientWrapperCorePoolSize = fatClientWrapperCorePoolSize; + return this; + } + + public int getFatClientWrapperKeepAliveInSecs() { + return fatClientWrapperKeepAliveInSecs; + } + + /** + * @param fatClientWrapperKeepAliveInSecs Defines the Keep alive period in + * seconds for the thread pool used in the Fat client wrapper + */ + public ClientConfig setFatClientWrapperKeepAliveInSecs(int fatClientWrapperKeepAliveInSecs) { + this.fatClientWrapperKeepAliveInSecs = fatClientWrapperKeepAliveInSecs; + return this; + } + public String toString() { StringBuilder clientConfigInfo = new StringBuilder(); clientConfigInfo.append("Max connections per node: " + this.maxConnectionsPerNode + "\n"); diff --git a/src/java/voldemort/client/protocol/admin/AdminClient.java b/src/java/voldemort/client/protocol/admin/AdminClient.java index 38691152bf..00b977650c 100644 --- a/src/java/voldemort/client/protocol/admin/AdminClient.java +++ b/src/java/voldemort/client/protocol/admin/AdminClient.java @@ -57,6 +57,7 @@ import voldemort.client.protocol.pb.VAdminProto; import voldemort.client.protocol.pb.VAdminProto.RebalancePartitionInfoMap; import voldemort.client.protocol.pb.VProto; +import voldemort.client.protocol.pb.VProto.KeyedVersions; import voldemort.client.protocol.pb.VProto.RequestType; import voldemort.client.rebalance.RebalancePartitionsInfo; import voldemort.cluster.Cluster; @@ -73,7 +74,6 @@ import voldemort.store.StoreUtils; import voldemort.store.metadata.MetadataStore; import voldemort.store.metadata.MetadataStore.VoldemortState; -import voldemort.store.mysql.MysqlStorageConfiguration; import voldemort.store.readonly.ReadOnlyStorageConfiguration; import voldemort.store.readonly.ReadOnlyStorageFormat; import voldemort.store.readonly.ReadOnlyStorageMetadata; @@ -854,17 +854,47 @@ public void setMetadataversion(Properties newProperties) { * @param key Metadata key to update * @param value Value for the metadata key */ + public void updateRemoteMetadata(int remoteNodeId, String key, Versioned value) { - ByteArray keyBytes = new ByteArray(ByteUtils.getBytes(key, "UTF-8")); - Versioned valueBytes = new Versioned(ByteUtils.getBytes(value.getValue(), - "UTF-8"), - value.getVersion()); + HashMap> keyValueMap = new HashMap>(); + keyValueMap.put(key, value); + + updateRemoteMetadata(remoteNodeId, keyValueMap); + } + + /* + * remoteNodeId the nodeId of the server keyValueMap a Map of metadata + * keys to their versioned value + * + * This method passes multiple metadata keys to the server atomic update + * of stores and cluster xml during rebalance + */ + + public void updateRemoteMetadata(int remoteNodeId, + HashMap> keyValueMap) { + + ArrayList allKeyVersions = new ArrayList(); + for(Entry> entry: keyValueMap.entrySet()) { + String key = entry.getKey(); + Versioned value = entry.getValue(); + ByteArray keyBytes = new ByteArray(ByteUtils.getBytes(key, "UTF-8")); + + Versioned valueBytes = new Versioned(ByteUtils.getBytes(value.getValue(), + "UTF-8"), + value.getVersion()); + + VProto.KeyedVersions.Builder keyedVersion = VProto.KeyedVersions.newBuilder() + .setKey(ProtoUtils.encodeBytes(keyBytes)); + keyedVersion.addVersions(ProtoUtils.encodeVersioned(valueBytes)); + allKeyVersions.add(keyedVersion.build()); + + } VAdminProto.VoldemortAdminRequest request = VAdminProto.VoldemortAdminRequest.newBuilder() .setType(VAdminProto.AdminRequestType.UPDATE_METADATA) .setUpdateMetadata(VAdminProto.UpdateMetadataRequest.newBuilder() - .setKey(ByteString.copyFrom(keyBytes.get())) - .setVersioned(ProtoUtils.encodeVersioned(valueBytes)) + .addAllMetadataEntry(allKeyVersions) + .build()) .build(); VAdminProto.UpdateMetadataResponse.Builder response = rpcOps.sendAndReceive(remoteNodeId, @@ -1961,6 +1991,8 @@ public SocketStore getSocketStore(int nodeId, String storeName) { SocketStore newSocketStore = null; try { + // Unless request format is protobuf, IGNORE_CHECKS + // will not work otherwise newSocketStore = clientPool.create(storeName, node.getHost(), node.getSocketPort(), @@ -2118,7 +2150,8 @@ public void updateEntries(int nodeId, } /** - * Fetch key/value tuples belonging to a node with given key values + * Fetch key/value tuples from a given server, directly from storage + * engine * *

* Entries are being queried synchronously @@ -2139,7 +2172,6 @@ public Iterator queryKeys(int nodeId, try { store = adminStoreClient.getSocketStore(nodeId, storeName); - } catch(Exception e) { throw new VoldemortException(e); } @@ -2391,6 +2423,8 @@ public Versioned getRemoteRebalancerState(int nodeId) { */ public void rebalanceStateChange(Cluster existingCluster, Cluster transitionCluster, + List existingStoreDefs, + List targetStoreDefs, List rebalancePartitionPlanList, boolean swapRO, boolean changeClusterMetadata, @@ -2410,6 +2444,7 @@ public void rebalanceStateChange(Cluster existingCluster, try { individualStateChange(nodeId, transitionCluster, + targetStoreDefs, stealerNodeToPlan.get(nodeId), swapRO, changeClusterMetadata, @@ -2454,6 +2489,7 @@ public void rebalanceStateChange(Cluster existingCluster, try { individualStateChange(completedNodeId, existingCluster, + existingStoreDefs, stealerNodeToPlan.get(completedNodeId), swapRO, changeClusterMetadata, @@ -2493,6 +2529,7 @@ public void rebalanceStateChange(Cluster existingCluster, */ private void individualStateChange(int nodeId, Cluster cluster, + List storeDefs, List rebalancePartitionPlanList, boolean swapRO, boolean changeClusterMetadata, @@ -2531,6 +2568,7 @@ private void individualStateChange(int nodeId, .setChangeRebalanceState(changeRebalanceState) .setClusterString(clusterMapper.writeCluster(cluster)) .setRollback(rollback) + .setStoresString(new StoreDefinitionsMapper().writeStoreList(storeDefs)) .build(); VAdminProto.VoldemortAdminRequest adminRequest = VAdminProto.VoldemortAdminRequest.newBuilder() .setRebalanceStateChange(getRebalanceStateChangeRequest) diff --git a/src/java/voldemort/client/protocol/admin/StreamingClient.java b/src/java/voldemort/client/protocol/admin/StreamingClient.java index 50430d2c40..a1c0320f37 100644 --- a/src/java/voldemort/client/protocol/admin/StreamingClient.java +++ b/src/java/voldemort/client/protocol/admin/StreamingClient.java @@ -51,21 +51,19 @@ /** * - * @author anagpal * - * The streaming API allows for send events into voldemort stores in the - * async fashion. All the partition and replication logic will be taken - * care of internally. + * The streaming API allows for send events into voldemort stores in the async + * fashion. All the partition and replication logic will be taken care of + * internally. * - * The users is expected to provide two callbacks, one for performing - * period checkpoints and one for recovering the streaming process from - * the last checkpoint. + * The users is expected to provide two callbacks, one for performing period + * checkpoints and one for recovering the streaming process from the last + * checkpoint. * - * NOTE: The API is not thread safe, since if multiple threads use this - * API we cannot make any guarantees about correctness of the - * checkpointing mechanism. + * NOTE: The API is not thread safe, since if multiple threads use this API we + * cannot make any guarantees about correctness of the checkpointing mechanism. * - * Right now we expect this to used by a single thread per data source + * Right now we expect this to used by a single thread per data source * */ public class StreamingClient { @@ -91,9 +89,6 @@ public class StreamingClient { // Every batch size we commit private static int CHECKPOINT_COMMIT_SIZE; - // TODO - // provide knobs to tune this - private static int TIME_COMMIT_SIZE = 30; // we have to throttle to a certain qps private static int THROTTLE_QPS; private int entriesProcessed; @@ -504,14 +499,12 @@ public synchronized void streamingPut(ByteArray key, Versioned value, St } catch(InterruptedException e1) { MARKED_BAD = true; - logger.error("Recovery Callback failed"); - e1.printStackTrace(); + logger.error("Recovery Callback failed", e1); throw new VoldemortException("Recovery Callback failed"); } catch(ExecutionException e1) { MARKED_BAD = true; - logger.error("Recovery Callback failed"); - e1.printStackTrace(); - throw new VoldemortException("Recovery Callback failed"); + logger.error("Recovery Callback failed during execution", e1); + throw new VoldemortException("Recovery Callback failed during execution"); } e.printStackTrace(); @@ -519,12 +512,9 @@ public synchronized void streamingPut(ByteArray key, Versioned value, St } - int secondsTime = calendar.get(Calendar.SECOND); - if(entriesProcessed == CHECKPOINT_COMMIT_SIZE || secondsTime % TIME_COMMIT_SIZE == 0) { + if(entriesProcessed == CHECKPOINT_COMMIT_SIZE) { entriesProcessed = 0; - commitToVoldemort(); - } throttler.maybeThrottle(1); @@ -542,6 +532,45 @@ public synchronized void commitToVoldemort() { commitToVoldemort(storeNames); } + /** + * Reset streaming session by unmarking it as bad + */ + public void unmarkBad() { + MARKED_BAD = false; + } + + /** + * mark a node as blacklisted + * + * @param NodeId Integer node id of the node to be balcklisted + */ + + @SuppressWarnings({ "rawtypes", "unchecked" }) + public void blacklistNode(int nodeId) { + Collection nodesInCluster = adminClient.getAdminClientCluster().getNodes(); + + if(blackListedNodes == null) { + blackListedNodes = new ArrayList(); + } + blackListedNodes.add(nodeId); + + for(Node node: nodesInCluster) { + + if(node.getId() == nodeId) { + nodesToStream.remove(node); + break; + } + + } + + for(String store: storeNames) { + SocketAndStreams sands = nodeIdStoreToSocketAndStreams.get(new Pair(store, nodeId)); + close(sands.getSocket()); + SocketDestination destination = nodeIdStoreToSocketRequest.get(new Pair(store, nodeId)); + streamingSocketPool.checkin(destination, sands); + } + } + /** * Flush the network buffer and write all entries to the serve. then wait * for an ack from the server. This is a blocking call. It is invoked on @@ -557,6 +586,8 @@ private void commitToVoldemort(List storeNamesToCommit) { if(logger.isDebugEnabled()) { logger.debug("Trying to commit to Voldemort"); } + + boolean hasError = false; for(Node node: nodesToStream) { for(String store: storeNamesToCommit) { @@ -576,64 +607,48 @@ private void commitToVoldemort(List storeNamesToCommit) { VAdminProto.UpdatePartitionEntriesResponse.Builder updateResponse = ProtoUtils.readToBuilder(inputStream, VAdminProto.UpdatePartitionEntriesResponse.newBuilder()); if(updateResponse.hasError()) { - logger.warn("Invoking the Recovery Callback"); - Future future = streamingresults.submit(recoveryCallback); - try { - future.get(); - - } catch(InterruptedException e1) { - MARKED_BAD = true; - logger.error("Recovery Callback failed"); - e1.printStackTrace(); - throw new VoldemortException("Recovery Callback failed"); - } catch(ExecutionException e1) { - MARKED_BAD = true; - logger.error("Recovery Callback failed"); - e1.printStackTrace(); - throw new VoldemortException("Recovery Callback failed"); - } - } else { - if(logger.isDebugEnabled()) { - logger.debug("Commit successful"); - logger.debug("calling checkpoint callback"); - } - Future future = streamingresults.submit(checkpointCallback); - try { - future.get(); - - } catch(InterruptedException e1) { - - logger.warn("Checkpoint callback failed!"); - e1.printStackTrace(); - } catch(ExecutionException e1) { - logger.warn("Checkpoint callback failed!"); - e1.printStackTrace(); - } + hasError = true; } } catch(IOException e) { + logger.error("Exception during commit", e); + hasError = true; + } + } - logger.warn("Invoking the Recovery Callback"); - Future future = streamingresults.submit(recoveryCallback); - try { - future.get(); - - } catch(InterruptedException e1) { - MARKED_BAD = true; - logger.error("Recovery Callback failed"); - e1.printStackTrace(); - throw new VoldemortException("Recovery Callback failed"); - } catch(ExecutionException e1) { - MARKED_BAD = true; - logger.error("Recovery Callback failed"); - e1.printStackTrace(); - throw new VoldemortException("Recovery Callback failed"); - } + } - e.printStackTrace(); - } + // remove redundant callbacks + if(hasError) { + + logger.warn("Invoking the Recovery Callback"); + Future future = streamingresults.submit(recoveryCallback); + try { + future.get(); + + } catch(InterruptedException e1) { + MARKED_BAD = true; + logger.error("Recovery Callback failed", e1); + throw new VoldemortException("Recovery Callback failed"); + } catch(ExecutionException e1) { + MARKED_BAD = true; + logger.error("Recovery Callback failed during execution", e1); + throw new VoldemortException("Recovery Callback failed during execution"); + } + } else { + if(logger.isDebugEnabled()) { + logger.debug("Commit successful"); + logger.debug("calling checkpoint callback"); } + Future future = streamingresults.submit(checkpointCallback); + try { + future.get(); + } catch(InterruptedException e1) { + logger.warn("Checkpoint callback failed!", e1); + } catch(ExecutionException e1) { + logger.warn("Checkpoint callback failed during execution!", e1); + } } } @@ -654,11 +669,9 @@ public synchronized void closeStreamingSessions(Callable resetCheckpointCallback future.get(); } catch(InterruptedException e1) { - // TODO Auto-generated catch block - e1.printStackTrace(); + logger.warn("Reset check point interrupted" + e1); } catch(ExecutionException e1) { - // TODO Auto-generated catch block - e1.printStackTrace(); + logger.warn("Reset check point interrupted during execution" + e1); } } diff --git a/src/java/voldemort/client/protocol/pb/ProtoUtils.java b/src/java/voldemort/client/protocol/pb/ProtoUtils.java index 7eef3359b0..ede67945a5 100644 --- a/src/java/voldemort/client/protocol/pb/ProtoUtils.java +++ b/src/java/voldemort/client/protocol/pb/ProtoUtils.java @@ -31,6 +31,7 @@ import voldemort.client.protocol.pb.VAdminProto.PerStorePartitionTuple; import voldemort.client.protocol.pb.VAdminProto.ROStoreVersionDirMap; import voldemort.client.protocol.pb.VAdminProto.RebalancePartitionInfoMap; +import voldemort.client.protocol.pb.VProto.KeyedVersions; import voldemort.client.rebalance.RebalancePartitionsInfo; import voldemort.store.ErrorCodeMapper; import voldemort.utils.ByteArray; @@ -198,6 +199,17 @@ public static Versioned decodeVersioned(VProto.Versioned versioned) { decodeClock(versioned.getVersion())); } + /** + * Given a list of value versions for the metadata keys we are just + * interested in the value at index 0 This is because even if we have to + * update the cluster.xml we marshall a single key into a versioned list + * Hence we just look at the value at index 0 + * + */ + public static Versioned decodeVersionedMetadataKeyValue(KeyedVersions keyValue) { + return decodeVersioned(keyValue.getVersions(0)); + } + public static List> decodeVersions(List versioned) { List> values = new ArrayList>(versioned.size()); for(VProto.Versioned v: versioned) diff --git a/src/java/voldemort/client/protocol/pb/VAdminProto.java b/src/java/voldemort/client/protocol/pb/VAdminProto.java index e34aec3563..cca0257143 100644 --- a/src/java/voldemort/client/protocol/pb/VAdminProto.java +++ b/src/java/voldemort/client/protocol/pb/VAdminProto.java @@ -821,38 +821,32 @@ public UpdateMetadataRequest getDefaultInstanceForType() { return voldemort.client.protocol.pb.VAdminProto.internal_static_voldemort_UpdateMetadataRequest_fieldAccessorTable; } - // required bytes key = 1; - public static final int KEY_FIELD_NUMBER = 1; - private boolean hasKey; - private com.google.protobuf.ByteString key_ = com.google.protobuf.ByteString.EMPTY; - public boolean hasKey() { return hasKey; } - public com.google.protobuf.ByteString getKey() { return key_; } - - // required .voldemort.Versioned versioned = 2; - public static final int VERSIONED_FIELD_NUMBER = 2; - private boolean hasVersioned; - private voldemort.client.protocol.pb.VProto.Versioned versioned_; - public boolean hasVersioned() { return hasVersioned; } - public voldemort.client.protocol.pb.VProto.Versioned getVersioned() { return versioned_; } + // repeated .voldemort.KeyedVersions metadataEntry = 1; + public static final int METADATAENTRY_FIELD_NUMBER = 1; + private java.util.List metadataEntry_ = + java.util.Collections.emptyList(); + public java.util.List getMetadataEntryList() { + return metadataEntry_; + } + public int getMetadataEntryCount() { return metadataEntry_.size(); } + public voldemort.client.protocol.pb.VProto.KeyedVersions getMetadataEntry(int index) { + return metadataEntry_.get(index); + } private void initFields() { - versioned_ = voldemort.client.protocol.pb.VProto.Versioned.getDefaultInstance(); } public final boolean isInitialized() { - if (!hasKey) return false; - if (!hasVersioned) return false; - if (!getVersioned().isInitialized()) return false; + for (voldemort.client.protocol.pb.VProto.KeyedVersions element : getMetadataEntryList()) { + if (!element.isInitialized()) return false; + } return true; } public void writeTo(com.google.protobuf.CodedOutputStream output) throws java.io.IOException { getSerializedSize(); - if (hasKey()) { - output.writeBytes(1, getKey()); - } - if (hasVersioned()) { - output.writeMessage(2, getVersioned()); + for (voldemort.client.protocol.pb.VProto.KeyedVersions element : getMetadataEntryList()) { + output.writeMessage(1, element); } getUnknownFields().writeTo(output); } @@ -863,13 +857,9 @@ public int getSerializedSize() { if (size != -1) return size; size = 0; - if (hasKey()) { + for (voldemort.client.protocol.pb.VProto.KeyedVersions element : getMetadataEntryList()) { size += com.google.protobuf.CodedOutputStream - .computeBytesSize(1, getKey()); - } - if (hasVersioned()) { - size += com.google.protobuf.CodedOutputStream - .computeMessageSize(2, getVersioned()); + .computeMessageSize(1, element); } size += getUnknownFields().getSerializedSize(); memoizedSerializedSize = size; @@ -1013,6 +1003,10 @@ public voldemort.client.protocol.pb.VAdminProto.UpdateMetadataRequest buildParti throw new IllegalStateException( "build() has already been called on this Builder."); } + if (result.metadataEntry_ != java.util.Collections.EMPTY_LIST) { + result.metadataEntry_ = + java.util.Collections.unmodifiableList(result.metadataEntry_); + } voldemort.client.protocol.pb.VAdminProto.UpdateMetadataRequest returnMe = result; result = null; return returnMe; @@ -1029,11 +1023,11 @@ public Builder mergeFrom(com.google.protobuf.Message other) { public Builder mergeFrom(voldemort.client.protocol.pb.VAdminProto.UpdateMetadataRequest other) { if (other == voldemort.client.protocol.pb.VAdminProto.UpdateMetadataRequest.getDefaultInstance()) return this; - if (other.hasKey()) { - setKey(other.getKey()); - } - if (other.hasVersioned()) { - mergeVersioned(other.getVersioned()); + if (!other.metadataEntry_.isEmpty()) { + if (result.metadataEntry_.isEmpty()) { + result.metadataEntry_ = new java.util.ArrayList(); + } + result.metadataEntry_.addAll(other.metadataEntry_); } this.mergeUnknownFields(other.getUnknownFields()); return this; @@ -1061,16 +1055,9 @@ public Builder mergeFrom( break; } case 10: { - setKey(input.readBytes()); - break; - } - case 18: { - voldemort.client.protocol.pb.VProto.Versioned.Builder subBuilder = voldemort.client.protocol.pb.VProto.Versioned.newBuilder(); - if (hasVersioned()) { - subBuilder.mergeFrom(getVersioned()); - } + voldemort.client.protocol.pb.VProto.KeyedVersions.Builder subBuilder = voldemort.client.protocol.pb.VProto.KeyedVersions.newBuilder(); input.readMessage(subBuilder, extensionRegistry); - setVersioned(subBuilder.buildPartial()); + addMetadataEntry(subBuilder.buildPartial()); break; } } @@ -1078,61 +1065,54 @@ public Builder mergeFrom( } - // required bytes key = 1; - public boolean hasKey() { - return result.hasKey(); + // repeated .voldemort.KeyedVersions metadataEntry = 1; + public java.util.List getMetadataEntryList() { + return java.util.Collections.unmodifiableList(result.metadataEntry_); } - public com.google.protobuf.ByteString getKey() { - return result.getKey(); + public int getMetadataEntryCount() { + return result.getMetadataEntryCount(); } - public Builder setKey(com.google.protobuf.ByteString value) { + public voldemort.client.protocol.pb.VProto.KeyedVersions getMetadataEntry(int index) { + return result.getMetadataEntry(index); + } + public Builder setMetadataEntry(int index, voldemort.client.protocol.pb.VProto.KeyedVersions value) { if (value == null) { - throw new NullPointerException(); - } - result.hasKey = true; - result.key_ = value; + throw new NullPointerException(); + } + result.metadataEntry_.set(index, value); return this; } - public Builder clearKey() { - result.hasKey = false; - result.key_ = getDefaultInstance().getKey(); + public Builder setMetadataEntry(int index, voldemort.client.protocol.pb.VProto.KeyedVersions.Builder builderForValue) { + result.metadataEntry_.set(index, builderForValue.build()); return this; } - - // required .voldemort.Versioned versioned = 2; - public boolean hasVersioned() { - return result.hasVersioned(); - } - public voldemort.client.protocol.pb.VProto.Versioned getVersioned() { - return result.getVersioned(); - } - public Builder setVersioned(voldemort.client.protocol.pb.VProto.Versioned value) { + public Builder addMetadataEntry(voldemort.client.protocol.pb.VProto.KeyedVersions value) { if (value == null) { throw new NullPointerException(); } - result.hasVersioned = true; - result.versioned_ = value; + if (result.metadataEntry_.isEmpty()) { + result.metadataEntry_ = new java.util.ArrayList(); + } + result.metadataEntry_.add(value); return this; } - public Builder setVersioned(voldemort.client.protocol.pb.VProto.Versioned.Builder builderForValue) { - result.hasVersioned = true; - result.versioned_ = builderForValue.build(); + public Builder addMetadataEntry(voldemort.client.protocol.pb.VProto.KeyedVersions.Builder builderForValue) { + if (result.metadataEntry_.isEmpty()) { + result.metadataEntry_ = new java.util.ArrayList(); + } + result.metadataEntry_.add(builderForValue.build()); return this; } - public Builder mergeVersioned(voldemort.client.protocol.pb.VProto.Versioned value) { - if (result.hasVersioned() && - result.versioned_ != voldemort.client.protocol.pb.VProto.Versioned.getDefaultInstance()) { - result.versioned_ = - voldemort.client.protocol.pb.VProto.Versioned.newBuilder(result.versioned_).mergeFrom(value).buildPartial(); - } else { - result.versioned_ = value; + public Builder addAllMetadataEntry( + java.lang.Iterable values) { + if (result.metadataEntry_.isEmpty()) { + result.metadataEntry_ = new java.util.ArrayList(); } - result.hasVersioned = true; + super.addAll(values, result.metadataEntry_); return this; } - public Builder clearVersioned() { - result.hasVersioned = false; - result.versioned_ = voldemort.client.protocol.pb.VProto.Versioned.getDefaultInstance(); + public Builder clearMetadataEntry() { + result.metadataEntry_ = java.util.Collections.emptyList(); return this; } @@ -18185,29 +18165,36 @@ public voldemort.client.protocol.pb.VAdminProto.RebalancePartitionInfoMap getReb public boolean hasClusterString() { return hasClusterString; } public java.lang.String getClusterString() { return clusterString_; } - // required bool swap_ro = 3; - public static final int SWAP_RO_FIELD_NUMBER = 3; + // required string stores_string = 3; + public static final int STORES_STRING_FIELD_NUMBER = 3; + private boolean hasStoresString; + private java.lang.String storesString_ = ""; + public boolean hasStoresString() { return hasStoresString; } + public java.lang.String getStoresString() { return storesString_; } + + // required bool swap_ro = 4; + public static final int SWAP_RO_FIELD_NUMBER = 4; private boolean hasSwapRo; private boolean swapRo_ = false; public boolean hasSwapRo() { return hasSwapRo; } public boolean getSwapRo() { return swapRo_; } - // required bool change_cluster_metadata = 4; - public static final int CHANGE_CLUSTER_METADATA_FIELD_NUMBER = 4; + // required bool change_cluster_metadata = 5; + public static final int CHANGE_CLUSTER_METADATA_FIELD_NUMBER = 5; private boolean hasChangeClusterMetadata; private boolean changeClusterMetadata_ = false; public boolean hasChangeClusterMetadata() { return hasChangeClusterMetadata; } public boolean getChangeClusterMetadata() { return changeClusterMetadata_; } - // required bool change_rebalance_state = 5; - public static final int CHANGE_REBALANCE_STATE_FIELD_NUMBER = 5; + // required bool change_rebalance_state = 6; + public static final int CHANGE_REBALANCE_STATE_FIELD_NUMBER = 6; private boolean hasChangeRebalanceState; private boolean changeRebalanceState_ = false; public boolean hasChangeRebalanceState() { return hasChangeRebalanceState; } public boolean getChangeRebalanceState() { return changeRebalanceState_; } - // required bool rollback = 6; - public static final int ROLLBACK_FIELD_NUMBER = 6; + // required bool rollback = 7; + public static final int ROLLBACK_FIELD_NUMBER = 7; private boolean hasRollback; private boolean rollback_ = false; public boolean hasRollback() { return hasRollback; } @@ -18217,6 +18204,7 @@ private void initFields() { } public final boolean isInitialized() { if (!hasClusterString) return false; + if (!hasStoresString) return false; if (!hasSwapRo) return false; if (!hasChangeClusterMetadata) return false; if (!hasChangeRebalanceState) return false; @@ -18236,17 +18224,20 @@ public void writeTo(com.google.protobuf.CodedOutputStream output) if (hasClusterString()) { output.writeString(2, getClusterString()); } + if (hasStoresString()) { + output.writeString(3, getStoresString()); + } if (hasSwapRo()) { - output.writeBool(3, getSwapRo()); + output.writeBool(4, getSwapRo()); } if (hasChangeClusterMetadata()) { - output.writeBool(4, getChangeClusterMetadata()); + output.writeBool(5, getChangeClusterMetadata()); } if (hasChangeRebalanceState()) { - output.writeBool(5, getChangeRebalanceState()); + output.writeBool(6, getChangeRebalanceState()); } if (hasRollback()) { - output.writeBool(6, getRollback()); + output.writeBool(7, getRollback()); } getUnknownFields().writeTo(output); } @@ -18265,21 +18256,25 @@ public int getSerializedSize() { size += com.google.protobuf.CodedOutputStream .computeStringSize(2, getClusterString()); } + if (hasStoresString()) { + size += com.google.protobuf.CodedOutputStream + .computeStringSize(3, getStoresString()); + } if (hasSwapRo()) { size += com.google.protobuf.CodedOutputStream - .computeBoolSize(3, getSwapRo()); + .computeBoolSize(4, getSwapRo()); } if (hasChangeClusterMetadata()) { size += com.google.protobuf.CodedOutputStream - .computeBoolSize(4, getChangeClusterMetadata()); + .computeBoolSize(5, getChangeClusterMetadata()); } if (hasChangeRebalanceState()) { size += com.google.protobuf.CodedOutputStream - .computeBoolSize(5, getChangeRebalanceState()); + .computeBoolSize(6, getChangeRebalanceState()); } if (hasRollback()) { size += com.google.protobuf.CodedOutputStream - .computeBoolSize(6, getRollback()); + .computeBoolSize(7, getRollback()); } size += getUnknownFields().getSerializedSize(); memoizedSerializedSize = size; @@ -18452,6 +18447,9 @@ public Builder mergeFrom(voldemort.client.protocol.pb.VAdminProto.RebalanceState if (other.hasClusterString()) { setClusterString(other.getClusterString()); } + if (other.hasStoresString()) { + setStoresString(other.getStoresString()); + } if (other.hasSwapRo()) { setSwapRo(other.getSwapRo()); } @@ -18499,19 +18497,23 @@ public Builder mergeFrom( setClusterString(input.readString()); break; } - case 24: { - setSwapRo(input.readBool()); + case 26: { + setStoresString(input.readString()); break; } case 32: { - setChangeClusterMetadata(input.readBool()); + setSwapRo(input.readBool()); break; } case 40: { - setChangeRebalanceState(input.readBool()); + setChangeClusterMetadata(input.readBool()); break; } case 48: { + setChangeRebalanceState(input.readBool()); + break; + } + case 56: { setRollback(input.readBool()); break; } @@ -18592,7 +18594,28 @@ public Builder clearClusterString() { return this; } - // required bool swap_ro = 3; + // required string stores_string = 3; + public boolean hasStoresString() { + return result.hasStoresString(); + } + public java.lang.String getStoresString() { + return result.getStoresString(); + } + public Builder setStoresString(java.lang.String value) { + if (value == null) { + throw new NullPointerException(); + } + result.hasStoresString = true; + result.storesString_ = value; + return this; + } + public Builder clearStoresString() { + result.hasStoresString = false; + result.storesString_ = getDefaultInstance().getStoresString(); + return this; + } + + // required bool swap_ro = 4; public boolean hasSwapRo() { return result.hasSwapRo(); } @@ -18610,7 +18633,7 @@ public Builder clearSwapRo() { return this; } - // required bool change_cluster_metadata = 4; + // required bool change_cluster_metadata = 5; public boolean hasChangeClusterMetadata() { return result.hasChangeClusterMetadata(); } @@ -18628,7 +18651,7 @@ public Builder clearChangeClusterMetadata() { return this; } - // required bool change_rebalance_state = 5; + // required bool change_rebalance_state = 6; public boolean hasChangeRebalanceState() { return result.hasChangeRebalanceState(); } @@ -18646,7 +18669,7 @@ public Builder clearChangeRebalanceState() { return this; } - // required bool rollback = 6; + // required bool rollback = 7; public boolean hasRollback() { return result.hasRollback(); } @@ -23141,198 +23164,198 @@ public Builder clearReserveMemory() { "emort-client.proto\"!\n\022GetMetadataRequest" + "\022\013\n\003key\030\001 \002(\014\"]\n\023GetMetadataResponse\022%\n\007" + "version\030\001 \001(\0132\024.voldemort.Versioned\022\037\n\005e" + - "rror\030\002 \001(\0132\020.voldemort.Error\"M\n\025UpdateMe" + - "tadataRequest\022\013\n\003key\030\001 \002(\014\022\'\n\tversioned\030" + - "\002 \002(\0132\024.voldemort.Versioned\"9\n\026UpdateMet" + - "adataResponse\022\037\n\005error\030\001 \001(\0132\020.voldemort" + - ".Error\"7\n\tFileEntry\022\021\n\tfile_name\030\001 \002(\t\022\027" + - "\n\017file_size_bytes\030\002 \002(\003\"F\n\016PartitionEntr", - "y\022\013\n\003key\030\001 \002(\014\022\'\n\tversioned\030\002 \002(\0132\024.vold" + - "emort.Versioned\"\216\001\n\035UpdatePartitionEntri" + - "esRequest\022\r\n\005store\030\001 \002(\t\0222\n\017partition_en" + - "try\030\002 \002(\0132\031.voldemort.PartitionEntry\022*\n\006" + - "filter\030\003 \001(\0132\032.voldemort.VoldemortFilter" + - "\"A\n\036UpdatePartitionEntriesResponse\022\037\n\005er" + - "ror\030\001 \001(\0132\020.voldemort.Error\"-\n\017Voldemort" + - "Filter\022\014\n\004name\030\001 \002(\t\022\014\n\004data\030\002 \002(\014\"\257\001\n\030U" + - "pdateSlopEntriesRequest\022\r\n\005store\030\001 \002(\t\022\013" + - "\n\003key\030\002 \002(\014\022\'\n\007version\030\003 \002(\0132\026.voldemort", - ".VectorClock\022,\n\014request_type\030\004 \002(\0162\026.vol" + - "demort.RequestType\022\r\n\005value\030\005 \001(\014\022\021\n\ttra" + - "nsform\030\006 \001(\014\"<\n\031UpdateSlopEntriesRespons" + - "e\022\037\n\005error\030\001 \001(\0132\020.voldemort.Error\"d\n\032Fe" + - "tchPartitionFilesRequest\022\r\n\005store\030\001 \002(\t\022" + - "7\n\024replica_to_partition\030\002 \003(\0132\031.voldemor" + - "t.PartitionTuple\"\244\002\n\034FetchPartitionEntri" + - "esRequest\0227\n\024replica_to_partition\030\001 \003(\0132" + - "\031.voldemort.PartitionTuple\022\r\n\005store\030\002 \002(" + - "\t\022*\n\006filter\030\003 \001(\0132\032.voldemort.VoldemortF", - "ilter\022\024\n\014fetch_values\030\004 \001(\010\022*\n\"OBSOLETE_" + - "_DO_NOT_USE__skip_records\030\005 \001(\003\022\027\n\017initi" + - "al_cluster\030\006 \001(\t\022\026\n\016fetch_orphaned\030\007 \001(\010" + - "\022\035\n\025records_per_partition\030\010 \001(\003\"\201\001\n\035Fetc" + - "hPartitionEntriesResponse\0222\n\017partition_e" + - "ntry\030\001 \001(\0132\031.voldemort.PartitionEntry\022\013\n" + - "\003key\030\002 \001(\014\022\037\n\005error\030\003 \001(\0132\020.voldemort.Er" + - "ror\"\254\001\n\035DeletePartitionEntriesRequest\022\r\n" + - "\005store\030\001 \002(\t\0227\n\024replica_to_partition\030\002 \003" + - "(\0132\031.voldemort.PartitionTuple\022*\n\006filter\030", - "\003 \001(\0132\032.voldemort.VoldemortFilter\022\027\n\017ini" + - "tial_cluster\030\004 \001(\t\"P\n\036DeletePartitionEnt" + - "riesResponse\022\r\n\005count\030\001 \001(\003\022\037\n\005error\030\002 \001" + - "(\0132\020.voldemort.Error\"\317\001\n\035InitiateFetchAn" + - "dUpdateRequest\022\017\n\007node_id\030\001 \002(\005\022\r\n\005store" + - "\030\002 \002(\t\022*\n\006filter\030\003 \001(\0132\032.voldemort.Volde" + - "mortFilter\0227\n\024replica_to_partition\030\004 \003(\013" + - "2\031.voldemort.PartitionTuple\022\027\n\017initial_c" + - "luster\030\005 \001(\t\022\020\n\010optimize\030\006 \001(\010\"1\n\033AsyncO" + - "perationStatusRequest\022\022\n\nrequest_id\030\001 \002(", - "\005\"/\n\031AsyncOperationStopRequest\022\022\n\nreques" + - "t_id\030\001 \002(\005\"=\n\032AsyncOperationStopResponse" + - "\022\037\n\005error\030\001 \001(\0132\020.voldemort.Error\"2\n\031Asy" + - "ncOperationListRequest\022\025\n\rshow_complete\030" + - "\002 \002(\010\"R\n\032AsyncOperationListResponse\022\023\n\013r" + - "equest_ids\030\001 \003(\005\022\037\n\005error\030\002 \001(\0132\020.voldem" + - "ort.Error\":\n\016PartitionTuple\022\024\n\014replica_t" + - "ype\030\001 \002(\005\022\022\n\npartitions\030\002 \003(\005\"e\n\026PerStor" + - "ePartitionTuple\022\022\n\nstore_name\030\001 \002(\t\0227\n\024r" + - "eplica_to_partition\030\002 \003(\0132\031.voldemort.Pa", - "rtitionTuple\"\370\001\n\031RebalancePartitionInfoM" + - "ap\022\022\n\nstealer_id\030\001 \002(\005\022\020\n\010donor_id\030\002 \002(\005" + - "\022\017\n\007attempt\030\003 \002(\005\022C\n\030replica_to_add_part" + - "ition\030\004 \003(\0132!.voldemort.PerStorePartitio" + - "nTuple\022F\n\033replica_to_delete_partition\030\005 " + - "\003(\0132!.voldemort.PerStorePartitionTuple\022\027" + - "\n\017initial_cluster\030\006 \002(\t\"f\n\034InitiateRebal" + - "anceNodeRequest\022F\n\030rebalance_partition_i" + - "nfo\030\001 \002(\0132$.voldemort.RebalancePartition" + - "InfoMap\"m\n#InitiateRebalanceNodeOnDonorR", - "equest\022F\n\030rebalance_partition_info\030\001 \003(\013" + - "2$.voldemort.RebalancePartitionInfoMap\"\212" + - "\001\n\034AsyncOperationStatusResponse\022\022\n\nreque" + - "st_id\030\001 \001(\005\022\023\n\013description\030\002 \001(\t\022\016\n\006stat" + - "us\030\003 \001(\t\022\020\n\010complete\030\004 \001(\010\022\037\n\005error\030\005 \001(" + - "\0132\020.voldemort.Error\"\'\n\026TruncateEntriesRe" + - "quest\022\r\n\005store\030\001 \002(\t\":\n\027TruncateEntriesR" + - "esponse\022\037\n\005error\030\001 \001(\0132\020.voldemort.Error" + - "\"*\n\017AddStoreRequest\022\027\n\017storeDefinition\030\001" + - " \002(\t\"3\n\020AddStoreResponse\022\037\n\005error\030\001 \001(\0132", - "\020.voldemort.Error\"\'\n\022DeleteStoreRequest\022" + - "\021\n\tstoreName\030\001 \002(\t\"6\n\023DeleteStoreRespons" + - "e\022\037\n\005error\030\001 \001(\0132\020.voldemort.Error\"P\n\021Fe" + - "tchStoreRequest\022\022\n\nstore_name\030\001 \002(\t\022\021\n\ts" + - "tore_dir\030\002 \002(\t\022\024\n\014push_version\030\003 \001(\003\"9\n\020" + - "SwapStoreRequest\022\022\n\nstore_name\030\001 \002(\t\022\021\n\t" + - "store_dir\030\002 \002(\t\"P\n\021SwapStoreResponse\022\037\n\005" + - "error\030\001 \001(\0132\020.voldemort.Error\022\032\n\022previou" + - "s_store_dir\030\002 \001(\t\"@\n\024RollbackStoreReques" + - "t\022\022\n\nstore_name\030\001 \002(\t\022\024\n\014push_version\030\002 ", - "\002(\003\"8\n\025RollbackStoreResponse\022\037\n\005error\030\001 " + - "\001(\0132\020.voldemort.Error\"&\n\020RepairJobReques" + - "t\022\022\n\nstore_name\030\001 \001(\t\"4\n\021RepairJobRespon" + - "se\022\037\n\005error\030\001 \001(\0132\020.voldemort.Error\"=\n\024R" + - "OStoreVersionDirMap\022\022\n\nstore_name\030\001 \002(\t\022" + - "\021\n\tstore_dir\030\002 \002(\t\"/\n\031GetROMaxVersionDir" + - "Request\022\022\n\nstore_name\030\001 \003(\t\"y\n\032GetROMaxV" + - "ersionDirResponse\022:\n\021ro_store_versions\030\001" + - " \003(\0132\037.voldemort.ROStoreVersionDirMap\022\037\n" + - "\005error\030\002 \001(\0132\020.voldemort.Error\"3\n\035GetROC", - "urrentVersionDirRequest\022\022\n\nstore_name\030\001 " + - "\003(\t\"}\n\036GetROCurrentVersionDirResponse\022:\n" + - "\021ro_store_versions\030\001 \003(\0132\037.voldemort.ROS" + - "toreVersionDirMap\022\037\n\005error\030\002 \001(\0132\020.volde" + - "mort.Error\"/\n\031GetROStorageFormatRequest\022" + - "\022\n\nstore_name\030\001 \003(\t\"y\n\032GetROStorageForma" + - "tResponse\022:\n\021ro_store_versions\030\001 \003(\0132\037.v" + - "oldemort.ROStoreVersionDirMap\022\037\n\005error\030\002" + - " \001(\0132\020.voldemort.Error\"@\n\027FailedFetchSto" + - "reRequest\022\022\n\nstore_name\030\001 \002(\t\022\021\n\tstore_d", - "ir\030\002 \002(\t\";\n\030FailedFetchStoreResponse\022\037\n\005" + - "error\030\001 \001(\0132\020.voldemort.Error\"\346\001\n\033Rebala" + - "nceStateChangeRequest\022K\n\035rebalance_parti" + - "tion_info_list\030\001 \003(\0132$.voldemort.Rebalan" + - "cePartitionInfoMap\022\026\n\016cluster_string\030\002 \002" + - "(\t\022\017\n\007swap_ro\030\003 \002(\010\022\037\n\027change_cluster_me" + - "tadata\030\004 \002(\010\022\036\n\026change_rebalance_state\030\005" + - " \002(\010\022\020\n\010rollback\030\006 \002(\010\"?\n\034RebalanceState" + - "ChangeResponse\022\037\n\005error\030\001 \001(\0132\020.voldemor" + - "t.Error\"G\n DeleteStoreRebalanceStateRequ", - "est\022\022\n\nstore_name\030\001 \002(\t\022\017\n\007node_id\030\002 \002(\005" + - "\"D\n!DeleteStoreRebalanceStateResponse\022\037\n" + - "\005error\030\001 \001(\0132\020.voldemort.Error\"h\n\023Native" + - "BackupRequest\022\022\n\nstore_name\030\001 \002(\t\022\022\n\nbac" + - "kup_dir\030\002 \002(\t\022\024\n\014verify_files\030\003 \002(\010\022\023\n\013i" + - "ncremental\030\004 \002(\010\">\n\024ReserveMemoryRequest" + - "\022\022\n\nstore_name\030\001 \002(\t\022\022\n\nsize_in_mb\030\002 \002(\003" + - "\"8\n\025ReserveMemoryResponse\022\037\n\005error\030\001 \001(\013" + - "2\020.voldemort.Error\"\360\016\n\025VoldemortAdminReq" + - "uest\022)\n\004type\030\001 \002(\0162\033.voldemort.AdminRequ", - "estType\0223\n\014get_metadata\030\002 \001(\0132\035.voldemor" + - "t.GetMetadataRequest\0229\n\017update_metadata\030" + - "\003 \001(\0132 .voldemort.UpdateMetadataRequest\022" + - "J\n\030update_partition_entries\030\004 \001(\0132(.vold" + - "emort.UpdatePartitionEntriesRequest\022H\n\027f" + - "etch_partition_entries\030\005 \001(\0132\'.voldemort" + - ".FetchPartitionEntriesRequest\022J\n\030delete_" + - "partition_entries\030\006 \001(\0132(.voldemort.Dele" + - "tePartitionEntriesRequest\022K\n\031initiate_fe" + - "tch_and_update\030\007 \001(\0132(.voldemort.Initiat", - "eFetchAndUpdateRequest\022F\n\026async_operatio" + - "n_status\030\010 \001(\0132&.voldemort.AsyncOperatio" + - "nStatusRequest\022H\n\027initiate_rebalance_nod" + - "e\030\t \001(\0132\'.voldemort.InitiateRebalanceNod" + - "eRequest\022B\n\024async_operation_stop\030\n \001(\0132$" + - ".voldemort.AsyncOperationStopRequest\022B\n\024" + - "async_operation_list\030\013 \001(\0132$.voldemort.A" + - "syncOperationListRequest\022;\n\020truncate_ent" + - "ries\030\014 \001(\0132!.voldemort.TruncateEntriesRe" + - "quest\022-\n\tadd_store\030\r \001(\0132\032.voldemort.Add", - "StoreRequest\0223\n\014delete_store\030\016 \001(\0132\035.vol" + - "demort.DeleteStoreRequest\0221\n\013fetch_store" + - "\030\017 \001(\0132\034.voldemort.FetchStoreRequest\022/\n\n" + - "swap_store\030\020 \001(\0132\033.voldemort.SwapStoreRe" + - "quest\0227\n\016rollback_store\030\021 \001(\0132\037.voldemor" + - "t.RollbackStoreRequest\022D\n\026get_ro_max_ver" + - "sion_dir\030\022 \001(\0132$.voldemort.GetROMaxVersi" + - "onDirRequest\022L\n\032get_ro_current_version_d" + - "ir\030\023 \001(\0132(.voldemort.GetROCurrentVersion" + - "DirRequest\022D\n\025fetch_partition_files\030\024 \001(", - "\0132%.voldemort.FetchPartitionFilesRequest" + - "\022@\n\023update_slop_entries\030\026 \001(\0132#.voldemor" + - "t.UpdateSlopEntriesRequest\022>\n\022failed_fet" + - "ch_store\030\030 \001(\0132\".voldemort.FailedFetchSt" + - "oreRequest\022C\n\025get_ro_storage_format\030\031 \001(" + - "\0132$.voldemort.GetROStorageFormatRequest\022" + - "F\n\026rebalance_state_change\030\032 \001(\0132&.voldem" + - "ort.RebalanceStateChangeRequest\022/\n\nrepai" + - "r_job\030\033 \001(\0132\033.voldemort.RepairJobRequest" + - "\022X\n initiate_rebalance_node_on_donor\030\034 \001", - "(\0132..voldemort.InitiateRebalanceNodeOnDo" + - "norRequest\022Q\n\034delete_store_rebalance_sta" + - "te\030\035 \001(\0132+.voldemort.DeleteStoreRebalanc" + - "eStateRequest\0225\n\rnative_backup\030\036 \001(\0132\036.v" + - "oldemort.NativeBackupRequest\0227\n\016reserve_" + - "memory\030\037 \001(\0132\037.voldemort.ReserveMemoryRe" + - "quest*\310\005\n\020AdminRequestType\022\020\n\014GET_METADA" + - "TA\020\000\022\023\n\017UPDATE_METADATA\020\001\022\034\n\030UPDATE_PART" + - "ITION_ENTRIES\020\002\022\033\n\027FETCH_PARTITION_ENTRI" + - "ES\020\003\022\034\n\030DELETE_PARTITION_ENTRIES\020\004\022\035\n\031IN", - "ITIATE_FETCH_AND_UPDATE\020\005\022\032\n\026ASYNC_OPERA" + - "TION_STATUS\020\006\022\033\n\027INITIATE_REBALANCE_NODE" + - "\020\007\022\030\n\024ASYNC_OPERATION_STOP\020\010\022\030\n\024ASYNC_OP" + - "ERATION_LIST\020\t\022\024\n\020TRUNCATE_ENTRIES\020\n\022\r\n\t" + - "ADD_STORE\020\013\022\020\n\014DELETE_STORE\020\014\022\017\n\013FETCH_S" + - "TORE\020\r\022\016\n\nSWAP_STORE\020\016\022\022\n\016ROLLBACK_STORE" + - "\020\017\022\032\n\026GET_RO_MAX_VERSION_DIR\020\020\022\036\n\032GET_RO" + - "_CURRENT_VERSION_DIR\020\021\022\031\n\025FETCH_PARTITIO" + - "N_FILES\020\022\022\027\n\023UPDATE_SLOP_ENTRIES\020\024\022\026\n\022FA" + - "ILED_FETCH_STORE\020\026\022\031\n\025GET_RO_STORAGE_FOR", - "MAT\020\027\022\032\n\026REBALANCE_STATE_CHANGE\020\030\022\016\n\nREP" + - "AIR_JOB\020\031\022$\n INITIATE_REBALANCE_NODE_ON_" + - "DONOR\020\032\022 \n\034DELETE_STORE_REBALANCE_STATE\020" + - "\033\022\021\n\rNATIVE_BACKUP\020\034\022\022\n\016RESERVE_MEMORY\020\035" + - "B-\n\034voldemort.client.protocol.pbB\013VAdmin" + - "ProtoH\001" + "rror\030\002 \001(\0132\020.voldemort.Error\"H\n\025UpdateMe" + + "tadataRequest\022/\n\rmetadataEntry\030\001 \003(\0132\030.v" + + "oldemort.KeyedVersions\"9\n\026UpdateMetadata" + + "Response\022\037\n\005error\030\001 \001(\0132\020.voldemort.Erro" + + "r\"7\n\tFileEntry\022\021\n\tfile_name\030\001 \002(\t\022\027\n\017fil" + + "e_size_bytes\030\002 \002(\003\"F\n\016PartitionEntry\022\013\n\003", + "key\030\001 \002(\014\022\'\n\tversioned\030\002 \002(\0132\024.voldemort" + + ".Versioned\"\216\001\n\035UpdatePartitionEntriesReq" + + "uest\022\r\n\005store\030\001 \002(\t\0222\n\017partition_entry\030\002" + + " \002(\0132\031.voldemort.PartitionEntry\022*\n\006filte" + + "r\030\003 \001(\0132\032.voldemort.VoldemortFilter\"A\n\036U" + + "pdatePartitionEntriesResponse\022\037\n\005error\030\001" + + " \001(\0132\020.voldemort.Error\"-\n\017VoldemortFilte" + + "r\022\014\n\004name\030\001 \002(\t\022\014\n\004data\030\002 \002(\014\"\257\001\n\030Update" + + "SlopEntriesRequest\022\r\n\005store\030\001 \002(\t\022\013\n\003key" + + "\030\002 \002(\014\022\'\n\007version\030\003 \002(\0132\026.voldemort.Vect", + "orClock\022,\n\014request_type\030\004 \002(\0162\026.voldemor" + + "t.RequestType\022\r\n\005value\030\005 \001(\014\022\021\n\ttransfor" + + "m\030\006 \001(\014\"<\n\031UpdateSlopEntriesResponse\022\037\n\005" + + "error\030\001 \001(\0132\020.voldemort.Error\"d\n\032FetchPa" + + "rtitionFilesRequest\022\r\n\005store\030\001 \002(\t\0227\n\024re" + + "plica_to_partition\030\002 \003(\0132\031.voldemort.Par" + + "titionTuple\"\244\002\n\034FetchPartitionEntriesReq" + + "uest\0227\n\024replica_to_partition\030\001 \003(\0132\031.vol" + + "demort.PartitionTuple\022\r\n\005store\030\002 \002(\t\022*\n\006" + + "filter\030\003 \001(\0132\032.voldemort.VoldemortFilter", + "\022\024\n\014fetch_values\030\004 \001(\010\022*\n\"OBSOLETE__DO_N" + + "OT_USE__skip_records\030\005 \001(\003\022\027\n\017initial_cl" + + "uster\030\006 \001(\t\022\026\n\016fetch_orphaned\030\007 \001(\010\022\035\n\025r" + + "ecords_per_partition\030\010 \001(\003\"\201\001\n\035FetchPart" + + "itionEntriesResponse\0222\n\017partition_entry\030" + + "\001 \001(\0132\031.voldemort.PartitionEntry\022\013\n\003key\030" + + "\002 \001(\014\022\037\n\005error\030\003 \001(\0132\020.voldemort.Error\"\254" + + "\001\n\035DeletePartitionEntriesRequest\022\r\n\005stor" + + "e\030\001 \002(\t\0227\n\024replica_to_partition\030\002 \003(\0132\031." + + "voldemort.PartitionTuple\022*\n\006filter\030\003 \001(\013", + "2\032.voldemort.VoldemortFilter\022\027\n\017initial_" + + "cluster\030\004 \001(\t\"P\n\036DeletePartitionEntriesR" + + "esponse\022\r\n\005count\030\001 \001(\003\022\037\n\005error\030\002 \001(\0132\020." + + "voldemort.Error\"\317\001\n\035InitiateFetchAndUpda" + + "teRequest\022\017\n\007node_id\030\001 \002(\005\022\r\n\005store\030\002 \002(" + + "\t\022*\n\006filter\030\003 \001(\0132\032.voldemort.VoldemortF" + + "ilter\0227\n\024replica_to_partition\030\004 \003(\0132\031.vo" + + "ldemort.PartitionTuple\022\027\n\017initial_cluste" + + "r\030\005 \001(\t\022\020\n\010optimize\030\006 \001(\010\"1\n\033AsyncOperat" + + "ionStatusRequest\022\022\n\nrequest_id\030\001 \002(\005\"/\n\031", + "AsyncOperationStopRequest\022\022\n\nrequest_id\030" + + "\001 \002(\005\"=\n\032AsyncOperationStopResponse\022\037\n\005e" + + "rror\030\001 \001(\0132\020.voldemort.Error\"2\n\031AsyncOpe" + + "rationListRequest\022\025\n\rshow_complete\030\002 \002(\010" + + "\"R\n\032AsyncOperationListResponse\022\023\n\013reques" + + "t_ids\030\001 \003(\005\022\037\n\005error\030\002 \001(\0132\020.voldemort.E" + + "rror\":\n\016PartitionTuple\022\024\n\014replica_type\030\001" + + " \002(\005\022\022\n\npartitions\030\002 \003(\005\"e\n\026PerStorePart" + + "itionTuple\022\022\n\nstore_name\030\001 \002(\t\0227\n\024replic" + + "a_to_partition\030\002 \003(\0132\031.voldemort.Partiti", + "onTuple\"\370\001\n\031RebalancePartitionInfoMap\022\022\n" + + "\nstealer_id\030\001 \002(\005\022\020\n\010donor_id\030\002 \002(\005\022\017\n\007a" + + "ttempt\030\003 \002(\005\022C\n\030replica_to_add_partition" + + "\030\004 \003(\0132!.voldemort.PerStorePartitionTupl" + + "e\022F\n\033replica_to_delete_partition\030\005 \003(\0132!" + + ".voldemort.PerStorePartitionTuple\022\027\n\017ini" + + "tial_cluster\030\006 \002(\t\"f\n\034InitiateRebalanceN" + + "odeRequest\022F\n\030rebalance_partition_info\030\001" + + " \002(\0132$.voldemort.RebalancePartitionInfoM" + + "ap\"m\n#InitiateRebalanceNodeOnDonorReques", + "t\022F\n\030rebalance_partition_info\030\001 \003(\0132$.vo" + + "ldemort.RebalancePartitionInfoMap\"\212\001\n\034As" + + "yncOperationStatusResponse\022\022\n\nrequest_id" + + "\030\001 \001(\005\022\023\n\013description\030\002 \001(\t\022\016\n\006status\030\003 " + + "\001(\t\022\020\n\010complete\030\004 \001(\010\022\037\n\005error\030\005 \001(\0132\020.v" + + "oldemort.Error\"\'\n\026TruncateEntriesRequest" + + "\022\r\n\005store\030\001 \002(\t\":\n\027TruncateEntriesRespon" + + "se\022\037\n\005error\030\001 \001(\0132\020.voldemort.Error\"*\n\017A" + + "ddStoreRequest\022\027\n\017storeDefinition\030\001 \002(\t\"" + + "3\n\020AddStoreResponse\022\037\n\005error\030\001 \001(\0132\020.vol", + "demort.Error\"\'\n\022DeleteStoreRequest\022\021\n\tst" + + "oreName\030\001 \002(\t\"6\n\023DeleteStoreResponse\022\037\n\005" + + "error\030\001 \001(\0132\020.voldemort.Error\"P\n\021FetchSt" + + "oreRequest\022\022\n\nstore_name\030\001 \002(\t\022\021\n\tstore_" + + "dir\030\002 \002(\t\022\024\n\014push_version\030\003 \001(\003\"9\n\020SwapS" + + "toreRequest\022\022\n\nstore_name\030\001 \002(\t\022\021\n\tstore" + + "_dir\030\002 \002(\t\"P\n\021SwapStoreResponse\022\037\n\005error" + + "\030\001 \001(\0132\020.voldemort.Error\022\032\n\022previous_sto" + + "re_dir\030\002 \001(\t\"@\n\024RollbackStoreRequest\022\022\n\n" + + "store_name\030\001 \002(\t\022\024\n\014push_version\030\002 \002(\003\"8", + "\n\025RollbackStoreResponse\022\037\n\005error\030\001 \001(\0132\020" + + ".voldemort.Error\"&\n\020RepairJobRequest\022\022\n\n" + + "store_name\030\001 \001(\t\"4\n\021RepairJobResponse\022\037\n" + + "\005error\030\001 \001(\0132\020.voldemort.Error\"=\n\024ROStor" + + "eVersionDirMap\022\022\n\nstore_name\030\001 \002(\t\022\021\n\tst" + + "ore_dir\030\002 \002(\t\"/\n\031GetROMaxVersionDirReque" + + "st\022\022\n\nstore_name\030\001 \003(\t\"y\n\032GetROMaxVersio" + + "nDirResponse\022:\n\021ro_store_versions\030\001 \003(\0132" + + "\037.voldemort.ROStoreVersionDirMap\022\037\n\005erro" + + "r\030\002 \001(\0132\020.voldemort.Error\"3\n\035GetROCurren", + "tVersionDirRequest\022\022\n\nstore_name\030\001 \003(\t\"}" + + "\n\036GetROCurrentVersionDirResponse\022:\n\021ro_s" + + "tore_versions\030\001 \003(\0132\037.voldemort.ROStoreV" + + "ersionDirMap\022\037\n\005error\030\002 \001(\0132\020.voldemort." + + "Error\"/\n\031GetROStorageFormatRequest\022\022\n\nst" + + "ore_name\030\001 \003(\t\"y\n\032GetROStorageFormatResp" + + "onse\022:\n\021ro_store_versions\030\001 \003(\0132\037.voldem" + + "ort.ROStoreVersionDirMap\022\037\n\005error\030\002 \001(\0132" + + "\020.voldemort.Error\"@\n\027FailedFetchStoreReq" + + "uest\022\022\n\nstore_name\030\001 \002(\t\022\021\n\tstore_dir\030\002 ", + "\002(\t\";\n\030FailedFetchStoreResponse\022\037\n\005error" + + "\030\001 \001(\0132\020.voldemort.Error\"\375\001\n\033RebalanceSt" + + "ateChangeRequest\022K\n\035rebalance_partition_" + + "info_list\030\001 \003(\0132$.voldemort.RebalancePar" + + "titionInfoMap\022\026\n\016cluster_string\030\002 \002(\t\022\025\n" + + "\rstores_string\030\003 \002(\t\022\017\n\007swap_ro\030\004 \002(\010\022\037\n" + + "\027change_cluster_metadata\030\005 \002(\010\022\036\n\026change" + + "_rebalance_state\030\006 \002(\010\022\020\n\010rollback\030\007 \002(\010" + + "\"?\n\034RebalanceStateChangeResponse\022\037\n\005erro" + + "r\030\001 \001(\0132\020.voldemort.Error\"G\n DeleteStore", + "RebalanceStateRequest\022\022\n\nstore_name\030\001 \002(" + + "\t\022\017\n\007node_id\030\002 \002(\005\"D\n!DeleteStoreRebalan" + + "ceStateResponse\022\037\n\005error\030\001 \001(\0132\020.voldemo" + + "rt.Error\"h\n\023NativeBackupRequest\022\022\n\nstore" + + "_name\030\001 \002(\t\022\022\n\nbackup_dir\030\002 \002(\t\022\024\n\014verif" + + "y_files\030\003 \002(\010\022\023\n\013incremental\030\004 \002(\010\">\n\024Re" + + "serveMemoryRequest\022\022\n\nstore_name\030\001 \002(\t\022\022" + + "\n\nsize_in_mb\030\002 \002(\003\"8\n\025ReserveMemoryRespo" + + "nse\022\037\n\005error\030\001 \001(\0132\020.voldemort.Error\"\360\016\n" + + "\025VoldemortAdminRequest\022)\n\004type\030\001 \002(\0162\033.v", + "oldemort.AdminRequestType\0223\n\014get_metadat" + + "a\030\002 \001(\0132\035.voldemort.GetMetadataRequest\0229" + + "\n\017update_metadata\030\003 \001(\0132 .voldemort.Upda" + + "teMetadataRequest\022J\n\030update_partition_en" + + "tries\030\004 \001(\0132(.voldemort.UpdatePartitionE" + + "ntriesRequest\022H\n\027fetch_partition_entries" + + "\030\005 \001(\0132\'.voldemort.FetchPartitionEntries" + + "Request\022J\n\030delete_partition_entries\030\006 \001(" + + "\0132(.voldemort.DeletePartitionEntriesRequ" + + "est\022K\n\031initiate_fetch_and_update\030\007 \001(\0132(", + ".voldemort.InitiateFetchAndUpdateRequest" + + "\022F\n\026async_operation_status\030\010 \001(\0132&.volde" + + "mort.AsyncOperationStatusRequest\022H\n\027init" + + "iate_rebalance_node\030\t \001(\0132\'.voldemort.In" + + "itiateRebalanceNodeRequest\022B\n\024async_oper" + + "ation_stop\030\n \001(\0132$.voldemort.AsyncOperat" + + "ionStopRequest\022B\n\024async_operation_list\030\013" + + " \001(\0132$.voldemort.AsyncOperationListReque" + + "st\022;\n\020truncate_entries\030\014 \001(\0132!.voldemort" + + ".TruncateEntriesRequest\022-\n\tadd_store\030\r \001", + "(\0132\032.voldemort.AddStoreRequest\0223\n\014delete" + + "_store\030\016 \001(\0132\035.voldemort.DeleteStoreRequ" + + "est\0221\n\013fetch_store\030\017 \001(\0132\034.voldemort.Fet" + + "chStoreRequest\022/\n\nswap_store\030\020 \001(\0132\033.vol" + + "demort.SwapStoreRequest\0227\n\016rollback_stor" + + "e\030\021 \001(\0132\037.voldemort.RollbackStoreRequest" + + "\022D\n\026get_ro_max_version_dir\030\022 \001(\0132$.volde" + + "mort.GetROMaxVersionDirRequest\022L\n\032get_ro" + + "_current_version_dir\030\023 \001(\0132(.voldemort.G" + + "etROCurrentVersionDirRequest\022D\n\025fetch_pa", + "rtition_files\030\024 \001(\0132%.voldemort.FetchPar" + + "titionFilesRequest\022@\n\023update_slop_entrie" + + "s\030\026 \001(\0132#.voldemort.UpdateSlopEntriesReq" + + "uest\022>\n\022failed_fetch_store\030\030 \001(\0132\".volde" + + "mort.FailedFetchStoreRequest\022C\n\025get_ro_s" + + "torage_format\030\031 \001(\0132$.voldemort.GetROSto" + + "rageFormatRequest\022F\n\026rebalance_state_cha" + + "nge\030\032 \001(\0132&.voldemort.RebalanceStateChan" + + "geRequest\022/\n\nrepair_job\030\033 \001(\0132\033.voldemor" + + "t.RepairJobRequest\022X\n initiate_rebalance", + "_node_on_donor\030\034 \001(\0132..voldemort.Initiat" + + "eRebalanceNodeOnDonorRequest\022Q\n\034delete_s" + + "tore_rebalance_state\030\035 \001(\0132+.voldemort.D" + + "eleteStoreRebalanceStateRequest\0225\n\rnativ" + + "e_backup\030\036 \001(\0132\036.voldemort.NativeBackupR" + + "equest\0227\n\016reserve_memory\030\037 \001(\0132\037.voldemo" + + "rt.ReserveMemoryRequest*\310\005\n\020AdminRequest" + + "Type\022\020\n\014GET_METADATA\020\000\022\023\n\017UPDATE_METADAT" + + "A\020\001\022\034\n\030UPDATE_PARTITION_ENTRIES\020\002\022\033\n\027FET" + + "CH_PARTITION_ENTRIES\020\003\022\034\n\030DELETE_PARTITI", + "ON_ENTRIES\020\004\022\035\n\031INITIATE_FETCH_AND_UPDAT" + + "E\020\005\022\032\n\026ASYNC_OPERATION_STATUS\020\006\022\033\n\027INITI" + + "ATE_REBALANCE_NODE\020\007\022\030\n\024ASYNC_OPERATION_" + + "STOP\020\010\022\030\n\024ASYNC_OPERATION_LIST\020\t\022\024\n\020TRUN" + + "CATE_ENTRIES\020\n\022\r\n\tADD_STORE\020\013\022\020\n\014DELETE_" + + "STORE\020\014\022\017\n\013FETCH_STORE\020\r\022\016\n\nSWAP_STORE\020\016" + + "\022\022\n\016ROLLBACK_STORE\020\017\022\032\n\026GET_RO_MAX_VERSI" + + "ON_DIR\020\020\022\036\n\032GET_RO_CURRENT_VERSION_DIR\020\021" + + "\022\031\n\025FETCH_PARTITION_FILES\020\022\022\027\n\023UPDATE_SL" + + "OP_ENTRIES\020\024\022\026\n\022FAILED_FETCH_STORE\020\026\022\031\n\025", + "GET_RO_STORAGE_FORMAT\020\027\022\032\n\026REBALANCE_STA" + + "TE_CHANGE\020\030\022\016\n\nREPAIR_JOB\020\031\022$\n INITIATE_" + + "REBALANCE_NODE_ON_DONOR\020\032\022 \n\034DELETE_STOR" + + "E_REBALANCE_STATE\020\033\022\021\n\rNATIVE_BACKUP\020\034\022\022" + + "\n\016RESERVE_MEMORY\020\035B-\n\034voldemort.client.p" + + "rotocol.pbB\013VAdminProtoH\001" }; com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner assigner = new com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner() { @@ -23360,7 +23383,7 @@ public com.google.protobuf.ExtensionRegistry assignDescriptors( internal_static_voldemort_UpdateMetadataRequest_fieldAccessorTable = new com.google.protobuf.GeneratedMessage.FieldAccessorTable( internal_static_voldemort_UpdateMetadataRequest_descriptor, - new java.lang.String[] { "Key", "Versioned", }, + new java.lang.String[] { "MetadataEntry", }, voldemort.client.protocol.pb.VAdminProto.UpdateMetadataRequest.class, voldemort.client.protocol.pb.VAdminProto.UpdateMetadataRequest.Builder.class); internal_static_voldemort_UpdateMetadataResponse_descriptor = @@ -23744,7 +23767,7 @@ public com.google.protobuf.ExtensionRegistry assignDescriptors( internal_static_voldemort_RebalanceStateChangeRequest_fieldAccessorTable = new com.google.protobuf.GeneratedMessage.FieldAccessorTable( internal_static_voldemort_RebalanceStateChangeRequest_descriptor, - new java.lang.String[] { "RebalancePartitionInfoList", "ClusterString", "SwapRo", "ChangeClusterMetadata", "ChangeRebalanceState", "Rollback", }, + new java.lang.String[] { "RebalancePartitionInfoList", "ClusterString", "StoresString", "SwapRo", "ChangeClusterMetadata", "ChangeRebalanceState", "Rollback", }, voldemort.client.protocol.pb.VAdminProto.RebalanceStateChangeRequest.class, voldemort.client.protocol.pb.VAdminProto.RebalanceStateChangeRequest.Builder.class); internal_static_voldemort_RebalanceStateChangeResponse_descriptor = diff --git a/src/java/voldemort/client/rebalance/RebalanceController.java b/src/java/voldemort/client/rebalance/RebalanceController.java index fb52897487..b2789d5202 100644 --- a/src/java/voldemort/client/rebalance/RebalanceController.java +++ b/src/java/voldemort/client/rebalance/RebalanceController.java @@ -30,6 +30,7 @@ import voldemort.VoldemortException; import voldemort.client.ClientConfig; +import voldemort.client.protocol.RequestFormatType; import voldemort.client.protocol.admin.AdminClient; import voldemort.client.rebalance.task.DonorBasedRebalanceTask; import voldemort.client.rebalance.task.RebalanceTask; @@ -58,7 +59,9 @@ public class RebalanceController { private final RebalanceClientConfig rebalanceConfig; public RebalanceController(String bootstrapUrl, RebalanceClientConfig rebalanceConfig) { - this.adminClient = new AdminClient(bootstrapUrl, rebalanceConfig, new ClientConfig()); + this.adminClient = new AdminClient(bootstrapUrl, + rebalanceConfig, + new ClientConfig().setRequestFormatType(RequestFormatType.PROTOCOL_BUFFERS)); this.rebalanceConfig = rebalanceConfig; } @@ -386,6 +389,7 @@ private void rebalancePerPartitionTransition(final OrderedClusterTransition orde // Flatten the node plans to partition plans List rebalancePartitionPlanList = rebalancePartitionsInfoList; + List allStoreDefs = orderedClusterTransition.getStoreDefs(); // Split the store definitions List readOnlyStoreDefs = StoreDefinitionUtils.filterStores(orderedClusterTransition.getStoreDefs(), true); @@ -400,9 +404,14 @@ private void rebalancePerPartitionTransition(final OrderedClusterTransition orde List filteredRebalancePartitionPlanList = RebalanceUtils.filterPartitionPlanWithStores(rebalancePartitionPlanList, readOnlyStoreDefs); + // TODO this method right nowtakes just the source stores definition + // the 2nd argument needs to be fixed + // ATTENTION JAY rebalanceStateChange(orderedClusterTransition.getId(), orderedClusterTransition.getCurrentCluster(), orderedClusterTransition.getTargetCluster(), + allStoreDefs, + allStoreDefs, filteredRebalancePartitionPlanList, hasReadOnlyStores, hasReadWriteStores, @@ -423,9 +432,14 @@ private void rebalancePerPartitionTransition(final OrderedClusterTransition orde filteredRebalancePartitionPlanList = RebalanceUtils.filterPartitionPlanWithStores(rebalancePartitionPlanList, readWriteStoreDefs); + // TODO this method right nowtakes just the source stores definition + // the 2nd argument needs to be fixed + // ATTENTION JAY rebalanceStateChange(orderedClusterTransition.getId(), orderedClusterTransition.getCurrentCluster(), orderedClusterTransition.getTargetCluster(), + allStoreDefs, + allStoreDefs, filteredRebalancePartitionPlanList, hasReadOnlyStores, hasReadWriteStores, @@ -460,6 +474,8 @@ private void rebalancePerPartitionTransition(final OrderedClusterTransition orde } /** + * TODO JAY -- This interface expects the source stores definition and + * target stores def * * Perform a group of state change actions. Also any errors + rollback * procedures are performed at this level itself. @@ -490,6 +506,8 @@ private void rebalancePerPartitionTransition(final OrderedClusterTransition orde private void rebalanceStateChange(final int taskId, Cluster currentCluster, Cluster transitionCluster, + List existingStoreDefs, + List targetStoreDefs, List rebalancePartitionPlanList, boolean hasReadOnlyStores, boolean hasReadWriteStores, @@ -511,6 +529,8 @@ private void rebalanceStateChange(final int taskId, if(!rebalanceConfig.isShowPlanEnabled()) adminClient.rebalanceOps.rebalanceStateChange(currentCluster, transitionCluster, + existingStoreDefs, + targetStoreDefs, rebalancePartitionPlanList, false, true, @@ -523,6 +543,8 @@ private void rebalanceStateChange(final int taskId, if(!rebalanceConfig.isShowPlanEnabled()) adminClient.rebalanceOps.rebalanceStateChange(currentCluster, transitionCluster, + existingStoreDefs, + targetStoreDefs, rebalancePartitionPlanList, false, false, @@ -535,6 +557,8 @@ private void rebalanceStateChange(final int taskId, if(!rebalanceConfig.isShowPlanEnabled()) adminClient.rebalanceOps.rebalanceStateChange(currentCluster, transitionCluster, + existingStoreDefs, + targetStoreDefs, rebalancePartitionPlanList, true, true, @@ -549,6 +573,8 @@ private void rebalanceStateChange(final int taskId, if(!rebalanceConfig.isShowPlanEnabled()) adminClient.rebalanceOps.rebalanceStateChange(currentCluster, transitionCluster, + existingStoreDefs, + targetStoreDefs, rebalancePartitionPlanList, true, true, @@ -676,8 +702,15 @@ private void rebalancePerTaskTransition(final int taskId, if(hasReadOnlyStores && hasReadWriteStores && finishedReadOnlyStores) { // Case 0 - adminClient.rebalanceOps.rebalanceStateChange(null, - currentCluster, + + // TODO this method right nowtakes just the source stores + // definition + // the 2nd argument needs to be fixed + // ATTENTION JAY + adminClient.rebalanceOps.rebalanceStateChange(null, currentCluster, null, null, // pass + // current + // store + // def null, true, true, @@ -686,8 +719,15 @@ private void rebalancePerTaskTransition(final int taskId, false); } else if(hasReadWriteStores && finishedReadOnlyStores) { // Case 4 - adminClient.rebalanceOps.rebalanceStateChange(null, - currentCluster, + + // TODO this method right nowtakes just the source stores + // definition + // the 2nd argument needs to be fixed + // ATTENTION JAY + adminClient.rebalanceOps.rebalanceStateChange(null, currentCluster, null, null, // pass + // current + // store + // def null, false, true, diff --git a/src/java/voldemort/client/rebalance/RebalancePartitionsInfo.java b/src/java/voldemort/client/rebalance/RebalancePartitionsInfo.java index 330a81d021..beb69ef3c0 100644 --- a/src/java/voldemort/client/rebalance/RebalancePartitionsInfo.java +++ b/src/java/voldemort/client/rebalance/RebalancePartitionsInfo.java @@ -131,12 +131,16 @@ public static RebalancePartitionsInfo create(Map map) { List partitionList = Utils.uncheckedCast(map.get(unbalancedStore + "replicaToAddPartitionList" + Integer.toString(replicaNo))); + // TODO there is a potential NPE hiding here that might fail + // rebalancing tests if(partitionList.size() > 0) replicaToAddPartition.put(replicaNo, partitionList); List deletePartitionList = Utils.uncheckedCast(map.get(unbalancedStore + "replicaToDeletePartitionList" + Integer.toString(replicaNo))); + // TODO there is a potential NPE hiding here that might fail + // rebalancing tests if(deletePartitionList.size() > 0) replicaToDeletePartitionList.put(replicaNo, deletePartitionList); } @@ -156,7 +160,7 @@ public static RebalancePartitionsInfo create(Map map) { attempt); } - public ImmutableMap asMap() { + public synchronized ImmutableMap asMap() { ImmutableMap.Builder builder = new ImmutableMap.Builder(); builder.put("stealerId", stealerId) @@ -199,23 +203,23 @@ public ImmutableMap asMap() { return builder.build(); } - public void setAttempt(int attempt) { + public synchronized void setAttempt(int attempt) { this.attempt = attempt; } - public int getDonorId() { + public synchronized int getDonorId() { return donorId; } - public int getAttempt() { + public synchronized int getAttempt() { return attempt; } - public int getStealerId() { + public synchronized int getStealerId() { return stealerId; } - public Cluster getInitialCluster() { + public synchronized Cluster getInitialCluster() { return initialCluster; } @@ -225,35 +229,35 @@ public Cluster getInitialCluster() { * * @return Set of store names */ - public Set getUnbalancedStoreList() { + public synchronized Set getUnbalancedStoreList() { return storeToReplicaToAddPartitionList.keySet(); } - public HashMap>> getStoreToReplicaToAddPartitionList() { + public synchronized HashMap>> getStoreToReplicaToAddPartitionList() { return this.storeToReplicaToAddPartitionList; } - public HashMap>> getStoreToReplicaToDeletePartitionList() { + public synchronized HashMap>> getStoreToReplicaToDeletePartitionList() { return this.storeToReplicaToDeletePartitionList; } - public HashMap> getReplicaToAddPartitionList(String storeName) { + public synchronized HashMap> getReplicaToAddPartitionList(String storeName) { return this.storeToReplicaToAddPartitionList.get(storeName); } - public HashMap> getReplicaToDeletePartitionList(String storeName) { + public synchronized HashMap> getReplicaToDeletePartitionList(String storeName) { return this.storeToReplicaToDeletePartitionList.get(storeName); } - public void setStoreToReplicaToAddPartitionList(HashMap>> storeToReplicaToAddPartitionList) { + public synchronized void setStoreToReplicaToAddPartitionList(HashMap>> storeToReplicaToAddPartitionList) { this.storeToReplicaToAddPartitionList = storeToReplicaToAddPartitionList; } - public void setStoreToReplicaToDeletePartitionList(HashMap>> storeToReplicaToDeletePartitionList) { + public synchronized void setStoreToReplicaToDeletePartitionList(HashMap>> storeToReplicaToDeletePartitionList) { this.storeToReplicaToDeletePartitionList = storeToReplicaToDeletePartitionList; } - public void removeStore(String storeName) { + public synchronized void removeStore(String storeName) { this.storeToReplicaToAddPartitionList.remove(storeName); this.storeToReplicaToDeletePartitionList.remove(storeName); } @@ -263,7 +267,7 @@ public void removeStore(String storeName) { * * @return List of primary partitions */ - public List getStealMasterPartitions() { + public synchronized List getStealMasterPartitions() { Iterator>> iter = storeToReplicaToAddPartitionList.values() .iterator(); List primaryPartitionsBeingMoved = Lists.newArrayList(); @@ -276,7 +280,7 @@ public List getStealMasterPartitions() { } @Override - public String toString() { + public synchronized String toString() { StringBuffer sb = new StringBuffer(); sb.append("\nRebalancePartitionsInfo(" + getStealerId() + " [" + initialCluster.getNodeById(getStealerId()).getHost() + "] <--- " + getDonorId() @@ -307,7 +311,7 @@ public String toString() { return sb.toString(); } - public String toJsonString() { + public synchronized String toJsonString() { Map map = asMap(); StringWriter writer = new StringWriter(); @@ -317,7 +321,7 @@ public String toJsonString() { } @Override - public boolean equals(Object o) { + public synchronized boolean equals(Object o) { if(this == o) return true; if(o == null || getClass() != o.getClass()) @@ -344,7 +348,7 @@ public boolean equals(Object o) { } @Override - public int hashCode() { + public synchronized int hashCode() { int result = stealerId; result = 31 * result + donorId; result = 31 * result + initialCluster.hashCode(); diff --git a/src/java/voldemort/cluster/Zone.java b/src/java/voldemort/cluster/Zone.java index 047aa1f35a..874db6e876 100644 --- a/src/java/voldemort/cluster/Zone.java +++ b/src/java/voldemort/cluster/Zone.java @@ -18,7 +18,7 @@ public Zone(int zoneId, LinkedList proximityList) { } public Zone() { - this.zoneId = 0; + this.zoneId = DEFAULT_ZONE_ID; this.proximityList = new LinkedList(); } diff --git a/src/java/voldemort/coordinator/CoordinatorConfig.java b/src/java/voldemort/coordinator/CoordinatorConfig.java index 670f278b6d..2f2b32f1b1 100644 --- a/src/java/voldemort/coordinator/CoordinatorConfig.java +++ b/src/java/voldemort/coordinator/CoordinatorConfig.java @@ -1,3 +1,19 @@ +/* + * Copyright 2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + package voldemort.coordinator; import java.io.BufferedInputStream; @@ -18,18 +34,16 @@ public class CoordinatorConfig { private volatile List bootstrapURLs = null; private volatile String fatClientConfigPath = null; - private volatile int fatClientWrapperMaxPoolSize = 20; - private volatile int fatClientWrapperCorePoolSize = 20; - private volatile int fatClientWrapperKeepAliveInSecs = 60; private volatile int metadataCheckIntervalInMs = 5000; + private volatile int nettyServerPort = 8080; + private volatile int nettyServerBacklog = 1000; /* Propery names for propery-based configuration */ public static final String BOOTSTRAP_URLS_PROPERTY = "bootstrap_urls"; public static final String FAT_CLIENTS_CONFIG_FILE_PATH_PROPERTY = "fat_clients_config_file_path"; - public static final String FAT_CLIENT_WRAPPER_MAX_POOL_SIZE_PROPERTY = "fat_client_wrapper_max_pool_size"; - public static final String FAT_CLIENT_WRAPPER_CORE_POOL_SIZE_PROPERTY = "fat_client_wrapper_core_pool_size"; - public static final String FAT_CLIENT_WRAPPER_POOL_KEEPALIVE_IN_SECS = "fat_client_wrapper_pool_keepalive_in_secs"; public static final String METADATA_CHECK_INTERVAL_IN_MS = "metadata_check_interval_in_ms"; + public static final String NETTY_SERVER_PORT = "netty_server_port"; + public static final String NETTY_SERVER_BACKLOG = "netty_server_backlog"; /** * Instantiate the coordinator config using a properties file @@ -61,6 +75,17 @@ public CoordinatorConfig(Properties properties) { setProperties(properties); } + /** + * Dummy constructor for testing purposes + */ + public CoordinatorConfig() {} + + /** + * Set the values using the specified Properties object + * + * @param properties Properties object containing specific property values + * for the Coordinator config + */ private void setProperties(Properties properties) { Props props = new Props(properties); if(props.containsKey(BOOTSTRAP_URLS_PROPERTY)) { @@ -71,24 +96,17 @@ private void setProperties(Properties properties) { setFatClientConfigPath(props.getString(FAT_CLIENTS_CONFIG_FILE_PATH_PROPERTY)); } - if(props.containsKey(FAT_CLIENT_WRAPPER_CORE_POOL_SIZE_PROPERTY)) { - setFatClientWrapperCorePoolSize(props.getInt(FAT_CLIENT_WRAPPER_CORE_POOL_SIZE_PROPERTY, - this.fatClientWrapperCorePoolSize)); - } - - if(props.containsKey(FAT_CLIENT_WRAPPER_MAX_POOL_SIZE_PROPERTY)) { - setFatClientWrapperMaxPoolSize(props.getInt(FAT_CLIENT_WRAPPER_MAX_POOL_SIZE_PROPERTY, - this.fatClientWrapperMaxPoolSize)); + if(props.containsKey(METADATA_CHECK_INTERVAL_IN_MS)) { + setMetadataCheckIntervalInMs(props.getInt(METADATA_CHECK_INTERVAL_IN_MS, + this.metadataCheckIntervalInMs)); } - if(props.containsKey(FAT_CLIENT_WRAPPER_POOL_KEEPALIVE_IN_SECS)) { - setFatClientWrapperKeepAliveInSecs(props.getInt(FAT_CLIENT_WRAPPER_POOL_KEEPALIVE_IN_SECS, - this.fatClientWrapperKeepAliveInSecs)); + if(props.containsKey(NETTY_SERVER_PORT)) { + setMetadataCheckIntervalInMs(props.getInt(NETTY_SERVER_PORT, this.nettyServerPort)); } - if(props.containsKey(METADATA_CHECK_INTERVAL_IN_MS)) { - setMetadataCheckIntervalInMs(props.getInt(METADATA_CHECK_INTERVAL_IN_MS, - this.metadataCheckIntervalInMs)); + if(props.containsKey(NETTY_SERVER_BACKLOG)) { + setMetadataCheckIntervalInMs(props.getInt(NETTY_SERVER_BACKLOG, this.nettyServerBacklog)); } } @@ -98,6 +116,14 @@ public String[] getBootstrapURLs() { return this.bootstrapURLs.toArray(new String[this.bootstrapURLs.size()]); } + /** + * Sets the bootstrap URLs used by the different Fat clients inside the + * Coordinator + * + * @param bootstrapUrls list of bootstrap URLs defining which cluster to + * connect to + * @return + */ public CoordinatorConfig setBootstrapURLs(List bootstrapUrls) { this.bootstrapURLs = Utils.notNull(bootstrapUrls); if(this.bootstrapURLs.size() <= 0) @@ -109,40 +135,52 @@ public String getFatClientConfigPath() { return fatClientConfigPath; } + /** + * Defines individual config for each of the fat clients managed by the + * Coordinator + * + * @param fatClientConfigPath The path of the file containing the fat client + * config in Avro format + */ public void setFatClientConfigPath(String fatClientConfigPath) { this.fatClientConfigPath = fatClientConfigPath; } - public int getFatClientWrapperMaxPoolSize() { - return fatClientWrapperMaxPoolSize; - } - - public void setFatClientWrapperMaxPoolSize(int fatClientWrapperMaxPoolSize) { - this.fatClientWrapperMaxPoolSize = fatClientWrapperMaxPoolSize; - } - - public int getFatClientWrapperCorePoolSize() { - return fatClientWrapperCorePoolSize; + public int getMetadataCheckIntervalInMs() { + return metadataCheckIntervalInMs; } - public void setFatClientWrapperCorePoolSize(int fatClientWrapperCorePoolSize) { - this.fatClientWrapperCorePoolSize = fatClientWrapperCorePoolSize; + /** + * @param metadataCheckIntervalInMs Defines the frequency with which to + * check for updates in the cluster metadata (Eg: cluster.xml and + * stores.xml) + */ + public void setMetadataCheckIntervalInMs(int metadataCheckIntervalInMs) { + this.metadataCheckIntervalInMs = metadataCheckIntervalInMs; } - public int getFatClientWrapperKeepAliveInSecs() { - return fatClientWrapperKeepAliveInSecs; + public int getServerPort() { + return nettyServerPort; } - public void setFatClientWrapperKeepAliveInSecs(int fatClientWrapperKeepAliveInSecs) { - this.fatClientWrapperKeepAliveInSecs = fatClientWrapperKeepAliveInSecs; + /** + * @param serverPort Defines the port to use while bootstrapping the Netty + * server + */ + public void setServerPort(int serverPort) { + this.nettyServerPort = serverPort; } - public int getMetadataCheckIntervalInMs() { - return metadataCheckIntervalInMs; + public int getNettyServerBacklog() { + return nettyServerBacklog; } - public void setMetadataCheckIntervalInMs(int metadataCheckIntervalInMs) { - this.metadataCheckIntervalInMs = metadataCheckIntervalInMs; + /** + * @param nettyServerBacklog Defines the netty server backlog value + * + */ + public void setNettyServerBacklog(int nettyServerBacklog) { + this.nettyServerBacklog = nettyServerBacklog; } } diff --git a/src/java/voldemort/coordinator/CoordinatorErrorStats.java b/src/java/voldemort/coordinator/CoordinatorErrorStats.java new file mode 100644 index 0000000000..96a55ca917 --- /dev/null +++ b/src/java/voldemort/coordinator/CoordinatorErrorStats.java @@ -0,0 +1,52 @@ +package voldemort.coordinator; + +import java.util.concurrent.RejectedExecutionException; +import java.util.concurrent.atomic.AtomicLong; + +import voldemort.VoldemortException; +import voldemort.annotations.jmx.JmxGetter; +import voldemort.store.InsufficientOperationalNodesException; +import voldemort.store.InsufficientZoneResponsesException; +import voldemort.store.InvalidMetadataException; +import voldemort.store.routed.PipelineRoutedStats; + +/** + * Class to keep track of all the errors in the Coordinator service + * + */ +public class CoordinatorErrorStats extends PipelineRoutedStats { + + CoordinatorErrorStats() { + super(); + this.errCountMap.put(RejectedExecutionException.class, new AtomicLong(0)); + this.errCountMap.put(IllegalArgumentException.class, new AtomicLong(0)); + this.errCountMap.put(VoldemortException.class, new AtomicLong(0)); + } + + @Override + public boolean isSevere(Exception ve) { + if(ve instanceof InsufficientOperationalNodesException + || ve instanceof InsufficientZoneResponsesException + || ve instanceof InvalidMetadataException || ve instanceof RejectedExecutionException + || ve instanceof IllegalArgumentException || ve instanceof VoldemortException) + return true; + else + return false; + } + + @JmxGetter(name = "numRejectedExecutionExceptions", description = "Number of rejected tasks by the Fat client") + public long getNumRejectedExecutionExceptions() { + return errCountMap.get(RejectedExecutionException.class).get(); + } + + @JmxGetter(name = "numIllegalArgumentExceptions", description = "Number of bad requests received by the Coordinator") + public long getNumIllegalArgumentExceptions() { + return errCountMap.get(IllegalArgumentException.class).get(); + } + + @JmxGetter(name = "numVoldemortExceptions", description = "Number of failed Voldemort operations") + public long getNumVoldemortExceptions() { + return errCountMap.get(VoldemortException.class).get(); + } + +} diff --git a/src/java/voldemort/coordinator/CoordinatorPipelineFactory.java b/src/java/voldemort/coordinator/CoordinatorPipelineFactory.java index 1646f02130..0252539a57 100644 --- a/src/java/voldemort/coordinator/CoordinatorPipelineFactory.java +++ b/src/java/voldemort/coordinator/CoordinatorPipelineFactory.java @@ -36,9 +36,13 @@ public class CoordinatorPipelineFactory implements ChannelPipelineFactory { private boolean noop = false; private Map fatClientMap; + private CoordinatorErrorStats errorStats = null; - public CoordinatorPipelineFactory(Map fatClientMap, boolean noop) { + public CoordinatorPipelineFactory(Map fatClientMap, + CoordinatorErrorStats errorStats, + boolean noop) { this.fatClientMap = fatClientMap; + this.errorStats = errorStats; this.noop = noop; } @@ -56,7 +60,8 @@ public ChannelPipeline getPipeline() throws Exception { if(this.noop) { pipeline.addLast("handler", new NoopHttpRequestHandler()); } else { - pipeline.addLast("handler", new VoldemortHttpRequestHandler(this.fatClientMap)); + pipeline.addLast("handler", new VoldemortHttpRequestHandler(this.fatClientMap, + this.errorStats)); } return pipeline; } diff --git a/src/java/voldemort/coordinator/CoordinatorService.java b/src/java/voldemort/coordinator/CoordinatorService.java index a6a8a508bc..bede057a2e 100644 --- a/src/java/voldemort/coordinator/CoordinatorService.java +++ b/src/java/voldemort/coordinator/CoordinatorService.java @@ -1,5 +1,5 @@ /* - * Copyright 2008-2013 LinkedIn, Inc + * Copyright 2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -31,6 +31,7 @@ import java.util.Properties; import java.util.concurrent.Callable; import java.util.concurrent.Executors; +import java.util.concurrent.ThreadPoolExecutor; import org.apache.avro.Schema; import org.apache.avro.generic.GenericData; @@ -40,8 +41,11 @@ import org.apache.commons.io.IOUtils; import org.apache.log4j.Logger; import org.jboss.netty.bootstrap.ServerBootstrap; +import org.jboss.netty.channel.Channel; import org.jboss.netty.channel.socket.nio.NioServerSocketChannelFactory; +import voldemort.annotations.jmx.JmxGetter; +import voldemort.annotations.jmx.JmxManaged; import voldemort.client.ClientConfig; import voldemort.client.SocketStoreClientFactory; import voldemort.client.SystemStoreRepository; @@ -52,6 +56,9 @@ import voldemort.server.VoldemortServer; import voldemort.store.StoreDefinition; import voldemort.store.metadata.MetadataStore; +import voldemort.store.stats.StoreStats; +import voldemort.store.stats.Tracked; +import voldemort.utils.JmxUtils; import voldemort.utils.SystemTime; import voldemort.utils.Utils; import voldemort.xml.StoreDefinitionsMapper; @@ -63,24 +70,33 @@ * clients and invokes the corresponding Fat client API. * */ +@JmxManaged(description = "A Coordinator Service for proxying Voldemort HTTP requests") public class CoordinatorService extends AbstractService { - private CoordinatorConfig config = null; + private CoordinatorConfig coordinatorConfig = null; - public CoordinatorService(CoordinatorConfig config) { - super(ServiceType.COORDINATOR); - this.config = config; - } - - private static boolean noop = false; - private static SocketStoreClientFactory storeClientFactory = null; - private static AsyncMetadataVersionManager asyncMetadataManager = null; - private static SchedulerService schedulerService = null; + private boolean noop = false; + private SocketStoreClientFactory storeClientFactory = null; + private AsyncMetadataVersionManager asyncMetadataManager = null; + private SchedulerService schedulerService = null; private static final Logger logger = Logger.getLogger(CoordinatorService.class); - private static Map fatClientMap = null; + private Map fatClientMap = null; public final static Schema CLIENT_CONFIGS_AVRO_SCHEMA = Schema.parse("{ \"name\": \"clientConfigs\", \"type\":\"array\"," + "\"items\": { \"name\": \"clientConfig\", \"type\": \"map\", \"values\":\"string\" }}}"); private static final String STORE_NAME_KEY = "store_name"; + protected ThreadPoolExecutor workerPool = null; + private final CoordinatorErrorStats errorStats; + private final StoreStats coordinatorPerfStats; + private ServerBootstrap bootstrap = null; + private Channel nettyServerChannel = null; + + public CoordinatorService(CoordinatorConfig config) { + super(ServiceType.COORDINATOR); + this.coordinatorConfig = config; + this.coordinatorPerfStats = new StoreStats(); + this.errorStats = new CoordinatorErrorStats(); + RESTErrorHandler.setErrorStatsHandler(errorStats); + } /** * Initializes all the Fat clients (1 per store) for the cluster that this @@ -97,8 +113,8 @@ private void initializeFatClients() { List storeDefList = storeMapper.readStoreList(new StringReader(storesXml), false); - Map fatClientConfigMap = readClientConfig(this.config.getFatClientConfigPath(), - this.config.getBootstrapURLs()); + Map fatClientConfigMap = readClientConfig(this.coordinatorConfig.getFatClientConfigPath(), + this.coordinatorConfig.getBootstrapURLs()); // For now Simply create the map of store definition to // FatClientWrappers // TODO: After the fat client improvements is done, modify this to @@ -111,10 +127,12 @@ private void initializeFatClients() { logger.info("Creating a Fat client wrapper for store: " + storeName); logger.info("Using config: " + fatClientConfigMap.get(storeName)); fatClientMap.put(storeName, new FatClientWrapper(storeName, - this.config, + this.coordinatorConfig, fatClientConfigMap.get(storeName), storesXml, - clusterXml)); + clusterXml, + this.errorStats, + this.coordinatorPerfStats)); } } @@ -123,7 +141,7 @@ protected void startInner() { // Initialize the Voldemort Metadata ClientConfig clientConfig = new ClientConfig(); - clientConfig.setBootstrapUrls(this.config.getBootstrapURLs()); + clientConfig.setBootstrapUrls(this.coordinatorConfig.getBootstrapURLs()); storeClientFactory = new SocketStoreClientFactory(clientConfig); initializeFatClients(); @@ -145,6 +163,8 @@ public Void call() throws Exception { }; + // For now track changes in cluster.xml only + // TODO: Modify this to track stores.xml in the future asyncMetadataManager = new AsyncMetadataVersionManager(sysRepository, rebootstrapCallback, null); @@ -153,18 +173,37 @@ public Void call() throws Exception { schedulerService.schedule(asyncMetadataManager.getClass().getName(), asyncMetadataManager, new Date(), - this.config.getMetadataCheckIntervalInMs()); + this.coordinatorConfig.getMetadataCheckIntervalInMs()); // Configure the server. - ServerBootstrap bootstrap = new ServerBootstrap(new NioServerSocketChannelFactory(Executors.newCachedThreadPool(), - Executors.newCachedThreadPool())); - bootstrap.setOption("backlog", 1000); + this.workerPool = (ThreadPoolExecutor) Executors.newCachedThreadPool(); + this.bootstrap = new ServerBootstrap(new NioServerSocketChannelFactory(Executors.newCachedThreadPool(), + workerPool)); + this.bootstrap.setOption("backlog", this.coordinatorConfig.getNettyServerBacklog()); + this.bootstrap.setOption("child.tcpNoDelay", true); + this.bootstrap.setOption("child.keepAlive", true); + this.bootstrap.setOption("child.reuseAddress", true); // Set up the event pipeline factory. - bootstrap.setPipelineFactory(new CoordinatorPipelineFactory(fatClientMap, noop)); + this.bootstrap.setPipelineFactory(new CoordinatorPipelineFactory(this.fatClientMap, + this.errorStats, + noop)); + + // Register the Mbean + // Netty Queue stats + JmxUtils.registerMbean(this, + JmxUtils.createObjectName(JmxUtils.getPackageName(this.getClass()), + JmxUtils.getClassName(this.getClass()))); + + // Error stats Mbean + JmxUtils.registerMbean(this.errorStats, + JmxUtils.createObjectName(JmxUtils.getPackageName(this.errorStats.getClass()), + JmxUtils.getClassName(this.errorStats.getClass()))); // Bind and start to accept incoming connections. - bootstrap.bind(new InetSocketAddress(8080)); + this.nettyServerChannel = this.bootstrap.bind(new InetSocketAddress(this.coordinatorConfig.getServerPort())); + + logger.info("Coordinator service started on port " + this.coordinatorConfig.getServerPort()); } /** @@ -205,14 +244,14 @@ private static Map readClientConfig(String configFilePath, throw new Exception("Illegal Store Name !!!"); } - ClientConfig config = new ClientConfig(props); - config.setBootstrapUrls(bootstrapURLs) - .setEnableCompressionLayer(false) - .setEnableSerializationLayer(false) - .enableDefaultClient(true) - .setEnableLazy(false); + ClientConfig fatClientConfig = new ClientConfig(props); + fatClientConfig.setBootstrapUrls(bootstrapURLs) + .setEnableCompressionLayer(false) + .setEnableSerializationLayer(false) + .enableDefaultClient(true) + .setEnableLazy(false); - storeNameConfigMap.put(storeName, config); + storeNameConfigMap.put(storeName, fatClientConfig); } } @@ -231,7 +270,17 @@ private static Map readClientConfig(String configFilePath, } @Override - protected void stopInner() {} + protected void stopInner() { + if(this.nettyServerChannel != null) { + this.nettyServerChannel.close(); + } + + JmxUtils.unregisterMbean(JmxUtils.createObjectName(JmxUtils.getPackageName(this.getClass()), + JmxUtils.getClassName(this.getClass()))); + + JmxUtils.unregisterMbean(JmxUtils.createObjectName(JmxUtils.getPackageName(this.errorStats.getClass()), + JmxUtils.getClassName(this.errorStats.getClass()))); + } public static void main(String[] args) throws Exception { CoordinatorConfig config = null; @@ -264,4 +313,59 @@ public void run() { } }); } + + @JmxGetter(name = "numberOfActiveThreads", description = "The number of active Netty worker threads.") + public int getNumberOfActiveThreads() { + return this.workerPool.getActiveCount(); + } + + @JmxGetter(name = "numberOfThreads", description = "The total number of Netty worker threads, active and idle.") + public int getNumberOfThreads() { + return this.workerPool.getPoolSize(); + } + + @JmxGetter(name = "queuedRequests", description = "Number of requests in the Netty worker queue waiting to execute.") + public int getQueuedRequests() { + return this.workerPool.getQueue().size(); + } + + @JmxGetter(name = "averageGetCompletionTimeInMs", description = "The avg. time in ms for GET calls to complete.") + public double getAverageGetCompletionTimeInMs() { + return this.coordinatorPerfStats.getAvgTimeInMs(Tracked.GET); + } + + @JmxGetter(name = "averagePutCompletionTimeInMs", description = "The avg. time in ms for GET calls to complete.") + public double getAveragePutCompletionTimeInMs() { + return this.coordinatorPerfStats.getAvgTimeInMs(Tracked.PUT); + } + + @JmxGetter(name = "averageGetAllCompletionTimeInMs", description = "The avg. time in ms for GET calls to complete.") + public double getAverageGetAllCompletionTimeInMs() { + return this.coordinatorPerfStats.getAvgTimeInMs(Tracked.GET_ALL); + } + + @JmxGetter(name = "averageDeleteCompletionTimeInMs", description = "The avg. time in ms for GET calls to complete.") + public double getAverageDeleteCompletionTimeInMs() { + return this.coordinatorPerfStats.getAvgTimeInMs(Tracked.DELETE); + } + + @JmxGetter(name = "q99GetLatencyInMs", description = "") + public long getQ99GetLatency() { + return this.coordinatorPerfStats.getQ99LatencyInMs(Tracked.GET); + } + + @JmxGetter(name = "q99PutLatencyInMs", description = "") + public long getQ99PutLatency() { + return this.coordinatorPerfStats.getQ99LatencyInMs(Tracked.PUT); + } + + @JmxGetter(name = "q99GetAllLatencyInMs", description = "") + public long getQ99GetAllLatency() { + return this.coordinatorPerfStats.getQ99LatencyInMs(Tracked.GET_ALL); + } + + @JmxGetter(name = "q99DeleteLatencyInMs", description = "") + public long getQ99DeleteLatency() { + return this.coordinatorPerfStats.getQ99LatencyInMs(Tracked.DELETE); + } } diff --git a/src/java/voldemort/coordinator/CoordinatorUtils.java b/src/java/voldemort/coordinator/CoordinatorUtils.java new file mode 100644 index 0000000000..d8003ca655 --- /dev/null +++ b/src/java/voldemort/coordinator/CoordinatorUtils.java @@ -0,0 +1,46 @@ +package voldemort.coordinator; + +import org.codehaus.jackson.map.ObjectMapper; + +import voldemort.versioning.VectorClock; + +public class CoordinatorUtils { + + /** + * Function to serialize the given Vector clock into a string. If something + * goes wrong, it returns an empty string. + * + * @param vc The Vector clock to serialize + * @return The string (JSON) version of the specified Vector clock + */ + public static String getSerializedVectorClock(VectorClock vc) { + VectorClockWrapper vcWrapper = new VectorClockWrapper(vc); + ObjectMapper mapper = new ObjectMapper(); + String serializedVC = ""; + try { + serializedVC = mapper.writeValueAsString(vcWrapper); + } catch(Exception e) { + e.printStackTrace(); + } + return serializedVC; + } + + public static VectorClock deserializeVectorClock(String serializedVC) { + VectorClock vc = null; + + if(serializedVC == null) { + return null; + } + + ObjectMapper mapper = new ObjectMapper(); + + try { + VectorClockWrapper vcWrapper = mapper.readValue(serializedVC, VectorClockWrapper.class); + vc = new VectorClock(vcWrapper.getVersions(), vcWrapper.getTimestamp()); + } catch(Exception e) { + e.printStackTrace(); + } + + return vc; + } +} diff --git a/src/java/voldemort/coordinator/DynamicTimeoutStoreClient.java b/src/java/voldemort/coordinator/DynamicTimeoutStoreClient.java index 17f3d71af5..2c72b3e62a 100644 --- a/src/java/voldemort/coordinator/DynamicTimeoutStoreClient.java +++ b/src/java/voldemort/coordinator/DynamicTimeoutStoreClient.java @@ -1,5 +1,5 @@ /* - * Copyright 2008-2013 LinkedIn, Inc + * Copyright 2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -29,6 +29,7 @@ import voldemort.store.CompositeVersionedPutVoldemortRequest; import voldemort.store.CompositeVoldemortRequest; import voldemort.store.InvalidMetadataException; +import voldemort.store.Store; import voldemort.store.StoreTimeoutException; import voldemort.versioning.ObsoleteVersionException; import voldemort.versioning.VectorClock; @@ -42,6 +43,8 @@ * features: 1) Per call timeout facility 2) Ability to disable resolution per * call * + * TODO: Merge this with DefaultStoreClient eventually. + * * @param Type of the Key * @param Type of the Value */ @@ -68,6 +71,17 @@ public DynamicTimeoutStoreClient(String storeName, bootStrap(clusterXml, storesXml); } + /** + * Dummy constructor for Unit test purposes + * + * @param customStore A custom store object to use for performing the + * operations + */ + public DynamicTimeoutStoreClient(Store customStore) { + this.store = customStore; + this.metadataRefreshAttempts = 1; + } + // Bootstrap using the given cluster xml and stores xml // The super class bootStrap() method is used to handle the // InvalidMetadataException @@ -76,6 +90,13 @@ public void bootStrap(String customClusterXml, String customStoresXml) { this.store = factory.getRawStore(storeName, null, customStoresXml, customClusterXml, null); } + /** + * Performs a get operation with the specified composite request object + * + * @param requestWrapper A composite request object containing the key (and + * / or default value) and timeout. + * @return The Versioned value corresponding to the key + */ public Versioned getWithCustomTimeout(CompositeVoldemortRequest requestWrapper) { validateTimeout(requestWrapper.getRoutingTimeoutInMs()); for(int attempts = 0; attempts < this.metadataRefreshAttempts; attempts++) { @@ -92,14 +113,21 @@ public Versioned getWithCustomTimeout(CompositeVoldemortRequest request + " metadata refresh attempts failed."); } + /** + * Performs a put operation with the specified composite request object + * + * @param requestWrapper A composite request object containing the key and + * value + * @return Version of the value for the successful put + */ public Version putWithCustomTimeout(CompositeVoldemortRequest requestWrapper) { validateTimeout(requestWrapper.getRoutingTimeoutInMs()); Versioned versioned; long startTime = System.currentTimeMillis(); // We use the full timeout for doing the Get. In this, we're being - // optimistic that the subsequent put might be faster all the steps - // might finish within the alloted time + // optimistic that the subsequent put might be faster such that all the + // steps might finish within the alloted time versioned = getWithCustomTimeout(requestWrapper); long endTime = System.currentTimeMillis(); @@ -119,6 +147,15 @@ public Version putWithCustomTimeout(CompositeVoldemortRequest requestWrapp (requestWrapper.getRoutingTimeoutInMs() - (endTime - startTime)))); } + /** + * Performs a Versioned put operation with the specified composite request + * object + * + * @param requestWrapper Composite request object containing the key and the + * versioned object + * @return Version of the value for the successful put + * @throws ObsoleteVersionException + */ public Version putVersionedWithCustomTimeout(CompositeVoldemortRequest requestWrapper) throws ObsoleteVersionException { validateTimeout(requestWrapper.getRoutingTimeoutInMs()); @@ -136,6 +173,14 @@ public Version putVersionedWithCustomTimeout(CompositeVoldemortRequest req + " metadata refresh attempts failed."); } + /** + * Performs a get all operation with the specified composite request object + * + * @param requestWrapper Composite request object containing a reference to + * the Iterable keys + * + * @return Map of the keys to the corresponding versioned values + */ public Map> getAllWithCustomTimeout(CompositeVoldemortRequest requestWrapper) { validateTimeout(requestWrapper.getRoutingTimeoutInMs()); Map>> items = null; @@ -161,6 +206,13 @@ public Map> getAllWithCustomTimeout(CompositeVoldemortRequest deleteRequestObject) { validateTimeout(deleteRequestObject.getRoutingTimeoutInMs()); if(deleteRequestObject.getVersion() == null) { @@ -194,7 +246,11 @@ public boolean deleteWithCustomTimeout(CompositeVoldemortRequest deleteReq return store.delete(deleteRequestObject); } - // Make sure that the timeout specified is valid + /** + * Function to check that the timeout specified is valid + * + * @param opTimeoutInMs The specified timeout in milliseconds + */ private void validateTimeout(long opTimeoutInMs) { if(opTimeoutInMs <= 0) { throw new IllegalArgumentException("Illegal parameter: Timeout is too low: " diff --git a/src/java/voldemort/coordinator/FatClientWrapper.java b/src/java/voldemort/coordinator/FatClientWrapper.java index 81f0e08b31..c7944402ec 100644 --- a/src/java/voldemort/coordinator/FatClientWrapper.java +++ b/src/java/voldemort/coordinator/FatClientWrapper.java @@ -1,5 +1,5 @@ /* - * Copyright 2008-2013 LinkedIn, Inc + * Copyright 2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -16,10 +16,9 @@ package voldemort.coordinator; -import java.util.concurrent.ExecutorService; +import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.RejectedExecutionException; import java.util.concurrent.RejectedExecutionHandler; -import java.util.concurrent.SynchronousQueue; import java.util.concurrent.ThreadFactory; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; @@ -27,10 +26,14 @@ import org.apache.log4j.Logger; import org.jboss.netty.channel.MessageEvent; +import voldemort.annotations.jmx.JmxGetter; +import voldemort.annotations.jmx.JmxManaged; import voldemort.client.ClientConfig; import voldemort.client.SocketStoreClientFactory; import voldemort.store.CompositeVoldemortRequest; +import voldemort.store.stats.StoreStats; import voldemort.utils.ByteArray; +import voldemort.utils.JmxUtils; /** * A Wrapper class to provide asynchronous API for calling the fat client @@ -38,13 +41,17 @@ * of invoking the Fat Client methods on its own * */ +@JmxManaged(description = "A Wrapper for a Fat client in order to execute requests asynchronously") public class FatClientWrapper { - private ExecutorService fatClientExecutor; + private ThreadPoolExecutor fatClientExecutor; private SocketStoreClientFactory storeClientFactory; private DynamicTimeoutStoreClient dynamicTimeoutClient; - private final CoordinatorConfig config; + private final CoordinatorConfig coordinatorConfig; private final Logger logger = Logger.getLogger(FatClientWrapper.class); + private final String storeName; + private final CoordinatorErrorStats errorStats; + private final StoreStats coordinatorPerfStats; /** * @@ -53,25 +60,27 @@ public class FatClientWrapper { * @param clientConfig The config used to bootstrap the fat client * @param storesXml Stores XML used to bootstrap the fat client * @param clusterXml Cluster XML used to bootstrap the fat client + * @param errorStats + * @param coordinatorPerfStats */ public FatClientWrapper(String storeName, CoordinatorConfig config, ClientConfig clientConfig, String storesXml, - String clusterXml) { + String clusterXml, + CoordinatorErrorStats errorStats, + StoreStats coordinatorPerfStats) { - this.config = config; + this.coordinatorConfig = config; // TODO: Import this from Config - this.fatClientExecutor = new ThreadPoolExecutor(this.config.getFatClientWrapperCorePoolSize(), - this.config.getFatClientWrapperMaxPoolSize(), - this.config.getFatClientWrapperKeepAliveInSecs(), // Keepalive + this.fatClientExecutor = new ThreadPoolExecutor(clientConfig.getFatClientWrapperCorePoolSize(), + clientConfig.getFatClientWrapperMaxPoolSize(), + clientConfig.getFatClientWrapperKeepAliveInSecs(), // Keepalive TimeUnit.SECONDS, // Keepalive // Timeunit - new SynchronousQueue(), // Queue - // for - // pending - // tasks + new ArrayBlockingQueue(clientConfig.getFatClientWrapperMaxPoolSize(), + true), new ThreadFactory() { @@ -95,7 +104,6 @@ public void rejectedExecution(Runnable r, } }); - // this.fatClientRequestQueue = new SynchronousQueue(); this.storeClientFactory = new SocketStoreClientFactory(clientConfig); this.dynamicTimeoutClient = new DynamicTimeoutStoreClient(storeName, @@ -103,7 +111,24 @@ public void rejectedExecution(Runnable r, 1, storesXml, clusterXml); + this.errorStats = errorStats; + this.coordinatorPerfStats = coordinatorPerfStats; + this.storeName = storeName; + // Register the Mbean + JmxUtils.registerMbean(this, + JmxUtils.createObjectName(JmxUtils.getPackageName(this.getClass()), + JmxUtils.getClassName(this.getClass()) + + "-" + storeName)); + + } + + public void close() { + // Register the Mbean + JmxUtils.unregisterMbean(JmxUtils.createObjectName(JmxUtils.getPackageName(this.getClass()), + JmxUtils.getClassName(this.getClass()) + + "-" + this.storeName)); + this.storeClientFactory.close(); } /** @@ -111,22 +136,25 @@ public void rejectedExecution(Runnable r, * * @param getRequestObject Contains the key used in the get operation * @param getRequestMessageEvent MessageEvent to write the response back to + * @param startTimestampInNs The start timestamp used to measure turnaround + * time */ void submitGetRequest(final CompositeVoldemortRequest getRequestObject, - final MessageEvent getRequestMessageEvent) { + final MessageEvent getRequestMessageEvent, + long startTimestampInNs) { try { this.fatClientExecutor.submit(new HttpGetRequestExecutor(getRequestObject, getRequestMessageEvent, - this.dynamicTimeoutClient)); + this.dynamicTimeoutClient, + startTimestampInNs, + this.coordinatorPerfStats)); if(logger.isDebugEnabled()) { logger.debug("Submitted a get request"); } - // Keep track of this request for monitoring - // this.fatClientRequestQueue.add(f); } catch(RejectedExecutionException rej) { - handleRejectedException(getRequestMessageEvent); + handleRejectedException(rej, getRequestMessageEvent); } } @@ -136,50 +164,56 @@ void submitGetRequest(final CompositeVoldemortRequest getRequ * @param getAllRequestObject Contains the keys used in the getAll oepration * @param getAllRequestMessageEvent MessageEvent to write the response back * to + * @param storeName Name of the store to be specified in the response + * (header) + * @param startTimestampInNs The start timestamp used to measure turnaround + * time */ void submitGetAllRequest(final CompositeVoldemortRequest getAllRequestObject, final MessageEvent getAllRequestMessageEvent, - final String storeName) { + final String storeName, + long startTimestampInNs) { try { this.fatClientExecutor.submit(new HttpGetAllRequestExecutor(getAllRequestObject, getAllRequestMessageEvent, this.dynamicTimeoutClient, - storeName)); + storeName, + startTimestampInNs, + this.coordinatorPerfStats)); if(logger.isDebugEnabled()) { logger.debug("Submitted a get all request"); } - // Keep track of this request for monitoring - // this.fatClientRequestQueue.add(f); } catch(RejectedExecutionException rej) { - handleRejectedException(getAllRequestMessageEvent); + handleRejectedException(rej, getAllRequestMessageEvent); } } /** * Interface to perform put operation on the Fat client * - * @param key: ByteArray representation of the key to put - * @param value: value corresponding to the key to put - * @param putRequest: MessageEvent to write the response on. - * @param operationTimeoutInMs The timeout value for this operation + * @param putRequestObject Request object containing the key and value + * @param putRequestMessageEvent MessageEvent to write the response on. + * @param startTimestampInNs The start timestamp used to measure turnaround + * time */ void submitPutRequest(final CompositeVoldemortRequest putRequestObject, - final MessageEvent putRequest) { + final MessageEvent putRequestMessageEvent, + long startTimestampInNs) { try { this.fatClientExecutor.submit(new HttpPutRequestExecutor(putRequestObject, - putRequest, - this.dynamicTimeoutClient)); + putRequestMessageEvent, + this.dynamicTimeoutClient, + startTimestampInNs, + this.coordinatorPerfStats)); if(logger.isDebugEnabled()) { logger.debug("Submitted a put request"); } - // Keep track of this request for monitoring - // this.fatClientRequestQueue.add(f); } catch(RejectedExecutionException rej) { - handleRejectedException(putRequest); + handleRejectedException(rej, putRequestMessageEvent); } } @@ -189,37 +223,46 @@ void submitPutRequest(final CompositeVoldemortRequest putRequ * @param deleteRequestObject Contains the key and the version used in the * delete operation * @param deleteRequestEvent MessageEvent to write the response back to + * @param startTimestampInNs The start timestamp used to measure turnaround + * time */ public void submitDeleteRequest(CompositeVoldemortRequest deleteRequestObject, - MessageEvent deleteRequestEvent) { + MessageEvent deleteRequestEvent, + long startTimestampInNs) { try { this.fatClientExecutor.submit(new HttpDeleteRequestExecutor(deleteRequestObject, deleteRequestEvent, - this.dynamicTimeoutClient)); + this.dynamicTimeoutClient, + startTimestampInNs, + this.coordinatorPerfStats)); - // Keep track of this request for monitoring - // this.fatClientRequestQueue.add(f); } catch(RejectedExecutionException rej) { - handleRejectedException(deleteRequestEvent); + handleRejectedException(rej, deleteRequestEvent); } } // TODO: Add a custom HTTP Error status 429: Too many requests - private void handleRejectedException(MessageEvent getRequest) { + private void handleRejectedException(RejectedExecutionException rej, MessageEvent getRequest) { + this.errorStats.reportException(rej); logger.error("rejected !!!"); getRequest.getChannel().write(null); // Write error back to the thin // client - // String errorDescription = - // "Request queue for store " + - // this.dynamicTimeoutClient.getStoreName() - // + " is full !"); - // logger.error(errorDescription); - // RESTErrorHandler.handleError(REQUEST_TIMEOUT, - // this.getRequestMessageEvent, - // false, - // errorDescription); } + @JmxGetter(name = "numberOfActiveThreads", description = "The number of active Fat client wrapper threads.") + public int getNumberOfActiveThreads() { + return this.fatClientExecutor.getActiveCount(); + } + + @JmxGetter(name = "numberOfThreads", description = "The total number of Fat client wrapper threads, active and idle.") + public int getNumberOfThreads() { + return this.fatClientExecutor.getPoolSize(); + } + + @JmxGetter(name = "queuedRequests", description = "Number of requests in the Fat client wrapper queue waiting to execute.") + public int getQueuedRequests() { + return this.fatClientExecutor.getQueue().size(); + } } diff --git a/src/java/voldemort/coordinator/HttpDeleteRequestExecutor.java b/src/java/voldemort/coordinator/HttpDeleteRequestExecutor.java index 9009c19d16..c1b2e79aab 100644 --- a/src/java/voldemort/coordinator/HttpDeleteRequestExecutor.java +++ b/src/java/voldemort/coordinator/HttpDeleteRequestExecutor.java @@ -1,5 +1,5 @@ /* - * Copyright 2008-2013 LinkedIn, Inc + * Copyright 2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -17,17 +17,13 @@ package voldemort.coordinator; import static org.jboss.netty.handler.codec.http.HttpHeaders.Names.CONTENT_LENGTH; -import static org.jboss.netty.handler.codec.http.HttpHeaders.Names.CONTENT_TRANSFER_ENCODING; -import static org.jboss.netty.handler.codec.http.HttpHeaders.Names.CONTENT_TYPE; import static org.jboss.netty.handler.codec.http.HttpResponseStatus.INTERNAL_SERVER_ERROR; import static org.jboss.netty.handler.codec.http.HttpResponseStatus.NOT_FOUND; -import static org.jboss.netty.handler.codec.http.HttpResponseStatus.OK; +import static org.jboss.netty.handler.codec.http.HttpResponseStatus.NO_CONTENT; import static org.jboss.netty.handler.codec.http.HttpResponseStatus.REQUEST_TIMEOUT; import static org.jboss.netty.handler.codec.http.HttpVersion.HTTP_1_1; import org.apache.log4j.Logger; -import org.jboss.netty.channel.ChannelFuture; -import org.jboss.netty.channel.ChannelFutureListener; import org.jboss.netty.channel.MessageEvent; import org.jboss.netty.handler.codec.http.DefaultHttpResponse; import org.jboss.netty.handler.codec.http.HttpResponse; @@ -35,6 +31,8 @@ import voldemort.VoldemortException; import voldemort.store.CompositeVoldemortRequest; import voldemort.store.StoreTimeoutException; +import voldemort.store.stats.StoreStats; +import voldemort.store.stats.Tracked; import voldemort.utils.ByteArray; /** @@ -49,6 +47,8 @@ public class HttpDeleteRequestExecutor implements Runnable { DynamicTimeoutStoreClient storeClient; private final Logger logger = Logger.getLogger(HttpDeleteRequestExecutor.class); private final CompositeVoldemortRequest deleteRequestObject; + private final long startTimestampInNs; + private final StoreStats coordinatorPerfStats; /** * @@ -58,31 +58,37 @@ public class HttpDeleteRequestExecutor implements Runnable { * error * @param storeClient Reference to the fat client for performing this Delete * operation + * @param coordinatorPerfStats Stats object used to measure the turnaround + * time + * @param startTimestampInNs start timestamp of the request */ public HttpDeleteRequestExecutor(CompositeVoldemortRequest deleteRequestObject, MessageEvent requestEvent, - DynamicTimeoutStoreClient storeClient) { + DynamicTimeoutStoreClient storeClient, + long startTimestampInNs, + StoreStats coordinatorPerfStats) { this.deleteRequestMessageEvent = requestEvent; this.storeClient = storeClient; this.deleteRequestObject = deleteRequestObject; + this.startTimestampInNs = startTimestampInNs; + this.coordinatorPerfStats = coordinatorPerfStats; } public void writeResponse() { // 1. Create the Response object - HttpResponse response = new DefaultHttpResponse(HTTP_1_1, OK); + HttpResponse response = new DefaultHttpResponse(HTTP_1_1, NO_CONTENT); // 2. Set the right headers - response.setHeader(CONTENT_TYPE, "binary"); - response.setHeader(CONTENT_TRANSFER_ENCODING, "binary"); response.setHeader(CONTENT_LENGTH, "0"); - // Write the response to the Netty Channel - ChannelFuture future = this.deleteRequestMessageEvent.getChannel().write(response); - - // Close the non-keep-alive connection after the write operation is - // done. - future.addListener(ChannelFutureListener.CLOSE); + // Update the stats + if(this.coordinatorPerfStats != null) { + long durationInNs = System.nanoTime() - startTimestampInNs; + this.coordinatorPerfStats.recordTime(Tracked.DELETE, durationInNs); + } + // Write the response to the Netty Channel + this.deleteRequestMessageEvent.getChannel().write(response); } @Override @@ -94,7 +100,6 @@ public void run() { } else { RESTErrorHandler.handleError(NOT_FOUND, this.deleteRequestMessageEvent, - false, "Requested Key with the specified version does not exist"); } @@ -103,14 +108,12 @@ public void run() { logger.error(errorDescription); RESTErrorHandler.handleError(REQUEST_TIMEOUT, this.deleteRequestMessageEvent, - false, errorDescription); } catch(VoldemortException ve) { ve.printStackTrace(); String errorDescription = "Voldemort Exception: " + ve.getMessage(); RESTErrorHandler.handleError(INTERNAL_SERVER_ERROR, this.deleteRequestMessageEvent, - false, errorDescription); } } diff --git a/src/java/voldemort/coordinator/HttpGetAllRequestExecutor.java b/src/java/voldemort/coordinator/HttpGetAllRequestExecutor.java index 7ef491e677..c8a774b989 100644 --- a/src/java/voldemort/coordinator/HttpGetAllRequestExecutor.java +++ b/src/java/voldemort/coordinator/HttpGetAllRequestExecutor.java @@ -1,5 +1,5 @@ /* - * Copyright 2008-2013 LinkedIn, Inc + * Copyright 2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -44,8 +44,6 @@ import org.codehaus.jackson.map.ObjectMapper; import org.jboss.netty.buffer.ChannelBuffer; import org.jboss.netty.buffer.ChannelBuffers; -import org.jboss.netty.channel.ChannelFuture; -import org.jboss.netty.channel.ChannelFutureListener; import org.jboss.netty.channel.MessageEvent; import org.jboss.netty.handler.codec.http.DefaultHttpResponse; import org.jboss.netty.handler.codec.http.HttpResponse; @@ -53,6 +51,8 @@ import voldemort.VoldemortException; import voldemort.store.CompositeVoldemortRequest; import voldemort.store.StoreTimeoutException; +import voldemort.store.stats.StoreStats; +import voldemort.store.stats.Tracked; import voldemort.utils.ByteArray; import voldemort.versioning.VectorClock; import voldemort.versioning.Versioned; @@ -70,6 +70,8 @@ public class HttpGetAllRequestExecutor implements Runnable { private final Logger logger = Logger.getLogger(HttpGetRequestExecutor.class); private final CompositeVoldemortRequest getAllRequestObject; private final String storeName; + private final long startTimestampInNs; + private final StoreStats coordinatorPerfStats; /** * @@ -79,15 +81,24 @@ public class HttpGetAllRequestExecutor implements Runnable { * error * @param storeClient Reference to the fat client for performing this Get * operation + * @param storeName Name of the store intended to be included in the + * response (content-location) + * @param coordinatorPerfStats Stats object used to measure the turnaround + * time + * @param startTimestampInNs start timestamp of the request */ public HttpGetAllRequestExecutor(CompositeVoldemortRequest getAllRequestObject, MessageEvent requestMessageEvent, DynamicTimeoutStoreClient storeClient, - String storeName) { + String storeName, + long startTimestampInNs, + StoreStats coordinatorPerfStats) { this.getRequestMessageEvent = requestMessageEvent; this.storeClient = storeClient; this.getAllRequestObject = getAllRequestObject; this.storeName = storeName; + this.startTimestampInNs = startTimestampInNs; + this.coordinatorPerfStats = coordinatorPerfStats; } public void writeResponse(Map> responseVersioned) { @@ -160,13 +171,14 @@ public void writeResponse(Map> responseVersioned) { response.setContent(responseContent); response.setHeader(CONTENT_LENGTH, response.getContent().readableBytes()); - // Write the response to the Netty Channel - ChannelFuture future = this.getRequestMessageEvent.getChannel().write(response); - - // Close the non-keep-alive connection after the write operation is - // done. - future.addListener(ChannelFutureListener.CLOSE); + // Update the stats + if(this.coordinatorPerfStats != null) { + long durationInNs = System.nanoTime() - startTimestampInNs; + this.coordinatorPerfStats.recordTime(Tracked.GET_ALL, durationInNs); + } + // Write the response to the Netty Channel + this.getRequestMessageEvent.getChannel().write(response); } @Override @@ -176,7 +188,6 @@ public void run() { if(responseVersioned == null) { RESTErrorHandler.handleError(NOT_FOUND, this.getRequestMessageEvent, - false, "Requested Key does not exist"); } writeResponse(responseVersioned); @@ -184,22 +195,17 @@ public void run() { String errorDescription = "GETALL Failed !!! Illegal Arguments : " + illegalArgsException.getMessage(); logger.error(errorDescription); - RESTErrorHandler.handleError(BAD_REQUEST, - this.getRequestMessageEvent, - false, - errorDescription); + RESTErrorHandler.handleError(BAD_REQUEST, this.getRequestMessageEvent, errorDescription); } catch(StoreTimeoutException timeoutException) { String errorDescription = "GET Request timed out: " + timeoutException.getMessage(); logger.error(errorDescription); RESTErrorHandler.handleError(REQUEST_TIMEOUT, this.getRequestMessageEvent, - false, errorDescription); } catch(VoldemortException ve) { String errorDescription = "Voldemort Exception: " + ve.getMessage(); RESTErrorHandler.handleError(INTERNAL_SERVER_ERROR, this.getRequestMessageEvent, - false, errorDescription); } } diff --git a/src/java/voldemort/coordinator/HttpGetRequestExecutor.java b/src/java/voldemort/coordinator/HttpGetRequestExecutor.java index 61c4a543c8..02a1de9a58 100644 --- a/src/java/voldemort/coordinator/HttpGetRequestExecutor.java +++ b/src/java/voldemort/coordinator/HttpGetRequestExecutor.java @@ -1,5 +1,5 @@ /* - * Copyright 2008-2013 LinkedIn, Inc + * Copyright 2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -27,12 +27,7 @@ import static org.jboss.netty.handler.codec.http.HttpResponseStatus.REQUEST_TIMEOUT; import static org.jboss.netty.handler.codec.http.HttpVersion.HTTP_1_1; -import java.io.IOException; - import org.apache.log4j.Logger; -import org.codehaus.jackson.JsonGenerationException; -import org.codehaus.jackson.map.JsonMappingException; -import org.codehaus.jackson.map.ObjectMapper; import org.jboss.netty.buffer.ChannelBuffer; import org.jboss.netty.buffer.ChannelBuffers; import org.jboss.netty.channel.MessageEvent; @@ -42,6 +37,8 @@ import voldemort.VoldemortException; import voldemort.store.CompositeVoldemortRequest; import voldemort.store.StoreTimeoutException; +import voldemort.store.stats.StoreStats; +import voldemort.store.stats.Tracked; import voldemort.utils.ByteArray; import voldemort.versioning.VectorClock; import voldemort.versioning.Versioned; @@ -59,6 +56,20 @@ public class HttpGetRequestExecutor implements Runnable { DynamicTimeoutStoreClient storeClient; private final Logger logger = Logger.getLogger(HttpGetRequestExecutor.class); private final CompositeVoldemortRequest getRequestObject; + private final long startTimestampInNs; + private final StoreStats coordinatorPerfStats; + + /** + * Dummy constructor invoked during a Noop Get operation + * + * @param requestEvent MessageEvent used to write the response + */ + public HttpGetRequestExecutor(MessageEvent requestEvent) { + this.getRequestMessageEvent = requestEvent; + this.getRequestObject = null; + this.startTimestampInNs = 0; + this.coordinatorPerfStats = null; + } /** * @@ -68,13 +79,20 @@ public class HttpGetRequestExecutor implements Runnable { * error * @param storeClient Reference to the fat client for performing this Get * operation + * @param coordinatorPerfStats Stats object used to measure the turnaround + * time + * @param startTimestampInNs start timestamp of the request */ public HttpGetRequestExecutor(CompositeVoldemortRequest getRequestObject, MessageEvent requestEvent, - DynamicTimeoutStoreClient storeClient) { + DynamicTimeoutStoreClient storeClient, + long startTimestampInNs, + StoreStats coordinatorPerfStats) { this.getRequestMessageEvent = requestEvent; this.storeClient = storeClient; this.getRequestObject = getRequestObject; + this.startTimestampInNs = startTimestampInNs; + this.coordinatorPerfStats = coordinatorPerfStats; } public void writeResponse(Versioned responseVersioned) { @@ -87,18 +105,7 @@ public void writeResponse(Versioned responseVersioned) { this.responseContent.writeBytes(value); VectorClock vc = (VectorClock) responseVersioned.getVersion(); - VectorClockWrapper vcWrapper = new VectorClockWrapper(vc); - ObjectMapper mapper = new ObjectMapper(); - String eTag = ""; - try { - eTag = mapper.writeValueAsString(vcWrapper); - } catch(JsonGenerationException e) { - e.printStackTrace(); - } catch(JsonMappingException e) { - e.printStackTrace(); - } catch(IOException e) { - e.printStackTrace(); - } + String eTag = CoordinatorUtils.getSerializedVectorClock(vc); if(logger.isDebugEnabled()) { logger.debug("ETAG : " + eTag); @@ -120,6 +127,12 @@ public void writeResponse(Versioned responseVersioned) { logger.debug("Response = " + response); } + // Update the stats + if(this.coordinatorPerfStats != null) { + long durationInNs = System.nanoTime() - startTimestampInNs; + this.coordinatorPerfStats.recordTime(Tracked.GET, durationInNs); + } + // Write the response to the Netty Channel this.getRequestMessageEvent.getChannel().write(response); } @@ -134,7 +147,6 @@ public void run() { } else { RESTErrorHandler.handleError(NOT_FOUND, this.getRequestMessageEvent, - false, "Requested Key does not exist"); } if(logger.isDebugEnabled()) { @@ -146,22 +158,17 @@ public void run() { String errorDescription = "PUT Failed !!! Illegal Arguments : " + illegalArgsException.getMessage(); logger.error(errorDescription); - RESTErrorHandler.handleError(BAD_REQUEST, - this.getRequestMessageEvent, - false, - errorDescription); + RESTErrorHandler.handleError(BAD_REQUEST, this.getRequestMessageEvent, errorDescription); } catch(StoreTimeoutException timeoutException) { String errorDescription = "GET Request timed out: " + timeoutException.getMessage(); logger.error(errorDescription); RESTErrorHandler.handleError(REQUEST_TIMEOUT, this.getRequestMessageEvent, - false, errorDescription); } catch(VoldemortException ve) { String errorDescription = "Voldemort Exception: " + ve.getMessage(); RESTErrorHandler.handleError(INTERNAL_SERVER_ERROR, this.getRequestMessageEvent, - false, errorDescription); } } diff --git a/src/java/voldemort/coordinator/HttpPutRequestExecutor.java b/src/java/voldemort/coordinator/HttpPutRequestExecutor.java index ebbc7acc0d..e0ad0f98da 100644 --- a/src/java/voldemort/coordinator/HttpPutRequestExecutor.java +++ b/src/java/voldemort/coordinator/HttpPutRequestExecutor.java @@ -1,5 +1,5 @@ /* - * Copyright 2008-2013 LinkedIn, Inc + * Copyright 2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -17,10 +17,10 @@ package voldemort.coordinator; import static org.jboss.netty.handler.codec.http.HttpHeaders.Names.CONTENT_LENGTH; -import static org.jboss.netty.handler.codec.http.HttpHeaders.Names.CONTENT_TYPE; +import static org.jboss.netty.handler.codec.http.HttpHeaders.Names.ETAG; import static org.jboss.netty.handler.codec.http.HttpResponseStatus.BAD_REQUEST; +import static org.jboss.netty.handler.codec.http.HttpResponseStatus.CREATED; import static org.jboss.netty.handler.codec.http.HttpResponseStatus.INTERNAL_SERVER_ERROR; -import static org.jboss.netty.handler.codec.http.HttpResponseStatus.OK; import static org.jboss.netty.handler.codec.http.HttpResponseStatus.PRECONDITION_FAILED; import static org.jboss.netty.handler.codec.http.HttpResponseStatus.REQUEST_TIMEOUT; import static org.jboss.netty.handler.codec.http.HttpVersion.HTTP_1_1; @@ -33,8 +33,11 @@ import voldemort.VoldemortException; import voldemort.store.CompositeVoldemortRequest; import voldemort.store.StoreTimeoutException; +import voldemort.store.stats.StoreStats; +import voldemort.store.stats.Tracked; import voldemort.utils.ByteArray; import voldemort.versioning.ObsoleteVersionException; +import voldemort.versioning.VectorClock; /** * A Runnable class that uses the specified Fat client to perform a Voldemort @@ -48,10 +51,19 @@ public class HttpPutRequestExecutor implements Runnable { DynamicTimeoutStoreClient storeClient; private final Logger logger = Logger.getLogger(HttpPutRequestExecutor.class); private final CompositeVoldemortRequest putRequestObject; + private final long startTimestampInNs; + private final StoreStats coordinatorPerfStats; + /** + * Dummy constructor invoked during a Noop Put operation + * + * @param requestEvent MessageEvent used to write the response + */ public HttpPutRequestExecutor(MessageEvent requestEvent) { this.putRequestMessageEvent = requestEvent; this.putRequestObject = null; + this.startTimestampInNs = 0; + this.coordinatorPerfStats = null; } /** @@ -62,25 +74,44 @@ public HttpPutRequestExecutor(MessageEvent requestEvent) { * error * @param storeClient Reference to the fat client for performing this Get * operation + * @param coordinatorPerfStats Stats object used to measure the turnaround + * time + * @param startTimestampInNs start timestamp of the request */ public HttpPutRequestExecutor(CompositeVoldemortRequest putRequestObject, MessageEvent requestEvent, - DynamicTimeoutStoreClient storeClient) { + DynamicTimeoutStoreClient storeClient, + long startTimestampInNs, + StoreStats coordinatorPerfStats) { this.putRequestMessageEvent = requestEvent; this.storeClient = storeClient; this.putRequestObject = putRequestObject; + this.startTimestampInNs = startTimestampInNs; + this.coordinatorPerfStats = coordinatorPerfStats; } - public void writeResponse() { + public void writeResponse(VectorClock successfulPutVC) { // 1. Create the Response object - HttpResponse response = new DefaultHttpResponse(HTTP_1_1, OK); + HttpResponse response = new DefaultHttpResponse(HTTP_1_1, CREATED); - // 2. Set the right headers - response.setHeader(CONTENT_TYPE, "application/json"); + String eTag = CoordinatorUtils.getSerializedVectorClock(successfulPutVC); - // 3. Copy the data into the payload + if(logger.isDebugEnabled()) { + logger.debug("ETAG : " + eTag); + } + + // 2. Set the right headers + response.setHeader(ETAG, eTag); response.setHeader(CONTENT_LENGTH, 0); + // TODO: return the Version back to the client + + // Update the stats + if(this.coordinatorPerfStats != null) { + long durationInNs = System.nanoTime() - startTimestampInNs; + this.coordinatorPerfStats.recordTime(Tracked.PUT, durationInNs); + } + // Write the response to the Netty Channel this.putRequestMessageEvent.getChannel().write(response); } @@ -89,26 +120,27 @@ public void writeResponse() { public void run() { try { - this.storeClient.putWithCustomTimeout(putRequestObject); + VectorClock successfulPutVC = null; + if(putRequestObject.getValue() != null) { + successfulPutVC = (VectorClock) this.storeClient.putVersionedWithCustomTimeout(putRequestObject); + } else { + successfulPutVC = (VectorClock) this.storeClient.putWithCustomTimeout(putRequestObject); + } if(logger.isDebugEnabled()) { logger.debug("PUT successful !"); } - writeResponse(); + writeResponse(successfulPutVC); } catch(IllegalArgumentException illegalArgsException) { String errorDescription = "PUT Failed !!! Illegal Arguments : " + illegalArgsException.getMessage(); logger.error(errorDescription); - RESTErrorHandler.handleError(BAD_REQUEST, - this.putRequestMessageEvent, - false, - errorDescription); + RESTErrorHandler.handleError(BAD_REQUEST, this.putRequestMessageEvent, errorDescription); } catch(ObsoleteVersionException oe) { String errorDescription = "PUT Failed !!! Obsolete version exception: " + oe.getMessage(); RESTErrorHandler.handleError(PRECONDITION_FAILED, this.putRequestMessageEvent, - false, errorDescription); } catch(StoreTimeoutException timeoutException) { @@ -116,14 +148,12 @@ public void run() { logger.error(errorDescription); RESTErrorHandler.handleError(REQUEST_TIMEOUT, this.putRequestMessageEvent, - false, errorDescription); } catch(VoldemortException ve) { String errorDescription = "Voldemort Exception: " + ve.getMessage(); RESTErrorHandler.handleError(INTERNAL_SERVER_ERROR, this.putRequestMessageEvent, - false, errorDescription); } } diff --git a/src/java/voldemort/coordinator/NoopHttpRequestHandler.java b/src/java/voldemort/coordinator/NoopHttpRequestHandler.java index 7b0574453a..79a07b4a30 100644 --- a/src/java/voldemort/coordinator/NoopHttpRequestHandler.java +++ b/src/java/voldemort/coordinator/NoopHttpRequestHandler.java @@ -1,5 +1,5 @@ /* - * Copyright 2008-2013 LinkedIn, Inc + * Copyright 2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -22,8 +22,7 @@ import org.jboss.netty.handler.codec.http.HttpRequest; import voldemort.common.VoldemortOpCode; -import voldemort.store.CompositeGetVoldemortRequest; -import voldemort.utils.ByteArray; +import voldemort.versioning.VectorClock; import voldemort.versioning.Versioned; /** @@ -43,21 +42,16 @@ public void messageReceived(ChannelHandlerContext ctx, MessageEvent e) throws Ex switch(operationType) { case VoldemortOpCode.GET_OP_CODE: - HttpGetRequestExecutor getExecutor = new HttpGetRequestExecutor(new CompositeGetVoldemortRequest(null, - 0l, - false), - e, - null); + HttpGetRequestExecutor getExecutor = new HttpGetRequestExecutor(e); Versioned responseVersioned = null; - byte[] nullByteArray = new byte[1]; - nullByteArray[0] = 0; - responseVersioned = new Versioned(nullByteArray); + byte[] sampleByteArray = "a".getBytes(); + responseVersioned = new Versioned(sampleByteArray); getExecutor.writeResponse(responseVersioned); break; case VoldemortOpCode.PUT_OP_CODE: HttpPutRequestExecutor putRequestExecutor = new HttpPutRequestExecutor(e); - putRequestExecutor.writeResponse(); + putRequestExecutor.writeResponse(new VectorClock()); break; default: System.err.println("Illegal operation."); diff --git a/src/java/voldemort/coordinator/RESTErrorHandler.java b/src/java/voldemort/coordinator/RESTErrorHandler.java index f83e923248..d57b835aa7 100644 --- a/src/java/voldemort/coordinator/RESTErrorHandler.java +++ b/src/java/voldemort/coordinator/RESTErrorHandler.java @@ -1,5 +1,5 @@ /* - * Copyright 2008-2013 LinkedIn, Inc + * Copyright 2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -20,14 +20,14 @@ import static org.jboss.netty.handler.codec.http.HttpVersion.HTTP_1_1; import org.jboss.netty.buffer.ChannelBuffers; -import org.jboss.netty.channel.ChannelFuture; -import org.jboss.netty.channel.ChannelFutureListener; import org.jboss.netty.channel.MessageEvent; import org.jboss.netty.handler.codec.http.DefaultHttpResponse; import org.jboss.netty.handler.codec.http.HttpResponse; import org.jboss.netty.handler.codec.http.HttpResponseStatus; import org.jboss.netty.util.CharsetUtil; +import voldemort.VoldemortException; + /** * A Generic class used to propagate the error back to the client over the Netty * channel @@ -35,10 +35,15 @@ */ public class RESTErrorHandler { - public static void handleError(HttpResponseStatus status, - MessageEvent e, - boolean keepAlive, - String message) { + static CoordinatorErrorStats errorStats; + + public static void setErrorStatsHandler(CoordinatorErrorStats errorStatsObj) { + errorStats = errorStatsObj; + } + + public static void handleError(HttpResponseStatus status, MessageEvent e, String message) { + errorStats.reportException(new VoldemortException()); + // 1. Create the Response object HttpResponse response = new DefaultHttpResponse(HTTP_1_1, status); @@ -47,12 +52,6 @@ public static void handleError(HttpResponseStatus status, + message + "\r\n", CharsetUtil.UTF_8)); // Write the response to the Netty Channel - ChannelFuture future = e.getChannel().write(response); - - // Close the non-keep-alive connection after the write operation is - // done. - if(!keepAlive) { - future.addListener(ChannelFutureListener.CLOSE); - } + e.getChannel().write(response); } } diff --git a/src/java/voldemort/coordinator/VoldemortHttpRequestHandler.java b/src/java/voldemort/coordinator/VoldemortHttpRequestHandler.java index 0af1a280c0..a6b9553a4a 100644 --- a/src/java/voldemort/coordinator/VoldemortHttpRequestHandler.java +++ b/src/java/voldemort/coordinator/VoldemortHttpRequestHandler.java @@ -1,5 +1,5 @@ /* - * Copyright 2008-2013 LinkedIn, Inc + * Copyright 2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -16,7 +16,6 @@ package voldemort.coordinator; -import static org.jboss.netty.handler.codec.http.HttpHeaders.isKeepAlive; import static org.jboss.netty.handler.codec.http.HttpResponseStatus.BAD_REQUEST; import java.io.IOException; @@ -35,19 +34,19 @@ import org.jboss.netty.channel.MessageEvent; import org.jboss.netty.channel.SimpleChannelUpstreamHandler; import org.jboss.netty.handler.codec.http.HttpChunk; -import org.jboss.netty.handler.codec.http.HttpChunkTrailer; import org.jboss.netty.handler.codec.http.HttpMethod; import org.jboss.netty.handler.codec.http.HttpRequest; -import org.jboss.netty.util.CharsetUtil; import voldemort.common.VoldemortOpCode; import voldemort.store.CompositeDeleteVoldemortRequest; import voldemort.store.CompositeGetAllVoldemortRequest; import voldemort.store.CompositeGetVoldemortRequest; import voldemort.store.CompositePutVoldemortRequest; +import voldemort.store.CompositeVersionedPutVoldemortRequest; import voldemort.store.CompositeVoldemortRequest; import voldemort.utils.ByteArray; import voldemort.versioning.VectorClock; +import voldemort.versioning.Versioned; /** * A class to handle the HTTP request and execute the same on behalf of the thin @@ -60,8 +59,6 @@ public class VoldemortHttpRequestHandler extends SimpleChannelUpstreamHandler { public HttpRequest request; private boolean readingChunks; - /** Buffer that stores the response content */ - private final StringBuilder buf = new StringBuilder(); private Map fatClientMap; private final Logger logger = Logger.getLogger(VoldemortHttpRequestHandler.class); public static final String X_VOLD_REQUEST_TIMEOUT_MS = "X-VOLD-Request-Timeout-ms"; @@ -70,20 +67,25 @@ public class VoldemortHttpRequestHandler extends SimpleChannelUpstreamHandler { public static final String CUSTOM_RESOLVING_STRATEGY = "custom"; public static final String DEFAULT_RESOLVING_STRATEGY = "timestamp"; + private CoordinatorErrorStats errorStats = null; + // Implicit constructor defined for the derived classes public VoldemortHttpRequestHandler() {} - public VoldemortHttpRequestHandler(Map fatClientMap) { + public VoldemortHttpRequestHandler(Map fatClientMap, + CoordinatorErrorStats errorStats) { this.fatClientMap = fatClientMap; + this.errorStats = errorStats; } /** - * Function to parse the HTTP headers and build a Voldemort request object + * Function to parse (and validate) the HTTP headers and build a Voldemort + * request object * * @param requestURI URI of the REST request * @param httpMethod Message Event object used to write the response to * @param e The REST (Voldemort) operation type - * @return true if a valid request was received. False otherwise + * @return A composite request object corresponding to the incoming request */ private CompositeVoldemortRequest parseRequest(String requestURI, MessageEvent e, @@ -153,19 +155,23 @@ private CompositeVoldemortRequest parseRequest(String request return null; } byte[] putValue = readValue(content); - requestWrapper = new CompositePutVoldemortRequest(putKey, - putValue, - operationTimeoutInMs); + VectorClock putOpVectorClock = getVectorClock(this.request.getHeader(X_VOLD_VECTOR_CLOCK)); + if(putOpVectorClock != null && putOpVectorClock.getEntries().size() > 0) { + requestWrapper = new CompositeVersionedPutVoldemortRequest(putKey, + new Versioned(putValue, + putOpVectorClock), + operationTimeoutInMs); + } else { + requestWrapper = new CompositePutVoldemortRequest(putKey, + putValue, + operationTimeoutInMs); + } break; case VoldemortOpCode.DELETE_OP_CODE: - VectorClock vc = getVectorClock(this.request.getHeader(X_VOLD_VECTOR_CLOCK)); - if(vc == null) { - // handleBadRequest(e, - // "Incorrect vector clock specified in the request"); - } + VectorClock deleteOpVectorClock = getVectorClock(this.request.getHeader(X_VOLD_VECTOR_CLOCK)); requestWrapper = new CompositeDeleteVoldemortRequest(keyList.get(0), - vc, + deleteOpVectorClock, operationTimeoutInMs); break; @@ -192,6 +198,8 @@ public void messageReceived(ChannelHandlerContext ctx, MessageEvent e) throws Ex readingChunks = true; } else { + long startTimeStampInNs = System.nanoTime(); + CompositeVoldemortRequest requestObject = parseRequest(requestURI, e, this.request.getMethod()); @@ -205,11 +213,13 @@ public void messageReceived(ChannelHandlerContext ctx, MessageEvent e) throws Ex } if(storeName == null || fatClientWrapper == null) { + this.errorStats.reportException(new IllegalArgumentException()); handleBadRequest(e, "Invalid store name. Critical error."); return; } if(requestObject == null) { + this.errorStats.reportException(new IllegalArgumentException()); handleBadRequest(e, "Illegal request."); return; } @@ -219,28 +229,28 @@ public void messageReceived(ChannelHandlerContext ctx, MessageEvent e) throws Ex if(logger.isDebugEnabled()) { logger.debug("Incoming get request"); } - fatClientWrapper.submitGetRequest(requestObject, e); + fatClientWrapper.submitGetRequest(requestObject, e, startTimeStampInNs); break; case VoldemortOpCode.GET_ALL_OP_CODE: - fatClientWrapper.submitGetAllRequest(requestObject, e, storeName); + fatClientWrapper.submitGetAllRequest(requestObject, + e, + storeName, + startTimeStampInNs); break; case VoldemortOpCode.PUT_OP_CODE: if(logger.isDebugEnabled()) { logger.debug("Incoming put request"); } - fatClientWrapper.submitPutRequest(requestObject, e); + fatClientWrapper.submitPutRequest(requestObject, e, startTimeStampInNs); break; case VoldemortOpCode.DELETE_OP_CODE: - fatClientWrapper.submitDeleteRequest(requestObject, e); + fatClientWrapper.submitDeleteRequest(requestObject, e, startTimeStampInNs); break; default: String errorMessage = "Illegal operation."; logger.error(errorMessage); - RESTErrorHandler.handleError(BAD_REQUEST, - e, - isKeepAlive(request), - errorMessage); + RESTErrorHandler.handleError(BAD_REQUEST, e, errorMessage); return; } @@ -249,23 +259,7 @@ public void messageReceived(ChannelHandlerContext ctx, MessageEvent e) throws Ex HttpChunk chunk = (HttpChunk) e.getMessage(); if(chunk.isLast()) { readingChunks = false; - buf.append("END OF CONTENT\r\n"); - - HttpChunkTrailer trailer = (HttpChunkTrailer) chunk; - if(!trailer.getHeaderNames().isEmpty()) { - buf.append("\r\n"); - for(String name: trailer.getHeaderNames()) { - for(String value: trailer.getHeaders(name)) { - buf.append("TRAILING HEADER: " + name + " = " + value + "\r\n"); - } - } - buf.append("\r\n"); - } - - } else { - buf.append("CHUNK: " + chunk.getContent().toString(CharsetUtil.UTF_8) + "\r\n"); } - } } @@ -278,6 +272,11 @@ public void messageReceived(ChannelHandlerContext ctx, MessageEvent e) throws Ex */ private VectorClock getVectorClock(String vectorClockHeader) { VectorClock vc = null; + + if(vectorClockHeader == null) { + return null; + } + ObjectMapper mapper = new ObjectMapper(); if(logger.isDebugEnabled()) { logger.debug("Received vector clock : " + vectorClockHeader); @@ -312,7 +311,7 @@ private VectorClock getVectorClock(String vectorClockHeader) { private void handleBadRequest(MessageEvent e, String msg) { String errorMessage = msg; logger.error(errorMessage); - RESTErrorHandler.handleError(BAD_REQUEST, e, false, errorMessage); + RESTErrorHandler.handleError(BAD_REQUEST, e, errorMessage); } /** diff --git a/src/java/voldemort/routing/ConsistentRoutingStrategy.java b/src/java/voldemort/routing/ConsistentRoutingStrategy.java index fa226c6ca5..cb7f48d6ef 100644 --- a/src/java/voldemort/routing/ConsistentRoutingStrategy.java +++ b/src/java/voldemort/routing/ConsistentRoutingStrategy.java @@ -113,12 +113,12 @@ public List routeRequest(byte[] key) { preferenceList.add(partitionToNode[partition]); } if(logger.isDebugEnabled()) { - StringBuilder nodeList = new StringBuilder(); + List nodeIdList = new ArrayList(); for(int partition: partitionList) { - nodeList.append(partitionToNode[partition].getId() + ","); + nodeIdList.add(partitionToNode[partition].getId()); } - logger.debug("Key " + ByteUtils.toHexString(key) + " mapped to Nodes [" + nodeList - + "] Partitions [" + partitionList + "]"); + logger.debug("Key " + ByteUtils.toHexString(key) + " mapped to Nodes " + nodeIdList + + " Partitions " + partitionList); } return preferenceList; } diff --git a/src/java/voldemort/routing/RoutingStrategy.java b/src/java/voldemort/routing/RoutingStrategy.java index 827d861657..07c7b9050e 100644 --- a/src/java/voldemort/routing/RoutingStrategy.java +++ b/src/java/voldemort/routing/RoutingStrategy.java @@ -49,6 +49,11 @@ public interface RoutingStrategy { /** * Get the partition list for the given key. * + * TODO: The naming of this method is confusing.. it is simply a wrapper + * around {@link RoutingStrategy#getReplicatingPartitionList(int)} that + * takes a key. So, would be good to rename this also as + * getReplicatingPartitionList + * * @param key The key the operation is operating on * @return The partition list for the given key */ diff --git a/src/java/voldemort/utils/StoreInstance.java b/src/java/voldemort/routing/StoreRoutingPlan.java similarity index 69% rename from src/java/voldemort/utils/StoreInstance.java rename to src/java/voldemort/routing/StoreRoutingPlan.java index bc731b9371..8e890e7c31 100644 --- a/src/java/voldemort/utils/StoreInstance.java +++ b/src/java/voldemort/routing/StoreRoutingPlan.java @@ -14,7 +14,7 @@ * the License. */ -package voldemort.utils; +package voldemort.routing; import java.util.ArrayList; import java.util.HashMap; @@ -24,9 +24,13 @@ import voldemort.VoldemortException; import voldemort.cluster.Cluster; -import voldemort.routing.RoutingStrategyFactory; -import voldemort.routing.RoutingStrategyType; +import voldemort.cluster.Node; import voldemort.store.StoreDefinition; +import voldemort.utils.ByteUtils; +import voldemort.utils.ClusterUtils; +import voldemort.utils.NodeUtils; +import voldemort.utils.Pair; +import voldemort.utils.Utils; import com.google.common.collect.Lists; @@ -34,25 +38,23 @@ /** * This class wraps up a Cluster object and a StoreDefinition. The methods are - * effectively helper or util style methods for analyzing partitions and so on - * which are a function of both Cluster and StoreDefinition. + * effectively helper or util style methods for querying the routing plan that + * will be generated for a given routing strategy upon store and cluster + * topology information. */ -public class StoreInstance { - - // TODO: (refactor) Improve upon the name "StoreInstance". Object-oriented - // meaning of 'instance' is too easily confused with system notion of an - // "instance of a cluster" (the intended usage in this class name). +public class StoreRoutingPlan { private final Cluster cluster; private final StoreDefinition storeDefinition; - private final Map partitionIdToNodeIdMap; + private final RoutingStrategy routingStrategy; - public StoreInstance(Cluster cluster, StoreDefinition storeDefinition) { + public StoreRoutingPlan(Cluster cluster, StoreDefinition storeDefinition) { this.cluster = cluster; this.storeDefinition = storeDefinition; - - partitionIdToNodeIdMap = ClusterUtils.getCurrentPartitionMapping(cluster); + this.partitionIdToNodeIdMap = ClusterUtils.getCurrentPartitionMapping(cluster); + this.routingStrategy = new RoutingStrategyFactory().updateRoutingStrategy(storeDefinition, + cluster); } public Cluster getCluster() { @@ -69,19 +71,28 @@ public StoreDefinition getStoreDefinition() { * @param masterPartitionId * @return List of partition IDs that replicate the master partition ID. */ - public List getReplicationPartitionList(int masterPartitionId) { - return new RoutingStrategyFactory().updateRoutingStrategy(storeDefinition, cluster) - .getReplicatingPartitionList(masterPartitionId); + public List getReplicatingPartitionList(int masterPartitionId) { + return this.routingStrategy.getReplicatingPartitionList(masterPartitionId); } /** * Determines list of partition IDs that replicate the key. * * @param key - * @return List of partition IDs that replicate the partition ID. + * @return List of partition IDs that replicate the given key */ - public List getReplicationPartitionList(final byte[] key) { - return getReplicationPartitionList(getMasterPartitionId(key)); + public List getReplicatingPartitionList(final byte[] key) { + return this.routingStrategy.getPartitionList(key); + } + + /** + * Determines the list of nodes that the key replicates to + * + * @param key + * @return list of nodes that key replicates to + */ + public List getReplicationNodeList(final byte[] key) { + return NodeUtils.getNodeIds(this.routingStrategy.routeRequest(key)); } /** @@ -91,8 +102,7 @@ public List getReplicationPartitionList(final byte[] key) { * @return */ public int getMasterPartitionId(final byte[] key) { - return new RoutingStrategyFactory().updateRoutingStrategy(storeDefinition, cluster) - .getMasterPartition(key); + return this.routingStrategy.getMasterPartition(key); } /** @@ -113,8 +123,11 @@ public int getNodeIdForPartitionId(int partitionId) { * @return partitionId if found, otherwise null. */ public Integer getNodesPartitionIdForKey(int nodeId, final byte[] key) { - List partitionIds = getReplicationPartitionList(key); + // this is all the partitions the key replicates to. + List partitionIds = getReplicatingPartitionList(key); for(Integer partitionId: partitionIds) { + // check which of the replicating partitions belongs to the node in + // question if(getNodeIdForPartitionId(partitionId) == nodeId) { return partitionId; } @@ -147,8 +160,88 @@ private List getNodeIdListForPartitionIdList(List partitionIds return nodeIds; } + /** + * Returns the list of node ids this partition replicates to. + * + * TODO ideally the {@link RoutingStrategy} should house a routeRequest(int + * partition) method + * + * @param partitionId + * @return + * @throws VoldemortException + */ public List getReplicationNodeList(int partitionId) throws VoldemortException { - return getNodeIdListForPartitionIdList(getReplicationPartitionList(partitionId)); + return getNodeIdListForPartitionIdList(getReplicatingPartitionList(partitionId)); + } + + /** + * Given a key that belong to a given node, returns a number n (< zone + * replication factor), such that the given node holds the key as the nth + * replica of the given zone + * + * eg: if the method returns 1, then given node hosts the key as the zone + * secondary in the given zone + * + * @param zoneId + * @param nodeId + * @param key + * @return + */ + public int getZoneReplicaType(int zoneId, int nodeId, byte[] key) { + List replicatingNodes = this.routingStrategy.routeRequest(key); + int zoneReplicaType = -1; + for(Node node: replicatingNodes) { + // bump up the replica number once you encounter a node in the given + // zone + if(node.getZoneId() == zoneId) { + zoneReplicaType++; + } + // we are done when we find the given node + if(node.getId() == nodeId) { + return zoneReplicaType; + } + } + if(zoneReplicaType > -1) { + throw new VoldemortException("Node " + nodeId + " not a replica for the key " + + ByteUtils.toHexString(key) + " in given zone " + zoneId); + } else { + throw new VoldemortException("Could not find any replicas for the key " + + ByteUtils.toHexString(key) + " in given zone " + zoneId); + } + } + + /** + * Given a key and a replica type n (< zone replication factor), figure out + * the node that contains the key as the nth replica in the given zone. + * + * @param zoneId + * @param zoneReplicaType + * @param key + * @return + */ + public int getZoneReplicaNode(int zoneId, int zoneReplicaType, byte[] key) { + List replicatingNodes = this.routingStrategy.routeRequest(key); + int zoneReplicaTypeCounter = -1; + for(Node node: replicatingNodes) { + // bump up the counter if we encounter a replica in the given zone + if(node.getZoneId() == zoneId) { + zoneReplicaTypeCounter++; + } + // when the counter matches up with the replicaNumber we need, we + // are done. + if(zoneReplicaTypeCounter == zoneReplicaType) { + return node.getId(); + } + } + if(zoneReplicaTypeCounter == -1) { + throw new VoldemortException("Could not find any replicas for the key " + + ByteUtils.toHexString(key) + " in given zone " + zoneId); + } else { + throw new VoldemortException("Could not find " + (zoneReplicaType + 1) + + " replicas for the key " + ByteUtils.toHexString(key) + + " in given zone " + zoneId + ". Only found " + + (zoneReplicaTypeCounter + 1)); + } } // TODO: (refactor) Move from static methods to non-static methods that use @@ -212,9 +305,9 @@ public static boolean checkKeyBelongsToPartition(int nodeId, cluster) .getPartitionList(key); List nodePartitions = cluster.getNodeById(nodeId).getPartitionIds(); - checkResult = StoreInstance.checkKeyBelongsToPartition(keyPartitions, - nodePartitions, - replicaToPartitionList); + checkResult = StoreRoutingPlan.checkKeyBelongsToPartition(keyPartitions, + nodePartitions, + replicaToPartitionList); } return checkResult; } @@ -266,9 +359,9 @@ public static List checkKeyBelongsToPartition(byte[] key, for(Pair>> stealNodeToMap: stealerNodeToMappingTuples) { List nodePartitions = cluster.getNodeById(stealNodeToMap.getFirst()) .getPartitionIds(); - if(StoreInstance.checkKeyBelongsToPartition(keyPartitions, - nodePartitions, - stealNodeToMap.getSecond())) { + if(StoreRoutingPlan.checkKeyBelongsToPartition(keyPartitions, + nodePartitions, + stealNodeToMap.getSecond())) { nodesToPush.add(stealNodeToMap.getFirst()); } } diff --git a/src/java/voldemort/server/VoldemortConfig.java b/src/java/voldemort/server/VoldemortConfig.java index d7eb7bdfb6..83a86e3223 100644 --- a/src/java/voldemort/server/VoldemortConfig.java +++ b/src/java/voldemort/server/VoldemortConfig.java @@ -202,6 +202,7 @@ public class VoldemortConfig implements Serializable { private long streamMaxReadBytesPerSec; private long streamMaxWriteBytesPerSec; + private boolean multiVersionStreamingPutsEnabled; private int gossipIntervalMs; private String failureDetectorImplementation; @@ -223,6 +224,10 @@ public class VoldemortConfig implements Serializable { private int maxParallelStoresRebalancing; private boolean rebalancingOptimization; private boolean usePartitionScanForRebalance; + private int maxProxyPutThreads; + @Deprecated + // Should be removed once the proxy put implementation is stable. + private boolean proxyPutsDuringRebalance; public VoldemortConfig(Properties props) { this(new Props(props)); @@ -349,6 +354,8 @@ public VoldemortConfig(Props props) { this.streamMaxReadBytesPerSec = props.getBytes("stream.read.byte.per.sec", 10 * 1000 * 1000); this.streamMaxWriteBytesPerSec = props.getBytes("stream.write.byte.per.sec", 10 * 1000 * 1000); + this.multiVersionStreamingPutsEnabled = props.getBoolean("use.multi.version.streaming.puts", + true); this.socketTimeoutMs = props.getInt("socket.timeout.ms", 5000); this.socketBufferSize = (int) props.getBytes("socket.buffer.size", 64 * 1024); @@ -462,6 +469,8 @@ public VoldemortConfig(Props props) { this.rebalancingOptimization = props.getBoolean("rebalancing.optimization", true); this.usePartitionScanForRebalance = props.getBoolean("use.partition.scan.for.rebalance", true); + this.maxProxyPutThreads = props.getInt("max.proxy.put.threads", 1); + this.proxyPutsDuringRebalance = props.getBoolean("proxy.puts.during.rebalance", true); this.failureDetectorImplementation = props.getString("failuredetector.implementation", FailureDetectorConfig.DEFAULT_IMPLEMENTATION_CLASS_NAME); @@ -1452,6 +1461,25 @@ public long getSlopMaxWriteBytesPerSec() { return slopMaxWriteBytesPerSec; } + /** + * If true, multiple successive versions of the same key, will be atomically + * written to storage in a single operation. Currently not supported for + * MySqlStorageEngine + * + *

    + *
  • Property : "use.multi.version.streaming.puts"
  • + *
  • Default : true
  • + *
+ * + */ + public void setMultiVersionStreamingPutsEnabled(boolean multiVersionStreamingPutsEnabled) { + this.multiVersionStreamingPutsEnabled = multiVersionStreamingPutsEnabled; + } + + public boolean getMultiVersionStreamingPutsEnabled() { + return this.multiVersionStreamingPutsEnabled; + } + /** * Controls the rate at which the {@link StreamingSlopPusherJob} will send * slop writes over the wire @@ -2664,6 +2692,40 @@ public boolean usePartitionScanForRebalance() { return usePartitionScanForRebalance; } + /** + * Total number of threads needed to issue proxy puts during rebalancing + * + *
    + *
  • Property :"max.proxy.put.threads"
  • + *
  • Default : 1
  • + *
+ */ + public void setMaxProxyPutThreads(int maxProxyPutThreads) { + this.maxProxyPutThreads = maxProxyPutThreads; + } + + public int getMaxProxyPutThreads() { + return this.maxProxyPutThreads; + } + + /** + * If set to true, the puts to the new replicas will be relayed back to the + * original donor nodes, such that they exist if rebalance were to abort in + * the middle for some reason. + * + *
    + *
  • Property :"proxy.puts.during.rebalance"
  • + *
  • Default :false
  • + *
+ */ + public void setProxyPutsDuringRebalance(boolean proxyPutsDuringRebalance) { + this.proxyPutsDuringRebalance = proxyPutsDuringRebalance; + } + + public boolean getProxyPutsDuringRebalance() { + return this.proxyPutsDuringRebalance; + } + /** * Enables fast, efficient range scans to be used for rebalancing * diff --git a/src/java/voldemort/server/http/gui/ReadOnlyStoreManagementServlet.java b/src/java/voldemort/server/http/gui/ReadOnlyStoreManagementServlet.java index d21094905f..a9bface915 100644 --- a/src/java/voldemort/server/http/gui/ReadOnlyStoreManagementServlet.java +++ b/src/java/voldemort/server/http/gui/ReadOnlyStoreManagementServlet.java @@ -196,7 +196,8 @@ private void doSwap(HttpServletRequest req, HttpServletResponse resp) throws IOE String storeName = getRequired(req, "store"); if(metadataStore != null - && !metadataStore.getServerState().equals(MetadataStore.VoldemortState.NORMAL_SERVER)) { + && !metadataStore.getServerStateUnlocked() + .equals(MetadataStore.VoldemortState.NORMAL_SERVER)) { throw new ServletException("Voldemort server not in normal state"); } diff --git a/src/java/voldemort/server/protocol/admin/AdminServiceRequestHandler.java b/src/java/voldemort/server/protocol/admin/AdminServiceRequestHandler.java index 109afc6276..db0313e8fb 100644 --- a/src/java/voldemort/server/protocol/admin/AdminServiceRequestHandler.java +++ b/src/java/voldemort/server/protocol/admin/AdminServiceRequestHandler.java @@ -38,9 +38,11 @@ import voldemort.client.protocol.pb.VAdminProto; import voldemort.client.protocol.pb.VAdminProto.RebalancePartitionInfoMap; import voldemort.client.protocol.pb.VAdminProto.VoldemortAdminRequest; +import voldemort.client.protocol.pb.VProto.KeyedVersions; import voldemort.client.rebalance.RebalancePartitionsInfo; import voldemort.cluster.Cluster; import voldemort.common.nio.ByteBufferBackedInputStream; +import voldemort.routing.StoreRoutingPlan; import voldemort.server.StoreRepository; import voldemort.server.VoldemortConfig; import voldemort.server.protocol.RequestHandler; @@ -55,6 +57,7 @@ import voldemort.store.StoreOperationFailureException; import voldemort.store.backup.NativeBackupable; import voldemort.store.metadata.MetadataStore; +import voldemort.store.mysql.MysqlStorageEngine; import voldemort.store.readonly.FileFetcher; import voldemort.store.readonly.ReadOnlyStorageConfiguration; import voldemort.store.readonly.ReadOnlyStorageEngine; @@ -68,7 +71,6 @@ import voldemort.utils.Pair; import voldemort.utils.RebalanceUtils; import voldemort.utils.ReflectUtils; -import voldemort.utils.StoreInstance; import voldemort.utils.Utils; import voldemort.versioning.ObsoleteVersionException; import voldemort.versioning.VectorClock; @@ -321,12 +323,14 @@ public VAdminProto.RebalanceStateChangeResponse handleRebalanceStateChange(VAdmi Cluster cluster = new ClusterMapper().readCluster(new StringReader(request.getClusterString())); + List storeDefs = new StoreDefinitionsMapper().readStoreList(new StringReader(request.getStoresString())); boolean swapRO = request.getSwapRo(); boolean changeClusterMetadata = request.getChangeClusterMetadata(); boolean changeRebalanceState = request.getChangeRebalanceState(); boolean rollback = request.getRollback(); rebalancer.rebalanceStateChange(cluster, + storeDefs, rebalancePartitionsInfo, swapRO, changeClusterMetadata, @@ -378,7 +382,7 @@ public VAdminProto.AsyncOperationStatusResponse handleRebalanceNode(VAdminProto. + metadataStore.getNodeId()); // We should be in rebalancing state to run this function - if(!metadataStore.getServerState() + if(!metadataStore.getServerStateUnlocked() .equals(MetadataStore.VoldemortState.REBALANCING_MASTER_SERVER)) { response.setError(ProtoUtils.encodeError(errorCodeMapper, new VoldemortException("Voldemort server " @@ -548,42 +552,57 @@ public StreamRequestHandler handleFetchPartitionEntries(VAdminProto.FetchPartiti if(fetchValues) { if(storageEngine.isPartitionScanSupported() && !fetchOrphaned) return new PartitionScanFetchEntriesRequestHandler(request, - metadataStore, - errorCodeMapper, - voldemortConfig, - storeRepository, - networkClassLoader); + metadataStore, + errorCodeMapper, + voldemortConfig, + storeRepository, + networkClassLoader); else return new FullScanFetchEntriesRequestHandler(request, - metadataStore, - errorCodeMapper, - voldemortConfig, - storeRepository, - networkClassLoader); + metadataStore, + errorCodeMapper, + voldemortConfig, + storeRepository, + networkClassLoader); } else { if(storageEngine.isPartitionScanSupported() && !fetchOrphaned) return new PartitionScanFetchKeysRequestHandler(request, - metadataStore, - errorCodeMapper, - voldemortConfig, - storeRepository, - networkClassLoader); + metadataStore, + errorCodeMapper, + voldemortConfig, + storeRepository, + networkClassLoader); else return new FullScanFetchKeysRequestHandler(request, - metadataStore, - errorCodeMapper, - voldemortConfig, - storeRepository, - networkClassLoader); + metadataStore, + errorCodeMapper, + voldemortConfig, + storeRepository, + networkClassLoader); } } public StreamRequestHandler handleUpdatePartitionEntries(VAdminProto.UpdatePartitionEntriesRequest request) { - return new UpdatePartitionEntriesStreamRequestHandler(request, - errorCodeMapper, - voldemortConfig, - storeRepository, - networkClassLoader); + StorageEngine storageEngine = AdminServiceRequestHandler.getStorageEngine(storeRepository, + request.getStore()); + if(!voldemortConfig.getMultiVersionStreamingPutsEnabled() + || storageEngine instanceof MysqlStorageEngine) { + // TODO This check is ugly. Need some generic capability to check + // which storage engine supports which operations. + return new UpdatePartitionEntriesStreamRequestHandler(request, + errorCodeMapper, + voldemortConfig, + storageEngine, + storeRepository, + networkClassLoader); + } else { + return new BufferedUpdatePartitionEntriesStreamRequestHandler(request, + errorCodeMapper, + voldemortConfig, + storageEngine, + storeRepository, + networkClassLoader); + } } public VAdminProto.AsyncOperationListResponse handleAsyncOperationList(VAdminProto.AsyncOperationListRequest request) { @@ -708,7 +727,8 @@ public VAdminProto.SwapStoreResponse handleSwapROStore(VAdminProto.SwapStoreRequ final String storeName = request.getStoreName(); VAdminProto.SwapStoreResponse.Builder response = VAdminProto.SwapStoreResponse.newBuilder(); - if(!metadataStore.getServerState().equals(MetadataStore.VoldemortState.NORMAL_SERVER)) { + if(!metadataStore.getServerStateUnlocked() + .equals(MetadataStore.VoldemortState.NORMAL_SERVER)) { response.setError(ProtoUtils.encodeError(errorCodeMapper, new VoldemortException("Voldemort server " + metadataStore.getNodeId() @@ -1088,12 +1108,12 @@ public VAdminProto.DeletePartitionEntriesResponse handleDeletePartitionEntries(V ByteArray key = entry.getFirst(); Versioned value = entry.getSecond(); throttler.maybeThrottle(key.length() + valueSize(value)); - if(StoreInstance.checkKeyBelongsToPartition(metadataStore.getNodeId(), - key.get(), - replicaToPartitionList, - request.hasInitialCluster() ? new ClusterMapper().readCluster(new StringReader(request.getInitialCluster())) - : metadataStore.getCluster(), - metadataStore.getStoreDef(storeName)) + if(StoreRoutingPlan.checkKeyBelongsToPartition(metadataStore.getNodeId(), + key.get(), + replicaToPartitionList, + request.hasInitialCluster() ? new ClusterMapper().readCluster(new StringReader(request.getInitialCluster())) + : metadataStore.getCluster(), + metadataStore.getStoreDef(storeName)) && filter.accept(key, value)) { if(storageEngine.delete(key, value.getVersion())) { deleteSuccess++; @@ -1124,23 +1144,32 @@ public VAdminProto.DeletePartitionEntriesResponse handleDeletePartitionEntries(V public VAdminProto.UpdateMetadataResponse handleUpdateMetadata(VAdminProto.UpdateMetadataRequest request) { VAdminProto.UpdateMetadataResponse.Builder response = VAdminProto.UpdateMetadataResponse.newBuilder(); + metadataStore.writeLock.lock(); try { - ByteArray key = ProtoUtils.decodeBytes(request.getKey()); - String keyString = ByteUtils.getString(key.get(), "UTF-8"); - if(MetadataStore.METADATA_KEYS.contains(keyString)) { - Versioned versionedValue = ProtoUtils.decodeVersioned(request.getVersioned()); - logger.info("Updating metadata for key '" + keyString + "'"); - metadataStore.put(new ByteArray(ByteUtils.getBytes(keyString, "UTF-8")), - versionedValue, - null); - logger.info("Successfully updated metadata for key '" + keyString + "'"); + for(KeyedVersions keyValue: request.getMetadataEntryList()) { + + try { + ByteArray key = ProtoUtils.decodeBytes(keyValue.getKey()); + String keyString = ByteUtils.getString(key.get(), "UTF-8"); + if(MetadataStore.METADATA_KEYS.contains(keyString)) { + Versioned versionedValue = ProtoUtils.decodeVersionedMetadataKeyValue(keyValue); + logger.info("Updating metadata for key '" + keyString + "'"); + metadataStore.put(new ByteArray(ByteUtils.getBytes(keyString, "UTF-8")), + versionedValue, + null); + logger.info("Successfully updated metadata for key '" + keyString + "'"); + } + } catch(VoldemortException e) { + response.setError(ProtoUtils.encodeError(errorCodeMapper, e)); + logger.error("handleUpdateMetadata failed for request(" + request.toString() + + ")", e); + } } - } catch(VoldemortException e) { - response.setError(ProtoUtils.encodeError(errorCodeMapper, e)); - logger.error("handleUpdateMetadata failed for request(" + request.toString() + ")", e); - } - return response.build(); + return response.build(); + } finally { + metadataStore.writeLock.unlock(); + } } public VAdminProto.GetMetadataResponse handleGetMetadata(VAdminProto.GetMetadataRequest request) { @@ -1190,7 +1219,8 @@ public VAdminProto.DeleteStoreResponse handleDeleteStore(VAdminProto.DeleteStore VAdminProto.DeleteStoreResponse.Builder response = VAdminProto.DeleteStoreResponse.newBuilder(); // don't try to delete a store in the middle of rebalancing - if(!metadataStore.getServerState().equals(MetadataStore.VoldemortState.NORMAL_SERVER)) { + if(!metadataStore.getServerStateUnlocked() + .equals(MetadataStore.VoldemortState.NORMAL_SERVER)) { response.setError(ProtoUtils.encodeError(errorCodeMapper, new VoldemortException("Voldemort server is not in normal state"))); return response.build(); @@ -1267,7 +1297,8 @@ public VAdminProto.AddStoreResponse handleAddStore(VAdminProto.AddStoreRequest r VAdminProto.AddStoreResponse.Builder response = VAdminProto.AddStoreResponse.newBuilder(); // don't try to add a store when not in normal state - if(!metadataStore.getServerState().equals(MetadataStore.VoldemortState.NORMAL_SERVER)) { + if(!metadataStore.getServerStateUnlocked() + .equals(MetadataStore.VoldemortState.NORMAL_SERVER)) { response.setError(ProtoUtils.encodeError(errorCodeMapper, new VoldemortException("Voldemort server is not in normal state"))); return response.build(); diff --git a/src/java/voldemort/server/protocol/admin/AsyncOperationService.java b/src/java/voldemort/server/protocol/admin/AsyncOperationService.java index de0ad16603..fda2f186bf 100644 --- a/src/java/voldemort/server/protocol/admin/AsyncOperationService.java +++ b/src/java/voldemort/server/protocol/admin/AsyncOperationService.java @@ -1,12 +1,12 @@ /* * Copyright 2008-2010 LinkedIn, Inc - * + * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the @@ -16,10 +16,13 @@ package voldemort.server.protocol.admin; -import java.util.*; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Set; import java.util.concurrent.atomic.AtomicInteger; -import com.google.common.collect.ImmutableSet; import org.apache.log4j.Logger; import voldemort.VoldemortException; @@ -29,11 +32,14 @@ import voldemort.common.service.SchedulerService; import voldemort.common.service.ServiceType; +import com.google.common.collect.ImmutableSet; + /** * Asynchronous job scheduler for admin service operations. - * - * TODO: requesting a unique id, then creating an operation with that id seems like a bad API design. - * + * + * TODO: requesting a unique id, then creating an operation with that id seems + * like a bad API design. + * */ @JmxManaged(description = "Asynchronous operation execution") public class AsyncOperationService extends AbstractService { @@ -96,6 +102,19 @@ public String getStatus(int id) { } } + public List getMatchingAsyncOperationList(String jobDescPattern, boolean showCompleted) { + List operationIds = getAsyncOperationList(showCompleted); + List matchingOperationIds = new ArrayList(operationIds.size()); + for(Integer operationId: operationIds) { + AsyncOperation operation = operations.get(operationId); + String operationDescription = operation.getStatus().getDescription(); + if(operationDescription != null && operationDescription.indexOf(jobDescPattern) != -1) { + matchingOperationIds.add(operationId); + } + } + return matchingOperationIds; + } + @JmxOperation(description = "Retrieve all operations") public String getAllAsyncOperations() { String result; @@ -108,23 +127,25 @@ public String getAllAsyncOperations() { } /** - * Get list of asynchronous operations on this node. By default, only the pending - * operations are returned. + * Get list of asynchronous operations on this node. By default, only the + * pending operations are returned. + * * @param showCompleted Show completed operations * @return A list of operation ids. */ public List getAsyncOperationList(boolean showCompleted) { /** - * Create a copy using an immutable set to avoid a {@link java.util.ConcurrentModificationException} + * Create a copy using an immutable set to avoid a + * {@link java.util.ConcurrentModificationException} */ Set keySet = ImmutableSet.copyOf(operations.keySet()); - if (showCompleted) + if(showCompleted) return new ArrayList(keySet); List keyList = new ArrayList(); - for (int key: keySet) { - if (!operations.get(key).getStatus().isComplete()) + for(int key: keySet) { + if(!operations.get(key).getStatus().isComplete()) keyList.add(key); } return keyList; @@ -142,7 +163,7 @@ public AsyncOperationStatus getOperationStatus(int requestId) { public String stopAsyncOperation(int requestId) { try { stopOperation(requestId); - } catch (VoldemortException e) { + } catch(VoldemortException e) { return e.getMessage(); } @@ -158,6 +179,7 @@ public void stopOperation(int requestId) { /** * Generate a unique request id + * * @return A new, guaranteed unique, request id */ public int getUniqueRequestId() { diff --git a/src/java/voldemort/server/protocol/admin/BufferedUpdatePartitionEntriesStreamRequestHandler.java b/src/java/voldemort/server/protocol/admin/BufferedUpdatePartitionEntriesStreamRequestHandler.java new file mode 100644 index 0000000000..11f97871af --- /dev/null +++ b/src/java/voldemort/server/protocol/admin/BufferedUpdatePartitionEntriesStreamRequestHandler.java @@ -0,0 +1,207 @@ +package voldemort.server.protocol.admin; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.EOFException; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import voldemort.VoldemortException; +import voldemort.client.protocol.pb.ProtoUtils; +import voldemort.client.protocol.pb.VAdminProto; +import voldemort.client.protocol.pb.VAdminProto.UpdatePartitionEntriesRequest; +import voldemort.server.StoreRepository; +import voldemort.server.VoldemortConfig; +import voldemort.store.ErrorCodeMapper; +import voldemort.store.StorageEngine; +import voldemort.store.StoreUtils; +import voldemort.store.stats.StreamingStats.Operation; +import voldemort.utils.ByteArray; +import voldemort.utils.ByteUtils; +import voldemort.utils.NetworkClassLoader; +import voldemort.utils.Utils; +import voldemort.versioning.Versioned; + +/** + * The buffering is so that if we the stream contains multiple versions for the + * same key, then we would want the storage to be updated with all the versions + * atomically, to make sure client does not read a partial set of versions at + * any point + * + */ +class BufferedUpdatePartitionEntriesStreamRequestHandler extends + UpdatePartitionEntriesStreamRequestHandler { + + private static final int VALS_BUFFER_EXPECTED_SIZE = 5; + /** + * Current key being buffered. + */ + private ByteArray currBufferedKey; + + private List> currBufferedVals; + + public BufferedUpdatePartitionEntriesStreamRequestHandler(UpdatePartitionEntriesRequest request, + ErrorCodeMapper errorCodeMapper, + VoldemortConfig voldemortConfig, + StorageEngine storageEngine, + StoreRepository storeRepository, + NetworkClassLoader networkClassLoader) { + super(request, + errorCodeMapper, + voldemortConfig, + storageEngine, + storeRepository, + networkClassLoader); + currBufferedKey = null; + currBufferedVals = new ArrayList>(VALS_BUFFER_EXPECTED_SIZE); + } + + @Override + protected void finalize() { + super.finalize(); + /* + * Also check if we have any pending values being buffered. if so, flush + * to storage. + */ + writeBufferedValsToStorageIfAny(); + } + + /** + * Persists the current set of versions buffered for the current key into + * storage, using the multiVersionPut api + * + * NOTE: Now, it could be that the stream broke off and has more pending + * versions. For now, we simply commit what we have to disk. A better design + * would rely on in-stream markers to do the flushing to storage. + */ + private void writeBufferedValsToStorage() { + long startNs = System.nanoTime(); + + List> obsoleteVals = storageEngine.multiVersionPut(currBufferedKey, + currBufferedVals); + currBufferedVals = new ArrayList>(VALS_BUFFER_EXPECTED_SIZE); + if(streamStats != null) { + streamStats.reportStorageTime(Operation.UPDATE_ENTRIES, + Utils.elapsedTimeNs(startNs, System.nanoTime())); + streamStats.reportStreamingPut(Operation.UPDATE_ENTRIES); + } + + if(logger.isTraceEnabled()) + logger.trace("updateEntries (Streaming multi-version-put) successful"); + + // log Obsolete versions in debug mode + if(logger.isDebugEnabled() && obsoleteVals.size() > 0) { + logger.debug("updateEntries (Streaming multi-version-put) rejected these versions as obsolete : " + + StoreUtils.getVersions(obsoleteVals) + " for key " + currBufferedKey); + } + + // log progress + counter++; + if(0 == counter % STAT_RECORDS_INTERVAL) { + long totalTime = (System.currentTimeMillis() - startTime) / 1000; + + logger.info("Update entries updated " + counter + " entries for store '" + + storageEngine.getName() + "' in " + totalTime + " s"); + } + + // throttling + int totalValueSize = 0; + for(Versioned value: currBufferedVals) { + totalValueSize += AdminServiceRequestHandler.valueSize(value); + } + throttler.maybeThrottle(currBufferedKey.length() + totalValueSize); + } + + private void writeBufferedValsToStorageIfAny() { + if(currBufferedVals.size() > 0) { + writeBufferedValsToStorage(); + } + } + + @Override + public StreamRequestHandlerState handleRequest(DataInputStream inputStream, + DataOutputStream outputStream) + throws IOException { + long startNs = System.nanoTime(); + if(request == null) { + int size = 0; + try { + size = inputStream.readInt(); + } catch(EOFException e) { + if(logger.isTraceEnabled()) + logger.trace("Incomplete read for message size"); + if(streamStats != null) + streamStats.reportNetworkTime(Operation.UPDATE_ENTRIES, + Utils.elapsedTimeNs(startNs, System.nanoTime())); + return StreamRequestHandlerState.INCOMPLETE_READ; + } + + if(size == -1) { + long totalTime = (System.currentTimeMillis() - startTime) / 1000; + logger.info("Update entries successfully updated " + counter + + " entries for store '" + storageEngine.getName() + "' in " + + totalTime + " s"); + // Write the last buffered key to storage + writeBufferedValsToStorage(); + if(logger.isTraceEnabled()) + logger.trace("Message size -1, completed partition update"); + if(streamStats != null) + streamStats.reportNetworkTime(Operation.UPDATE_ENTRIES, + Utils.elapsedTimeNs(startNs, System.nanoTime())); + return StreamRequestHandlerState.COMPLETE; + } + + if(logger.isTraceEnabled()) + logger.trace("UpdatePartitionEntriesRequest message size: " + size); + + byte[] input = new byte[size]; + + try { + ByteUtils.read(inputStream, input); + } catch(EOFException e) { + if(logger.isTraceEnabled()) + logger.trace("Incomplete read for message"); + + return StreamRequestHandlerState.INCOMPLETE_READ; + } finally { + if(streamStats != null) + streamStats.reportNetworkTime(Operation.UPDATE_ENTRIES, + Utils.elapsedTimeNs(startNs, System.nanoTime())); + } + + VAdminProto.UpdatePartitionEntriesRequest.Builder builder = VAdminProto.UpdatePartitionEntriesRequest.newBuilder(); + builder.mergeFrom(input); + request = builder.build(); + } + + VAdminProto.PartitionEntry partitionEntry = request.getPartitionEntry(); + ByteArray key = ProtoUtils.decodeBytes(partitionEntry.getKey()); + Versioned value = ProtoUtils.decodeVersioned(partitionEntry.getVersioned()); + + if(filter.accept(key, value)) { + // Check if the current key is same as the one before. + if(currBufferedKey != null && !key.equals(currBufferedKey)) { + // if not, write buffered values for the previous key to storage + writeBufferedValsToStorage(); + } + currBufferedKey = key; + currBufferedVals.add(value); + } + + request = null; + return StreamRequestHandlerState.READING; + } + + @Override + public void close(DataOutputStream outputStream) throws IOException { + writeBufferedValsToStorageIfAny(); + super.close(outputStream); + } + + @Override + public void handleError(DataOutputStream outputStream, VoldemortException e) throws IOException { + writeBufferedValsToStorageIfAny(); + super.handleError(outputStream, e); + } +} diff --git a/src/java/voldemort/server/protocol/admin/FetchStreamRequestHandler.java b/src/java/voldemort/server/protocol/admin/FetchStreamRequestHandler.java index 4bb19ce66b..41e6b8651d 100644 --- a/src/java/voldemort/server/protocol/admin/FetchStreamRequestHandler.java +++ b/src/java/voldemort/server/protocol/admin/FetchStreamRequestHandler.java @@ -30,6 +30,7 @@ import voldemort.client.protocol.pb.ProtoUtils; import voldemort.client.protocol.pb.VAdminProto; import voldemort.cluster.Cluster; +import voldemort.routing.StoreRoutingPlan; import voldemort.server.StoreRepository; import voldemort.server.VoldemortConfig; import voldemort.server.protocol.StreamRequestHandler; @@ -42,7 +43,6 @@ import voldemort.utils.ByteArray; import voldemort.utils.EventThrottler; import voldemort.utils.NetworkClassLoader; -import voldemort.utils.StoreInstance; import voldemort.utils.Time; import voldemort.xml.ClusterMapper; @@ -90,7 +90,7 @@ public abstract class FetchStreamRequestHandler implements StreamRequestHandler protected boolean fetchOrphaned; - protected final StoreInstance storeInstance; + protected final StoreRoutingPlan storeInstance; protected FetchStreamRequestHandler(VAdminProto.FetchPartitionEntriesRequest request, MetadataStore metadataStore, @@ -119,7 +119,7 @@ protected FetchStreamRequestHandler(VAdminProto.FetchPartitionEntriesRequest req } else { this.initialCluster = metadataStore.getCluster(); } - this.storeInstance = new StoreInstance(this.initialCluster, this.storeDef); + this.storeInstance = new StoreRoutingPlan(this.initialCluster, this.storeDef); this.throttler = new EventThrottler(voldemortConfig.getStreamMaxReadBytesPerSec()); if(request.hasFilter()) { diff --git a/src/java/voldemort/server/protocol/admin/FullScanFetchStreamRequestHandler.java b/src/java/voldemort/server/protocol/admin/FullScanFetchStreamRequestHandler.java index 7d63e1baab..0fbbbf77d2 100644 --- a/src/java/voldemort/server/protocol/admin/FullScanFetchStreamRequestHandler.java +++ b/src/java/voldemort/server/protocol/admin/FullScanFetchStreamRequestHandler.java @@ -23,6 +23,7 @@ import java.util.Set; import voldemort.client.protocol.pb.VAdminProto.FetchPartitionEntriesRequest; +import voldemort.routing.StoreRoutingPlan; import voldemort.server.StoreRepository; import voldemort.server.VoldemortConfig; import voldemort.store.ErrorCodeMapper; @@ -31,7 +32,6 @@ import voldemort.utils.ByteArray; import voldemort.utils.ClosableIterator; import voldemort.utils.NetworkClassLoader; -import voldemort.utils.StoreInstance; import voldemort.utils.Utils; /** @@ -102,7 +102,7 @@ private Integer getKeyPartitionId(byte[] key) { * @return true iff key is needed. */ protected boolean isKeyNeeded(byte[] key) { - if(!StoreInstance.checkKeyBelongsToPartition(nodeId, + if(!StoreRoutingPlan.checkKeyBelongsToPartition(nodeId, key, replicaToPartitionList, initialCluster, @@ -134,7 +134,7 @@ protected boolean isItemAccepted(byte[] key) { entryAccepted = true; } } else { - if(!StoreInstance.checkKeyBelongsToNode(key, nodeId, initialCluster, storeDef)) { + if(!StoreRoutingPlan.checkKeyBelongsToNode(key, nodeId, initialCluster, storeDef)) { entryAccepted = true; } } diff --git a/src/java/voldemort/server/protocol/admin/PartitionScanFetchEntriesRequestHandler.java b/src/java/voldemort/server/protocol/admin/PartitionScanFetchEntriesRequestHandler.java index 0a1ff69d7b..b496f51574 100644 --- a/src/java/voldemort/server/protocol/admin/PartitionScanFetchEntriesRequestHandler.java +++ b/src/java/voldemort/server/protocol/admin/PartitionScanFetchEntriesRequestHandler.java @@ -23,6 +23,7 @@ import voldemort.client.protocol.pb.ProtoUtils; import voldemort.client.protocol.pb.VAdminProto; import voldemort.client.protocol.pb.VAdminProto.FetchPartitionEntriesRequest; +import voldemort.routing.StoreRoutingPlan; import voldemort.server.StoreRepository; import voldemort.server.VoldemortConfig; import voldemort.store.ErrorCodeMapper; @@ -32,7 +33,6 @@ import voldemort.utils.ClosableIterator; import voldemort.utils.NetworkClassLoader; import voldemort.utils.Pair; -import voldemort.utils.StoreInstance; import voldemort.versioning.Versioned; import com.google.protobuf.Message; @@ -86,7 +86,7 @@ public StreamRequestHandlerState handleRequest(DataInputStream inputStream, // Check the current node contains the partition as the // requested replicatype if(!fetchedPartitions.contains(currentPartition) - && StoreInstance.checkPartitionBelongsToNode(currentPartition, + && StoreRoutingPlan.checkPartitionBelongsToNode(currentPartition, currentReplicaType, nodeId, initialCluster, diff --git a/src/java/voldemort/server/protocol/admin/PartitionScanFetchKeysRequestHandler.java b/src/java/voldemort/server/protocol/admin/PartitionScanFetchKeysRequestHandler.java index 351335b4fc..afdd7f789f 100644 --- a/src/java/voldemort/server/protocol/admin/PartitionScanFetchKeysRequestHandler.java +++ b/src/java/voldemort/server/protocol/admin/PartitionScanFetchKeysRequestHandler.java @@ -23,6 +23,7 @@ import voldemort.client.protocol.pb.ProtoUtils; import voldemort.client.protocol.pb.VAdminProto; import voldemort.client.protocol.pb.VAdminProto.FetchPartitionEntriesRequest; +import voldemort.routing.StoreRoutingPlan; import voldemort.server.StoreRepository; import voldemort.server.VoldemortConfig; import voldemort.store.ErrorCodeMapper; @@ -31,7 +32,6 @@ import voldemort.utils.ByteArray; import voldemort.utils.ClosableIterator; import voldemort.utils.NetworkClassLoader; -import voldemort.utils.StoreInstance; import com.google.protobuf.Message; @@ -84,7 +84,7 @@ public StreamRequestHandlerState handleRequest(DataInputStream inputStream, // Check the current node contains the partition as the // requested replicatype if(!fetchedPartitions.contains(currentPartition) - && StoreInstance.checkPartitionBelongsToNode(currentPartition, + && StoreRoutingPlan.checkPartitionBelongsToNode(currentPartition, currentReplicaType, nodeId, initialCluster, diff --git a/src/java/voldemort/server/protocol/admin/UpdatePartitionEntriesStreamRequestHandler.java b/src/java/voldemort/server/protocol/admin/UpdatePartitionEntriesStreamRequestHandler.java index 53ae284fdd..c8f22085c6 100644 --- a/src/java/voldemort/server/protocol/admin/UpdatePartitionEntriesStreamRequestHandler.java +++ b/src/java/voldemort/server/protocol/admin/UpdatePartitionEntriesStreamRequestHandler.java @@ -36,38 +36,38 @@ public class UpdatePartitionEntriesStreamRequestHandler implements StreamRequestHandler { - private VAdminProto.UpdatePartitionEntriesRequest request; + protected VAdminProto.UpdatePartitionEntriesRequest request; - private final VAdminProto.UpdatePartitionEntriesResponse.Builder responseBuilder = VAdminProto.UpdatePartitionEntriesResponse.newBuilder(); + protected final VAdminProto.UpdatePartitionEntriesResponse.Builder responseBuilder = VAdminProto.UpdatePartitionEntriesResponse.newBuilder(); - private final ErrorCodeMapper errorCodeMapper; + protected final ErrorCodeMapper errorCodeMapper; - private final EventThrottler throttler; + protected final EventThrottler throttler; - private final VoldemortFilter filter; + protected final VoldemortFilter filter; - private final StorageEngine storageEngine; + protected final StorageEngine storageEngine; - private int counter; + protected int counter; - private final long startTime; + protected final long startTime; - private final StreamingStats streamStats; + protected final StreamingStats streamStats; - private final Logger logger = Logger.getLogger(getClass()); + protected final Logger logger = Logger.getLogger(getClass()); - private AtomicBoolean isBatchWriteOff; + protected AtomicBoolean isBatchWriteOff; public UpdatePartitionEntriesStreamRequestHandler(UpdatePartitionEntriesRequest request, ErrorCodeMapper errorCodeMapper, VoldemortConfig voldemortConfig, + StorageEngine storageEngine, StoreRepository storeRepository, NetworkClassLoader networkClassLoader) { super(); this.request = request; this.errorCodeMapper = errorCodeMapper; - storageEngine = AdminServiceRequestHandler.getStorageEngine(storeRepository, - request.getStore()); + this.storageEngine = storageEngine; throttler = new EventThrottler(voldemortConfig.getStreamMaxReadBytesPerSec()); filter = (request.hasFilter()) ? AdminServiceRequestHandler.getFilterFromRequest(request.getFilter(), voldemortConfig, @@ -92,13 +92,13 @@ protected void finalize() { storageEngine.endBatchModifications(); } + @Override public StreamRequestHandlerState handleRequest(DataInputStream inputStream, DataOutputStream outputStream) throws IOException { long startNs = System.nanoTime(); if(request == null) { int size = 0; - try { size = inputStream.readInt(); } catch(EOFException e) { @@ -187,16 +187,19 @@ public StreamRequestHandlerState handleRequest(DataInputStream inputStream, return StreamRequestHandlerState.READING; } + @Override public StreamRequestDirection getDirection() { return StreamRequestDirection.READING; } + @Override public void close(DataOutputStream outputStream) throws IOException { ProtoUtils.writeMessage(outputStream, responseBuilder.build()); storageEngine.endBatchModifications(); isBatchWriteOff.compareAndSet(false, true); } + @Override public void handleError(DataOutputStream outputStream, VoldemortException e) throws IOException { responseBuilder.setError(ProtoUtils.encodeError(errorCodeMapper, e)); if(logger.isEnabledFor(Level.ERROR)) diff --git a/src/java/voldemort/server/protocol/admin/UpdateSlopEntriesRequestHandler.java b/src/java/voldemort/server/protocol/admin/UpdateSlopEntriesRequestHandler.java index fc7fa45ba6..3b07d265c8 100644 --- a/src/java/voldemort/server/protocol/admin/UpdateSlopEntriesRequestHandler.java +++ b/src/java/voldemort/server/protocol/admin/UpdateSlopEntriesRequestHandler.java @@ -145,7 +145,8 @@ public StreamRequestHandlerState handleRequest(DataInputStream inputStream, streamStats.reportStorageTime(Operation.SLOP_UPDATE, System.nanoTime() - startNs); if(logger.isTraceEnabled()) - logger.trace("updateSlopEntries (Streaming put) successful"); + logger.trace("updateSlopEntries (Streaming put) successful on key:" + key + + " of store: " + request.getStore()); } catch(ObsoleteVersionException e) { // log and ignore if(logger.isDebugEnabled()) diff --git a/src/java/voldemort/server/rebalance/Rebalancer.java b/src/java/voldemort/server/rebalance/Rebalancer.java index 95c6e8df90..c3cb4f221e 100644 --- a/src/java/voldemort/server/rebalance/Rebalancer.java +++ b/src/java/voldemort/server/rebalance/Rebalancer.java @@ -118,63 +118,116 @@ public synchronized void releaseRebalancingPermit(int nodeId) { * *
      * | swapRO | changeClusterMetadata | changeRebalanceState | Order |
-     * | f | t | t | cluster -> rebalance | 
+     * | f | t | t | rebalance -> cluster  | 
      * | f | f | t | rebalance |
      * | t | t | f | cluster -> swap |
-     * | t | t | t | cluster -> swap -> rebalance |
+     * | t | t | t | rebalance -> cluster -> swap|
      * 
* * In general we need to do [ cluster change -> swap -> rebalance state * change ] * + * NOTE: The update of the cluster metadata and the rebalancer state is not + * "atomic". Ergo, there could theoretically be a race where a client picks + * up new cluster metadata sends a request based on that, but the proxy + * bridges have not been setup and we either miss a proxy put or return a + * null for get/getalls + * + * TODO:refactor The rollback logic here is too convoluted. Specifically, + * the independent updates to each key could be split up into their own + * methods. + * * @param cluster Cluster metadata to change * @param rebalancePartitionsInfo List of rebalance partitions info * @param swapRO Boolean to indicate swapping of RO store - * @param changeClusterMetadata Boolean to indicate a change of cluster - * metadata + * @param changeClusterAndStoresMetadata Boolean to indicate a change of + * cluster metadata * @param changeRebalanceState Boolean to indicate a change in rebalance * state * @param rollback Boolean to indicate that we are rolling back or not */ public void rebalanceStateChange(Cluster cluster, + List storeDefs, List rebalancePartitionsInfo, boolean swapRO, - boolean changeClusterMetadata, + boolean changeClusterAndStoresMetadata, boolean changeRebalanceState, boolean rollback) { Cluster currentCluster = metadataStore.getCluster(); + List currentStoreDefs = metadataStore.getStoreDefList(); - logger.info("Doing rebalance state change with options [ cluster metadata change - " - + changeClusterMetadata + " ], [ changing rebalancing state - " + logger.info("Server doing rebalance state change with options [ cluster metadata change - " + + changeClusterAndStoresMetadata + " ], [ changing rebalancing state - " + changeRebalanceState + " ], [ changing swapping RO - " + swapRO + " ], [ rollback - " + rollback + " ]"); // Variables to track what has completed List completedRebalancePartitionsInfo = Lists.newArrayList(); List swappedStoreNames = Lists.newArrayList(); - boolean completedClusterChange = false; + boolean completedClusterAndStoresChange = false; + boolean completedRebalanceSourceClusterChange = false; + Cluster previousRebalancingSourceCluster = null; + List previousRebalancingSourceStores = null; try { - // CHANGE CLUSTER METADATA - if(changeClusterMetadata) { - changeCluster(cluster); - completedClusterChange = true; - } - // SWAP RO DATA FOR ALL STORES - if(swapRO) { - swapROStores(swappedStoreNames, false); - } + /* + * Do the rebalancing state changes. It is important that this + * happens before the actual cluster metadata is changed. Here's + * what could happen otherwise. When a batch completes with + * {current_cluster c2, rebalancing_source_cluster c1} and the next + * rebalancing state changes it to {current_cluster c3, + * rebalancing_source_cluster c2} is set for the next batch, then + * there could be a window during which the state is + * {current_cluster c3, rebalancing_source_cluster c1}. On the other + * hand, when we update the rebalancing source cluster first, there + * is a window where the state is {current_cluster c2, + * rebalancing_source_cluster c2}, which still fine, because of the + * following. Successful completion of a batch means the cluster is + * finalized, so its okay to stop proxying based on {current_cluster + * c2, rebalancing_source_cluster c1}. And since the cluster + * metadata has not yet been updated to c3, the writes will happen + * based on c2. + * + * + * Even if some clients have already seen the {current_cluster c3, + * rebalancing_source_cluster c2} state from other servers, the + * operation will be rejected with InvalidMetadataException since + * this server itself is not aware of C3 + */ // CHANGE REBALANCING STATE if(changeRebalanceState) { try { + previousRebalancingSourceCluster = metadataStore.getRebalancingSourceCluster(); + previousRebalancingSourceStores = metadataStore.getRebalancingSourceStores(); if(!rollback) { + + // Save up the current cluster and stores def for + // Redirecting store + changeClusterAndStores(MetadataStore.REBALANCING_SOURCE_CLUSTER_XML, + currentCluster, + // Save the current store defs + // for Redirecting store + MetadataStore.REBALANCING_SOURCE_STORES_XML, + currentStoreDefs); + + completedRebalanceSourceClusterChange = true; + for(RebalancePartitionsInfo info: rebalancePartitionsInfo) { metadataStore.addRebalancingState(info); completedRebalancePartitionsInfo.add(info); } } else { + // Reset the rebalancing source cluster back to null + + changeClusterAndStores(MetadataStore.REBALANCING_SOURCE_CLUSTER_XML, null, + // Reset the rebalancing source stores back to null + MetadataStore.REBALANCING_SOURCE_STORES_XML, + null); + + completedRebalanceSourceClusterChange = true; + for(RebalancePartitionsInfo info: rebalancePartitionsInfo) { metadataStore.deleteRebalancingState(info); completedRebalancePartitionsInfo.add(info); @@ -184,17 +237,42 @@ public void rebalanceStateChange(Cluster cluster, throw new VoldemortException(e); } } + + // CHANGE CLUSTER METADATA AND STORE METADATA + if(changeClusterAndStoresMetadata) { + logger.info("Switching cluster metadata from " + currentCluster + " to " + cluster); + logger.info("Switching stores metadata from " + currentStoreDefs + " to " + + storeDefs); + changeClusterAndStores(MetadataStore.CLUSTER_KEY, + cluster, + MetadataStore.STORES_KEY, + storeDefs); + + completedClusterAndStoresChange = true; + + } + + // SWAP RO DATA FOR ALL STORES + if(swapRO) { + swapROStores(swappedStoreNames, false); + } + } catch(VoldemortException e) { logger.error("Got exception while changing state, now rolling back changes", e); - // ROLLBACK CLUSTER CHANGE - if(completedClusterChange) { + // ROLLBACK CLUSTER AND STORES CHANGE + if(completedClusterAndStoresChange) { try { - changeCluster(currentCluster); + logger.info("Rolling back cluster.xml to " + currentCluster); + logger.info("Rolling back stores.xml to " + currentStoreDefs); + changeClusterAndStores(MetadataStore.CLUSTER_KEY, + currentCluster, + MetadataStore.STORES_KEY, + currentStoreDefs); } catch(Exception exception) { - logger.error("Error while rolling back cluster metadata to " + currentCluster, - exception); + logger.error("Error while rolling back cluster metadata to " + currentCluster + + " Stores metadata to " + currentStoreDefs, exception); } } @@ -209,7 +287,6 @@ public void rebalanceStateChange(Cluster cluster, // CHANGE BACK ALL REBALANCING STATES FOR COMPLETED ONES if(completedRebalancePartitionsInfo.size() > 0) { - if(!rollback) { for(RebalancePartitionsInfo info: completedRebalancePartitionsInfo) { try { @@ -234,6 +311,19 @@ public void rebalanceStateChange(Cluster cluster, } + // Revert changes to REBALANCING_SOURCE_CLUSTER_XML and + // REBALANCING_SOURCE_STORES_XML + if(completedRebalanceSourceClusterChange) { + logger.info("Reverting the REBALANCING_SOURCE_CLUSTER_XML back to " + + previousRebalancingSourceCluster); + logger.info("Reverting the REBALANCING_SOURCE_STORES_XML back to " + + previousRebalancingSourceStores); + changeClusterAndStores(MetadataStore.REBALANCING_SOURCE_CLUSTER_XML, + previousRebalancingSourceCluster, + MetadataStore.REBALANCING_SOURCE_STORES_XML, + previousRebalancingSourceStores); + } + throw e; } @@ -283,29 +373,38 @@ private void swapROStores(List swappedStoreNames, boolean useSwappedStor } /** - * Updates the cluster metadata + * Updates the cluster and store metadata atomically + * + * This is required during rebalance and expansion into a new zone since we + * have to update the store def along with the cluster def. * * @param cluster The cluster metadata information + * @param storeDefs The stores metadata information */ - private void changeCluster(final Cluster cluster) { + private void changeClusterAndStores(String clusterKey, + final Cluster cluster, + String storesKey, + final List storeDefs) { + metadataStore.writeLock.lock(); try { - metadataStore.writeLock.lock(); - try { - VectorClock updatedVectorClock = ((VectorClock) metadataStore.get(MetadataStore.CLUSTER_KEY, - null) - .get(0) - .getVersion()).incremented(0, - System.currentTimeMillis()); - logger.info("Switching metadata from " + metadataStore.getCluster() + " to " - + cluster + " [ " + updatedVectorClock + " ]"); - metadataStore.put(MetadataStore.CLUSTER_KEY, - Versioned.value((Object) cluster, updatedVectorClock)); - } finally { - metadataStore.writeLock.unlock(); - } + VectorClock updatedVectorClock = ((VectorClock) metadataStore.get(clusterKey, null) + .get(0) + .getVersion()).incremented(metadataStore.getNodeId(), + System.currentTimeMillis()); + metadataStore.put(clusterKey, Versioned.value((Object) cluster, updatedVectorClock)); + + // now put new stores + updatedVectorClock = ((VectorClock) metadataStore.get(storesKey, null) + .get(0) + .getVersion()).incremented(metadataStore.getNodeId(), + System.currentTimeMillis()); + metadataStore.put(storesKey, Versioned.value((Object) storeDefs, updatedVectorClock)); + } catch(Exception e) { - logger.info("Error while changing cluster to " + cluster); + logger.info("Error while changing cluster to " + cluster + "for key " + clusterKey); throw new VoldemortException(e); + } finally { + metadataStore.writeLock.unlock(); } } diff --git a/src/java/voldemort/server/rebalance/RebalancerState.java b/src/java/voldemort/server/rebalance/RebalancerState.java index 6eb62d2ea7..5aa5588eb1 100644 --- a/src/java/voldemort/server/rebalance/RebalancerState.java +++ b/src/java/voldemort/server/rebalance/RebalancerState.java @@ -23,10 +23,10 @@ import java.util.Map; import voldemort.client.rebalance.RebalancePartitionsInfo; +import voldemort.routing.StoreRoutingPlan; import voldemort.serialization.json.JsonReader; import voldemort.serialization.json.JsonWriter; import voldemort.store.metadata.MetadataStore; -import voldemort.utils.StoreInstance; import com.google.common.collect.Lists; import com.google.common.collect.Maps; @@ -103,7 +103,7 @@ public RebalancePartitionsInfo find(String storeName, // If yes, check if the key belongs to one of the partitions // being moved - if(StoreInstance.checkKeyBelongsToPartition(keyPartitions, + if(StoreRoutingPlan.checkKeyBelongsToPartition(keyPartitions, nodePartitions, info.getReplicaToAddPartitionList(storeName))) { return info; diff --git a/src/java/voldemort/server/rebalance/async/DonorBasedRebalanceAsyncOperation.java b/src/java/voldemort/server/rebalance/async/DonorBasedRebalanceAsyncOperation.java index 3d94150ce7..c29be1396a 100644 --- a/src/java/voldemort/server/rebalance/async/DonorBasedRebalanceAsyncOperation.java +++ b/src/java/voldemort/server/rebalance/async/DonorBasedRebalanceAsyncOperation.java @@ -38,6 +38,7 @@ import voldemort.client.protocol.admin.AdminClient; import voldemort.client.rebalance.RebalancePartitionsInfo; import voldemort.cluster.Cluster; +import voldemort.routing.StoreRoutingPlan; import voldemort.server.StoreRepository; import voldemort.server.VoldemortConfig; import voldemort.server.rebalance.Rebalancer; @@ -51,7 +52,6 @@ import voldemort.utils.ClosableIterator; import voldemort.utils.Pair; import voldemort.utils.RebalanceUtils; -import voldemort.utils.StoreInstance; import voldemort.versioning.Versioned; import com.google.common.collect.HashMultimap; @@ -330,7 +330,7 @@ private void fetchEntriesForStealers(StorageEngine st while(running.get() && keys.hasNext()) { ByteArray key = keys.next(); scanned++; - List nodeIds = StoreInstance.checkKeyBelongsToPartition(key.get(), + List nodeIds = StoreRoutingPlan.checkKeyBelongsToPartition(key.get(), optimizedStealerNodeToMappingTuples, targetCluster, storeDef); @@ -378,7 +378,7 @@ private void fetchEntriesForStealersPartitionScan(StorageEngine value = entry.getSecond(); scanned++; - List nodeIds = StoreInstance.checkKeyBelongsToPartition(key.get(), + List nodeIds = StoreRoutingPlan.checkKeyBelongsToPartition(key.get(), optimizedStealerNodeToMappingTuples, targetCluster, storeDef); diff --git a/src/java/voldemort/server/scheduler/slop/BlockingSlopPusherJob.java b/src/java/voldemort/server/scheduler/slop/BlockingSlopPusherJob.java index 774fb94b9d..47b1cc5d9f 100644 --- a/src/java/voldemort/server/scheduler/slop/BlockingSlopPusherJob.java +++ b/src/java/voldemort/server/scheduler/slop/BlockingSlopPusherJob.java @@ -83,7 +83,7 @@ public BlockingSlopPusherJob(StoreRepository storeRepo, public void run() { // don't try to run slop pusher job when rebalancing - if(metadataStore.getServerState() + if(metadataStore.getServerStateUnlocked() .equals(MetadataStore.VoldemortState.REBALANCING_MASTER_SERVER)) { logger.error("Cannot run slop pusher job since Voldemort server is rebalancing"); return; diff --git a/src/java/voldemort/server/scheduler/slop/StreamingSlopPusherJob.java b/src/java/voldemort/server/scheduler/slop/StreamingSlopPusherJob.java index b8cbd1804a..b1d636d6c4 100644 --- a/src/java/voldemort/server/scheduler/slop/StreamingSlopPusherJob.java +++ b/src/java/voldemort/server/scheduler/slop/StreamingSlopPusherJob.java @@ -115,7 +115,7 @@ public void run() { loadMetadata(); // don't try to run slop pusher job when rebalancing - if(metadataStore.getServerState() + if(metadataStore.getServerStateUnlocked() .equals(MetadataStore.VoldemortState.REBALANCING_MASTER_SERVER)) { logger.error("Cannot run slop pusher job since Voldemort server is rebalancing"); return; @@ -196,7 +196,8 @@ public void run() { if(logger.isTraceEnabled()) logger.trace("Pushing slop for " + versioned.getValue().getNodeId() - + " and store " + versioned.getValue().getStoreName()); + + " and store " + versioned.getValue().getStoreName() + + " of key: " + versioned.getValue().getKey()); if(failureDetector.isAvailable(node)) { SynchronousQueue> slopQueue = slopQueues.get(nodeId); diff --git a/src/java/voldemort/server/storage/RepairJob.java b/src/java/voldemort/server/storage/RepairJob.java index 6544593729..00a0ea919a 100644 --- a/src/java/voldemort/server/storage/RepairJob.java +++ b/src/java/voldemort/server/storage/RepairJob.java @@ -64,7 +64,8 @@ public void startRepairJob() { public void run() { // don't try to run slop pusher job when rebalancing - if(!metadataStore.getServerState().equals(MetadataStore.VoldemortState.NORMAL_SERVER)) { + if(!metadataStore.getServerStateUnlocked() + .equals(MetadataStore.VoldemortState.NORMAL_SERVER)) { logger.error("Cannot run repair job since Voldemort server is not in normal state"); return; } @@ -97,8 +98,7 @@ public void run() { long repairSlops = 0L; long numDeletedKeys = 0; while(iterator.hasNext()) { - Pair> keyAndVal; - keyAndVal = iterator.next(); + Pair> keyAndVal = iterator.next(); List nodes = routingStrategy.routeRequest(keyAndVal.getFirst().get()); if(!hasDestination(nodes)) { @@ -111,7 +111,8 @@ public void run() { } closeIterator(iterator); localStats.put(storeDef.getName(), repairSlops); - logger.info("Completed store " + storeDef.getName()); + logger.info("Completed store " + storeDef.getName() + " #Scanned:" + + progress.get() + " #Deleted:" + numDeletedKeys); } } } catch(Exception e) { diff --git a/src/java/voldemort/server/storage/StorageService.java b/src/java/voldemort/server/storage/StorageService.java index 0fe606b502..7fb0e076c9 100644 --- a/src/java/voldemort/server/storage/StorageService.java +++ b/src/java/voldemort/server/storage/StorageService.java @@ -77,6 +77,7 @@ import voldemort.store.nonblockingstore.NonblockingStore; import voldemort.store.readonly.ReadOnlyStorageConfiguration; import voldemort.store.readonly.ReadOnlyStorageEngine; +import voldemort.store.rebalancing.ProxyPutStats; import voldemort.store.rebalancing.RebootstrappingStore; import voldemort.store.rebalancing.RedirectingStore; import voldemort.store.retention.RetentionEnforcingStore; @@ -96,6 +97,7 @@ import voldemort.utils.ByteArray; import voldemort.utils.ClosableIterator; import voldemort.utils.ConfigurationException; +import voldemort.utils.DaemonThreadFactory; import voldemort.utils.DynamicThrottleLimit; import voldemort.utils.EventThrottler; import voldemort.utils.JmxUtils; @@ -137,6 +139,8 @@ public class StorageService extends AbstractService { private final FailureDetector failureDetector; private final StoreStats storeStats; private final RoutedStoreFactory routedStoreFactory; + private final ExecutorService proxyPutWorkerPool; + private final ProxyPutStats aggregatedProxyPutStats; public StorageService(StoreRepository storeRepository, MetadataStore metadata, @@ -179,6 +183,17 @@ public StorageService(StoreRepository storeRepository, this.dynThrottleLimit = new DynamicThrottleLimit(rate); } else this.dynThrottleLimit = null; + + // create the proxy put thread pool + this.proxyPutWorkerPool = Executors.newFixedThreadPool(config.getMaxProxyPutThreads(), + new DaemonThreadFactory("voldemort-proxy-put-thread")); + this.aggregatedProxyPutStats = new ProxyPutStats(null); + if(config.isJmxEnabled()) { + JmxUtils.registerMbean(this.aggregatedProxyPutStats, + JmxUtils.createObjectName("voldemort.store.rebalancing", + "aggregate-proxy-puts")); + } + } private void initStorageConfig(String configClassName) { @@ -752,11 +767,21 @@ public void registerEngine(StorageEngine engine, } if(voldemortConfig.isEnableRebalanceService()) { + ProxyPutStats proxyPutStats = new ProxyPutStats(aggregatedProxyPutStats); + if(voldemortConfig.isJmxEnabled()) { + JmxUtils.registerMbean(proxyPutStats, + JmxUtils.createObjectName("voldemort.store.rebalancing", + engine.getName() + + "-proxy-puts")); + } store = new RedirectingStore(store, metadata, storeRepository, failureDetector, - storeFactory); + storeFactory, + voldemortConfig.getProxyPutsDuringRebalance(), + proxyPutWorkerPool, + proxyPutStats); if(voldemortConfig.isJmxEnabled()) { MBeanServer mbeanServer = ManagementFactory.getPlatformMBeanServer(); ObjectName name = null; @@ -999,6 +1024,8 @@ protected void stopInner() { logger.info("Closed client threadpool."); + storeFactory.close(); + if(this.failureDetector != null) { try { this.failureDetector.destroy(); @@ -1006,9 +1033,18 @@ protected void stopInner() { lastException = e; } } - logger.info("Closed failure detector."); + // shut down the proxy put thread pool + this.proxyPutWorkerPool.shutdown(); + try { + if(!this.proxyPutWorkerPool.awaitTermination(10, TimeUnit.SECONDS)) + this.proxyPutWorkerPool.shutdownNow(); + } catch(InterruptedException e) { + this.proxyPutWorkerPool.shutdownNow(); + } + logger.info("Closed proxy put thread pool."); + /* If there is an exception, throw it */ if(lastException instanceof VoldemortException) throw (VoldemortException) lastException; diff --git a/src/java/voldemort/store/AbstractStorageEngine.java b/src/java/voldemort/store/AbstractStorageEngine.java index 16819c8507..4ba9aefcf9 100644 --- a/src/java/voldemort/store/AbstractStorageEngine.java +++ b/src/java/voldemort/store/AbstractStorageEngine.java @@ -1,7 +1,12 @@ package voldemort.store; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + import voldemort.utils.ClosableIterator; import voldemort.utils.Pair; +import voldemort.versioning.Occurred; import voldemort.versioning.Versioned; public class AbstractStorageEngine extends AbstractStore implements @@ -49,9 +54,54 @@ public boolean beginBatchModifications() { return false; } + @Override + public List> multiVersionPut(K key, List> values) { + throw new UnsupportedOperationException("multiVersionPut is not supported for " + + this.getClass().getName()); + } + @Override public boolean endBatchModifications() { return false; } + /** + * Computes the final list of versions to be stored, on top of what is + * currently being stored. Final list is valsInStorage modified in place + * + * + * @param valuesInStorage list of versions currently in storage + * @param multiPutValues list of new versions being written to storage + * @return list of versions from multiPutVals that were rejected as obsolete + */ + protected List> resolveAndConstructVersionsToPersist(List> valuesInStorage, + List> multiPutValues) { + List> obsoleteVals = new ArrayList>(multiPutValues.size()); + // Go over all the values and determine whether the version is + // acceptable + for(Versioned value: multiPutValues) { + Iterator> iter = valuesInStorage.iterator(); + boolean obsolete = false; + // Compare the current version with a set of accepted versions + while(iter.hasNext()) { + Versioned curr = iter.next(); + Occurred occurred = value.getVersion().compare(curr.getVersion()); + if(occurred == Occurred.BEFORE) { + obsolete = true; + break; + } else if(occurred == Occurred.AFTER) { + iter.remove(); + } + } + if(obsolete) { + // add to return value if obsolete + obsoleteVals.add(value); + } else { + // else update the set of accepted versions + valuesInStorage.add(value); + } + } + + return obsoleteVals; + } } diff --git a/src/java/voldemort/store/CompositeDeleteVoldemortRequest.java b/src/java/voldemort/store/CompositeDeleteVoldemortRequest.java index 08c70a7927..4a5e7a458d 100644 --- a/src/java/voldemort/store/CompositeDeleteVoldemortRequest.java +++ b/src/java/voldemort/store/CompositeDeleteVoldemortRequest.java @@ -1,11 +1,33 @@ +/* + * Copyright 2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + package voldemort.store; import voldemort.common.VoldemortOpCode; import voldemort.versioning.Version; +/** + * A class that defines a composite delete request containing the key to delete, + * corresponding version (if present in the incoming HTTP request) and the + * timeout + * + */ public class CompositeDeleteVoldemortRequest extends CompositeVoldemortRequest { - public CompositeDeleteVoldemortRequest(K key, Version version, long timeout) { - super(key, null, null, null, version, timeout, true, VoldemortOpCode.DELETE_OP_CODE); + public CompositeDeleteVoldemortRequest(K key, Version version, long timeoutInMs) { + super(key, null, null, null, version, timeoutInMs, true, VoldemortOpCode.DELETE_OP_CODE); } } diff --git a/src/java/voldemort/store/CompositeGetAllVoldemortRequest.java b/src/java/voldemort/store/CompositeGetAllVoldemortRequest.java index 2c548b9bbd..0999581c56 100644 --- a/src/java/voldemort/store/CompositeGetAllVoldemortRequest.java +++ b/src/java/voldemort/store/CompositeGetAllVoldemortRequest.java @@ -1,16 +1,40 @@ +/* + * Copyright 2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + package voldemort.store; import voldemort.common.VoldemortOpCode; +/** + * A class that defines a composite get all request containing a reference to + * the iterable keys, a flag to indicate if the conflicts should be resolved and + * the timeout + * + */ public class CompositeGetAllVoldemortRequest extends CompositeVoldemortRequest { - public CompositeGetAllVoldemortRequest(Iterable keys, long timeout, boolean resolveConflicts) { + public CompositeGetAllVoldemortRequest(Iterable keys, + long timeoutInMs, + boolean resolveConflicts) { super(null, null, keys, null, null, - timeout, + timeoutInMs, resolveConflicts, VoldemortOpCode.GET_ALL_OP_CODE); } diff --git a/src/java/voldemort/store/CompositeGetVoldemortRequest.java b/src/java/voldemort/store/CompositeGetVoldemortRequest.java index 3826d06760..8cc9f2613a 100644 --- a/src/java/voldemort/store/CompositeGetVoldemortRequest.java +++ b/src/java/voldemort/store/CompositeGetVoldemortRequest.java @@ -1,10 +1,39 @@ +/* + * Copyright 2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + package voldemort.store; import voldemort.common.VoldemortOpCode; +/** + * A class that defines a composite get request containing the key, a flag to + * indicate whether the conflicts should be resolved and the timeout + * + */ + public class CompositeGetVoldemortRequest extends CompositeVoldemortRequest { - public CompositeGetVoldemortRequest(K key, long timeout, boolean resolveConflicts) { - super(key, null, null, null, null, timeout, resolveConflicts, VoldemortOpCode.GET_OP_CODE); + public CompositeGetVoldemortRequest(K key, long timeoutInMs, boolean resolveConflicts) { + super(key, + null, + null, + null, + null, + timeoutInMs, + resolveConflicts, + VoldemortOpCode.GET_OP_CODE); } } diff --git a/src/java/voldemort/store/CompositePutVoldemortRequest.java b/src/java/voldemort/store/CompositePutVoldemortRequest.java index e187993390..723404fb67 100644 --- a/src/java/voldemort/store/CompositePutVoldemortRequest.java +++ b/src/java/voldemort/store/CompositePutVoldemortRequest.java @@ -1,10 +1,31 @@ +/* + * Copyright 2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + package voldemort.store; import voldemort.common.VoldemortOpCode; +/** + * A class that defines a composite put request containing the key, the value + * and the timeout + * + */ public class CompositePutVoldemortRequest extends CompositeVoldemortRequest { - public CompositePutVoldemortRequest(K key, V rawValue, long timeout) { - super(key, rawValue, null, null, null, timeout, true, VoldemortOpCode.PUT_OP_CODE); + public CompositePutVoldemortRequest(K key, V rawValue, long timeoutInMs) { + super(key, rawValue, null, null, null, timeoutInMs, true, VoldemortOpCode.PUT_OP_CODE); } } diff --git a/src/java/voldemort/store/CompositeVersionedPutVoldemortRequest.java b/src/java/voldemort/store/CompositeVersionedPutVoldemortRequest.java index cc96d6e2b3..8b04b3dc66 100644 --- a/src/java/voldemort/store/CompositeVersionedPutVoldemortRequest.java +++ b/src/java/voldemort/store/CompositeVersionedPutVoldemortRequest.java @@ -1,12 +1,34 @@ +/* + * Copyright 2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + package voldemort.store; import voldemort.common.VoldemortOpCode; import voldemort.versioning.Versioned; +/** + * A class that defines a composite put request containing the key, the + * versioned value and the timeout + * + */ + public class CompositeVersionedPutVoldemortRequest extends CompositeVoldemortRequest { - public CompositeVersionedPutVoldemortRequest(K key, Versioned value, long timeout) { - super(key, null, null, value, null, timeout, true, VoldemortOpCode.PUT_OP_CODE); + public CompositeVersionedPutVoldemortRequest(K key, Versioned value, long timeoutInMs) { + super(key, null, null, value, null, timeoutInMs, true, VoldemortOpCode.PUT_OP_CODE); } } diff --git a/src/java/voldemort/store/CompositeVoldemortRequest.java b/src/java/voldemort/store/CompositeVoldemortRequest.java index f8e834f0a2..822251e3f5 100644 --- a/src/java/voldemort/store/CompositeVoldemortRequest.java +++ b/src/java/voldemort/store/CompositeVoldemortRequest.java @@ -1,8 +1,28 @@ +/* + * Copyright 2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + package voldemort.store; import voldemort.versioning.Version; import voldemort.versioning.Versioned; +/** + * A base class for the different types of Voldemort requests containing all the + * necessary components + */ public class CompositeVoldemortRequest { private final K key; @@ -10,7 +30,7 @@ public class CompositeVoldemortRequest { private final Iterable getAllIterableKeys; private final Versioned value; private Version version; - private long routingTimeout; + private long routingTimeoutInMs; private final boolean resolveConflicts; private final byte operationType; @@ -19,13 +39,13 @@ public CompositeVoldemortRequest(K key, Iterable keys, Versioned value, Version version, - long timeout, + long timeoutInMs, boolean resolveConflicts, byte operationType) { this.key = key; this.rawValue = rawValue; this.getAllIterableKeys = keys; - this.routingTimeout = timeout; + this.routingTimeoutInMs = timeoutInMs; this.value = value; this.version = version; this.resolveConflicts = resolveConflicts; @@ -49,11 +69,11 @@ public void setVersion(Version version) { } public long getRoutingTimeoutInMs() { - return routingTimeout; + return routingTimeoutInMs; } - public void setRoutingTimeoutInMs(long timeout) { - this.routingTimeout = timeout; + public void setRoutingTimeoutInMs(long timeoutInMs) { + this.routingTimeoutInMs = timeoutInMs; } public boolean resolveConflicts() { diff --git a/src/java/voldemort/store/StorageEngine.java b/src/java/voldemort/store/StorageEngine.java index 4af3046e94..101476586e 100644 --- a/src/java/voldemort/store/StorageEngine.java +++ b/src/java/voldemort/store/StorageEngine.java @@ -16,6 +16,8 @@ package voldemort.store; +import java.util.List; + import voldemort.utils.ClosableIterator; import voldemort.utils.Pair; import voldemort.versioning.Versioned; @@ -120,6 +122,16 @@ public interface StorageEngine extends Store { */ public boolean beginBatchModifications(); + /** + * Atomically update storage with the list of versioned values for the given + * key, to improve storage efficiency. + * + * @param key Key to write + * @param values List of versioned values to be written atomically. + * @return list of obsolete versions that were rejected + */ + public List> multiVersionPut(K key, List> values); + /** * * @return true if the storage engine successfully returned to normal mode diff --git a/src/java/voldemort/store/bdb/BdbStorageConfiguration.java b/src/java/voldemort/store/bdb/BdbStorageConfiguration.java index aeb58cd5ba..5819bb947a 100644 --- a/src/java/voldemort/store/bdb/BdbStorageConfiguration.java +++ b/src/java/voldemort/store/bdb/BdbStorageConfiguration.java @@ -39,6 +39,7 @@ import com.google.common.collect.Maps; import com.sleepycat.je.CacheMode; +import com.sleepycat.je.CheckpointConfig; import com.sleepycat.je.Database; import com.sleepycat.je.DatabaseConfig; import com.sleepycat.je.DatabaseException; @@ -304,7 +305,7 @@ public String getEnvStatsAsString(String storeName, boolean fast) throws Excepti /** * Forceful cleanup the logs */ - @JmxOperation(description = "Forceful start the cleaner threads") + @JmxOperation(description = "Forcefully invoke the log cleaning") public void cleanLogs() { synchronized(lock) { try { @@ -317,6 +318,24 @@ public void cleanLogs() { } } + /** + * Forceful checkpointing + */ + @JmxOperation(description = "Forcefully checkpoint all the environments") + public void checkPointAllEnvironments() { + synchronized(lock) { + try { + for(Environment environment: environments.values()) { + CheckpointConfig checkPointConfig = new CheckpointConfig(); + checkPointConfig.setForce(true); + environment.checkpoint(checkPointConfig); + } + } catch(DatabaseException e) { + throw new VoldemortException(e); + } + } + } + public void close() { synchronized(lock) { try { diff --git a/src/java/voldemort/store/bdb/BdbStorageEngine.java b/src/java/voldemort/store/bdb/BdbStorageEngine.java index ca3ade7cf9..2362b8e165 100644 --- a/src/java/voldemort/store/bdb/BdbStorageEngine.java +++ b/src/java/voldemort/store/bdb/BdbStorageEngine.java @@ -338,7 +338,7 @@ else if(occurred == Occurred.AFTER) } } else { // insert - vals = new ArrayList>(); + vals = new ArrayList>(1); } // update the new value @@ -659,6 +659,68 @@ public boolean beginBatchModifications() { return false; } + @Override + public List> multiVersionPut(ByteArray key, + final List> values) + throws PersistenceFailureException { + long startTimeNs = -1; + + if(logger.isTraceEnabled()) + startTimeNs = System.nanoTime(); + + StoreUtils.assertValidKey(key); + DatabaseEntry keyEntry = new DatabaseEntry(key.get()); + DatabaseEntry valueEntry = new DatabaseEntry(); + + boolean succeeded = false; + Transaction transaction = null; + List> valuesInStorage = null; + List> obsoleteVals = null; + + try { + transaction = environment.beginTransaction(null, null); + + // do a get for the existing values + OperationStatus status = getBdbDatabase().get(transaction, + keyEntry, + valueEntry, + LockMode.RMW); + if(OperationStatus.SUCCESS == status) { + // update + valuesInStorage = StoreBinaryFormat.fromByteArray(valueEntry.getData()); + } else { + // insert + valuesInStorage = new ArrayList>(values.size()); + } + + obsoleteVals = resolveAndConstructVersionsToPersist(valuesInStorage, values); + valueEntry.setData(StoreBinaryFormat.toByteArray(valuesInStorage)); + status = getBdbDatabase().put(transaction, keyEntry, valueEntry); + + if(status != OperationStatus.SUCCESS) + throw new PersistenceFailureException("multiVersionPut operation failed with status: " + + status); + succeeded = true; + + } catch(DatabaseException e) { + this.bdbEnvironmentStats.reportException(e); + logger.error(e); + throw new PersistenceFailureException(e); + } finally { + if(succeeded) + attemptCommit(transaction); + else + attemptAbort(transaction); + if(logger.isTraceEnabled()) { + logger.trace("Completed PUT (" + getName() + ") to key " + key + " (keyRef: " + + System.identityHashCode(key) + " values " + values + " in " + + (System.nanoTime() - startTimeNs) + " ns at " + + System.currentTimeMillis()); + } + } + return obsoleteVals; + } + @Override public boolean endBatchModifications() { if(checkpointerOffForBatchWrites) { diff --git a/src/java/voldemort/store/bdb/PartitionPrefixedBdbStorageEngine.java b/src/java/voldemort/store/bdb/PartitionPrefixedBdbStorageEngine.java index 3784775300..bb7c9ea340 100644 --- a/src/java/voldemort/store/bdb/PartitionPrefixedBdbStorageEngine.java +++ b/src/java/voldemort/store/bdb/PartitionPrefixedBdbStorageEngine.java @@ -121,6 +121,15 @@ public boolean delete(ByteArray key, Version version) throws PersistenceFailureE return super.delete(prefixedKey, version); } + @Override + public List> multiVersionPut(ByteArray key, List> values) { + StoreUtils.assertValidKey(key); + int partition = routingStrategy.getMasterPartition(key.get()); + ByteArray prefixedKey = new ByteArray(StoreBinaryFormat.makePrefixedKey(key.get(), + partition)); + return super.multiVersionPut(prefixedKey, values); + } + @Override protected Logger getLogger() { return logger; diff --git a/src/java/voldemort/store/bdb/dataconversion/BdbConvertBaseToNewDup.java b/src/java/voldemort/store/bdb/dataconversion/BdbConvertBaseToNewDup.java index cc38f72021..2c178bbcd0 100644 --- a/src/java/voldemort/store/bdb/dataconversion/BdbConvertBaseToNewDup.java +++ b/src/java/voldemort/store/bdb/dataconversion/BdbConvertBaseToNewDup.java @@ -38,7 +38,7 @@ public void transfer() throws Exception { while(cursor.getNext(keyEntry, valueEntry, LockMode.READ_UNCOMMITTED) == OperationStatus.SUCCESS) { scanCount++; if(scanCount % 1000000 == 0) - logger.info("Converted " + scanCount + "entries in " + logger.info("Converted " + scanCount + " entries in " + (System.currentTimeMillis() - startTime) / 1000 + " secs"); // read the value as a versioned Object @@ -80,7 +80,7 @@ public void transfer() throws Exception { } keyCount++; } - logger.info("Completed " + scanCount + "entries and " + keyCount + " keys in " + logger.info("Completed " + scanCount + " entries and " + keyCount + " keys in " + (System.currentTimeMillis() - startTime) / 1000 + " secs"); } diff --git a/src/java/voldemort/store/bdb/dataconversion/BdbConvertBaseToPidScan.java b/src/java/voldemort/store/bdb/dataconversion/BdbConvertBaseToPidScan.java index 3ec76da48b..8a50b122ca 100644 --- a/src/java/voldemort/store/bdb/dataconversion/BdbConvertBaseToPidScan.java +++ b/src/java/voldemort/store/bdb/dataconversion/BdbConvertBaseToPidScan.java @@ -42,7 +42,7 @@ public void transfer() throws Exception { while(cursor.getNext(keyEntry, valueEntry, LockMode.READ_UNCOMMITTED) == OperationStatus.SUCCESS) { scanCount++; if(scanCount % 1000000 == 0) - logger.info("Converted " + scanCount + "entries in " + logger.info("Converted " + scanCount + " entries in " + (System.currentTimeMillis() - startTime) / 1000 + " secs"); // read the value as a versioned Object @@ -90,7 +90,7 @@ public void transfer() throws Exception { } keyCount++; } - logger.info("Completed " + scanCount + "entries and " + keyCount + " keys in " + logger.info("Completed " + scanCount + " entries and " + keyCount + " keys in " + (System.currentTimeMillis() - startTime) / 1000 + " secs"); } diff --git a/src/java/voldemort/store/bdb/dataconversion/BdbConvertNewDupToPidScan.java b/src/java/voldemort/store/bdb/dataconversion/BdbConvertNewDupToPidScan.java index cfb21077af..954b523976 100644 --- a/src/java/voldemort/store/bdb/dataconversion/BdbConvertNewDupToPidScan.java +++ b/src/java/voldemort/store/bdb/dataconversion/BdbConvertNewDupToPidScan.java @@ -55,10 +55,10 @@ public void transfer() throws Exception { } if(scanCount % 1000000 == 0) - logger.info("Reverted " + scanCount + "entries in " + logger.info("Reverted " + scanCount + " entries in " + (System.currentTimeMillis() - startTime) / 1000 + " secs"); } - logger.info("Converted " + scanCount + "entries and " + keyCount + " keys in " + logger.info("Converted " + scanCount + " entries and " + keyCount + " keys in " + (System.currentTimeMillis() - startTime) / 1000 + " secs"); } diff --git a/src/java/voldemort/store/bdb/dataconversion/BdbRevertNewDupToBase.java b/src/java/voldemort/store/bdb/dataconversion/BdbRevertNewDupToBase.java index c755476ecc..5f323ec62c 100644 --- a/src/java/voldemort/store/bdb/dataconversion/BdbRevertNewDupToBase.java +++ b/src/java/voldemort/store/bdb/dataconversion/BdbRevertNewDupToBase.java @@ -52,10 +52,10 @@ public void transfer() throws Exception { scanCount++; } if(scanCount % 1000000 == 0) - logger.info("Reverted " + scanCount + "entries in " + logger.info("Reverted " + scanCount + " entries in " + (System.currentTimeMillis() - startTime) / 1000 + " secs"); } - logger.info("Reverted " + scanCount + "entries and " + keyCount + " keys in " + logger.info("Reverted " + scanCount + " entries and " + keyCount + " keys in " + (System.currentTimeMillis() - startTime) / 1000 + " secs"); } diff --git a/src/java/voldemort/store/bdb/dataconversion/BdbRevertPidScanToBase.java b/src/java/voldemort/store/bdb/dataconversion/BdbRevertPidScanToBase.java index c391899112..f026a3c350 100644 --- a/src/java/voldemort/store/bdb/dataconversion/BdbRevertPidScanToBase.java +++ b/src/java/voldemort/store/bdb/dataconversion/BdbRevertPidScanToBase.java @@ -54,10 +54,10 @@ public void transfer() throws Exception { scanCount++; } if(scanCount % 1000000 == 0) - logger.info("Reverted " + scanCount + "entries in " + logger.info("Reverted " + scanCount + " entries in " + (System.currentTimeMillis() - startTime) / 1000 + " secs"); } - logger.info("Reverted " + scanCount + "entries and " + keyCount + " keys in " + logger.info("Reverted " + scanCount + " entries and " + keyCount + " keys in " + (System.currentTimeMillis() - startTime) / 1000 + " secs"); } diff --git a/src/java/voldemort/store/bdb/dataconversion/BdbRevertPidScanToNewDup.java b/src/java/voldemort/store/bdb/dataconversion/BdbRevertPidScanToNewDup.java index 624abee439..ede0e168bf 100644 --- a/src/java/voldemort/store/bdb/dataconversion/BdbRevertPidScanToNewDup.java +++ b/src/java/voldemort/store/bdb/dataconversion/BdbRevertPidScanToNewDup.java @@ -47,10 +47,10 @@ public void transfer() throws Exception { } if(scanCount % 1000000 == 0) - logger.info("Reverted " + scanCount + "entries in " + logger.info("Reverted " + scanCount + " entries in " + (System.currentTimeMillis() - startTime) / 1000 + " secs"); } - logger.info("Reverted " + scanCount + "entries and " + keyCount + " keys in " + logger.info("Reverted " + scanCount + " entries and " + keyCount + " keys in " + (System.currentTimeMillis() - startTime) / 1000 + " secs"); } diff --git a/src/java/voldemort/store/memory/InMemoryStorageEngine.java b/src/java/voldemort/store/memory/InMemoryStorageEngine.java index f53d0ef60b..5f9e027bea 100644 --- a/src/java/voldemort/store/memory/InMemoryStorageEngine.java +++ b/src/java/voldemort/store/memory/InMemoryStorageEngine.java @@ -24,6 +24,8 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; +import org.apache.log4j.Logger; + import voldemort.VoldemortException; import voldemort.annotations.concurrency.NotThreadsafe; import voldemort.store.AbstractStorageEngine; @@ -39,11 +41,13 @@ /** * A simple non-persistent, in-memory store. Useful for unit testing. * + * TODO Rewrite this class using striped locks for more granular locking. * */ public class InMemoryStorageEngine extends AbstractStorageEngine { - private final ConcurrentMap>> map; + private static final Logger logger = Logger.getLogger(InMemoryStorageEngine.class); + protected final ConcurrentMap>> map; public InMemoryStorageEngine(String name) { super(name); @@ -55,7 +59,7 @@ public InMemoryStorageEngine(String name, ConcurrentMap>> m this.map = Utils.notNull(map); } - public void deleteAll() { + public synchronized void deleteAll() { this.map.clear(); } @@ -64,35 +68,35 @@ public boolean delete(K key) { } @Override - public boolean delete(K key, Version version) { + public synchronized boolean delete(K key, Version version) { StoreUtils.assertValidKey(key); - if(version == null) - return map.remove(key) != null; - List> values = map.get(key); if(values == null) { return false; } - synchronized(values) { - boolean deletedSomething = false; - Iterator> iterator = values.iterator(); - while(iterator.hasNext()) { - Versioned item = iterator.next(); - if(item.getVersion().compare(version) == Occurred.BEFORE) { - iterator.remove(); - deletedSomething = true; - } - } - if(values.size() == 0) { - // If this remove fails, then another delete operation got - // there before this one - if(!map.remove(key, values)) - return false; - } - return deletedSomething; + if(version == null) { + map.remove(key); + return true; } + + boolean deletedSomething = false; + Iterator> iterator = values.iterator(); + while(iterator.hasNext()) { + Versioned item = iterator.next(); + if(item.getVersion().compare(version) == Occurred.BEFORE) { + iterator.remove(); + deletedSomething = true; + } + } + if(values.size() == 0) { + // if there are no more versions left, also remove the key from the + // map + map.remove(key); + } + + return deletedSomething; } @Override @@ -101,13 +105,12 @@ public List getVersions(K key) { } @Override - public List> get(K key, T transform) throws VoldemortException { + public synchronized List> get(K key, T transform) throws VoldemortException { StoreUtils.assertValidKey(key); List> results = map.get(key); if(results == null) { return new ArrayList>(0); - } - synchronized(results) { + } else { return new ArrayList>(results); } } @@ -120,48 +123,47 @@ public Map>> getAll(Iterable keys, Map transforms) } @Override - public void put(K key, Versioned value, T transforms) throws VoldemortException { + public synchronized void put(K key, Versioned value, T transforms) throws VoldemortException { StoreUtils.assertValidKey(key); - - Version version = value.getVersion(); - boolean success = false; - while(!success) { - List> items = map.get(key); - // If we have no value, optimistically try to add one - if(items == null) { - items = new ArrayList>(); - items.add(new Versioned(value.getValue(), version)); - success = map.putIfAbsent(key, items) == null; - } else { - synchronized(items) { - // if this check fails, items has been removed from the map - // by delete, so we try again. - if(map.get(key) != items) - continue; - - // Check for existing versions - remember which items to - // remove in case of success - List> itemsToRemove = new ArrayList>(items.size()); - for(Versioned versioned: items) { - Occurred occurred = value.getVersion().compare(versioned.getVersion()); - if(occurred == Occurred.BEFORE) { - throw new ObsoleteVersionException("Obsolete version for key '" + key - + "': " + value.getVersion()); - } else if(occurred == Occurred.AFTER) { - itemsToRemove.add(versioned); - } - } - items.removeAll(itemsToRemove); - items.add(value); - } - success = true; + List> items = map.get(key); + // If we have no value, add the current value + if(items == null) { + items = new ArrayList>(); + } + // Check for existing versions - remember which items to + // remove in case of success + List> itemsToRemove = new ArrayList>(items.size()); + for(Versioned versioned: items) { + Occurred occurred = value.getVersion().compare(versioned.getVersion()); + if(occurred == Occurred.BEFORE) { + throw new ObsoleteVersionException("Obsolete version for key '" + key + "': " + + value.getVersion()); + } else if(occurred == Occurred.AFTER) { + itemsToRemove.add(versioned); } } + items.removeAll(itemsToRemove); + items.add(value); + map.put(key, items); + } + + @Override + public synchronized List> multiVersionPut(K key, final List> values) { + StoreUtils.assertValidKey(key); + List> obsoleteVals = null; + List> valuesInStorage = null; + valuesInStorage = map.get(key); + if(valuesInStorage == null) { + valuesInStorage = new ArrayList>(values.size()); + } + obsoleteVals = resolveAndConstructVersionsToPersist(valuesInStorage, values); + map.put(key, valuesInStorage); + return obsoleteVals; } @Override public ClosableIterator>> entries() { - return new InMemoryIterator(map); + return new InMemoryIterator(map, this); } @Override @@ -181,7 +183,7 @@ public ClosableIterator keys(int partition) { } @Override - public void truncate() { + public synchronized void truncate() { map.clear(); } @@ -208,15 +210,23 @@ public String toString(int size) { return builder.toString(); } + /** + * This class relies on the concurrent hash map's iterator to return a + * weakly consistent view of the data in the map. + */ @NotThreadsafe - private static class InMemoryIterator implements ClosableIterator>> { + private static class InMemoryIterator implements + ClosableIterator>> { private final Iterator>>> iterator; private K currentKey; private Iterator> currentValues; + private InMemoryStorageEngine inMemoryStorageEngine; - public InMemoryIterator(ConcurrentMap>> map) { + public InMemoryIterator(ConcurrentMap>> map, + InMemoryStorageEngine inMemoryStorageEngine) { this.iterator = map.entrySet().iterator(); + this.inMemoryStorageEngine = inMemoryStorageEngine; } @Override @@ -244,7 +254,7 @@ public Pair> next() { Entry>> entry = iterator.next(); List> list = entry.getValue(); - synchronized(list) { + synchronized(this.inMemoryStorageEngine) { // okay we may have gotten an empty list, if so try // again if(list.size() == 0) @@ -269,6 +279,5 @@ public void remove() { public void close() { // nothing to do here } - } } diff --git a/src/java/voldemort/store/metadata/MetadataStore.java b/src/java/voldemort/store/metadata/MetadataStore.java index c6f1f1b46f..743a547634 100644 --- a/src/java/voldemort/store/metadata/MetadataStore.java +++ b/src/java/voldemort/store/metadata/MetadataStore.java @@ -78,6 +78,8 @@ public class MetadataStore extends AbstractStorageEngine GOSSIP_KEYS = ImmutableSet.of(CLUSTER_KEY, STORES_KEY); @@ -85,7 +87,9 @@ public class MetadataStore extends AbstractStorageEngine OPTIONAL_KEYS = ImmutableSet.of(SERVER_STATE_KEY, NODE_ID_KEY, - REBALANCING_STEAL_INFO); + REBALANCING_STEAL_INFO, + REBALANCING_SOURCE_CLUSTER_XML, + REBALANCING_SOURCE_STORES_XML); public static final Set METADATA_KEYS = ImmutableSet.builder() .addAll(REQUIRED_KEYS) @@ -167,25 +171,32 @@ public String getName() { * @param value */ @SuppressWarnings("unchecked") - public synchronized void put(String key, Versioned value) { - if(METADATA_KEYS.contains(key)) { + public void put(String key, Versioned value) { + // acquire write lock + writeLock.lock(); - // try inserting into inner store first - putInner(key, convertObjectToString(key, value)); + try { + if(METADATA_KEYS.contains(key)) { - // cache all keys if innerStore put succeeded - metadataCache.put(key, value); + // try inserting into inner store first + putInner(key, convertObjectToString(key, value)); - // do special stuff if needed - if(CLUSTER_KEY.equals(key)) { - updateRoutingStrategies((Cluster) value.getValue(), getStoreDefList()); - } else if(STORES_KEY.equals(key)) { - updateRoutingStrategies(getCluster(), (List) value.getValue()); - } else if(SYSTEM_STORES_KEY.equals(key)) - throw new VoldemortException("Cannot overwrite system store definitions"); + // cache all keys if innerStore put succeeded + metadataCache.put(key, value); - } else { - throw new VoldemortException("Unhandled Key:" + key + " for MetadataStore put()"); + // do special stuff if needed + if(CLUSTER_KEY.equals(key)) { + updateRoutingStrategies((Cluster) value.getValue(), getStoreDefList()); + } else if(STORES_KEY.equals(key)) { + updateRoutingStrategies(getCluster(), (List) value.getValue()); + } else if(SYSTEM_STORES_KEY.equals(key)) + throw new VoldemortException("Cannot overwrite system store definitions"); + + } else { + throw new VoldemortException("Unhandled Key:" + key + " for MetadataStore put()"); + } + } finally { + writeLock.unlock(); } } @@ -197,13 +208,19 @@ public synchronized void put(String key, Versioned value) { * @param value */ public void put(String key, Object value) { - if(METADATA_KEYS.contains(key)) { - VectorClock version = (VectorClock) get(key, null).get(0).getVersion(); - put(key, - new Versioned(value, version.incremented(getNodeId(), - System.currentTimeMillis()))); - } else { - throw new VoldemortException("Unhandled Key:" + key + " for MetadataStore put()"); + // acquire write lock + writeLock.lock(); + try { + if(METADATA_KEYS.contains(key)) { + VectorClock version = (VectorClock) get(key, null).get(0).getVersion(); + put(key, + new Versioned(value, version.incremented(getNodeId(), + System.currentTimeMillis()))); + } else { + throw new VoldemortException("Unhandled Key:" + key + " for MetadataStore put()"); + } + } finally { + writeLock.unlock(); } } @@ -216,16 +233,22 @@ public void put(String key, Object value) { * @throws VoldemortException */ @Override - public synchronized void put(ByteArray keyBytes, Versioned valueBytes, byte[] transforms) + public void put(ByteArray keyBytes, Versioned valueBytes, byte[] transforms) throws VoldemortException { - String key = ByteUtils.getString(keyBytes.get(), "UTF-8"); - Versioned value = new Versioned(ByteUtils.getString(valueBytes.getValue(), - "UTF-8"), - valueBytes.getVersion()); + // acquire write lock + writeLock.lock(); + try { + String key = ByteUtils.getString(keyBytes.get(), "UTF-8"); + Versioned value = new Versioned(ByteUtils.getString(valueBytes.getValue(), + "UTF-8"), + valueBytes.getVersion()); - Versioned valueObject = convertStringToObject(key, value); + Versioned valueObject = convertStringToObject(key, value); - this.put(key, valueObject); + this.put(key, valueObject); + } finally { + writeLock.unlock(); + } } @Override @@ -245,9 +268,16 @@ public Object getCapability(StoreCapabilityType capability) { * @throws VoldemortException */ @Override - public synchronized List> get(ByteArray keyBytes, byte[] transforms) + public List> get(ByteArray keyBytes, byte[] transforms) throws VoldemortException { + // acquire read lock + + readLock.lock(); try { + // get a read lock this prevents any sort of interleaving\ + // especially critical during reebalance when we set the new cluster + // and store xml + String key = ByteUtils.getString(keyBytes.get(), "UTF-8"); if(METADATA_KEYS.contains(key)) { @@ -272,69 +302,136 @@ public synchronized List> get(ByteArray keyBytes, byte[] trans + ByteUtils.getString(keyBytes.get(), "UTF-8") + " delete config/.temp config/.version directories and restart.", e); + } finally { + readLock.unlock(); } } public List> get(String key, String transforms) throws VoldemortException { - return get(new ByteArray(ByteUtils.getBytes(key, "UTF-8")), - transforms == null ? null : ByteUtils.getBytes(transforms, "UTF-8")); + // acquire read lock + readLock.lock(); + try { + return get(new ByteArray(ByteUtils.getBytes(key, "UTF-8")), + transforms == null ? null : ByteUtils.getBytes(transforms, "UTF-8")); + } finally { + readLock.unlock(); + } } @JmxOperation(description = "Clean all rebalancing server/cluster states from this node.", impact = MBeanOperationInfo.ACTION) - public synchronized void cleanAllRebalancingState() { - for(String key: OPTIONAL_KEYS) { - if(!key.equals(NODE_ID_KEY)) - innerStore.delete(key, - getVersions(new ByteArray(ByteUtils.getBytes(key, "UTF-8"))).get(0)); - } + public void cleanAllRebalancingState() { + // acquire write lock + writeLock.lock(); + try { + for(String key: OPTIONAL_KEYS) { + if(!key.equals(NODE_ID_KEY)) + innerStore.delete(key, + getVersions(new ByteArray(ByteUtils.getBytes(key, "UTF-8"))).get(0)); + } - init(getNodeId()); + init(getNodeId()); + } finally { + writeLock.unlock(); + } } @Override public List getVersions(ByteArray key) { - List> values = get(key, null); - List versions = new ArrayList(values.size()); - for(Versioned value: values) { - versions.add(value.getVersion()); + // acquire read lock + readLock.lock(); + try { + List> values = get(key, null); + List versions = new ArrayList(values.size()); + for(Versioned value: values) { + versions.add(value.getVersion()); + } + return versions; + } finally { + readLock.unlock(); } - return versions; } public Cluster getCluster() { - return (Cluster) metadataCache.get(CLUSTER_KEY).getValue(); + // acquire read lock + readLock.lock(); + try { + return (Cluster) metadataCache.get(CLUSTER_KEY).getValue(); + } finally { + readLock.unlock(); + + } } @SuppressWarnings("unchecked") public List getStoreDefList() { - return (List) metadataCache.get(STORES_KEY).getValue(); + // acquire read lock + readLock.lock(); + try { + return (List) metadataCache.get(STORES_KEY).getValue(); + } finally { + readLock.unlock(); + + } } @SuppressWarnings("unchecked") public List getSystemStoreDefList() { - return (List) metadataCache.get(SYSTEM_STORES_KEY).getValue(); + // acquire read lock + readLock.lock(); + try { + return (List) metadataCache.get(SYSTEM_STORES_KEY).getValue(); + } finally { + readLock.unlock(); + } } public int getNodeId() { - return (Integer) (metadataCache.get(NODE_ID_KEY).getValue()); + // acquire read lock + readLock.lock(); + try { + return (Integer) (metadataCache.get(NODE_ID_KEY).getValue()); + } finally { + readLock.unlock(); + } } public StoreDefinition getStoreDef(String storeName) { - List storeDefs = getStoreDefList(); - for(StoreDefinition storeDef: storeDefs) { - if(storeDef.getName().equals(storeName)) - return storeDef; + // acquire read lock + readLock.lock(); + try { + + List storeDefs = getStoreDefList(); + for(StoreDefinition storeDef: storeDefs) { + if(storeDef.getName().equals(storeName)) + return storeDef; + } + + throw new VoldemortException("Store " + storeName + " not found in MetadataStore"); + } finally { + readLock.unlock(); } + } - throw new VoldemortException("Store " + storeName + " not found in MetadataStore"); + public VoldemortState getServerStateLocked() { + // acquire read lock + readLock.lock(); + try { + return VoldemortState.valueOf(metadataCache.get(SERVER_STATE_KEY).getValue().toString()); + } finally { + readLock.unlock(); + + } } - public VoldemortState getServerState() { + public VoldemortState getServerStateUnlocked() { + return VoldemortState.valueOf(metadataCache.get(SERVER_STATE_KEY).getValue().toString()); + } public RebalancerState getRebalancerState() { + // acquire read lock readLock.lock(); try { return (RebalancerState) metadataCache.get(REBALANCING_STEAL_INFO).getValue(); @@ -343,21 +440,49 @@ public RebalancerState getRebalancerState() { } } + public Cluster getRebalancingSourceCluster() { + // acquire read lock + readLock.lock(); + try { + return (Cluster) metadataCache.get(REBALANCING_SOURCE_CLUSTER_XML).getValue(); + } finally { + readLock.unlock(); + } + } + + @SuppressWarnings("unchecked") + public List getRebalancingSourceStores() { + // acquire read lock + readLock.lock(); + try { + return (List) metadataCache.get(REBALANCING_SOURCE_STORES_XML) + .getValue(); + } finally { + readLock.unlock(); + } + } + /* * First check in the map of regular stores. If not present, check in the * system stores map. */ @SuppressWarnings("unchecked") public RoutingStrategy getRoutingStrategy(String storeName) { - Map routingStrategyMap = (Map) metadataCache.get(ROUTING_STRATEGY_KEY) - .getValue(); - RoutingStrategy strategy = routingStrategyMap.get(storeName); - if(strategy == null) { - Map systemRoutingStrategyMap = (Map) metadataCache.get(SYSTEM_ROUTING_STRATEGY_KEY) - .getValue(); - strategy = systemRoutingStrategyMap.get(storeName); + // acquire read lock + readLock.lock(); + try { + Map routingStrategyMap = (Map) metadataCache.get(ROUTING_STRATEGY_KEY) + .getValue(); + RoutingStrategy strategy = routingStrategyMap.get(storeName); + if(strategy == null) { + Map systemRoutingStrategyMap = (Map) metadataCache.get(SYSTEM_ROUTING_STRATEGY_KEY) + .getValue(); + strategy = systemRoutingStrategyMap.get(storeName); + } + return strategy; + } finally { + readLock.unlock(); } - return strategy; } /** @@ -382,33 +507,39 @@ private HashMap makeStoreDefinitionMap(List storeDefs) { - VectorClock clock = new VectorClock(); - if(metadataCache.containsKey(ROUTING_STRATEGY_KEY)) - clock = (VectorClock) metadataCache.get(ROUTING_STRATEGY_KEY).getVersion(); - - logger.info("Updating routing strategy for all stores"); - HashMap storeDefMap = makeStoreDefinitionMap(storeDefs); - HashMap routingStrategyMap = createRoutingStrategyMap(cluster, - storeDefMap); - this.metadataCache.put(ROUTING_STRATEGY_KEY, - new Versioned(routingStrategyMap, - clock.incremented(getNodeId(), - System.currentTimeMillis()))); - - for(String storeName: storeNameTolisteners.keySet()) { - RoutingStrategy updatedRoutingStrategy = routingStrategyMap.get(storeName); - if(updatedRoutingStrategy != null) { - try { - for(MetadataStoreListener listener: storeNameTolisteners.get(storeName)) { - listener.updateRoutingStrategy(updatedRoutingStrategy); - listener.updateStoreDefinition(storeDefMap.get(storeName)); + // acquire write lock + writeLock.lock(); + try { + VectorClock clock = new VectorClock(); + if(metadataCache.containsKey(ROUTING_STRATEGY_KEY)) + clock = (VectorClock) metadataCache.get(ROUTING_STRATEGY_KEY).getVersion(); + + logger.info("Updating routing strategy for all stores"); + HashMap storeDefMap = makeStoreDefinitionMap(storeDefs); + HashMap routingStrategyMap = createRoutingStrategyMap(cluster, + storeDefMap); + this.metadataCache.put(ROUTING_STRATEGY_KEY, + new Versioned(routingStrategyMap, + clock.incremented(getNodeId(), + System.currentTimeMillis()))); + + for(String storeName: storeNameTolisteners.keySet()) { + RoutingStrategy updatedRoutingStrategy = routingStrategyMap.get(storeName); + if(updatedRoutingStrategy != null) { + try { + for(MetadataStoreListener listener: storeNameTolisteners.get(storeName)) { + listener.updateRoutingStrategy(updatedRoutingStrategy); + listener.updateStoreDefinition(storeDefMap.get(storeName)); + } + } catch(Exception e) { + if(logger.isEnabledFor(Level.WARN)) + logger.warn(e, e); } - } catch(Exception e) { - if(logger.isEnabledFor(Level.WARN)) - logger.warn(e, e); } - } + } + } finally { + writeLock.unlock(); } } @@ -429,6 +560,7 @@ private void initSystemRoutingStrategies(Cluster cluster) { * @param stealInfo The steal information to add */ public void addRebalancingState(final RebalancePartitionsInfo stealInfo) { + // acquire write lock writeLock.lock(); try { // Move into rebalancing state @@ -460,6 +592,7 @@ public void addRebalancingState(final RebalancePartitionsInfo stealInfo) { * @param stealInfo The steal information to delete */ public void deleteRebalancingState(RebalancePartitionsInfo stealInfo) { + // acquire write lock writeLock.lock(); try { RebalancerState rebalancerState = getRebalancerState(); @@ -514,8 +647,14 @@ public boolean delete(ByteArray key, Version version) throws VoldemortException public Map>> getAll(Iterable keys, Map transforms) throws VoldemortException { - StoreUtils.assertValidKeys(keys); - return StoreUtils.getAll(this, keys, transforms); + // acquire read lock + readLock.lock(); + try { + StoreUtils.assertValidKeys(keys); + return StoreUtils.getAll(this, keys, transforms); + } finally { + readLock.unlock(); + } } /** @@ -524,6 +663,7 @@ public Map>> getAll(Iterable keys, private void init(int nodeId) { logger.info("metadata init()."); + writeLock.lock(); // Required keys initCache(CLUSTER_KEY); initCache(STORES_KEY); @@ -544,9 +684,13 @@ private void init(int nodeId) { initCache(REBALANCING_STEAL_INFO, new RebalancerState(new ArrayList())); initCache(SERVER_STATE_KEY, VoldemortState.NORMAL_SERVER.toString()); + initCache(REBALANCING_SOURCE_CLUSTER_XML, null); + initCache(REBALANCING_SOURCE_STORES_XML, null); // set transient values updateRoutingStrategies(getCluster(), getStoreDefList()); + + writeLock.unlock(); } private synchronized void initCache(String key) { @@ -594,7 +738,7 @@ private HashMap createRoutingStrategyMap(Cluster cluste */ @SuppressWarnings("unchecked") private Versioned convertObjectToString(String key, Versioned value) { - String valueStr = value.getValue().toString(); + String valueStr = ""; if(CLUSTER_KEY.equals(key)) { valueStr = clusterMapper.writeCluster((Cluster) value.getValue()); @@ -605,6 +749,14 @@ private Versioned convertObjectToString(String key, Versioned va valueStr = rebalancerState.toJsonString(); } else if(SERVER_STATE_KEY.equals(key) || NODE_ID_KEY.equals(key)) { valueStr = value.getValue().toString(); + } else if(REBALANCING_SOURCE_CLUSTER_XML.equals(key)) { + if(value.getValue() != null) { + valueStr = clusterMapper.writeCluster((Cluster) value.getValue()); + } + } else if(REBALANCING_SOURCE_STORES_XML.equals(key)) { + if(value.getValue() != null) { + valueStr = storeMapper.writeStoreList((List) value.getValue()); + } } else { throw new VoldemortException("Unhandled key:'" + key + "' for Object to String serialization."); @@ -641,6 +793,14 @@ private Versioned convertStringToObject(String key, Versioned va } else { valueObject = new RebalancerState(Arrays.asList(RebalancePartitionsInfo.create(valueString))); } + } else if(REBALANCING_SOURCE_CLUSTER_XML.equals(key)) { + if(value.getValue() != null && value.getValue().length() > 0) { + valueObject = clusterMapper.readCluster(new StringReader(value.getValue())); + } + } else if(REBALANCING_SOURCE_STORES_XML.equals(key)) { + if(value.getValue() != null && value.getValue().length() > 0) { + valueObject = storeMapper.readStoreList(new StringReader(value.getValue())); + } } else { throw new VoldemortException("Unhandled key:'" + key + "' for String to Object serialization."); diff --git a/src/java/voldemort/store/nonblockingstore/ThreadPoolBasedNonblockingStoreImpl.java b/src/java/voldemort/store/nonblockingstore/ThreadPoolBasedNonblockingStoreImpl.java index 1fc45756cd..e9bd28ea13 100644 --- a/src/java/voldemort/store/nonblockingstore/ThreadPoolBasedNonblockingStoreImpl.java +++ b/src/java/voldemort/store/nonblockingstore/ThreadPoolBasedNonblockingStoreImpl.java @@ -127,10 +127,8 @@ public void run() { try { Object result = request.request(innerStore); - if(callback != null) { - long diff = System.nanoTime() - start; - + long diff = Utils.elapsedTimeNs(start, System.nanoTime()); if(diff <= timeoutNs) { try { callback.requestComplete(result, diff / Time.NS_PER_MS); diff --git a/src/java/voldemort/store/rebalancing/AsyncProxyPutTask.java b/src/java/voldemort/store/rebalancing/AsyncProxyPutTask.java new file mode 100644 index 0000000000..46396037e3 --- /dev/null +++ b/src/java/voldemort/store/rebalancing/AsyncProxyPutTask.java @@ -0,0 +1,117 @@ +/* + * Copyright 2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package voldemort.store.rebalancing; + +import org.apache.log4j.Logger; + +import voldemort.cluster.Node; +import voldemort.store.Store; +import voldemort.store.UnreachableStoreException; +import voldemort.store.metadata.MetadataStore; +import voldemort.utils.ByteArray; +import voldemort.utils.ByteUtils; +import voldemort.versioning.ObsoleteVersionException; +import voldemort.versioning.Versioned; + +/** + * Task that issues the proxy put against the old replica, based on the old + * cluster metadata. This is best effort async replication. Failures will be + * logged and the server log will be post processed in case the rebalancing + * fails and we move back to old topology + * + * NOTE : There is no need for any special ordering of the proxy puts in the + * async thread pool (although the threadpool will have a queue of pending proxy + * puts internally), since a later version being proxy put before an earlier + * version would simply result in an OVE for the earlier proxy put. Online + * traffic will not be affected since the proxy node is not a replica and hence + * no client will be reading from it (if we are at all wondering about read + * consistency) + * + */ +public class AsyncProxyPutTask implements Runnable { + + private final static Logger logger = Logger.getLogger(AsyncProxyPutTask.class); + + private final RedirectingStore redirectingStore; + private final ByteArray key; + private final Versioned value; + private final byte[] transforms; + private final int destinationNode; + private final MetadataStore metadata; + + AsyncProxyPutTask(RedirectingStore redirectingStore, + ByteArray key, + Versioned value, + byte[] transforms, + int destinationNode) { + this.key = key; + this.value = value; + this.transforms = transforms; + this.redirectingStore = redirectingStore; + this.destinationNode = destinationNode; + this.metadata = redirectingStore.getMetadataStore(); + } + + @Override + public void run() { + Node proxyNode = metadata.getCluster().getNodeById(destinationNode); + long startNs = System.nanoTime(); + try { + // TODO there are no retries now if the node we want to write to is + // unavailable + redirectingStore.checkNodeAvailable(proxyNode); + Store socketStore = redirectingStore.getRedirectingSocketStore(redirectingStore.getName(), + destinationNode); + + socketStore.put(key, value, transforms); + redirectingStore.recordSuccess(proxyNode, startNs); + redirectingStore.reportProxyPutSuccess(); + if(logger.isTraceEnabled()) { + logger.trace("Proxy write for store " + redirectingStore.getName() + " key " + + ByteUtils.toHexString(key.get()) + " to destinationNode:" + + destinationNode); + } + } catch(UnreachableStoreException e) { + redirectingStore.recordException(proxyNode, startNs, e); + logFailedProxyPutIfNeeded(e); + } catch(ObsoleteVersionException ove) { + /* + * Proxy puts can get an OVE if somehow there are two stealers for + * the same proxy node and the other stealer's proxy put already got + * tothe proxy node.. This will not result from online put winning, + * since we don't issue proxy puts if the proxy node is still a + * replica + */ + logFailedProxyPutIfNeeded(ove); + } catch(Exception e) { + // Just log the key.. Not sure having values in the log is a good + // idea. + logFailedProxyPutIfNeeded(e); + } + } + + private void logFailedProxyPutIfNeeded(Exception e) { + redirectingStore.reportProxyPutFailure(); + // only log OVE if trace debugging is on. + if(e instanceof ObsoleteVersionException && !logger.isTraceEnabled()) { + return; + } + logger.error("Exception in proxy put for proxyNode: " + destinationNode + " from node:" + + metadata.getNodeId() + " on key " + ByteUtils.toHexString(key.get()) + + " Version:" + value.getVersion(), e); + } +} diff --git a/src/java/voldemort/store/rebalancing/ProxyPutStats.java b/src/java/voldemort/store/rebalancing/ProxyPutStats.java new file mode 100644 index 0000000000..9f485d6c09 --- /dev/null +++ b/src/java/voldemort/store/rebalancing/ProxyPutStats.java @@ -0,0 +1,72 @@ +/* + * Copyright 2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package voldemort.store.rebalancing; + +import java.util.concurrent.atomic.AtomicLong; + +import voldemort.annotations.jmx.JmxGetter; + +/** + * Statistics on Proxy puts issued from the redirecting store + * + */ +public class ProxyPutStats { + + private AtomicLong numProxyPutFailures; + + private AtomicLong numPendingProxyPuts; + + private ProxyPutStats parent; + + public ProxyPutStats(ProxyPutStats parent) { + this.numPendingProxyPuts = new AtomicLong(); + this.numProxyPutFailures = new AtomicLong(); + this.parent = parent; + } + + public void reportProxyPutSubmission() { + this.numPendingProxyPuts.incrementAndGet(); + if(this.parent != null) { + this.parent.reportProxyPutSubmission(); + } + } + + public void reportProxyPutCompletion() { + this.numPendingProxyPuts.decrementAndGet(); + if(this.parent != null) { + this.parent.reportProxyPutCompletion(); + } + } + + public void reportProxyPutFailure() { + this.reportProxyPutCompletion(); + this.numProxyPutFailures.incrementAndGet(); + if(this.parent != null) { + this.parent.reportProxyPutFailure(); + } + } + + @JmxGetter(name = "numProxyPutFailures") + public long getNumProxyPutFailures() { + return numProxyPutFailures.get(); + } + + @JmxGetter(name = "numPendingProxyPuts") + public long getNumPendingProxyPuts() { + return numPendingProxyPuts.get(); + } +} diff --git a/src/java/voldemort/store/rebalancing/RedirectingStore.java b/src/java/voldemort/store/rebalancing/RedirectingStore.java index 73c2bc1871..dcb4cbe084 100644 --- a/src/java/voldemort/store/rebalancing/RedirectingStore.java +++ b/src/java/voldemort/store/rebalancing/RedirectingStore.java @@ -18,6 +18,7 @@ import java.util.List; import java.util.Map; +import java.util.concurrent.ExecutorService; import java.util.concurrent.atomic.AtomicBoolean; import org.apache.log4j.Logger; @@ -26,19 +27,23 @@ import voldemort.annotations.jmx.JmxGetter; import voldemort.annotations.jmx.JmxSetter; import voldemort.client.protocol.RequestFormatType; -import voldemort.client.rebalance.RebalancePartitionsInfo; +import voldemort.cluster.Cluster; import voldemort.cluster.Node; import voldemort.cluster.failuredetector.FailureDetector; +import voldemort.routing.StoreRoutingPlan; import voldemort.server.RequestRoutingType; import voldemort.server.StoreRepository; import voldemort.store.DelegatingStore; import voldemort.store.Store; +import voldemort.store.StoreDefinition; import voldemort.store.StoreUtils; import voldemort.store.UnreachableStoreException; import voldemort.store.metadata.MetadataStore; import voldemort.store.metadata.MetadataStore.VoldemortState; +import voldemort.store.readonly.ReadOnlyStorageConfiguration; import voldemort.store.socket.SocketStoreFactory; import voldemort.utils.ByteArray; +import voldemort.utils.ByteUtils; import voldemort.utils.Time; import voldemort.versioning.ObsoleteVersionException; import voldemort.versioning.Version; @@ -52,10 +57,28 @@ /** * The RedirectingStore extends {@link DelegatingStore} *

- * If current server_state is {@link VoldemortState#REBALANCING_MASTER_SERVER}
- * then before serving any client request do a remote get() call, put it locally - * ignoring any {@link ObsoleteVersionException} and then serve the client - * requests. This piece of code is run on the stealer nodes. + * If current server_state is {@link VoldemortState#REBALANCING_MASTER_SERVER} + * then handle incoming requests in the following way, if the key belongs to a + * partition that this server is currently stealing. Such a server has what we + * call a 'proxy node', which is the server which owned that partition as the + * exact same type of replica in the same zone, as per the old cluster topology. + * + * 1. getVersions/get
+ * If the server contains the key locally, then serve it directly. Else, fetch + * from proxy node, update local storage and then serve it off that. + * + * 2. getAll
+ * Similarly, for keys that exist locally, serve it off directly. Else, + * fetch-update the missing keys from proxyNode and then serve them off local + * storage. + * + * 3. put
+ * First write it to local storage, then submit a async put() to the proxy node, + * so we can safely abort the rebalancing if we have to. + * + * 4. delete
+ * :) :) :) + * */ public class RedirectingStore extends DelegatingStore { @@ -65,18 +88,29 @@ public class RedirectingStore extends DelegatingStore private final SocketStoreFactory storeFactory; private FailureDetector failureDetector; private AtomicBoolean isRedirectingStoreEnabled; + private boolean isProxyPutEnabled; + private final ExecutorService proxyPutWorkerPool; + + // statistics on proxy put tasks + private final ProxyPutStats proxyPutStats; public RedirectingStore(Store innerStore, MetadataStore metadata, StoreRepository storeRepository, FailureDetector detector, - SocketStoreFactory storeFactory) { + SocketStoreFactory storeFactory, + boolean isProxyPutEnabled, + ExecutorService proxyPutWorkerPool, + ProxyPutStats proxyPutStats) { super(innerStore); this.metadata = metadata; this.storeRepository = storeRepository; this.storeFactory = storeFactory; this.failureDetector = detector; this.isRedirectingStoreEnabled = new AtomicBoolean(true); + this.isProxyPutEnabled = isProxyPutEnabled; + this.proxyPutWorkerPool = proxyPutWorkerPool; + this.proxyPutStats = proxyPutStats; } @JmxSetter(name = "setRedirectingStoreEnabled", description = "Enable the redirecting store for this store") @@ -91,84 +125,256 @@ public boolean getIsRedirectingStoreEnabled() { return this.isRedirectingStoreEnabled.get(); } - @Override - public void put(ByteArray key, Versioned value, byte[] transforms) + /** + * If needed, satisfies the get request by redirecting calls to the remote + * proxy node. Also updates local storage accordingly. + * + * @param key + * @param transforms + * @return + * @throws VoldemortException + */ + private List> redirectingGet(ByteArray key, byte[] transforms) throws VoldemortException { - RebalancePartitionsInfo stealInfo = redirectingKey(key); - /** - * If I am rebalancing for this key, try to do remote get() , put it + * If I am rebalancing for this key, try to do remote get(), put it * locally first to get the correct version ignoring any * {@link ObsoleteVersionException} */ - if(stealInfo != null) - proxyGetAndLocalPut(key, stealInfo.getDonorId(), transforms); - - getInnerStore().put(key, value, transforms); - } + Integer redirectNode = getProxyNode(key.get()); + if(redirectNode != null) { + // First, attempt a local get + List> vals = getInnerStore().get(key, transforms); + // If found, return + if(!vals.isEmpty()) { + /* + * There is a subtle race here if the underlying storage does + * not implement multiVersionPut(), since the we could read some + * versions of the key without all of it being transferred over + * by the background fetch. This is not a problem if we do bulk + * atomic writes of multiple versions of the same key into + * storage a.k.a multiVersionPut + */ + return vals; + } - private RebalancePartitionsInfo redirectingKey(ByteArray key) { - if(VoldemortState.REBALANCING_MASTER_SERVER.equals(metadata.getServerState()) - && isRedirectingStoreEnabled.get()) { - return metadata.getRebalancerState().find(getName(), - metadata.getRoutingStrategy(getName()) - .getPartitionList(key.get()), - metadata.getCluster() - .getNodeById(metadata.getNodeId()) - .getPartitionIds()); + if(logger.isTraceEnabled()) { + logger.trace("Proxying GET on stealer:" + metadata.getNodeId() + " for key " + + ByteUtils.toHexString(key.get()) + " to node:" + redirectNode); + } + proxyGetAndLocalPut(key, redirectNode, transforms); } - return null; + return getInnerStore().get(key, transforms); } - @Override - public List> get(ByteArray key, byte[] transforms) throws VoldemortException { - RebalancePartitionsInfo stealInfo = redirectingKey(key); - + /** + * If needed, satisfies the getVersions request by redirecting calls to the + * remote proxy node. Also updates local storage accordingly. + * + * @param key + * @param transforms + * @return + * @throws VoldemortException + */ + private List redirectingGetVersions(ByteArray key) { /** * If I am rebalancing for this key, try to do remote get(), put it * locally first to get the correct version ignoring any - * {@link ObsoleteVersionException} + * {@link ObsoleteVersionException}. */ - if(stealInfo != null) { - proxyGetAndLocalPut(key, stealInfo.getDonorId(), transforms); - } + Integer redirectNode = getProxyNode(key.get()); + if(redirectNode != null) { + // First, attempt a local getVersions() + List versions = getInnerStore().getVersions(key); + // If found some versions, return + if(!versions.isEmpty()) { + // Same caveat here as in redirectingGet(). Need multiVersionPut + // support in storage to avoid seeing partial versions + return versions; + } - return getInnerStore().get(key, transforms); + if(logger.isTraceEnabled()) { + logger.trace("Proxying GETVERSIONS on stealer:" + metadata.getNodeId() + + " for key " + ByteUtils.toHexString(key.get()) + " to node:" + + redirectNode); + } + proxyGetAndLocalPut(key, redirectNode, null); + } + return getInnerStore().getVersions(key); } - @Override - public List getVersions(ByteArray key) { - RebalancePartitionsInfo stealInfo = redirectingKey(key); + /** + * If needed, satisfies the getAll request by redirecting calls to the + * remote proxy node. Also updates local storage accordingly. + * + * + * @param keys + * @param transforms + * @return + * @throws VoldemortException + */ + private Map>> redirectingGetAll(Iterable keys, + Map transforms) + throws VoldemortException { + + // first determine how many keys are already present locally. + Map>> localVals = getInnerStore().getAll(keys, transforms); + Map keyToProxyNodeMap = Maps.newHashMapWithExpectedSize(Iterables.size(keys)); + for(ByteArray key: keys) { + // Relies on inner getAll() to not return an entry for the key in + // the result hashmap, in case the key does not exist on storage + if(localVals.containsKey(key)) { + // if you have it locally, move to next key + continue; + } + Integer redirectNode = getProxyNode(key.get()); + /* + * Else check if we are rebalancing for the key.. Intuitively, if we + * don't have the key, then we must be rebalancing for that key, + * right? Otherwise the key should have been here? Wrong, what if + * this is a non-existent key. We can't really confirm key does not + * exist, without going to the proxy node.. + */ + if(redirectNode != null) { + /* + * If we are indeed rebalancing for the key, then a proxy fetch + * will make things certain. + */ + keyToProxyNodeMap.put(key, redirectNode); + } + } + // If all keys were present locally, return. If not, do proxy fetch + if(!keyToProxyNodeMap.isEmpty()) { + if(logger.isTraceEnabled()) { + String keyStr = ""; + for(ByteArray key: keys) + keyStr += key + " "; + logger.trace("Proxying GETALL on stealer:" + metadata.getNodeId() + " for keys " + + keyStr); + } + // Issue proxy fetches for non-rebalancing keys that did not exist + // locally + proxyGetAllAndLocalPut(keyToProxyNodeMap, transforms); + // Now, issue a getAll for those keys alone + Map>> proxyFetchedVals = getInnerStore().getAll(keyToProxyNodeMap.keySet(), + transforms); + // Merge the results + for(Map.Entry>> entry: proxyFetchedVals.entrySet()) { + localVals.put(entry.getKey(), entry.getValue()); + } + } + return localVals; + } + + /** + * This is slightly different from other redirecting*** methods in that, + * this updates the remote proxy node, with this put request, so we can + * switch back to the old cluster topology if needed + * + * @param key + * @param value + * @param transforms + * @throws VoldemortException + */ + private void redirectingPut(ByteArray key, Versioned value, byte[] transforms) + throws VoldemortException { + Cluster currentCluster = metadata.getCluster(); + // TODO:refactor O(n) linear lookup of storedef here. Ideally should be + // a hash lookup. + StoreDefinition storeDef = metadata.getStoreDef(getName()); + /* + * defensively, error out if this is a read-only store and someone is + * doing puts against it. We don't to do extra work and fill the log + * with errors in that case. + */ + if(storeDef.getType().compareTo(ReadOnlyStorageConfiguration.TYPE_NAME) == 0) { + throw new UnsupportedOperationException("put() not supported on read-only store"); + } + StoreRoutingPlan currentRoutingPlan = new StoreRoutingPlan(currentCluster, storeDef); + Integer redirectNode = getProxyNode(currentRoutingPlan, storeDef, key.get()); /** - * If I am rebalancing for this key, try to do remote get(), put it - * locally first to get the correct version ignoring any - * {@link ObsoleteVersionException}. + * If I am rebalancing for this key, try to do remote get() if this node + * does not have the key , put it locally first to get the correct + * version ignoring any {@link ObsoleteVersionException} */ - if(stealInfo != null) { - proxyGetAndLocalPut(key, stealInfo.getDonorId(), null); + if(redirectNode != null) { + /* + * first check if the key exists locally. If so, it means, it has + * been moved over (either by a proxy fetch or background fetch) and + * we are good simply issuing the put on top of that. + */ + List> vals = getInnerStore().get(key, transforms); + if(vals.isEmpty()) { + // if not, then go proxy fetch it + if(logger.isTraceEnabled()) { + logger.trace("Proxying GET (before PUT) on stealer:" + metadata.getNodeId() + + " for key " + ByteUtils.toHexString(key.get()) + " to node:" + + redirectNode); + } + proxyGetAndLocalPut(key, redirectNode, transforms); + } } - return getInnerStore().getVersions(key); + // Here we are sure that the current node has caught up with the proxy + // for this key. Moving on to the put logic. + // put the data locally, if this step fails, there will be no proxy puts + getInnerStore().put(key, value, transforms); + + // submit an async task to issue proxy puts to the redirectNode + // NOTE : if the redirect node is also a current replica for the key (we + // could have a situation where the online replicated write could lose + // out to the proxy put and hence fail the client operation with an + // OVE). So do not send proxy puts in those cases. + if(isProxyPutEnabled && redirectNode != null + && !currentRoutingPlan.getReplicationNodeList(key.get()).contains(redirectNode)) { + AsyncProxyPutTask asyncProxyPutTask = new AsyncProxyPutTask(this, + key, + value, + transforms, + redirectNode); + proxyPutStats.reportProxyPutSubmission(); + proxyPutWorkerPool.submit(asyncProxyPutTask); + } + } + + @Override + public List> get(ByteArray key, byte[] transforms) throws VoldemortException { + if(isServerRebalancing()) { + return redirectingGet(key, transforms); + } else { + return getInnerStore().get(key, transforms); + } + } + + @Override + public List getVersions(ByteArray key) { + if(isServerRebalancing()) { + return redirectingGetVersions(key); + } else { + return getInnerStore().getVersions(key); + } } @Override public Map>> getAll(Iterable keys, Map transforms) throws VoldemortException { - Map rebalancePartitionsInfoPerKey = Maps.newHashMapWithExpectedSize(Iterables.size(keys)); - for(ByteArray key: keys) { - RebalancePartitionsInfo info = redirectingKey(key); - if(info != null) { - rebalancePartitionsInfoPerKey.put(key, info); - } + if(isServerRebalancing()) { + return redirectingGetAll(keys, transforms); + } else { + return getInnerStore().getAll(keys, transforms); } + } - if(!rebalancePartitionsInfoPerKey.isEmpty()) { - proxyGetAllAndLocalPut(rebalancePartitionsInfoPerKey, transforms); + @Override + public void put(ByteArray key, Versioned value, byte[] transforms) + throws VoldemortException { + if(isServerRebalancing()) { + redirectingPut(key, value, transforms); + } else { + getInnerStore().put(key, value, transforms); } - - return getInnerStore().getAll(keys, transforms); } /** @@ -194,78 +400,195 @@ public boolean delete(ByteArray key, Version version) throws VoldemortException return getInnerStore().delete(key, version); } + public boolean isServerRebalancing() { + return VoldemortState.REBALANCING_MASTER_SERVER.equals(metadata.getServerStateUnlocked()); + } + /** - * Performs a back-door proxy get to - * {@link voldemort.client.rebalance.RebalancePartitionsInfo#getDonorId() - * getDonorId} + * Checks if the server has to do any proxying of gets/puts to another + * server, as a part of an ongoing rebalance operation. + * + * Basic idea : Any given node which is a stealer of a partition, as the ith + * replica of a given zone, will proxy to the old ith replica of the + * partition in the given zone, as per the source cluster metadata. + * Exception : if this amounts to proxying to itself. + * + * Note on Zone Expansion : For zone expansion, there will be no proxying + * within the new zone. This is a practical assumption since if we fail, we + * fallback to a cluster topology without the new zone. As a result, reads + * from the new zone are not guaranteed to return some values during the + * course of zone expansion. This is a also reasonable since any + * organization undertaking such effort would need to have the data in place + * in the new zone, before the client apps are moved over. + * + * TODO:refactor Add helper methods to StoreRoutingPlan to simplify this + * code + * + * @param currentRoutingPlan routing plan object based on cluster's current + * topology + * @param storeDef definition of the store being redirected + * @param key to decide where to proxy to + * @return Null if no proxying is required else node id of the server to + * proxy to + */ + private Integer getProxyNode(StoreRoutingPlan currentRoutingPlan, + StoreDefinition storeDef, + byte[] key) { + // get out if redirecting is disabled. + if(!isRedirectingStoreEnabled.get()) { + return null; + } + + // TODO a better design would be to get these state changes from + // metadata listener callbacks, so we need not allocate these objects + // all the time + Cluster sourceCluster = metadata.getRebalancingSourceCluster(); + + // Logic to get the old storedef + List sourceStoreDefs = metadata.getRebalancingSourceStores(); + + if(sourceCluster == null) { + /* + * This is more for defensive coding purposes. The update of the + * source cluster key happens before the server is put in + * REBALANCING mode and is reset to null after the server goes back + * to NORMAL mode. + */ + + if(logger.isTraceEnabled()) { + + logger.trace("Old Cluster is null.. bail"); + } + return null; + } + if(sourceStoreDefs == null) { + /* + * similar to the above for stores xml + */ + + if(logger.isTraceEnabled()) { + + logger.trace("Old stores def is null.. bail"); + } + return null; + } + + StoreDefinition sourceStoreDef = null; + sourceStoreDef = StoreUtils.getStoreDef(sourceStoreDefs, storeDef.getName()); + + Integer nodeId = metadata.getNodeId(); + Integer zoneId = currentRoutingPlan.getCluster().getNodeById(nodeId).getZoneId(); + + // Use the old store definition to get the routing object + StoreRoutingPlan oldRoutingPlan = new StoreRoutingPlan(sourceCluster, sourceStoreDef); + // Check the current node's relationship to the key. + int zoneReplicaType = currentRoutingPlan.getZoneReplicaType(zoneId, nodeId, key); + // Determine which node held the key with the same relationship in the + // old cluster. That is your man! + Integer redirectNodeId; + try { + redirectNodeId = oldRoutingPlan.getZoneReplicaNode(zoneId, zoneReplicaType, key); + } catch(VoldemortException ve) { + /* + * If the zone does not exist, as in the case of Zone Expansion, + * there will be no proxy bridges built. The only other time an + * exception can be thrown here is when the replicaType is invalid. + * But that would mean we are changing say a 2/1/1 store to 3/2/2, + * which Voldemort currently does not support anyway + */ + return null; + } + // Unless he is the same as this node (where this is meaningless effort) + if(redirectNodeId == nodeId) { + return null; + } + return redirectNodeId; + } + + /** + * Wrapper around + * {@link RedirectingStore#getProxyNode(StoreRoutingPlan, StoreDefinition, byte[])} + * + * @param key + * @return + */ + private Integer getProxyNode(byte[] key) { + Cluster currentCluster = metadata.getCluster(); + StoreDefinition storeDef = metadata.getStoreDef(getName()); + // TODO Ideally, this object construction should be done only when + // metadata changes using a listener mechanism + StoreRoutingPlan currentRoutingPlan = new StoreRoutingPlan(currentCluster, storeDef); + return getProxyNode(currentRoutingPlan, storeDef, key); + } + + /** + * Performs a back-door proxy get to proxy node * * @param key Key - * @param donorNodeId donor node id - * @throws ProxyUnreachableException if donor node can't be reached + * @param proxyNodeId proxy node id + * @throws ProxyUnreachableException if proxy node can't be reached */ - private List> proxyGet(ByteArray key, int donorNodeId, byte[] transform) { - Node donorNode = metadata.getCluster().getNodeById(donorNodeId); - checkNodeAvailable(donorNode); + private List> proxyGet(ByteArray key, int proxyNodeId, byte[] transform) { + Node proxyNode = metadata.getCluster().getNodeById(proxyNodeId); + checkNodeAvailable(proxyNode); long startNs = System.nanoTime(); try { Store redirectingStore = getRedirectingSocketStore(getName(), - donorNodeId); + proxyNodeId); List> values = redirectingStore.get(key, transform); - recordSuccess(donorNode, startNs); + recordSuccess(proxyNode, startNs); return values; } catch(UnreachableStoreException e) { - recordException(donorNode, startNs, e); - throw new ProxyUnreachableException("Failed to reach proxy node " + donorNode, e); + recordException(proxyNode, startNs, e); + throw new ProxyUnreachableException("Failed to reach proxy node " + proxyNode, e); } } - private void checkNodeAvailable(Node donorNode) { - if(!failureDetector.isAvailable(donorNode)) - throw new ProxyUnreachableException("Failed to reach proxy node " + donorNode + protected void checkNodeAvailable(Node proxyNode) { + if(!failureDetector.isAvailable(proxyNode)) + throw new ProxyUnreachableException("Failed to reach proxy node " + proxyNode + " is marked down by failure detector."); } /** - * Performs a back-door proxy get to - * {@link voldemort.client.rebalance.RebalancePartitionsInfo#getDonorId() - * getDonorId} + * Performs a back-door proxy getAll * - * @param rebalancePartitionsInfoPerKey Map of keys to corresponding - * partition info + * @param keyToProxyNodeMap Map of keys to corresponding proxy nodes housing + * the keys in source cluster * @param transforms Map of keys to their corresponding transforms - * @throws ProxyUnreachableException if donor node can't be reached + * @throws ProxyUnreachableException if proxy node can't be reached */ - private Map>> proxyGetAll(Map rebalancePartitionsInfoPerKey, + private Map>> proxyGetAll(Map keyToProxyNodeMap, Map transforms) throws VoldemortException { - Multimap donorNodeToKeys = HashMultimap.create(); + Multimap proxyNodeToKeys = HashMultimap.create(); int numKeys = 0; - // Transform the map of key to plan to a map of donor node id to keys - for(Map.Entry entry: rebalancePartitionsInfoPerKey.entrySet()) { + // Transform the map of key to plan to a map of proxy node id to keys + for(Map.Entry entry: keyToProxyNodeMap.entrySet()) { numKeys++; - donorNodeToKeys.put(entry.getValue().getDonorId(), entry.getKey()); + proxyNodeToKeys.put(entry.getValue(), entry.getKey()); } Map>> gatherMap = Maps.newHashMapWithExpectedSize(numKeys); - for(int donorNodeId: donorNodeToKeys.keySet()) { - Node donorNode = metadata.getCluster().getNodeById(donorNodeId); - checkNodeAvailable(donorNode); + for(int proxyNodeId: proxyNodeToKeys.keySet()) { + Node proxyNode = metadata.getCluster().getNodeById(proxyNodeId); + checkNodeAvailable(proxyNode); long startNs = System.nanoTime(); try { Map>> resultsForNode = getRedirectingSocketStore(getName(), - donorNodeId).getAll(donorNodeToKeys.get(donorNodeId), + proxyNodeId).getAll(proxyNodeToKeys.get(proxyNodeId), transforms); - recordSuccess(donorNode, startNs); + recordSuccess(proxyNode, startNs); for(Map.Entry>> entry: resultsForNode.entrySet()) { gatherMap.put(entry.getKey(), entry.getValue()); } } catch(UnreachableStoreException e) { - recordException(donorNode, startNs, e); - throw new ProxyUnreachableException("Failed to reach proxy node " + donorNode, e); + recordException(proxyNode, startNs, e); + throw new ProxyUnreachableException("Failed to reach proxy node " + proxyNode, e); } } @@ -275,27 +598,34 @@ private Map>> proxyGetAll(MapREBALANCING_MASTER_SERVER state put should be committed * on stealer node. To follow Voldemort version guarantees, stealer node - * should query donor node and put that value (proxyValue) before committing + * should query proxy node and put that value (proxyValue) before committing * the value from client. *

* Stealer node should ignore {@link ObsoleteVersionException} while * commiting proxyValue to local storage. * * @param key Key - * @param donorId donorId + * @param proxyId proxy node id * @return Returns the proxy value * @throws VoldemortException if {@link #proxyGet(ByteArray, int)} fails */ private List> proxyGetAndLocalPut(ByteArray key, - int donorId, + int proxyId, byte[] transforms) throws VoldemortException { - List> proxyValues = proxyGet(key, donorId, transforms); + List> proxyValues = proxyGet(key, proxyId, transforms); for(Versioned proxyValue: proxyValues) { try { getInnerStore().put(key, proxyValue, null); } catch(ObsoleteVersionException e) { - // ignore these + // TODO this is in TRACE because OVE is expected here, for keys + // that are already moved over or proxy got. This will become + // ERROR later post redesign + if(logger.isTraceEnabled()) + logger.trace("OVE in proxy get local put for key " + + ByteUtils.toHexString(key.get()) + " Stealer:" + + metadata.getNodeId() + " ProxyNode:" + proxyId, + e); } } return proxyValues; @@ -305,16 +635,16 @@ private List> proxyGetAndLocalPut(ByteArray key, * Similar to {@link #proxyGetAndLocalPut(ByteArray, int)} but meant for * {@link #getAll(Iterable)} * - * @param rebalancePartitionsInfoPerKey Map of keys which are being routed - * to their corresponding plan + * @param keyToProxyNodeMap Map of keys which are being routed to their + * corresponding proxy nodes * @param transforms Map of key to their corresponding transforms * @return Returns a map of key to its corresponding list of values * @throws VoldemortException if {@link #proxyGetAll(List, List)} fails */ - private Map>> proxyGetAllAndLocalPut(Map rebalancePartitionsInfoPerKey, + private Map>> proxyGetAllAndLocalPut(Map keyToProxyNodeMap, Map transforms) throws VoldemortException { - Map>> proxyKeyValues = proxyGetAll(rebalancePartitionsInfoPerKey, + Map>> proxyKeyValues = proxyGetAll(keyToProxyNodeMap, transforms); for(Map.Entry>> keyValuePair: proxyKeyValues.entrySet()) { for(Versioned proxyValue: keyValuePair.getValue()) { @@ -330,49 +660,65 @@ private Map>> proxyGetAllAndLocalPut(MapSocketStore object for storeName and - * donorNodeId + * proxyNodeId */ - private Store getRedirectingSocketStore(String storeName, - int donorNodeId) { - if(!storeRepository.hasRedirectingSocketStore(storeName, donorNodeId)) { + protected Store getRedirectingSocketStore(String storeName, + int proxyNodeId) { + if(!storeRepository.hasRedirectingSocketStore(storeName, proxyNodeId)) { synchronized(storeRepository) { - if(!storeRepository.hasRedirectingSocketStore(storeName, donorNodeId)) { - Node donorNode = getNodeIfPresent(donorNodeId); - logger.info("Creating new redirecting store for donor node " - + donorNode.getId() + " and store " + storeName); - storeRepository.addRedirectingSocketStore(donorNode.getId(), + if(!storeRepository.hasRedirectingSocketStore(storeName, proxyNodeId)) { + Node proxyNode = getNodeIfPresent(proxyNodeId); + logger.info("Creating new redirecting store for proxy node " + + proxyNode.getId() + " and store " + storeName); + storeRepository.addRedirectingSocketStore(proxyNode.getId(), storeFactory.create(storeName, - donorNode.getHost(), - donorNode.getSocketPort(), + proxyNode.getHost(), + proxyNode.getSocketPort(), RequestFormatType.PROTOCOL_BUFFERS, RequestRoutingType.IGNORE_CHECKS)); } } } - return storeRepository.getRedirectingSocketStore(storeName, donorNodeId); + return storeRepository.getRedirectingSocketStore(storeName, proxyNodeId); } - private Node getNodeIfPresent(int donorId) { + private Node getNodeIfPresent(int proxyNodeId) { try { - return metadata.getCluster().getNodeById(donorId); + return metadata.getCluster().getNodeById(proxyNodeId); } catch(Exception e) { - throw new VoldemortException("Failed to get donorNode " + donorId + throw new VoldemortException("Failed to get proxyNode " + proxyNodeId + " from current cluster " + metadata.getCluster() + " at node " + metadata.getNodeId(), e); } } - private void recordException(Node node, long startNs, UnreachableStoreException e) { + protected void recordException(Node node, long startNs, UnreachableStoreException e) { failureDetector.recordException(node, (System.nanoTime() - startNs) / Time.NS_PER_MS, e); } - private void recordSuccess(Node node, long startNs) { + protected void recordSuccess(Node node, long startNs) { failureDetector.recordSuccess(node, (System.nanoTime() - startNs) / Time.NS_PER_MS); } + + protected MetadataStore getMetadataStore() { + return metadata; + } + + protected void reportProxyPutFailure() { + proxyPutStats.reportProxyPutFailure(); + } + + protected void reportProxyPutSuccess() { + proxyPutStats.reportProxyPutCompletion(); + } + + public ProxyPutStats getProxyPutStats() { + return this.proxyPutStats; + } } diff --git a/src/java/voldemort/store/routed/Pipeline.java b/src/java/voldemort/store/routed/Pipeline.java index 255c682571..7a26891b8d 100644 --- a/src/java/voldemort/store/routed/Pipeline.java +++ b/src/java/voldemort/store/routed/Pipeline.java @@ -128,10 +128,7 @@ public void addEventAction(Event event, Action action) { */ public void abort() { - if(isHintedHandoffEnabled()) - addEvent(Event.ABORTED); - else - addEvent(Event.ERROR); + addEvent(Event.ERROR); } /** diff --git a/src/java/voldemort/store/routed/PipelineRoutedStats.java b/src/java/voldemort/store/routed/PipelineRoutedStats.java index 77918d28c6..d3a41ae2ef 100644 --- a/src/java/voldemort/store/routed/PipelineRoutedStats.java +++ b/src/java/voldemort/store/routed/PipelineRoutedStats.java @@ -21,11 +21,11 @@ */ public class PipelineRoutedStats { - private ConcurrentHashMap, AtomicLong> errCountMap; - private AtomicLong severeExceptionCount; - private AtomicLong benignExceptionCount; + protected ConcurrentHashMap, AtomicLong> errCountMap; + protected AtomicLong severeExceptionCount; + protected AtomicLong benignExceptionCount; - PipelineRoutedStats() { + protected PipelineRoutedStats() { errCountMap = new ConcurrentHashMap, AtomicLong>(); errCountMap.put(InvalidMetadataException.class, new AtomicLong(0)); errCountMap.put(InsufficientOperationalNodesException.class, new AtomicLong(0)); @@ -99,7 +99,7 @@ public void reportException(Exception e) { errCountMap.get(e.getClass()).incrementAndGet(); } - private boolean isSevere(Exception ve) { + public boolean isSevere(Exception ve) { if(ve instanceof InsufficientOperationalNodesException || ve instanceof InsufficientZoneResponsesException || ve instanceof InvalidMetadataException) diff --git a/src/java/voldemort/store/routed/PipelineRoutedStore.java b/src/java/voldemort/store/routed/PipelineRoutedStore.java index d0181c6dc4..b63621653a 100644 --- a/src/java/voldemort/store/routed/PipelineRoutedStore.java +++ b/src/java/voldemort/store/routed/PipelineRoutedStore.java @@ -30,10 +30,12 @@ import voldemort.common.VoldemortOpCode; import voldemort.routing.RoutingStrategyType; import voldemort.store.CompositeVoldemortRequest; +import voldemort.store.PersistenceFailureException; import voldemort.store.Store; import voldemort.store.StoreDefinition; import voldemort.store.StoreRequest; import voldemort.store.StoreUtils; +import voldemort.store.UnreachableStoreException; import voldemort.store.nonblockingstore.NonblockingStore; import voldemort.store.routed.Pipeline.Event; import voldemort.store.routed.Pipeline.Operation; @@ -904,4 +906,9 @@ public boolean delete(CompositeVoldemortRequest request) throws VoldemortException { return delete(request.getKey(), request.getVersion(), request.getRoutingTimeoutInMs()); } + + public static boolean isSlopableFailure(Object response) { + return response instanceof UnreachableStoreException + || response instanceof PersistenceFailureException; + } } diff --git a/src/java/voldemort/store/routed/PutPipelineData.java b/src/java/voldemort/store/routed/PutPipelineData.java index 5f119a9e5f..af34ec9b0d 100644 --- a/src/java/voldemort/store/routed/PutPipelineData.java +++ b/src/java/voldemort/store/routed/PutPipelineData.java @@ -17,6 +17,7 @@ package voldemort.store.routed; import voldemort.cluster.Node; +import voldemort.store.routed.action.AsyncPutSynchronizer; import voldemort.store.routed.action.PerformSerialPutRequests; import voldemort.versioning.Versioned; @@ -33,6 +34,8 @@ public class PutPipelineData extends BasicPipelineData { private long startTimeNs; + final private AsyncPutSynchronizer synchronizer = new AsyncPutSynchronizer(); + /** * Returns the previously determined "master" node. This is the first node * in the preference list that succeeded in "putting" the value. @@ -95,4 +98,7 @@ public long getStartTimeNs() { return this.startTimeNs; } + public AsyncPutSynchronizer getSynchronizer() { + return synchronizer; + } } diff --git a/src/java/voldemort/store/routed/ReadRepairer.java b/src/java/voldemort/store/routed/ReadRepairer.java index 32b5156c32..a5be02a2ef 100644 --- a/src/java/voldemort/store/routed/ReadRepairer.java +++ b/src/java/voldemort/store/routed/ReadRepairer.java @@ -18,11 +18,14 @@ import java.util.ArrayList; import java.util.Collections; +import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; +import org.apache.log4j.Logger; + import voldemort.annotations.concurrency.Threadsafe; import voldemort.versioning.Occurred; import voldemort.versioning.Version; @@ -45,6 +48,8 @@ @Threadsafe public class ReadRepairer { + private final Logger logger = Logger.getLogger(getClass()); + /** * Compute the repair set from the given values and nodes * @@ -77,85 +82,105 @@ private List> singleKeyGetRepairs(List> nodeValu if(size <= 1) return Collections.emptyList(); - // A list of obsolete nodes that need to be repaired - Set obsolete = new HashSet(3); + // 1. Create a multi-map of nodes to their existing Versions + Multimap> nodeVersionsMap = HashMultimap.create(); + for(NodeValue nodeValue: nodeValues) { + nodeVersionsMap.put(nodeValue.getNodeId(), nodeValue); + } + + // 2. Create a map of the final set of versions (for this key) + Map> mostCurrentVersionsMap = new HashMap>(); - // A Map of Version=>NodeValues that contains the current best estimate - // of the set of current versions - // and the nodes containing them - Multimap> concurrents = HashMultimap.create(); - concurrents.put(nodeValues.get(0).getVersion(), nodeValues.get(0)); + // Initialize with the first element from the input + mostCurrentVersionsMap.put(nodeValues.get(0).getVersion(), nodeValues.get(0)); // check each value against the current set of most current versions for(int i = 1; i < nodeValues.size(); i++) { NodeValue curr = nodeValues.get(i); boolean concurrentToAll = true; - Set versions = new HashSet(concurrents.keySet()); - for(Version concurrentVersion: versions) { - // if we already have the version, just add the nodevalue for - // future updating and move on - if(curr.getVersion().equals(concurrentVersion)) { - concurrents.put(curr.getVersion(), curr); + /* + * Make a copy for the traversal. This is because the original map + * can be modified during this traversal + */ + Set knownGoodVersions = new HashSet(mostCurrentVersionsMap.keySet()); + + for(Version currentGoodversion: knownGoodVersions) { + + // If the version already exists, do nothing + if(curr.getVersion().equals(currentGoodversion)) { + concurrentToAll = false; + if(logger.isDebugEnabled()) { + logger.debug("Version already exists in the most current set: " + curr); + } break; } // Check the ordering of the current value - Occurred occurred = curr.getVersion().compare(concurrentVersion); + Occurred occurred = curr.getVersion().compare(currentGoodversion); if(occurred == Occurred.BEFORE) { - // This value is obsolete! Stop checking against other - // values... - obsolete.add(curr.getNodeId()); + // This value is obsolete! Break from the loop + if(logger.isDebugEnabled()) { + logger.debug("Version is obsolete : " + curr); + } concurrentToAll = false; break; } else if(occurred == Occurred.AFTER) { // This concurrent value is obsolete and the current value // should replace it - for(NodeValue v: concurrents.get(concurrentVersion)) - obsolete.add(v.getNodeId()); - concurrents.removeAll(concurrentVersion); + mostCurrentVersionsMap.remove(currentGoodversion); concurrentToAll = false; - concurrents.put(curr.getVersion(), curr); + mostCurrentVersionsMap.put(curr.getVersion(), curr); + if(logger.isDebugEnabled()) { + logger.debug("Updating the current best - adding : " + curr); + } } } // if the value is concurrent to all existing versions then add it // to the concurrent set - if(concurrentToAll) - concurrents.put(curr.getVersion(), curr); + if(concurrentToAll) { + mostCurrentVersionsMap.put(curr.getVersion(), curr); + if(logger.isDebugEnabled()) { + logger.debug("Value is concurrent to all ! : " + curr); + } + } } - // Construct the list of repairs + // 3. Compare 1 and 2 and create the repair list List> repairs = new ArrayList>(3); - for(Integer id: obsolete) { - // repair all obsolete nodes - for(Version v: concurrents.keySet()) { - NodeValue concurrent = concurrents.get(v).iterator().next(); - NodeValue repair = new NodeValue(id, - concurrent.getKey(), - concurrent.getVersioned()); - repairs.add(repair); + for(int nodeId: nodeVersionsMap.keySet()) { + Set finalVersions = new HashSet(mostCurrentVersionsMap.keySet()); + if(logger.isDebugEnabled()) { + logger.debug("Set of final versions = " + finalVersions); } - } - if(concurrents.size() > 1) { - // if there are more then one concurrent versions on different - // nodes, - // we should repair so all have the same set of values - Set> existing = new HashSet>(repairs); - for(NodeValue entry1: concurrents.values()) { - for(NodeValue entry2: concurrents.values()) { - if(!entry1.getVersion().equals(entry2.getVersion())) { - NodeValue repair = new NodeValue(entry1.getNodeId(), - entry2.getKey(), - entry2.getVersioned()); - if(!existing.contains(repair)) - repairs.add(repair); - } + // Calculate the set difference between final Versions and + // the versions currently existing for nodeId + Set currentNodeVersions = new HashSet(); + for(NodeValue nodeValue: nodeVersionsMap.get(nodeId)) { + currentNodeVersions.add(nodeValue.getVersion()); + } + finalVersions.removeAll(currentNodeVersions); + + if(logger.isDebugEnabled()) { + logger.debug("Remaining versions to be repaired for this node after the set difference = " + + finalVersions); + } + + // Repair nodeId with the remaining Versioned values + for(Version remainingVersion: finalVersions) { + NodeValue repair = new NodeValue(nodeId, + mostCurrentVersionsMap.get(remainingVersion) + .getKey(), + mostCurrentVersionsMap.get(remainingVersion) + .getVersioned()); + if(logger.isDebugEnabled()) { + logger.debug("Node value marked to be repaired : " + repair); } + repairs.add(repair); } } return repairs; } - } diff --git a/src/java/voldemort/store/routed/action/AbstractAction.java b/src/java/voldemort/store/routed/action/AbstractAction.java index 906db19e6b..042e85efc3 100644 --- a/src/java/voldemort/store/routed/action/AbstractAction.java +++ b/src/java/voldemort/store/routed/action/AbstractAction.java @@ -74,6 +74,9 @@ protected boolean handleResponseError(Exception e, } if(e instanceof UnreachableStoreException) { + if(logger.isTraceEnabled()) { + logger.trace("Adding node [" + node + "] to failed nodes list"); + } pipelineData.addFailedNode(node); pipelineData.recordFailure(e); failureDetector.recordException(node, requestTime, (UnreachableStoreException) e); diff --git a/src/java/voldemort/store/routed/action/AbstractConfigureNodes.java b/src/java/voldemort/store/routed/action/AbstractConfigureNodes.java index 6bb5992059..ff5dffa280 100644 --- a/src/java/voldemort/store/routed/action/AbstractConfigureNodes.java +++ b/src/java/voldemort/store/routed/action/AbstractConfigureNodes.java @@ -25,6 +25,7 @@ import voldemort.store.InsufficientOperationalNodesException; import voldemort.store.routed.Pipeline.Event; import voldemort.store.routed.PipelineData; +import voldemort.store.routed.PutPipelineData; import voldemort.utils.ByteArray; import voldemort.utils.ByteUtils; @@ -61,6 +62,9 @@ protected List getNodes(ByteArray key) { if(failureDetector.isAvailable(node)) nodes.add(node); else { + if(pipelineData instanceof PutPipelineData) { + ((PutPipelineData) pipelineData).getSynchronizer().tryDelegateSlop(node); + } pipelineData.addFailedNode(node); if(logger.isDebugEnabled()) { logger.debug("Key " + ByteUtils.toHexString(key.get()) + " Node " diff --git a/src/java/voldemort/store/routed/action/AbstractReadRepair.java b/src/java/voldemort/store/routed/action/AbstractReadRepair.java index ac2dac7829..f41d0a00f5 100644 --- a/src/java/voldemort/store/routed/action/AbstractReadRepair.java +++ b/src/java/voldemort/store/routed/action/AbstractReadRepair.java @@ -78,7 +78,7 @@ public void execute(Pipeline pipeline) { long startTimeNs = -1; - if(logger.isTraceEnabled()) + if(logger.isDebugEnabled()) startTimeNs = System.nanoTime(); if(nodeValues.size() > 1 && preferred > 1) { diff --git a/src/java/voldemort/store/routed/action/AsyncPutSynchronizer.java b/src/java/voldemort/store/routed/action/AsyncPutSynchronizer.java new file mode 100644 index 0000000000..4a915e9cdb --- /dev/null +++ b/src/java/voldemort/store/routed/action/AsyncPutSynchronizer.java @@ -0,0 +1,131 @@ +package voldemort.store.routed.action; + +import java.util.Collections; +import java.util.LinkedList; +import java.util.Queue; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.TimeUnit; + +import org.apache.log4j.Logger; + +import voldemort.cluster.Node; +import voldemort.store.routed.Response; +import voldemort.utils.ByteArray; + +/** + * The AsyncPutSynchronizer Class is used for synchronizing operations inside + * PerformParallelPut action More specifically, it coordinate the exception + * handling and hinted handoff responsibility between master thread and async + * put threads + * + */ +public class AsyncPutSynchronizer { + + private final static Logger logger = Logger.getLogger(AsyncPutSynchronizer.class); + private boolean asyncCallbackShouldSendhint; + private boolean responseHandlingCutoff; + private final ConcurrentMap slopDestinations; // the value in + // the map is + // not used + private final Queue> responseQueue; + + public AsyncPutSynchronizer() { + asyncCallbackShouldSendhint = false; + responseHandlingCutoff = false; + slopDestinations = new ConcurrentHashMap(); + responseQueue = new LinkedList>(); + } + + /** + * Get list of nodes to register slop for + * + * @return list of nodes to register slop for + */ + public synchronized Set getDelegatedSlopDestinations() { + return Collections.unmodifiableSet(slopDestinations.keySet()); + } + + /** + * Stop accepting delegated slop responsibility by master + */ + public synchronized void disallowDelegateSlop() { + asyncCallbackShouldSendhint = true; + } + + /** + * Try to delegate the responsibility of sending slops to master + * + * @param node The node that slop should eventually be pushed to + * @return true if master accept the responsibility; false if master does + * not accept + */ + public synchronized boolean tryDelegateSlop(Node node) { + if(asyncCallbackShouldSendhint) { + return false; + } else { + slopDestinations.put(node, true); + return true; + } + } + + /** + * Master Stop accepting new responses (from async callbacks) + */ + public synchronized void cutoffHandling() { + responseHandlingCutoff = true; + } + + /** + * try to delegate the master to handle the response + * + * @param response + * @return true if the master accepted the response; false if the master + * didn't accept + */ + public synchronized boolean tryDelegateResponseHandling(Response response) { + if(responseHandlingCutoff) { + return false; + } else { + responseQueue.offer(response); + this.notifyAll(); + return true; + } + } + + /** + * poll the response queue for response + * + * @param timeout timeout amount + * @param timeUnit timeUnit of timeout + * @return same result of BlockQueue.poll(long, TimeUnit) + * @throws InterruptedException + */ + public synchronized Response responseQueuePoll(long timeout, + TimeUnit timeUnit) + throws InterruptedException { + long timeoutMs = timeUnit.toMillis(timeout); + long timeoutWallClockMs = System.currentTimeMillis() + timeoutMs; + while(responseQueue.isEmpty() && System.currentTimeMillis() < timeoutWallClockMs) { + long remainingMs = Math.max(0, timeoutWallClockMs - System.currentTimeMillis()); + if(logger.isDebugEnabled()) { + logger.debug("Start waiting for response queue with timeoutMs: " + timeoutMs); + } + this.wait(remainingMs); + if(logger.isDebugEnabled()) { + logger.debug("End waiting for response queue with timeoutMs: " + timeoutMs); + } + } + return responseQueue.poll(); + } + + /** + * to see if the response queue is empty + * + * @return true is response queue is empty; false if not empty. + */ + public synchronized boolean responseQueueIsEmpty() { + return responseQueue.isEmpty(); + } +} diff --git a/src/java/voldemort/store/routed/action/ConfigureNodes.java b/src/java/voldemort/store/routed/action/ConfigureNodes.java index 1f7cb59c60..a7a4946ce2 100644 --- a/src/java/voldemort/store/routed/action/ConfigureNodes.java +++ b/src/java/voldemort/store/routed/action/ConfigureNodes.java @@ -132,5 +132,4 @@ public void execute(Pipeline pipeline) { pipelineData.setNodes(nodes); pipeline.addEvent(completeEvent); } - } diff --git a/src/java/voldemort/store/routed/action/ConfigureNodesByZone.java b/src/java/voldemort/store/routed/action/ConfigureNodesByZone.java index 6f1d15a1f0..fbc6888b4b 100644 --- a/src/java/voldemort/store/routed/action/ConfigureNodesByZone.java +++ b/src/java/voldemort/store/routed/action/ConfigureNodesByZone.java @@ -113,6 +113,9 @@ public List getNodes(ByteArray key, Operation op) { if(clientZoneNodes != null && clientZoneNodes.size() > 0) nodes.addAll(clientZoneNodes); // ...followed by other zones sorted by proximity list + // NOTE : its imperative that the proximity list does not contain the + // client zone. If this happens, we will add those nodes twice to the + // list for(int index = 0; index < zoneProximityList.size(); index++) { List zoneNodes = zoneIdToNode.get(zoneProximityList.get(index)); if(zoneNodes != null && zoneNodes.size() > 0) { diff --git a/src/java/voldemort/store/routed/action/PerformParallelPutRequests.java b/src/java/voldemort/store/routed/action/PerformParallelPutRequests.java index 1cf13c308d..cbf5fb29c3 100644 --- a/src/java/voldemort/store/routed/action/PerformParallelPutRequests.java +++ b/src/java/voldemort/store/routed/action/PerformParallelPutRequests.java @@ -20,29 +20,27 @@ import java.util.Date; import java.util.List; import java.util.Map; -import java.util.Map.Entry; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.CountDownLatch; +import java.util.NoSuchElementException; import java.util.concurrent.TimeUnit; import org.apache.log4j.Level; +import voldemort.VoldemortException; import voldemort.cluster.Node; import voldemort.cluster.failuredetector.FailureDetector; import voldemort.store.InsufficientOperationalNodesException; import voldemort.store.InsufficientZoneResponsesException; import voldemort.store.InvalidMetadataException; -import voldemort.store.UnreachableStoreException; import voldemort.store.nonblockingstore.NonblockingStore; import voldemort.store.nonblockingstore.NonblockingStoreCallback; import voldemort.store.routed.Pipeline; import voldemort.store.routed.Pipeline.Event; +import voldemort.store.routed.PipelineRoutedStore; import voldemort.store.routed.PutPipelineData; import voldemort.store.routed.Response; import voldemort.store.slop.HintedHandoff; import voldemort.store.slop.Slop; import voldemort.utils.ByteArray; -import voldemort.utils.ByteUtils; import voldemort.utils.Time; import voldemort.versioning.ObsoleteVersionException; import voldemort.versioning.Versioned; @@ -66,6 +64,11 @@ public class PerformParallelPutRequests extends public boolean enableHintedHandoff; + private boolean quorumSatisfied = false; + private boolean zonesSatisfied = false; + private Integer numResponsesGot = 0; + private Integer numNodesPendingResponse = 0; + public PerformParallelPutRequests(PutPipelineData pipelineData, Event completeEvent, ByteArray key, @@ -91,223 +94,295 @@ public boolean isHintedHandoffEnabled() { return enableHintedHandoff; } + @Override public void execute(final Pipeline pipeline) { - Node master = pipelineData.getMaster(); + final Node masterNode = pipelineData.getMaster(); + final List nodes = pipelineData.getNodes(); final Versioned versionedCopy = pipelineData.getVersionedCopy(); + final Integer numNodesTouchedInSerialPut = nodes.indexOf(masterNode) + 1; + numNodesPendingResponse = nodes.size() - numNodesTouchedInSerialPut; if(logger.isDebugEnabled()) - logger.debug("Serial put requests determined master node as " + master.getId() - + ", submitting remaining requests in parallel"); - - List nodes = pipelineData.getNodes(); - int firstParallelNodeIndex = nodes.indexOf(master) + 1; - int attempts = nodes.size() - firstParallelNodeIndex; - int blocks = Math.min(preferred - 1, attempts); - - final Map> responses = new ConcurrentHashMap>(); - final CountDownLatch attemptsLatch = new CountDownLatch(attempts); - final CountDownLatch blocksLatch = new CountDownLatch(blocks); + logger.debug("PUT {key:" + key + "} MasterNode={id:" + masterNode.getId() + + "} totalNodesToAsyncPut=" + numNodesPendingResponse); - if(logger.isTraceEnabled()) - logger.trace("Attempting " + attempts + " " + pipeline.getOperation().getSimpleName() - + " operations in parallel"); - - for(int i = firstParallelNodeIndex; i < (firstParallelNodeIndex + attempts); i++) { + // initiate parallel puts + for(int i = numNodesTouchedInSerialPut; i < nodes.size(); i++) { final Node node = nodes.get(i); pipelineData.incrementNodeIndex(); NonblockingStoreCallback callback = new NonblockingStoreCallback() { + @Override public void requestComplete(Object result, long requestTime) { - if(logger.isTraceEnabled()) - logger.trace(pipeline.getOperation().getSimpleName() - + " response received (" + requestTime + " ms.) from node " - + node.getId()); + boolean responseHandledByMaster = false; + if(logger.isDebugEnabled()) + logger.debug("PUT {key:" + key + "} response received from node={id:" + + node.getId() + "} in " + requestTime + " ms)"); - Response response = new Response(node, - key, - result, - requestTime); - responses.put(node.getId(), response); + Response response; + response = new Response(node, key, result, requestTime); - if(logger.isDebugEnabled()) - logger.debug("Finished secondary PUT for key " - + ByteUtils.toHexString(key.get()) + " (keyRef: " - + System.identityHashCode(key) + "); took " + requestTime - + " ms on node " + node.getId() + "(" + node.getHost() + ")"); - - if(isHintedHandoffEnabled() && pipeline.isFinished()) { - if(response.getValue() instanceof UnreachableStoreException) { - Slop slop = new Slop(pipelineData.getStoreName(), - Slop.Operation.PUT, - key, - versionedCopy.getValue(), - transforms, - node.getId(), - new Date()); - pipelineData.addFailedNode(node); - hintedHandoff.sendHintSerial(node, versionedCopy.getVersion(), slop); - } + if(logger.isDebugEnabled()) { + logger.debug("PUT {key:" + + key + + "} Parallel put thread trying to return result to main thread"); + } + + responseHandledByMaster = pipelineData.getSynchronizer() + .tryDelegateResponseHandling(response); + + if(logger.isDebugEnabled()) { + logger.debug("PUT {key:" + key + "} Master thread accepted the response: " + + responseHandledByMaster); } - attemptsLatch.countDown(); - blocksLatch.countDown(); - - if(logger.isTraceEnabled()) - logger.trace(attemptsLatch.getCount() + " attempts remaining. Will block " - + " for " + blocksLatch.getCount() + " more "); - - // Note errors that come in after the pipeline has finished. - // These will *not* get a chance to be called in the loop of - // responses below. - if(pipeline.isFinished() && response.getValue() instanceof Exception - && !(response.getValue() instanceof ObsoleteVersionException)) { - if(response.getValue() instanceof InvalidMetadataException) { - pipelineData.reportException((InvalidMetadataException) response.getValue()); - logger.warn("Received invalid metadata problem after a successful " - + pipeline.getOperation().getSimpleName() - + " call on node " + node.getId() + ", store '" - + pipelineData.getStoreName() + "'"); + if(!responseHandledByMaster) { + if(logger.isDebugEnabled()) { + logger.debug("PUT {key:" + + key + + "} Master thread did not accept the response: will handle in worker thread"); + } + if(PipelineRoutedStore.isSlopableFailure(response.getValue())) { + if(logger.isDebugEnabled()) + logger.debug("PUT {key:" + key + "} failed on node={id:" + + node.getId() + ",host:" + node.getHost() + "}"); + + if(isHintedHandoffEnabled()) { + boolean triedDelegateSlop = pipelineData.getSynchronizer() + .tryDelegateSlop(node); + if(logger.isDebugEnabled()) { + logger.debug("PUT {key:" + key + "} triedDelegateSlop: " + + triedDelegateSlop); + } + if(!triedDelegateSlop) { + Slop slop = new Slop(pipelineData.getStoreName(), + Slop.Operation.PUT, + key, + versionedCopy.getValue(), + transforms, + node.getId(), + new Date()); + pipelineData.addFailedNode(node); + if(logger.isDebugEnabled()) + logger.debug("PUT {key:" + key + + "} Start registering Slop(node:" + + node.getId() + ",host:" + node.getHost() + + ")"); + hintedHandoff.sendHintParallel(node, + versionedCopy.getVersion(), + slop); + if(logger.isDebugEnabled()) + logger.debug("PUT {key:" + key + + "} Sent out request to register Slop(node: " + + node.getId() + ",host:" + node.getHost() + + ")"); + } + } } else { - handleResponseError(response, pipeline, failureDetector); + // did not slop because either it's not exception or + // the exception is ignorable + if(logger.isDebugEnabled()) { + if(result instanceof Exception) { + logger.debug("PUT {key:" + + key + + "} will not send hint. Response is ignorable exception: " + + result.getClass().toString()); + } else { + logger.debug("PUT {key:" + key + + "} will not send hint. Response is success"); + } + } + } + + if(result instanceof Exception + && !(result instanceof ObsoleteVersionException)) { + if(response.getValue() instanceof InvalidMetadataException) { + pipelineData.reportException((InvalidMetadataException) response.getValue()); + logger.warn("Received invalid metadata problem after a successful " + + pipeline.getOperation().getSimpleName() + + " call on node " + node.getId() + ", store '" + + pipelineData.getStoreName() + "'"); + } else { + handleResponseError(response, pipeline, failureDetector); + } } } } - }; if(logger.isTraceEnabled()) logger.trace("Submitting " + pipeline.getOperation().getSimpleName() - + " request on node " + node.getId()); + + " request on node " + node.getId() + " for key " + key); NonblockingStore store = nonblockingStores.get(node.getId()); store.submitPutRequest(key, versionedCopy, transforms, callback, timeoutMs); } try { - long ellapsedNs = System.nanoTime() - pipelineData.getStartTimeNs(); - long remainingNs = (timeoutMs * Time.NS_PER_MS) - ellapsedNs; - if(remainingNs > 0) - blocksLatch.await(remainingNs, TimeUnit.NANOSECONDS); - } catch(InterruptedException e) { - if(logger.isEnabledFor(Level.WARN)) - logger.warn(e, e); - } + boolean preferredSatisfied = false; + while(true) { + long ellapsedNs = System.nanoTime() - pipelineData.getStartTimeNs(); + long remainingNs = (timeoutMs * Time.NS_PER_MS) - ellapsedNs; + remainingNs = Math.max(0, remainingNs); + // preferred check + if(numResponsesGot >= preferred - 1) { + preferredSatisfied = true; + } - for(Entry> responseEntry: responses.entrySet()) { - Response response = responseEntry.getValue(); - // Treat ObsoleteVersionExceptions as success since such an - // exception means that a higher version was able to write on the - // node. - if(response.getValue() instanceof Exception - && !(response.getValue() instanceof ObsoleteVersionException)) { - if(handleResponseError(response, pipeline, failureDetector)) - return; - } else { - pipelineData.incrementSuccesses(); - failureDetector.recordSuccess(response.getNode(), response.getRequestTime()); - pipelineData.getZoneResponses().add(response.getNode().getZoneId()); - responses.remove(responseEntry.getKey()); - } - } + // quorum check + if(pipelineData.getSuccesses() >= required) { + quorumSatisfied = true; + } - boolean quorumSatisfied = true; - if(pipelineData.getSuccesses() < required) { - long ellapsedNs = System.nanoTime() - pipelineData.getStartTimeNs(); - long remainingNs = (timeoutMs * Time.NS_PER_MS) - ellapsedNs; - if(remainingNs > 0) { - try { - attemptsLatch.await(remainingNs, TimeUnit.NANOSECONDS); - } catch(InterruptedException e) { - if(logger.isEnabledFor(Level.WARN)) - logger.warn(e, e); + // zone check + if(pipelineData.getZonesRequired() == null) { + zonesSatisfied = true; + } else { + int numZonesSatisfied = pipelineData.getZoneResponses().size(); + if(numZonesSatisfied >= (pipelineData.getZonesRequired() + 1)) { + zonesSatisfied = true; + } } - for(Entry> responseEntry: responses.entrySet()) { - Response response = responseEntry.getValue(); - // Treat ObsoleteVersionExceptions as success since such an - // exception means that a higher version was able to write - // on the node. - if(response.getValue() instanceof Exception - && !(response.getValue() instanceof ObsoleteVersionException)) { - if(handleResponseError(response, pipeline, failureDetector)) - return; - } else { - pipelineData.incrementSuccesses(); - failureDetector.recordSuccess(response.getNode(), response.getRequestTime()); - pipelineData.getZoneResponses().add(response.getNode().getZoneId()); - responses.remove(responseEntry.getKey()); + if(quorumSatisfied && zonesSatisfied && preferredSatisfied || remainingNs <= 0 + || numNodesPendingResponse <= 0) { + pipelineData.getSynchronizer().cutoffHandling(); + break; + } else { + if(logger.isTraceEnabled()) { + logger.trace("PUT {key:" + key + "} trying to poll from queue"); + } + Response response = pipelineData.getSynchronizer() + .responseQueuePoll(remainingNs, + TimeUnit.NANOSECONDS); + processResponse(response, pipeline); + if(logger.isTraceEnabled()) { + logger.trace("PUT {key:" + key + "} tried to poll from queue. Null?: " + + (response == null) + " numResponsesGot:" + numResponsesGot + + " parellelResponseToWait: " + numNodesPendingResponse + + "; preferred-1: " + (preferred - 1) + "; preferredOK: " + + preferredSatisfied + " quromOK: " + quorumSatisfied + + "; zoneOK: " + zonesSatisfied); } + } } - if(pipelineData.getSuccesses() < required) { - pipelineData.setFatalError(new InsufficientOperationalNodesException(required - + " " - + pipeline.getOperation() - .getSimpleName() - + "s required, but only " - + pipelineData.getSuccesses() - + " succeeded", - new ArrayList(pipelineData.getReplicationSet()), - new ArrayList(pipelineData.getNodes()), - new ArrayList(pipelineData.getFailedNodes()), - pipelineData.getFailures())); + // clean leftovers + // a) The main thread did a processResponse, due to which the + // criteria (quorum) was satisfied + // b) After this, the main thread cuts off adding responses to the + // queue by the async callbacks + + // An async callback can be invoked between a and b (this is the + // leftover) + while(!pipelineData.getSynchronizer().responseQueueIsEmpty()) { + Response response = pipelineData.getSynchronizer() + .responseQueuePoll(0, + TimeUnit.NANOSECONDS); + processResponse(response, pipeline); + } + + if(quorumSatisfied && zonesSatisfied) { + if(logger.isDebugEnabled()) { + logger.debug("PUT {key:" + key + "} succeeded at parellel put stage"); + } + pipelineData.getSynchronizer().disallowDelegateSlop(); + pipeline.addEvent(completeEvent); + } else { + VoldemortException fatalError; + if(!quorumSatisfied) { + if(logger.isDebugEnabled()) { + logger.debug("PUT {key:" + key + + "} failed due to insufficient nodes. required=" + required + + " success=" + pipelineData.getSuccesses()); + } + fatalError = new InsufficientOperationalNodesException(required + + " " + + pipeline.getOperation() + .getSimpleName() + + "s required, but only " + + pipelineData.getSuccesses() + + " succeeded", + new ArrayList(pipelineData.getReplicationSet()), + new ArrayList(pipelineData.getNodes()), + new ArrayList(pipelineData.getFailedNodes()), + pipelineData.getFailures()); + pipelineData.setFatalError(fatalError); + } else if(!zonesSatisfied) { + if(logger.isDebugEnabled()) { + logger.debug("PUT {key:" + key + + "} failed due to insufficient zones. required=" + + pipelineData.getZonesRequired() + 1 + " success=" + + pipelineData.getZoneResponses().size()); + } + fatalError = new InsufficientZoneResponsesException((pipelineData.getZonesRequired() + 1) + + " " + + pipeline.getOperation() + .getSimpleName() + + "s required zone, but only " + + zonesSatisfied + + " succeeded. Failing nodes : " + + pipelineData.getFailedNodes()); + pipelineData.setFatalError(fatalError); + } pipeline.abort(); - quorumSatisfied = false; + } + } catch(InterruptedException e) { + if(logger.isEnabledFor(Level.WARN)) + logger.warn(e, e); + } catch(NoSuchElementException e) { + if(logger.isEnabledFor(Level.ERROR)) + logger.error("Response Queue is empty. There may be a bug in PerformParallelPutRequest", + e); + } finally { + if(logger.isDebugEnabled()) { + logger.debug("PUT {key:" + key + "} marking parallel put stage finished"); } } + } - if(quorumSatisfied) { - if(pipelineData.getZonesRequired() != null) { - - int zonesSatisfied = pipelineData.getZoneResponses().size(); - if(zonesSatisfied >= (pipelineData.getZonesRequired() + 1)) { - pipeline.addEvent(completeEvent); - } else { - long timeMs = (System.nanoTime() - pipelineData.getStartTimeNs()) - / Time.NS_PER_MS; - - if((timeoutMs - timeMs) > 0) { - try { - attemptsLatch.await(timeoutMs - timeMs, TimeUnit.MILLISECONDS); - } catch(InterruptedException e) { - if(logger.isEnabledFor(Level.WARN)) - logger.warn(e, e); - } - - for(Entry> responseEntry: responses.entrySet()) { - Response response = responseEntry.getValue(); - if(response.getValue() instanceof Exception) { - if(handleResponseError(response, pipeline, failureDetector)) - return; - } else { - pipelineData.incrementSuccesses(); - failureDetector.recordSuccess(response.getNode(), - response.getRequestTime()); - pipelineData.getZoneResponses().add(response.getNode().getZoneId()); - responses.remove(responseEntry.getKey()); - } - } + /** + * Process the response by reporting proper log and feeding failure + * detectors + * + * @param response + * @param pipeline + */ + private void processResponse(Response response, Pipeline pipeline) { + if(response == null) { + logger.warn("RoutingTimedout on waiting for async ops; parellelResponseToWait: " + + numNodesPendingResponse + "; preferred-1: " + (preferred - 1) + + "; quromOK: " + quorumSatisfied + "; zoneOK: " + zonesSatisfied); + } else { + numNodesPendingResponse = numNodesPendingResponse - 1; + numResponsesGot = numResponsesGot + 1; + if(response.getValue() instanceof Exception + && !(response.getValue() instanceof ObsoleteVersionException)) { + if(logger.isDebugEnabled()) { + logger.debug("PUT {key:" + key + "} handling async put error"); + } + if(handleResponseError(response, pipeline, failureDetector)) { + if(logger.isDebugEnabled()) { + logger.debug("PUT {key:" + key + + "} severe async put error, exiting parallel put stage"); } - if(pipelineData.getZoneResponses().size() >= (pipelineData.getZonesRequired() + 1)) { - pipeline.addEvent(completeEvent); - } else { - pipelineData.setFatalError(new InsufficientZoneResponsesException((pipelineData.getZonesRequired() + 1) - + " " - + pipeline.getOperation() - .getSimpleName() - + "s required zone, but only " - + zonesSatisfied - + " succeeded. Failing nodes : " - + pipelineData.getFailedNodes())); - pipeline.abort(); - } + return; + } + if(PipelineRoutedStore.isSlopableFailure(response.getValue())) { + pipelineData.getSynchronizer().tryDelegateSlop(response.getNode()); + } + + if(logger.isDebugEnabled()) { + logger.debug("PUT {key:" + key + "} handled async put error"); } } else { - pipeline.addEvent(completeEvent); + pipelineData.incrementSuccesses(); + failureDetector.recordSuccess(response.getNode(), response.getRequestTime()); + pipelineData.getZoneResponses().add(response.getNode().getZoneId()); } } } diff --git a/src/java/voldemort/store/routed/action/PerformParallelRequests.java b/src/java/voldemort/store/routed/action/PerformParallelRequests.java index f0a6c4f1e1..65deeb9226 100644 --- a/src/java/voldemort/store/routed/action/PerformParallelRequests.java +++ b/src/java/voldemort/store/routed/action/PerformParallelRequests.java @@ -90,7 +90,7 @@ public void execute(final Pipeline pipeline) { if(logger.isTraceEnabled()) logger.trace("Attempting " + attempts + " " + pipeline.getOperation().getSimpleName() - + " operations in parallel"); + + " operations in parallel for key " + key); for(int i = 0; i < attempts; i++) { final Node node = nodes.get(i); @@ -104,7 +104,7 @@ public void requestComplete(Object result, long requestTime) { if(logger.isTraceEnabled()) logger.trace(pipeline.getOperation().getSimpleName() + " response received (" + requestTime + " ms.) from node " - + node.getId()); + + node.getId() + "for key " + key); Response response = new Response(node, key, @@ -129,7 +129,7 @@ public void requestComplete(Object result, long requestTime) { logger.warn("Received invalid metadata problem after a successful " + pipeline.getOperation().getSimpleName() + " call on node " + node.getId() + ", store '" - + pipelineData.getStoreName() + "'"); + + pipelineData.getStoreName() + "' for key " + key); } else { handleResponseError(response, pipeline, failureDetector); } @@ -140,7 +140,7 @@ public void requestComplete(Object result, long requestTime) { if(logger.isTraceEnabled()) logger.trace("Submitting " + pipeline.getOperation().getSimpleName() - + " request on node " + node.getId()); + + " request on node " + node.getId() + " for key " + key); NonblockingStore store = nonblockingStores.get(node.getId()); @@ -212,7 +212,8 @@ else if(pipeline.getOperation() == Operation.GET_VERSIONS) logger.debug("Operation " + pipeline.getOperation().getSimpleName() + "failed due to insufficent zone responses, required " + pipelineData.getZonesRequired() + " obtained " - + zonesSatisfied + " " + pipelineData.getZoneResponses()); + + zonesSatisfied + " " + pipelineData.getZoneResponses() + + " for key " + key); } if(this.insufficientZonesEvent != null) { pipeline.addEvent(this.insufficientZonesEvent); diff --git a/src/java/voldemort/store/routed/action/PerformPutHintedHandoff.java b/src/java/voldemort/store/routed/action/PerformPutHintedHandoff.java index a4916f832b..a7f770ab0c 100644 --- a/src/java/voldemort/store/routed/action/PerformPutHintedHandoff.java +++ b/src/java/voldemort/store/routed/action/PerformPutHintedHandoff.java @@ -53,8 +53,8 @@ public PerformPutHintedHandoff(PutPipelineData pipelineData, @Override public void execute(Pipeline pipeline) { Versioned versionedCopy = pipelineData.getVersionedCopy(); - for(Node failedNode: failedNodes) { - int failedNodeId = failedNode.getId(); + for(Node slopFinalDestinationNode: pipelineData.getSynchronizer().getDelegatedSlopDestinations()) { + int failedNodeId = slopFinalDestinationNode.getId(); if(versionedCopy == null) { VectorClock clock = (VectorClock) versioned.getVersion(); versionedCopy = new Versioned(versioned.getValue(), @@ -64,8 +64,9 @@ public void execute(Pipeline pipeline) { Version version = versionedCopy.getVersion(); if(logger.isTraceEnabled()) - logger.trace("Performing hinted handoff for node " + failedNode + ", store " - + pipelineData.getStoreName() + " key " + key + ", version " + version); + logger.trace("Performing parallel hinted handoff for node " + slopFinalDestinationNode + + ", store " + pipelineData.getStoreName() + " key " + key + + ", version " + version); Slop slop = new Slop(pipelineData.getStoreName(), Slop.Operation.PUT, @@ -74,7 +75,7 @@ public void execute(Pipeline pipeline) { transforms, failedNodeId, new Date()); - hintedHandoff.sendHintParallel(failedNode, version, slop); + hintedHandoff.sendHintParallel(slopFinalDestinationNode, version, slop); } pipeline.addEvent(completeEvent); } diff --git a/src/java/voldemort/store/routed/action/PerformSerialPutRequests.java b/src/java/voldemort/store/routed/action/PerformSerialPutRequests.java index 08a25c7bc7..f21ff0adb5 100644 --- a/src/java/voldemort/store/routed/action/PerformSerialPutRequests.java +++ b/src/java/voldemort/store/routed/action/PerformSerialPutRequests.java @@ -27,6 +27,7 @@ import voldemort.store.Store; import voldemort.store.routed.Pipeline; import voldemort.store.routed.Pipeline.Event; +import voldemort.store.routed.PipelineRoutedStore; import voldemort.store.routed.PutPipelineData; import voldemort.utils.ByteArray; import voldemort.utils.ByteUtils; @@ -114,6 +115,7 @@ public void execute(Pipeline pipeline) { pipelineData.setMaster(node); pipelineData.setVersionedCopy(versionedCopy); pipelineData.getZoneResponses().add(node.getZoneId()); + currentNode++; break; } catch(Exception e) { long requestTime = (System.nanoTime() - start) / Time.NS_PER_MS; @@ -124,11 +126,19 @@ public void execute(Pipeline pipeline) { + (System.nanoTime() - start) + " ns" + " (keyRef: " + System.identityHashCode(key) + ")"); + if(PipelineRoutedStore.isSlopableFailure(e)) { + pipelineData.getSynchronizer().tryDelegateSlop(node); + } if(handleResponseError(e, node, requestTime, pipeline, failureDetector)) return; } } + if(logger.isTraceEnabled()) { + logger.trace("PUT {key:" + key + "} currentNode=" + currentNode + " nodes.size()=" + + nodes.size()); + } + if(pipelineData.getSuccesses() < 1) { List failures = pipelineData.getFailures(); pipelineData.setFatalError(new InsufficientOperationalNodesException("No master node succeeded!", @@ -138,8 +148,6 @@ public void execute(Pipeline pipeline) { return; } - currentNode++; - // There aren't any more requests to make... if(currentNode == nodes.size()) { if(pipelineData.getSuccesses() < required) { diff --git a/src/java/voldemort/store/serialized/SerializingStorageEngine.java b/src/java/voldemort/store/serialized/SerializingStorageEngine.java index 8ab9843b4b..f3f7e4b3b8 100644 --- a/src/java/voldemort/store/serialized/SerializingStorageEngine.java +++ b/src/java/voldemort/store/serialized/SerializingStorageEngine.java @@ -16,6 +16,8 @@ package voldemort.store.serialized; +import java.util.List; + import voldemort.serialization.Serializer; import voldemort.store.StorageEngine; import voldemort.utils.ByteArray; @@ -167,4 +169,11 @@ public boolean beginBatchModifications() { public boolean endBatchModifications() { return false; } + + @Override + public List> multiVersionPut(K key, List> values) { + // This is used only for slops as of now. + throw new UnsupportedOperationException("multiVersionPut is not supported for " + + this.getClass().getName()); + } } diff --git a/src/java/voldemort/store/slop/HintedHandoff.java b/src/java/voldemort/store/slop/HintedHandoff.java index 00182e4250..5143a71036 100644 --- a/src/java/voldemort/store/slop/HintedHandoff.java +++ b/src/java/voldemort/store/slop/HintedHandoff.java @@ -16,6 +16,8 @@ package voldemort.store.slop; +import java.util.ArrayList; +import java.util.LinkedList; import java.util.List; import java.util.Map; @@ -98,80 +100,97 @@ public HintedHandoff(FailureDetector failureDetector, * @see #sendHintSerial(voldemort.cluster.Node, * voldemort.versioning.Version, Slop) */ - public void sendHintParallel(final Node failedNode, final Version version, final Slop slop) { - final ByteArray slopKey = slop.makeKey(); - Versioned slopVersioned = new Versioned(slopSerializer.toBytes(slop), - version); + public void sendHintParallel(Node failedNode, Version version, Slop slop) { + List nodes = new LinkedList(); + nodes.addAll(handoffStrategy.routeHint(failedNode)); + if(logger.isDebugEnabled()) { + List nodeIds = new ArrayList(); + for(Node node: nodes) { + nodeIds.add(node.getId()); + } + logger.debug("Hint preference list: " + nodeIds.toString()); + } + sendOneAsyncHint(slop.makeKey(), new Versioned(slopSerializer.toBytes(slop), + version), nodes); + } - for(final Node node: handoffStrategy.routeHint(failedNode)) { - int nodeId = node.getId(); + /** + * A callback that handles requestComplete event from NIO selector manager + * Will try any possible nodes and pass itself as callback util all nodes + * are exhausted + * + * @param slopKey + * @param slopVersioned + * @param nodesToTry List of nodes to try to contact. Will become shorter + * after each callback + */ + private void sendOneAsyncHint(final ByteArray slopKey, + final Versioned slopVersioned, + final List nodesToTry) { + Node nodeToHostHint = null; + boolean foundNode = false; + while(nodesToTry.size() > 0) { + nodeToHostHint = nodesToTry.remove(0); + if(!failedNodes.contains(nodeToHostHint) && failureDetector.isAvailable(nodeToHostHint)) { + foundNode = true; + break; + } + } + if(!foundNode) { + Slop slop = slopSerializer.toObject(slopVersioned.getValue()); + logger.error("Trying to send an async hint but used up all nodes. key: " + + slop.getKey() + " version: " + slopVersioned.getVersion().toString()); + return; + } + final Node node = nodeToHostHint; + int nodeId = node.getId(); - if(logger.isDebugEnabled()) - logger.debug("Sending an async hint to " + nodeId); + NonblockingStore nonblockingStore = nonblockingSlopStores.get(nodeId); + Utils.notNull(nonblockingStore); - if(!failedNodes.contains(node) && failureDetector.isAvailable(node)) { - NonblockingStore nonblockingStore = nonblockingSlopStores.get(nodeId); - Utils.notNull(nonblockingStore); - final long startNs = System.nanoTime(); + final Long startNs = System.nanoTime(); + NonblockingStoreCallback callback = new NonblockingStoreCallback() { - if(logger.isDebugEnabled()) - logger.debug("Slop attempt to write " + slop.getKey() + " for " + failedNode - + " to node " + node); - - NonblockingStoreCallback callback = new NonblockingStoreCallback() { - - public void requestComplete(Object result, long requestTime) { - Response response = new Response(node, - slopKey, - result, - requestTime); - if(response.getValue() instanceof Exception) { - if(response.getValue() instanceof ObsoleteVersionException) { - // Ignore - - // TODO: Treating ObsoleteVersionException as - // "success", but there is no logger.debug to - // note that the slop was written, nor is there - // a failureDetector.recordSuccess invocation. - } else { - // Use the blocking approach - if(!failedNodes.contains(node)) - failedNodes.add(node); - if(response.getValue() instanceof UnreachableStoreException) { - UnreachableStoreException use = (UnreachableStoreException) response.getValue(); - - if(logger.isDebugEnabled()) { - logger.debug("Write of key " + slop.getKey() + " for " - + failedNode + " to node " + node - + " failed due to unreachable: " - + use.getMessage()); - } - - failureDetector.recordException(node, - (System.nanoTime() - startNs) - / Time.NS_PER_MS, - use); - } - sendHintSerial(failedNode, version, slop); - } - return; + @Override + public void requestComplete(Object result, long requestTime) { + Slop slop = null; + boolean loggerDebugEnabled = logger.isDebugEnabled(); + if(loggerDebugEnabled) { + slop = slopSerializer.toObject(slopVersioned.getValue()); + } + Response response = new Response(node, + slopKey, + result, + requestTime); + if(response.getValue() instanceof Exception + && !(response.getValue() instanceof ObsoleteVersionException)) { + if(!failedNodes.contains(node)) + failedNodes.add(node); + if(response.getValue() instanceof UnreachableStoreException) { + UnreachableStoreException use = (UnreachableStoreException) response.getValue(); + + if(loggerDebugEnabled) { + logger.debug("Write of key " + slop.getKey() + " for " + + slop.getNodeId() + " to node " + node + + " failed due to unreachable: " + use.getMessage()); } - if(logger.isDebugEnabled()) - logger.debug("Slop write of key " + slop.getKey() + " for " - + failedNode + " to node " + node + " succeeded in " - + (System.nanoTime() - startNs) + " ns"); + failureDetector.recordException(node, (System.nanoTime() - startNs) + / Time.NS_PER_MS, use); + } + sendOneAsyncHint(slopKey, slopVersioned, nodesToTry); + } - failureDetector.recordSuccess(node, (System.nanoTime() - startNs) - / Time.NS_PER_MS); + if(loggerDebugEnabled) + logger.debug("Slop write of key " + slop.getKey() + " for node " + + slop.getNodeId() + " to node " + node + " succeeded in " + + (System.nanoTime() - startNs) + " ns"); - } - }; + failureDetector.recordSuccess(node, (System.nanoTime() - startNs) / Time.NS_PER_MS); - nonblockingStore.submitPutRequest(slopKey, slopVersioned, null, callback, timeoutMs); - break; } - } + }; + nonblockingStore.submitPutRequest(slopKey, slopVersioned, null, callback, timeoutMs); } /** @@ -184,14 +203,15 @@ public void requestComplete(Object result, long requestTime) { * @param slop The hint * @return True if persisted on another node, false otherwise */ + @Deprecated public boolean sendHintSerial(Node failedNode, Version version, Slop slop) { boolean persisted = false; for(Node node: handoffStrategy.routeHint(failedNode)) { int nodeId = node.getId(); - if(logger.isDebugEnabled()) - logger.debug("Trying to send hint to " + nodeId); if(!failedNodes.contains(node) && failureDetector.isAvailable(node)) { + if(logger.isDebugEnabled()) + logger.debug("Trying to send hint to " + nodeId + " for key " + slop.getKey()); Store slopStore = slopStores.get(nodeId); Utils.notNull(slopStore); long startNs = System.nanoTime(); @@ -215,16 +235,24 @@ public boolean sendHintSerial(Node failedNode, Version version, Slop slop) { } catch(UnreachableStoreException e) { failureDetector.recordException(node, (System.nanoTime() - startNs) / Time.NS_PER_MS, e); - logger.warn("Error during hinted handoff", e); + logger.warn("Error during hinted handoff. Will try another node", e); + } catch(IllegalStateException e) { + logger.warn("Error during hinted handoff. Will try another node", e); } catch(ObsoleteVersionException e) { logger.debug(e, e); + } catch(Exception e) { + logger.error("Unknown exception. Will try another node" + e); } if(logger.isDebugEnabled()) logger.debug("Slop write of key " + slop.getKey() + " (keyRef: " + System.identityHashCode(slop.getKey()) + ") for " + failedNode - + " to node " + node + " succeeded in " - + (System.nanoTime() - startNs) + " ns"); + + " to node " + node + (persisted ? " succeeded" : " failed") + + " in " + (System.nanoTime() - startNs) + " ns"); + } else { + if(logger.isDebugEnabled()) { + logger.debug("Skipping node " + nodeId); + } } } diff --git a/src/java/voldemort/store/slop/Slop.java b/src/java/voldemort/store/slop/Slop.java index f3fd513b29..c382cb806e 100644 --- a/src/java/voldemort/store/slop/Slop.java +++ b/src/java/voldemort/store/slop/Slop.java @@ -148,7 +148,8 @@ public int hashCode() { @Override public String toString() { return "Slop(storeName = " + storeName + ", operation = " + operation + ", key = " + key - + ", value = " + Arrays.toString(value) + ", nodeId = " + nodeId + ", arrived = " + arrived + ")"; + + ", value = " + ByteUtils.toHexString(value) + ", nodeId = " + nodeId + + ", arrived = " + arrived + ")"; } } diff --git a/src/java/voldemort/store/socket/SocketStore.java b/src/java/voldemort/store/socket/SocketStore.java index a65673aa05..93e520d1f5 100644 --- a/src/java/voldemort/store/socket/SocketStore.java +++ b/src/java/voldemort/store/socket/SocketStore.java @@ -284,7 +284,12 @@ private T request(ClientRequest delegate, String operationName) { clientRequestExecutor.addClientRequest(blockingClientRequest, timeoutMs, System.nanoTime() - startTimeNs); - blockingClientRequest.await(); + + boolean awaitResult = blockingClientRequest.await(); + + if(awaitResult == false) { + blockingClientRequest.timeOut(); + } if(logger.isDebugEnabled()) debugMsgStr += "success"; diff --git a/src/java/voldemort/store/socket/clientrequest/BlockingClientRequest.java b/src/java/voldemort/store/socket/clientrequest/BlockingClientRequest.java index 59640d9fe7..cb947bd7b3 100644 --- a/src/java/voldemort/store/socket/clientrequest/BlockingClientRequest.java +++ b/src/java/voldemort/store/socket/clientrequest/BlockingClientRequest.java @@ -57,8 +57,8 @@ public boolean isComplete() { return delegate.isComplete() && latch.getCount() == 0; } - public void await() throws InterruptedException { - latch.await(timeoutMs, TimeUnit.MILLISECONDS); + public boolean await() throws InterruptedException { + return latch.await(timeoutMs, TimeUnit.MILLISECONDS); } public T getResult() throws VoldemortException, IOException { diff --git a/src/java/voldemort/store/socket/clientrequest/ClientRequestExecutor.java b/src/java/voldemort/store/socket/clientrequest/ClientRequestExecutor.java index 1111b57b0c..153e7a8169 100644 --- a/src/java/voldemort/store/socket/clientrequest/ClientRequestExecutor.java +++ b/src/java/voldemort/store/socket/clientrequest/ClientRequestExecutor.java @@ -108,13 +108,14 @@ public synchronized void addClientRequest(ClientRequest clientRequest, if(timeoutMs == -1) { this.expiration = -1; } else { + long nowNs = System.nanoTime(); if(elapsedNs > (Time.NS_PER_MS * timeoutMs)) { - this.expiration = System.nanoTime(); + this.expiration = nowNs; } else { - this.expiration = System.nanoTime() + (Time.NS_PER_MS * timeoutMs) - elapsedNs; + this.expiration = nowNs + (Time.NS_PER_MS * timeoutMs) - elapsedNs; } - if(this.expiration < System.nanoTime()) + if(this.expiration < nowNs) throw new IllegalArgumentException("timeout " + timeoutMs + " not valid"); } diff --git a/src/java/voldemort/store/socket/clientrequest/ClientRequestExecutorPool.java b/src/java/voldemort/store/socket/clientrequest/ClientRequestExecutorPool.java index 7d0e75c251..895af6101e 100644 --- a/src/java/voldemort/store/socket/clientrequest/ClientRequestExecutorPool.java +++ b/src/java/voldemort/store/socket/clientrequest/ClientRequestExecutorPool.java @@ -131,6 +131,17 @@ public ClientRequestExecutorFactory getFactory() { return factory; } + /*** + * Create a new socket store to talk to a given server for a specific store + * + * Note: IGNORE_CHECKS will only be honored for Protobuf request format + * + * @param storeName + * @param hostName + * @param port + * @param requestFormatType protocol to use + * @param requestRoutingType routed/ignore checks/normal + */ @Override public SocketStore create(String storeName, String hostName, diff --git a/src/java/voldemort/utils/AbstractConsistencyFixer.java b/src/java/voldemort/utils/AbstractConsistencyFixer.java index c13dc1529a..d0cc8a7c3f 100644 --- a/src/java/voldemort/utils/AbstractConsistencyFixer.java +++ b/src/java/voldemort/utils/AbstractConsistencyFixer.java @@ -26,6 +26,7 @@ import voldemort.VoldemortException; import voldemort.client.protocol.admin.AdminClient; import voldemort.client.protocol.admin.QueryKeyResult; +import voldemort.routing.StoreRoutingPlan; import voldemort.store.routed.NodeValue; import voldemort.store.routed.ReadRepairer; import voldemort.utils.ConsistencyFix.BadKey; @@ -51,7 +52,7 @@ abstract class AbstractConsistencyFixer { private static final int fakeNodeID = Integer.MIN_VALUE; protected final BadKey badKey; - protected final StoreInstance storeInstance; + protected final StoreRoutingPlan storeInstance; protected final AdminClient adminClient; protected final QueryKeyResult orphanedValues; @@ -62,7 +63,7 @@ abstract class AbstractConsistencyFixer { * @param consistencyFix * @param badKeyQOut */ - AbstractConsistencyFixer(BadKey badKey, StoreInstance storeInstance, AdminClient adminClient) { + AbstractConsistencyFixer(BadKey badKey, StoreRoutingPlan storeInstance, AdminClient adminClient) { this(badKey, storeInstance, adminClient, null); } @@ -77,7 +78,7 @@ abstract class AbstractConsistencyFixer { * @param orphanedValues Set to null if no orphaned values to be included. */ AbstractConsistencyFixer(BadKey badKey, - StoreInstance storeInstance, + StoreRoutingPlan storeInstance, AdminClient adminClient, QueryKeyResult orphanedValues) { this.badKey = badKey; diff --git a/src/java/voldemort/utils/ByteArray.java b/src/java/voldemort/utils/ByteArray.java index c6ccf86ada..397152f956 100644 --- a/src/java/voldemort/utils/ByteArray.java +++ b/src/java/voldemort/utils/ByteArray.java @@ -41,7 +41,7 @@ public boolean equals(Object obj) { @Override public String toString() { - return Arrays.toString(underlying); + return ByteUtils.toHexString(underlying); } /** diff --git a/src/java/voldemort/utils/ClusterForkLiftTool.java b/src/java/voldemort/utils/ClusterForkLiftTool.java index 5f98e36aae..cce77a1203 100644 --- a/src/java/voldemort/utils/ClusterForkLiftTool.java +++ b/src/java/voldemort/utils/ClusterForkLiftTool.java @@ -28,6 +28,7 @@ import voldemort.client.protocol.admin.StreamingClientConfig; import voldemort.cluster.Cluster; import voldemort.cluster.Node; +import voldemort.routing.StoreRoutingPlan; import voldemort.store.StoreDefinition; import voldemort.store.StoreUtils; import voldemort.versioning.ChainedResolver; @@ -108,6 +109,18 @@ */ public class ClusterForkLiftTool implements Runnable { + /* + * different modes available with the forklift tool + */ + enum ForkLiftTaskMode { + global_resolution, /* Fetch data from all partitions and do resolution */ + primary_resolution, /* + * Fetch data from primary partition and do + * resolution + */ + no_resolution /* fetch data from primary parition and do no resolution */ + } + private static Logger logger = Logger.getLogger(ClusterForkLiftTool.class); private static final int DEFAULT_MAX_PUTS_PER_SEC = 500; private static final int DEFAULT_PROGRESS_PERIOD_OPS = 100000; @@ -121,7 +134,7 @@ public class ClusterForkLiftTool implements Runnable { private final int progressOps; private final HashMap srcStoreDefMap; private final List partitionList; - private final boolean globalResolution; + private final ForkLiftTaskMode mode; public ClusterForkLiftTool(String srcBootstrapUrl, String dstBootstrapUrl, @@ -130,7 +143,7 @@ public ClusterForkLiftTool(String srcBootstrapUrl, int progressOps, List storesList, List partitions, - boolean globalResolution) { + ForkLiftTaskMode mode) { // set up AdminClient on source cluster this.srcAdminClient = new AdminClient(srcBootstrapUrl, new AdminClientConfig(), @@ -142,6 +155,7 @@ public ClusterForkLiftTool(String srcBootstrapUrl, props.put("streaming.platform.throttle.qps", maxPutsPerSecond); StreamingClientConfig config = new StreamingClientConfig(props); this.dstStreamingClient = new StreamingClient(config); + this.mode = mode; // determine and verify final list of stores to be forklifted over if(storesList != null) { @@ -173,7 +187,7 @@ public ClusterForkLiftTool(String srcBootstrapUrl, // set up thread pool to parallely forklift partitions this.workerPool = Executors.newFixedThreadPool(partitionParallelism); this.progressOps = progressOps; - this.globalResolution = globalResolution; + } private HashMap checkStoresOnBothSides() { @@ -210,10 +224,10 @@ abstract class SinglePartitionForkLiftTask { protected int partitionId; protected CountDownLatch latch; - protected StoreInstance storeInstance; + protected StoreRoutingPlan storeInstance; protected String workName; - SinglePartitionForkLiftTask(StoreInstance storeInstance, + SinglePartitionForkLiftTask(StoreRoutingPlan storeInstance, int partitionId, CountDownLatch latch) { this.partitionId = partitionId; @@ -236,7 +250,7 @@ abstract class SinglePartitionForkLiftTask { class SinglePartitionGloballyResolvingForkLiftTask extends SinglePartitionForkLiftTask implements Runnable { - SinglePartitionGloballyResolvingForkLiftTask(StoreInstance storeInstance, + SinglePartitionGloballyResolvingForkLiftTask(StoreRoutingPlan storeInstance, int partitionId, CountDownLatch latch) { super(storeInstance, partitionId, latch); @@ -337,7 +351,7 @@ private Map doReads(final List nodeIdList, class SinglePartitionPrimaryResolvingForkLiftTask extends SinglePartitionForkLiftTask implements Runnable { - SinglePartitionPrimaryResolvingForkLiftTask(StoreInstance storeInstance, + SinglePartitionPrimaryResolvingForkLiftTask(StoreRoutingPlan storeInstance, int partitionId, CountDownLatch latch) { super(storeInstance, partitionId, latch); @@ -412,6 +426,58 @@ public void run() { } } + /** + * Simply fetches the data for the partition from the primary replica and + * writes it into the destination cluster without resolving any of the + * conflicting values + * + */ + class SinglePartitionNoResolutionForkLiftTask extends SinglePartitionForkLiftTask implements + Runnable { + + SinglePartitionNoResolutionForkLiftTask(StoreRoutingPlan storeInstance, + int partitionId, + CountDownLatch latch) { + super(storeInstance, partitionId, latch); + } + + @Override + public void run() { + String storeName = this.storeInstance.getStoreDefinition().getName(); + long entriesForkLifted = 0; + try { + logger.info(workName + "Starting processing"); + Iterator>> entryItr = srcAdminClient.bulkFetchOps.fetchEntries(storeInstance.getNodeIdForPartitionId(this.partitionId), + storeName, + Lists.newArrayList(this.partitionId), + null, + true); + + while(entryItr.hasNext()) { + Pair> record = entryItr.next(); + ByteArray key = record.getFirst(); + Versioned versioned = record.getSecond(); + dstStreamingClient.streamingPut(key, versioned); + entriesForkLifted++; + if(entriesForkLifted % progressOps == 0) { + logger.info(workName + " fork lifted " + entriesForkLifted + + " entries successfully"); + } + + } + logger.info(workName + "Completed processing " + entriesForkLifted + " records"); + + } catch(Exception e) { + // if for some reason this partition fails, we will have retry + // again for those partitions alone. + logger.error(workName + "Error forklifting data ", e); + } finally { + latch.countDown(); + } + + } + } + @Override public void run() { final Cluster srcCluster = srcAdminClient.getAdminClientCluster(); @@ -436,23 +502,29 @@ public Object call() throws Exception { }, true); final CountDownLatch latch = new CountDownLatch(srcCluster.getNumberOfPartitions()); - StoreInstance storeInstance = new StoreInstance(srcCluster, - srcStoreDefMap.get(store)); + StoreRoutingPlan storeInstance = new StoreRoutingPlan(srcCluster, + srcStoreDefMap.get(store)); // submit work on every partition that is to be forklifted for(Integer partitionId: partitionList) { - if(this.globalResolution) { + if(this.mode == ForkLiftTaskMode.global_resolution) { // do thorough global resolution across replicas SinglePartitionGloballyResolvingForkLiftTask work = new SinglePartitionGloballyResolvingForkLiftTask(storeInstance, partitionId, latch); workerPool.submit(work); - } else { + } else if(this.mode == ForkLiftTaskMode.primary_resolution) { // do the less cleaner, but much faster route SinglePartitionPrimaryResolvingForkLiftTask work = new SinglePartitionPrimaryResolvingForkLiftTask(storeInstance, partitionId, latch); workerPool.submit(work); + } else if(this.mode == ForkLiftTaskMode.no_resolution) { + // do the less cleaner, but much faster route + SinglePartitionNoResolutionForkLiftTask work = new SinglePartitionNoResolutionForkLiftTask(storeInstance, + partitionId, + latch); + workerPool.submit(work); } } @@ -522,8 +594,10 @@ private static OptionParser getParser() { .withRequiredArg() .describedAs("partitionParallelism") .ofType(Integer.class); - parser.accepts("global-resolution", - "Determines if a thorough global resolution needs to be done, by comparing all replicas. [Default: Fetch from primary alone ]"); + parser.accepts("mode", + "Determines if a thorough global resolution needs to be done, by comparing all replicas. [Default: " + + ForkLiftTaskMode.primary_resolution.toString() + + " Fetch from primary alone ]"); return parser; } @@ -577,6 +651,15 @@ public static void main(String[] args) throws Exception { progressOps = (Integer) options.valueOf("progress-period-ops"); } + ForkLiftTaskMode mode; + mode = ForkLiftTaskMode.primary_resolution; + if(options.has("mode")) { + mode = Utils.getEnumFromString(ForkLiftTaskMode.class, (String) options.valueOf("mode")); + if(mode == null) + mode = ForkLiftTaskMode.primary_resolution; + + } + ClusterForkLiftTool forkLiftTool = new ClusterForkLiftTool(srcBootstrapUrl, dstBootstrapUrl, maxPutsPerSecond, @@ -584,9 +667,10 @@ public static void main(String[] args) throws Exception { progressOps, storesList, partitions, - options.has("global-resolution")); + mode); forkLiftTool.run(); // TODO cleanly shut down the hanging threadpool System.exit(0); } + } diff --git a/src/java/voldemort/utils/ClusterInstance.java b/src/java/voldemort/utils/ClusterInstance.java index c2808250e8..b9500d922c 100644 --- a/src/java/voldemort/utils/ClusterInstance.java +++ b/src/java/voldemort/utils/ClusterInstance.java @@ -23,6 +23,7 @@ import java.util.Set; import voldemort.cluster.Cluster; +import voldemort.routing.StoreRoutingPlan; import voldemort.store.StoreDefinition; import com.google.common.collect.Maps; @@ -146,7 +147,7 @@ public Pair analyzeBalanceVerbose() { } for(StoreDefinition storeDefinition: uniqueStores.keySet()) { - StoreInstance storeInstance = new StoreInstance(cluster, storeDefinition); + StoreRoutingPlan storeInstance = new StoreRoutingPlan(cluster, storeDefinition); builder.append("\n"); builder.append("Store exemplar: " + storeDefinition.getName() + "\n"); diff --git a/src/java/voldemort/utils/ConsistencyFix.java b/src/java/voldemort/utils/ConsistencyFix.java index 9615b0cdbc..ba8bdebcf5 100644 --- a/src/java/voldemort/utils/ConsistencyFix.java +++ b/src/java/voldemort/utils/ConsistencyFix.java @@ -44,6 +44,7 @@ import voldemort.client.protocol.admin.AdminClientConfig; import voldemort.client.protocol.admin.QueryKeyResult; import voldemort.cluster.Cluster; +import voldemort.routing.StoreRoutingPlan; import voldemort.store.StoreDefinition; import voldemort.versioning.ClockEntry; import voldemort.versioning.VectorClock; @@ -58,7 +59,7 @@ public class ConsistencyFix { private final String storeName; private final AdminClient adminClient; - private final StoreInstance storeInstance; + private final StoreRoutingPlan storeInstance; private final Stats stats; private final long perServerQPSLimit; private final ConcurrentMap putThrottlers; @@ -83,7 +84,7 @@ public class ConsistencyFix { storeName); logger.info("Store definition for store " + storeName + " has been determined."); - storeInstance = new StoreInstance(cluster, storeDefinition); + storeInstance = new StoreRoutingPlan(cluster, storeDefinition); stats = new Stats(progressBar); @@ -97,7 +98,7 @@ public String getStoreName() { return storeName; } - public StoreInstance getStoreInstance() { + public StoreRoutingPlan getStoreInstance() { return storeInstance; } diff --git a/src/java/voldemort/utils/KeyLocationValidation.java b/src/java/voldemort/utils/KeyLocationValidation.java deleted file mode 100644 index 274de5cdcc..0000000000 --- a/src/java/voldemort/utils/KeyLocationValidation.java +++ /dev/null @@ -1,64 +0,0 @@ -package voldemort.utils; - -import java.util.List; - -import voldemort.client.protocol.RequestFormatType; -import voldemort.cluster.Cluster; -import voldemort.server.RequestRoutingType; -import voldemort.store.Store; -import voldemort.store.StoreDefinition; -import voldemort.store.socket.SocketStoreFactory; -import voldemort.store.socket.clientrequest.ClientRequestExecutorPool; -import voldemort.versioning.Versioned; - -public class KeyLocationValidation { - - private final int nodeId; - private final ByteArray keyList; - private final Cluster cluster; - private final StoreDefinition storeDef; - - public KeyLocationValidation(Cluster cluster, - int nodeId, - StoreDefinition storeDef, - ByteArray keyList) { - this.nodeId = nodeId; - this.keyList = keyList; - this.cluster = cluster; - this.storeDef = storeDef; - } - - /* - * Validate location of the 'keyList' - * - * @param positiveTest Indicates how to validate True: Positive test (the - * keys should be present on nodeId). False : Negative test (the keys should - * not be present on nodeId) - */ - public boolean validate(boolean positiveTest) { - boolean retVal = false; - - SocketStoreFactory socketStoreFactory = new ClientRequestExecutorPool(2, - 10000, - 100000, - 32 * 1024); - // Cache connections to all nodes for this store and given node - Store socketStore = socketStoreFactory.create(storeDef.getName(), - cluster.getNodeById(nodeId) - .getHost(), - cluster.getNodeById(nodeId) - .getSocketPort(), - RequestFormatType.PROTOCOL_BUFFERS, - RequestRoutingType.IGNORE_CHECKS); - List> value = socketStore.get(keyList, null); - - if(!positiveTest && (value == null || value.size() == 0)) { - retVal = true; - } else if(value != null && value.size() != 0) { - retVal = true; - } - - socketStore.close(); - return retVal; - } -} diff --git a/src/java/voldemort/utils/KeySamplerCLI.java b/src/java/voldemort/utils/KeySamplerCLI.java index c8464a16ad..5e6b862ed5 100644 --- a/src/java/voldemort/utils/KeySamplerCLI.java +++ b/src/java/voldemort/utils/KeySamplerCLI.java @@ -21,10 +21,12 @@ import java.io.Writer; import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; @@ -65,7 +67,7 @@ public class KeySamplerCLI { private final AdminClient adminClient; private final Cluster cluster; private final List storeDefinitions; - private final Map storeNameToKeyStringsMap; + private final Set storeNameSet; private final String outDir; @@ -90,7 +92,7 @@ public KeySamplerCLI(String url, this.adminClient = new AdminClient(url, new AdminClientConfig(), new ClientConfig()); this.cluster = adminClient.getAdminClientCluster(); this.storeDefinitions = adminClient.metadataMgmtOps.getRemoteStoreDefList(0).getValue(); - this.storeNameToKeyStringsMap = new HashMap(); + this.storeNameSet = new HashSet(); for(StoreDefinition storeDefinition: storeDefinitions) { String storeName = storeDefinition.getName(); if(storeNames != null) { @@ -101,13 +103,13 @@ public KeySamplerCLI(String url, continue; } } - this.storeNameToKeyStringsMap.put(storeName, new StringBuilder()); + this.storeNameSet.add(storeName); } if(storeNames != null) { List badStoreNames = new LinkedList(); for(String storeName: storeNames) { - if(!this.storeNameToKeyStringsMap.keySet().contains(storeName)) { + if(!this.storeNameSet.contains(storeName)) { badStoreNames.add(storeName); } } @@ -129,7 +131,7 @@ public KeySamplerCLI(String url, public boolean sampleStores() { for(StoreDefinition storeDefinition: storeDefinitions) { - if(storeNameToKeyStringsMap.keySet().contains(storeDefinition.getName())) { + if(storeNameSet.contains(storeDefinition.getName())) { if(!sampleStore(storeDefinition)) { return false; } @@ -141,30 +143,31 @@ public boolean sampleStores() { public static class NodeSampleResult { public final boolean success; - public final String keysString; + public final Exception exception; - NodeSampleResult(boolean success, String keysString) { + NodeSampleResult(boolean success, Exception exception) { this.success = success; - this.keysString = keysString; + this.exception = exception; } } - public class NodeSampler implements Callable { + public class SampleNodeTask implements Callable { private final Node node; private final StoreDefinition storeDefinition; private final EventThrottler throttler; + private final Writer writer; - NodeSampler(Node node, StoreDefinition storeDefinition) { + public SampleNodeTask(Node node, StoreDefinition storeDefinition, Writer writer) { this.node = node; this.storeDefinition = storeDefinition; this.throttler = new EventThrottler(keysPerSecondLimit); + this.writer = writer; } @Override public NodeSampleResult call() throws Exception { String storeName = storeDefinition.getName(); - StringBuilder hexKeysString = new StringBuilder(); String nodeTag = node.getId() + " [" + node.getHost() + "]"; List nodePartitionIds = new ArrayList(node.getPartitionIds()); @@ -173,7 +176,7 @@ public NodeSampleResult call() throws Exception { if(nodePartitionIds.size() == 0) { logger.info("No partitions to sample for store '" + storeName + "' on node " + nodeTag); - return new NodeSampleResult(true, hexKeysString.toString()); + return new NodeSampleResult(true, null); } } @@ -195,7 +198,10 @@ public NodeSampleResult call() throws Exception { while(fetchIterator.hasNext()) { ByteArray key = fetchIterator.next(); String hexKeyString = ByteUtils.toHexString(key.get()); - hexKeysString.append(hexKeyString + "\n"); + // locking to prevent garbled output from multiple threads + synchronized(this.writer) { + writer.append(hexKeyString + "\n"); + } keyCount++; throttler.maybeThrottle(1); @@ -220,11 +226,13 @@ public NodeSampleResult call() throws Exception { } logger.info("Finished sample --- " + infoTag); - return new NodeSampleResult(true, hexKeysString.toString()); + return new NodeSampleResult(true, null); } catch(VoldemortException ve) { logger.error("Failed to sample --- " + infoTag + " --- VoldemortException caught (" - + ve.getMessage() + ") caused by (" + ve.getCause().getMessage() + ")"); - throw ve; + + ve.getMessage() + ") caused by (" + + ve.getCause().getMessage() + ")", + ve); + return new NodeSampleResult(false, ve); } } } @@ -246,8 +254,9 @@ public boolean sampleStore(StoreDefinition storeDefinition) { Map> results = new HashMap>(); for(Node node: cluster.getNodes()) { - Future future = nodeSamplerService.submit(new NodeSampler(node, - storeDefinition)); + Future future = nodeSamplerService.submit(new SampleNodeTask(node, + storeDefinition, + keyWriter)); results.put(node, future); } @@ -261,34 +270,33 @@ public boolean sampleStore(StoreDefinition storeDefinition) { try { NodeSampleResult nodeSampleResult = future.get(); - if(nodeSampleResult.success) { - keyWriter.write(nodeSampleResult.keysString); - } else { + if(!nodeSampleResult.success) { success = false; logger.error("Sampling on node " + node.getHost() + " of store " - + storeDefinition.getName() + " failed."); + + storeDefinition.getName() + " failed.", + nodeSampleResult.exception); } } catch(ExecutionException ee) { success = false; logger.error("Encountered an execution exception on node " + node.getHost() - + " while sampling " + storeName + ": " + ee.getMessage()); + + " while sampling " + storeName, ee); ee.printStackTrace(); } catch(InterruptedException ie) { success = false; logger.error("Waiting for node " + node.getHost() + " to be sampled for store " - + storeName + ", but was interrupted: " + ie.getMessage()); + + storeName + ", but was interrupted", ie); } } return success; } catch(IOException e) { - logger.error("IOException encountered for store " + storeName + " : " + e.getMessage()); + logger.error("IOException encountered for store " + storeName, e); return false; } finally { try { keyWriter.close(); } catch(IOException e) { logger.error("IOException caught while trying to close keyWriter for store " - + storeName + " : " + e.getMessage()); + + storeName, e); } } } diff --git a/src/java/voldemort/utils/KeyVersionFetcherCLI.java b/src/java/voldemort/utils/KeyVersionFetcherCLI.java index a9b608a4e7..140e9ae520 100644 --- a/src/java/voldemort/utils/KeyVersionFetcherCLI.java +++ b/src/java/voldemort/utils/KeyVersionFetcherCLI.java @@ -21,11 +21,11 @@ import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; -import java.util.HashMap; +import java.util.HashSet; import java.util.LinkedList; import java.util.List; -import java.util.Map; import java.util.Queue; +import java.util.Set; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; @@ -45,6 +45,7 @@ import voldemort.client.protocol.admin.AdminClient; import voldemort.client.protocol.admin.AdminClientConfig; import voldemort.cluster.Cluster; +import voldemort.routing.StoreRoutingPlan; import voldemort.store.StoreDefinition; import voldemort.versioning.Versioned; @@ -61,17 +62,19 @@ public class KeyVersionFetcherCLI { private final static int DEFAULT_KEY_PARALLELISM = 4; private final static int DEFAULT_PROGRESS_PERIOD_OPS = 1000; + private final static int DEFAULT_OUTPUT_BATCH_SIZE = 100; private final AdminClient adminClient; private final Cluster cluster; private final List storeDefinitions; - private final Map storeNameToKeyStringsMap; + private final Set storeNamesSet; private final String inDir; private final String outDir; private final ExecutorService kvFetcherService; private final int progressPeriodOps; + private final int outputBatchSize; private final long startTimeMs; private static AtomicInteger fetches = new AtomicInteger(0); @@ -81,14 +84,15 @@ public KeyVersionFetcherCLI(String url, String outDir, List storeNames, int keyParallelism, - int progressPeriodOps) { + int progressPeriodOps, + int outputBatchSize) { if(logger.isInfoEnabled()) { logger.info("Connecting to bootstrap server: " + url); } this.adminClient = new AdminClient(url, new AdminClientConfig(), new ClientConfig()); this.cluster = adminClient.getAdminClientCluster(); this.storeDefinitions = adminClient.metadataMgmtOps.getRemoteStoreDefList(0).getValue(); - this.storeNameToKeyStringsMap = new HashMap(); + this.storeNamesSet = new HashSet(); for(StoreDefinition storeDefinition: storeDefinitions) { String storeName = storeDefinition.getName(); if(storeNames != null) { @@ -99,13 +103,13 @@ public KeyVersionFetcherCLI(String url, continue; } } - this.storeNameToKeyStringsMap.put(storeName, new StringBuilder()); + this.storeNamesSet.add(storeName); } if(storeNames != null) { List badStoreNames = new LinkedList(); for(String storeName: storeNames) { - if(!this.storeNameToKeyStringsMap.keySet().contains(storeName)) { + if(!this.storeNamesSet.contains(storeName)) { badStoreNames.add(storeName); } } @@ -121,13 +125,16 @@ public KeyVersionFetcherCLI(String url, this.kvFetcherService = Executors.newFixedThreadPool(keyParallelism); this.progressPeriodOps = progressPeriodOps; + this.outputBatchSize = outputBatchSize; this.startTimeMs = System.currentTimeMillis(); } public boolean sampleStores() { for(StoreDefinition storeDefinition: storeDefinitions) { - if(storeNameToKeyStringsMap.keySet().contains(storeDefinition.getName())) { + if(storeNamesSet.contains(storeDefinition.getName())) { if(!sampleStore(storeDefinition)) { + logger.info("Problem sampling store " + storeDefinition.getName() + + ".. Bailing.."); return false; } } @@ -135,24 +142,25 @@ public boolean sampleStores() { return true; } - public void updateFetchProgress() { + public void updateFetchProgress(String storeName) { int curFetches = fetches.incrementAndGet(); if(0 == curFetches % progressPeriodOps) { if(logger.isInfoEnabled()) { long durationS = TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis() - startTimeMs); - logger.info("Fetched " + curFetches + " in " + durationS + " seconds."); + logger.info("Fetched " + curFetches + " in " + durationS + " seconds for store " + + storeName); } } } - public class KeyVersionFetcher implements Callable { + public class FetchKeyVersionsTask implements Callable { - private final StoreInstance storeInstance; + private final StoreRoutingPlan storeInstance; private final byte[] key; - KeyVersionFetcher(StoreInstance storeInstance, byte[] key) { + FetchKeyVersionsTask(StoreRoutingPlan storeInstance, byte[] key) { this.storeInstance = storeInstance; this.key = key; } @@ -176,7 +184,7 @@ public String call() throws Exception { sb.append("\n"); replicationOffset++; } - updateFetchProgress(); + updateFetchProgress(storeName); return sb.toString(); } } @@ -187,7 +195,7 @@ public boolean sampleStore(StoreDefinition storeDefinition) { String keysFileName = inDir + System.getProperty("file.separator") + storeName + ".keys"; File keysFile = new File(keysFileName); if(!keysFile.exists()) { - logger.error("Keys file " + keysFileName + "does not exist!"); + logger.error("Keys file " + keysFileName + " does not exist!"); return false; } @@ -199,44 +207,49 @@ public boolean sampleStore(StoreDefinition storeDefinition) { return true; } - StoreInstance storeInstance = new StoreInstance(cluster, storeDefinition); + StoreRoutingPlan storeInstance = new StoreRoutingPlan(cluster, storeDefinition); BufferedReader keyReader = null; BufferedWriter kvWriter = null; try { keyReader = new BufferedReader(new FileReader(keysFileName)); + kvWriter = new BufferedWriter(new FileWriter(kvFileName)); - Queue> futureKVs = new LinkedList>(); - for(String keyLine = keyReader.readLine(); keyLine != null; keyLine = keyReader.readLine()) { - byte[] keyInBytes = ByteUtils.fromHexString(keyLine.trim()); - - KeyVersionFetcher kvFetcher = new KeyVersionFetcher(storeInstance, keyInBytes); - Future future = kvFetcherService.submit(kvFetcher); - futureKVs.add(future); - } + boolean readAllKeys = false; + while(!readAllKeys) { + Queue> futureKVs = new LinkedList>(); + for(int numFetchTasks = 0; numFetchTasks < this.outputBatchSize; numFetchTasks++) { + String keyLine = keyReader.readLine(); + if(keyLine == null) { + readAllKeys = true; + break; + } + byte[] keyInBytes = ByteUtils.fromHexString(keyLine.trim()); + FetchKeyVersionsTask kvFetcher = new FetchKeyVersionsTask(storeInstance, + keyInBytes); + Future future = kvFetcherService.submit(kvFetcher); + futureKVs.add(future); + } - kvWriter = new BufferedWriter(new FileWriter(kvFileName)); - while(!futureKVs.isEmpty()) { - Future future = futureKVs.poll(); - String keyVersions = future.get(); - kvWriter.append(keyVersions); + if(futureKVs.size() > 0) { + while(!futureKVs.isEmpty()) { + Future future = futureKVs.poll(); + String keyVersions = future.get(); + kvWriter.append(keyVersions); + } + } } - return true; } catch(DecoderException de) { - logger.error("Could not decode key to sample for store " + storeName + " : " - + de.getMessage()); + logger.error("Could not decode key to sample for store " + storeName, de); return false; } catch(IOException ioe) { - logger.error("IOException caught while sampling store " + storeName + " : " - + ioe.getMessage()); + logger.error("IOException caught while sampling store " + storeName, ioe); return false; } catch(InterruptedException ie) { - logger.error("InterruptedException caught while sampling store " + storeName + " : " - + ie.getMessage()); + logger.error("InterruptedException caught while sampling store " + storeName, ie); return false; } catch(ExecutionException ee) { - logger.error("Encountered an execution exception while sampling " + storeName + ": " - + ee.getMessage()); + logger.error("Encountered an execution exception while sampling " + storeName, ee); ee.printStackTrace(); return false; } finally { @@ -245,7 +258,7 @@ public boolean sampleStore(StoreDefinition storeDefinition) { keyReader.close(); } catch(IOException e) { logger.error("IOException caught while trying to close keyReader for store " - + storeName + " : " + e.getMessage()); + + storeName, e); e.printStackTrace(); } } @@ -254,7 +267,7 @@ public boolean sampleStore(StoreDefinition storeDefinition) { kvWriter.close(); } catch(IOException e) { logger.error("IOException caught while trying to close kvWriter for store " - + storeName + " : " + e.getMessage()); + + storeName, e); e.printStackTrace(); } } @@ -308,6 +321,12 @@ private static OptionParser getParser() { .withRequiredArg() .describedAs("progressPeriodOps") .ofType(Integer.class); + parser.accepts("output-batch-size", + "Number of keys fetched and written out in sorted order at once. [Default: " + + DEFAULT_OUTPUT_BATCH_SIZE + " ]") + .withRequiredArg() + .describedAs("outputBatchSize") + .ofType(Integer.class); return parser; } @@ -327,6 +346,7 @@ private static void printUsage() { help.append(" --store-names [,...]\n"); help.append(" --parallelism \n"); help.append(" --progress-period-ops \n"); + help.append(" --output-batch-size \n"); help.append(" --help\n"); System.out.print(help.toString()); } @@ -390,13 +410,19 @@ public static void main(String[] args) throws Exception { progressPeriodOps = (Integer) options.valueOf("progress-period-ops"); } + Integer outputBatchSize = DEFAULT_OUTPUT_BATCH_SIZE; + if(options.hasArgument("output-batch-size")) { + outputBatchSize = (Integer) options.valueOf("output-batch-size"); + } + try { KeyVersionFetcherCLI sampler = new KeyVersionFetcherCLI(url, inDir, outDir, storeNames, keyParallelism, - progressPeriodOps); + progressPeriodOps, + outputBatchSize); try { if(!sampler.sampleStores()) { @@ -407,8 +433,7 @@ public static void main(String[] args) throws Exception { } } catch(Exception e) { - Utils.croak("Exception during key-version sampling: " + e.getMessage()); + logger.error("Exception during key-version sampling: ", e); } - } } diff --git a/src/java/voldemort/utils/RebalanceUtils.java b/src/java/voldemort/utils/RebalanceUtils.java index e66895963c..2013c9d0d8 100644 --- a/src/java/voldemort/utils/RebalanceUtils.java +++ b/src/java/voldemort/utils/RebalanceUtils.java @@ -315,7 +315,7 @@ public static Cluster updateCluster(Cluster currentCluster, List updatedNo * * @param currentCluster Existing cluster metadata. Both stealer and donor * node should already exist in this metadata - * @param stealerNodeId Id of node from which we are stealing the partitions + * @param stealerNodeId Id of node for which we are stealing the partitions * @param donatedPartitions List of partitions we are moving * @param partitionList List of partitions we are moving * @return Updated cluster metadata diff --git a/src/java/voldemort/utils/Utils.java b/src/java/voldemort/utils/Utils.java index 8289cfa49f..e680f47894 100644 --- a/src/java/voldemort/utils/Utils.java +++ b/src/java/voldemort/utils/Utils.java @@ -586,4 +586,38 @@ public static int getDayOfTheWeekFromNow(int nDays) { cal.add(Calendar.DAY_OF_YEAR, nDays); return cal.get(Calendar.DAY_OF_WEEK); } + + /** + * A common method for all enums since they can't have another base class + * + * @param Enum type + * @param c enum type. All enums must be all caps. + * @param string case insensitive + * @return corresponding enum, or null + */ + public static > T getEnumFromString(Class c, String string) { + if(c != null && string != null) { + try { + return Enum.valueOf(c, string.trim().toUpperCase()); + } catch(IllegalArgumentException ex) {} + } + return null; + } + + /** + * Specifically, this utility is to address the fact that System.nanoTime() + * can sometimes go backwards, due to the fact that it relies on the + * performance counters + * + * @param startNs + * @param endNs + * @return 0 if endNs < startNs, delta otherwise + */ + public static long elapsedTimeNs(long startNs, long endNs) { + if(endNs < startNs) { + return 0L; + } else { + return endNs - startNs; + } + } } diff --git a/src/java/voldemort/versioning/ClockEntry.java b/src/java/voldemort/versioning/ClockEntry.java index 27bc0cac19..326cb6d916 100644 --- a/src/java/voldemort/versioning/ClockEntry.java +++ b/src/java/voldemort/versioning/ClockEntry.java @@ -28,6 +28,7 @@ * */ @NotThreadsafe +@Deprecated public final class ClockEntry implements Cloneable, Serializable { private static final long serialVersionUID = 1; @@ -108,11 +109,27 @@ public String toString() { } public void setNodeId(short nodeId) { + if(nodeId < 0) + throw new IllegalArgumentException("Node id " + nodeId + " is not in the range (0, " + + Short.MAX_VALUE + ")."); this.nodeId = nodeId; } public void setVersion(long version) { + if(version < 1) + throw new IllegalArgumentException("Version " + version + " is not in the range (1, " + + Short.MAX_VALUE + ")."); this.version = version; } + public void validate() { + if(nodeId < 0) + throw new InvalidClockEntryException("Node id " + nodeId + " is not in the range (0, " + + Short.MAX_VALUE + ")."); + if(version < 1) + throw new InvalidClockEntryException("Version " + version + " is not in the range (1, " + + Short.MAX_VALUE + ")."); + + } + } diff --git a/src/java/voldemort/versioning/InvalidClockEntryException.java b/src/java/voldemort/versioning/InvalidClockEntryException.java new file mode 100644 index 0000000000..dcbbd29ebf --- /dev/null +++ b/src/java/voldemort/versioning/InvalidClockEntryException.java @@ -0,0 +1,24 @@ +package voldemort.versioning; + +import voldemort.VoldemortException; + +public class InvalidClockEntryException extends VoldemortException { + + private static final long serialVersionUID = 1L; + + public InvalidClockEntryException() { + super(); + } + + public InvalidClockEntryException(String s, Throwable t) { + super(s, t); + } + + public InvalidClockEntryException(String s) { + super(s); + } + + public InvalidClockEntryException(Throwable t) { + super(t); + } +} diff --git a/src/java/voldemort/versioning/VectorClock.java b/src/java/voldemort/versioning/VectorClock.java index 50ab8699ab..be2f8e3d88 100644 --- a/src/java/voldemort/versioning/VectorClock.java +++ b/src/java/voldemort/versioning/VectorClock.java @@ -18,12 +18,18 @@ import java.io.Serializable; import java.util.ArrayList; +import java.util.Collections; import java.util.List; +import java.util.Map; +import java.util.SortedSet; +import java.util.TreeMap; import voldemort.annotations.concurrency.NotThreadsafe; import voldemort.utils.ByteUtils; +import voldemort.utils.Utils; -import com.google.common.collect.Lists; +import com.google.common.collect.Maps; +import com.google.common.collect.Sets; /** * A vector of the number of writes mastered by each node. The vector is stored @@ -40,8 +46,8 @@ public class VectorClock implements Version, Serializable { private static final int MAX_NUMBER_OF_VERSIONS = Short.MAX_VALUE; - /* A sorted list of live versions ordered from least to greatest */ - private final List versions; + /* A map of versions keyed by nodeId */ + private final TreeMap versionMap; /* * The time of the last update on the server on which the update was @@ -53,21 +59,36 @@ public class VectorClock implements Version, Serializable { * Construct an empty VectorClock */ public VectorClock() { - this(new ArrayList(0), System.currentTimeMillis()); + this(System.currentTimeMillis()); } public VectorClock(long timestamp) { - this(new ArrayList(0), timestamp); + this.versionMap = new TreeMap(); + this.timestamp = timestamp; } /** - * Create a VectorClock with the given version and timestamp + * This function is not safe because it may break the pre-condition that + * clock entries should be sorted by nodeId * - * @param versions The version to prepopulate - * @param timestamp The timestamp to prepopulate */ + @Deprecated public VectorClock(List versions, long timestamp) { - this.versions = versions; + this.versionMap = new TreeMap(); + this.timestamp = timestamp; + for(ClockEntry clockEntry: versions) { + this.versionMap.put(clockEntry.getNodeId(), clockEntry.getVersion()); + } + } + + /** + * Only used for cloning + * + * @param versionMap + * @param timestamp + */ + private VectorClock(TreeMap versionMap, long timestamp) { + this.versionMap = Utils.notNull(versionMap); this.timestamp = timestamp; } @@ -101,12 +122,12 @@ public VectorClock(byte[] bytes, int offset) { throw new IllegalArgumentException("Too few bytes: expected at least " + minimumBytes + " but found only " + bytes.length + "."); - this.versions = new ArrayList(numEntries); + this.versionMap = new TreeMap(); int index = 3 + offset; for(int i = 0; i < numEntries; i++) { short nodeId = ByteUtils.readShort(bytes, index); long version = ByteUtils.readBytes(bytes, index + ByteUtils.SIZE_OF_SHORT, versionSize); - this.versions.add(new ClockEntry(nodeId, version)); + this.versionMap.put(nodeId, version); index += entrySize; } this.timestamp = ByteUtils.readLong(bytes, index); @@ -120,7 +141,7 @@ public byte[] toBytes() { public int toBytes(byte[] buf, int offset) { // write the number of versions - ByteUtils.writeShort(buf, (short) versions.size(), offset); + ByteUtils.writeShort(buf, (short) versionMap.size(), offset); offset += ByteUtils.SIZE_OF_SHORT; // write the size of each version in bytes byte versionSize = ByteUtils.numberOfBytesRequired(getMaxVersion()); @@ -128,9 +149,11 @@ public int toBytes(byte[] buf, int offset) { offset++; int clockEntrySize = ByteUtils.SIZE_OF_SHORT + versionSize; - for(ClockEntry v: versions) { - ByteUtils.writeShort(buf, v.getNodeId(), offset); - ByteUtils.writeBytes(buf, v.getVersion(), offset + ByteUtils.SIZE_OF_SHORT, versionSize); + SortedSet nodeIds = versionMap.navigableKeySet(); + for(Short nodeId: nodeIds) { + Long version = versionMap.get(nodeId); + ByteUtils.writeShort(buf, nodeId, offset); + ByteUtils.writeBytes(buf, version, offset + ByteUtils.SIZE_OF_SHORT, versionSize); offset += clockEntrySize; } ByteUtils.writeLong(buf, this.timestamp, offset); @@ -139,7 +162,7 @@ public int toBytes(byte[] buf, int offset) { public int sizeInBytes() { byte versionSize = ByteUtils.numberOfBytesRequired(getMaxVersion()); - return ByteUtils.SIZE_OF_SHORT + 1 + this.versions.size() + return ByteUtils.SIZE_OF_SHORT + 1 + this.versionMap.size() * (ByteUtils.SIZE_OF_SHORT + versionSize) + ByteUtils.SIZE_OF_LONG; } @@ -155,28 +178,16 @@ public void incrementVersion(int node, long time) { this.timestamp = time; - // stop on the index greater or equal to the node - boolean found = false; - int index = 0; - for(; index < versions.size(); index++) { - if(versions.get(index).getNodeId() == node) { - found = true; - break; - } else if(versions.get(index).getNodeId() > node) { - found = false; - break; - } + Long version = versionMap.get((short) node); + if(version == null) { + version = 1L; + } else { + version = version + 1L; } - if(found) { - versions.set(index, versions.get(index).incremented()); - } else if(index < versions.size() - 1) { - versions.add(index, new ClockEntry((short) node, 1)); - } else { - // we don't already have a version for this, so add it - if(versions.size() > MAX_NUMBER_OF_VERSIONS) - throw new IllegalStateException("Vector clock is full!"); - versions.add(index, new ClockEntry((short) node, 1)); + versionMap.put((short) node, version); + if(versionMap.size() >= MAX_NUMBER_OF_VERSIONS) { + throw new IllegalStateException("Vector clock is full!"); } } @@ -196,7 +207,7 @@ public VectorClock incremented(int nodeId, long time) { @Override public VectorClock clone() { - return new VectorClock(Lists.newArrayList(versions), this.timestamp); + return new VectorClock(Maps.newTreeMap(versionMap), this.timestamp); } @Override @@ -208,24 +219,27 @@ public boolean equals(Object object) { if(!object.getClass().equals(VectorClock.class)) return false; VectorClock clock = (VectorClock) object; - return versions.equals(clock.versions); + return versionMap.equals(clock.versionMap); } @Override public int hashCode() { - return versions.hashCode(); + return versionMap.hashCode(); } @Override public String toString() { StringBuilder builder = new StringBuilder(); builder.append("version("); - if(this.versions.size() > 0) { - for(int i = 0; i < this.versions.size() - 1; i++) { - builder.append(this.versions.get(i)); + int versionsLeft = versionMap.size(); + for(Map.Entry entry: versionMap.entrySet()) { + versionsLeft--; + Short node = entry.getKey(); + Long version = entry.getValue(); + builder.append(node + ":" + version); + if(versionsLeft > 0) { builder.append(", "); } - builder.append(this.versions.get(this.versions.size() - 1)); } builder.append(")"); builder.append(" ts:" + timestamp); @@ -234,41 +248,29 @@ public String toString() { public long getMaxVersion() { long max = -1; - for(ClockEntry entry: versions) - max = Math.max(entry.getVersion(), max); + for(Long version: versionMap.values()) + max = Math.max(version, max); return max; } public VectorClock merge(VectorClock clock) { VectorClock newClock = new VectorClock(); - int i = 0; - int j = 0; - while(i < this.versions.size() && j < clock.versions.size()) { - ClockEntry v1 = this.versions.get(i); - ClockEntry v2 = clock.versions.get(j); - if(v1.getNodeId() == v2.getNodeId()) { - newClock.versions.add(new ClockEntry(v1.getNodeId(), Math.max(v1.getVersion(), - v2.getVersion()))); - i++; - j++; - } else if(v1.getNodeId() < v2.getNodeId()) { - newClock.versions.add(v1.clone()); - i++; + for(Map.Entry entry: this.versionMap.entrySet()) { + newClock.versionMap.put(entry.getKey(), entry.getValue()); + } + for(Map.Entry entry: clock.versionMap.entrySet()) { + Long version = newClock.versionMap.get(entry.getKey()); + if(version == null) { + newClock.versionMap.put(entry.getKey(), entry.getValue()); } else { - newClock.versions.add(v2.clone()); - j++; + newClock.versionMap.put(entry.getKey(), Math.max(version, entry.getValue())); } } - // Okay now there may be leftovers on one or the other list remaining - for(int k = i; k < this.versions.size(); k++) - newClock.versions.add(this.versions.get(k).clone()); - for(int k = j; k < clock.versions.size(); k++) - newClock.versions.add(clock.versions.get(k).clone()); - return newClock; } + @Override public Occurred compare(Version v) { if(!(v instanceof VectorClock)) throw new IllegalArgumentException("Cannot compare Versions of different types."); @@ -277,12 +279,13 @@ public Occurred compare(Version v) { } /** - * Is this Reflexive, AntiSymetic, and Transitive? Compare two VectorClocks, - * the outcomes will be one of the following: -- Clock 1 is BEFORE clock 2 - * if there exists an i such that c1(i) <= c(2) and there does not exist a j - * such that c1(j) > c2(j). -- Clock 1 is CONCURRENT to clock 2 if there - * exists an i, j such that c1(i) < c2(i) and c1(j) > c2(j) -- Clock 1 is - * AFTER clock 2 otherwise + * Compare two VectorClocks, the outcomes will be one of the following:
+ * -- Clock 1 is BEFORE clock 2, if there exists an nodeId such that + * c1(nodeId) <= c2(nodeId) and there does not exist another nodeId such + * that c1(nodeId) > c2(nodeId).
+ * -- Clock 1 is CONCURRENT to clock 2 if there exists an nodeId, nodeId2 + * such that c1(nodeId) < c2(nodeId) and c1(nodeId2) > c2(nodeId2)
+ * -- Clock 1 is AFTER clock 2 otherwise * * @param v1 The first VectorClock * @param v2 The second VectorClock @@ -293,37 +296,36 @@ public static Occurred compare(VectorClock v1, VectorClock v2) { // We do two checks: v1 <= v2 and v2 <= v1 if both are true then boolean v1Bigger = false; boolean v2Bigger = false; - int p1 = 0; - int p2 = 0; - - while(p1 < v1.versions.size() && p2 < v2.versions.size()) { - ClockEntry ver1 = v1.versions.get(p1); - ClockEntry ver2 = v2.versions.get(p2); - if(ver1.getNodeId() == ver2.getNodeId()) { - if(ver1.getVersion() > ver2.getVersion()) - v1Bigger = true; - else if(ver2.getVersion() > ver1.getVersion()) - v2Bigger = true; - p1++; - p2++; - } else if(ver1.getNodeId() > ver2.getNodeId()) { - // since ver1 is bigger that means it is missing a version that - // ver2 has - v2Bigger = true; - p2++; - } else { - // this means ver2 is bigger which means it is missing a version - // ver1 has - v1Bigger = true; - p1++; - } - } - /* Okay, now check for left overs */ - if(p1 < v1.versions.size()) + SortedSet v1Nodes = v1.versionMap.navigableKeySet(); + SortedSet v2Nodes = v2.versionMap.navigableKeySet(); + // get clocks(nodeIds) that both v1 and v2 has + SortedSet commonNodes = Sets.newTreeSet(v1Nodes); + commonNodes.retainAll(v2Nodes); + // if v1 has more nodes than common nodes + // v1 has clocks that v2 does not + if(v1Nodes.size() > commonNodes.size()) { v1Bigger = true; - else if(p2 < v2.versions.size()) + } + // if v2 has more nodes than common nodes + // v2 has clocks that v1 does not + if(v2Nodes.size() > commonNodes.size()) { v2Bigger = true; + } + // compare the common parts + for(Short nodeId: commonNodes) { + // no need to compare more + if(v1Bigger && v2Bigger) { + break; + } + long v1Version = v1.versionMap.get(nodeId); + long v2Version = v2.versionMap.get(nodeId); + if(v1Version > v2Version) { + v1Bigger = true; + } else if(v1Version < v2Version) { + v2Bigger = true; + } + } /* * This is the case where they are equal. Consciously return BEFORE, so @@ -347,8 +349,12 @@ public long getTimestamp() { return this.timestamp; } + @Deprecated public List getEntries() { - return this.versions; + List clocks = new ArrayList(versionMap.size()); + for(Map.Entry entry: versionMap.entrySet()) { + clocks.add(new ClockEntry(entry.getKey(), entry.getValue())); + } + return Collections.unmodifiableList(clocks); } - } diff --git a/src/proto/voldemort-admin.proto b/src/proto/voldemort-admin.proto index 4d8710ff0a..59613c6fb1 100644 --- a/src/proto/voldemort-admin.proto +++ b/src/proto/voldemort-admin.proto @@ -21,8 +21,7 @@ message GetMetadataResponse { } message UpdateMetadataRequest { - required bytes key = 1; - required Versioned versioned = 2; + repeated KeyedVersions metadataEntry = 1; } message UpdateMetadataResponse { @@ -266,10 +265,11 @@ message FailedFetchStoreResponse { message RebalanceStateChangeRequest { repeated RebalancePartitionInfoMap rebalance_partition_info_list = 1; required string cluster_string = 2; - required bool swap_ro = 3; - required bool change_cluster_metadata = 4; - required bool change_rebalance_state = 5; - required bool rollback = 6; + required string stores_string = 3; + required bool swap_ro = 4; + required bool change_cluster_metadata = 5; + required bool change_rebalance_state = 6; + required bool rollback = 7; } message RebalanceStateChangeResponse { diff --git a/test/common/voldemort/ServerTestUtils.java b/test/common/voldemort/ServerTestUtils.java index e8aa77b6b7..8e47660da9 100644 --- a/test/common/voldemort/ServerTestUtils.java +++ b/test/common/voldemort/ServerTestUtils.java @@ -60,6 +60,7 @@ import voldemort.server.protocol.RequestHandler; import voldemort.server.protocol.RequestHandlerFactory; import voldemort.server.protocol.SocketRequestHandlerFactory; +import voldemort.server.protocol.admin.AsyncOperationService; import voldemort.server.socket.SocketService; import voldemort.store.Store; import voldemort.store.StoreDefinition; @@ -301,67 +302,68 @@ public static Cluster getLocalCluster(int numberOfNodes, int[] ports, int[][] pa return new Cluster("test-cluster", nodes); } + public static Cluster getLocalZonedCluster(int numberOfNodes, + int numberOfZones, + int[] nodeToZoneMapping, + int[][] partitionMapping) { + return getLocalZonedCluster(numberOfNodes, + numberOfZones, + nodeToZoneMapping, + partitionMapping, + findFreePorts(3 * numberOfNodes)); + } + /** - * Update a cluster by replacing the specified server with a new host, i.e. - * new ports since they are all localhost + * Returns a cluster with numberOfNodes nodes in numberOfZones + * zones. It is important that numberOfNodes be divisible by + * numberOfZones * - * @param original The original cluster to be updated - * @param serverIds The ids of the server to be replaced with new hosts - * @return updated cluster + * @param numberOfNodes Number of nodes in the cluster + * @param partitionsPerNode Number of partitions in one node + * @param numberOfZones Number of zones + * @return Cluster */ - public static Cluster updateClusterWithNewHost(Cluster original, int... serverIds) { - int highestPortInuse = 0; + public static Cluster getLocalZonedCluster(int numberOfNodes, + int numberOfZones, + int[] nodeToZoneMapping, + int[][] partitionMapping, + int[] ports) { - for(Node node: original.getNodes()) { - int nodeMaxPort = 0; - nodeMaxPort = Math.max(nodeMaxPort, node.getAdminPort()); - nodeMaxPort = Math.max(nodeMaxPort, node.getHttpPort()); - nodeMaxPort = Math.max(nodeMaxPort, node.getSocketPort()); - highestPortInuse = Math.max(highestPortInuse, nodeMaxPort); + if(numberOfZones > 0 && numberOfNodes > 0 && numberOfNodes % numberOfZones != 0) { + throw new VoldemortException("The number of nodes (" + numberOfNodes + + ") is not divisible by number of zones (" + + numberOfZones + ")"); } - Set newNodesSet = new HashSet(serverIds.length); - for(int id: serverIds) { - newNodesSet.add(id); - } + List nodes = new ArrayList(); + for(int i = 0; i < numberOfNodes; i++) { - List newNodeList = new ArrayList(serverIds.length); - for(Node node: original.getNodes()) { - if(newNodesSet.contains(node.getId())) { - node = new Node(node.getId(), - "localhost", - ++highestPortInuse, - ++highestPortInuse, - ++highestPortInuse, - node.getPartitionIds()); + List partitions = new ArrayList(partitionMapping[i].length); + for(int p: partitionMapping[i]) { + partitions.add(p); } - newNodeList.add(node); - } - return new Cluster(original.getName(), newNodeList); - } + nodes.add(new Node(i, + "localhost", + ports[3 * i], + ports[3 * i + 1], + ports[3 * i + 2], + nodeToZoneMapping[i], + partitions)); + } - /** - * Returns a list of zones with their proximity list being in increasing - * order - * - * @param numberOfZones The number of zones to return - * @return List of zones - */ - public static List getZones(int numberOfZones) { + // Generate zones List zones = Lists.newArrayList(); for(int i = 0; i < numberOfZones; i++) { LinkedList proximityList = Lists.newLinkedList(); int zoneId = i + 1; - for(int j = 0; j < numberOfZones; j++) { - if(zoneId % numberOfZones == i) - break; + for(int j = 0; j < numberOfZones - 1; j++) { proximityList.add(zoneId % numberOfZones); zoneId++; } zones.add(new Zone(i, proximityList)); } - return zones; + return new Cluster("cluster", nodes, zones); } /** @@ -413,7 +415,7 @@ public static Cluster getLocalCluster(int numberOfNodes, for(int i = 0; i < numberOfZones; i++) { LinkedList proximityList = Lists.newLinkedList(); int zoneId = i + 1; - for(int j = 0; j < numberOfZones; j++) { + for(int j = 0; j < numberOfZones - 1; j++) { proximityList.add(zoneId % numberOfZones); zoneId++; } @@ -425,6 +427,69 @@ public static Cluster getLocalCluster(int numberOfNodes, } } + /** + * Update a cluster by replacing the specified server with a new host, i.e. + * new ports since they are all localhost + * + * @param original The original cluster to be updated + * @param serverIds The ids of the server to be replaced with new hosts + * @return updated cluster + */ + public static Cluster updateClusterWithNewHost(Cluster original, int... serverIds) { + int highestPortInuse = 0; + + for(Node node: original.getNodes()) { + int nodeMaxPort = 0; + nodeMaxPort = Math.max(nodeMaxPort, node.getAdminPort()); + nodeMaxPort = Math.max(nodeMaxPort, node.getHttpPort()); + nodeMaxPort = Math.max(nodeMaxPort, node.getSocketPort()); + highestPortInuse = Math.max(highestPortInuse, nodeMaxPort); + } + + Set newNodesSet = new HashSet(serverIds.length); + for(int id: serverIds) { + newNodesSet.add(id); + } + + List newNodeList = new ArrayList(serverIds.length); + for(Node node: original.getNodes()) { + if(newNodesSet.contains(node.getId())) { + node = new Node(node.getId(), + "localhost", + ++highestPortInuse, + ++highestPortInuse, + ++highestPortInuse, + node.getPartitionIds()); + } + newNodeList.add(node); + } + + return new Cluster(original.getName(), newNodeList); + } + + /** + * Returns a list of zones with their proximity list being in increasing + * order + * + * @param numberOfZones The number of zones to return + * @return List of zones + */ + public static List getZones(int numberOfZones) { + List zones = Lists.newArrayList(); + for(int i = 0; i < numberOfZones; i++) { + LinkedList proximityList = Lists.newLinkedList(); + int zoneId = i + 1; + for(int j = 0; j < numberOfZones; j++) { + if(zoneId % numberOfZones == i) + break; + proximityList.add(zoneId % numberOfZones); + zoneId++; + } + zones.add(new Zone(i, proximityList)); + } + return zones; + } + public static Node getLocalNode(int nodeId, List partitions) { int[] ports = findFreePorts(3); return new Node(nodeId, "localhost", ports[0], ports[1], ports[2], partitions); @@ -450,7 +515,7 @@ public static List getStoreDefs(int numStores) { .setType(InMemoryStorageConfiguration.TYPE_NAME) .setKeySerializer(serDef) .setValueSerializer(serDef) - .setRoutingPolicy(RoutingTier.SERVER) + .setRoutingPolicy(RoutingTier.CLIENT) .setRoutingStrategyType(RoutingStrategyType.CONSISTENT_STRATEGY) .setReplicationFactor(2) .setPreferredReads(1) @@ -473,7 +538,7 @@ public static StoreDefinition getStoreDef(String storeName, .setType(InMemoryStorageConfiguration.TYPE_NAME) .setKeySerializer(serDef) .setValueSerializer(serDef) - .setRoutingPolicy(RoutingTier.SERVER) + .setRoutingPolicy(RoutingTier.CLIENT) .setRoutingStrategyType(strategyType) .setReplicationFactor(replicationFactor) .setPreferredReads(preads) @@ -502,7 +567,7 @@ public static StoreDefinition getStoreDef(String storeName, .setType(InMemoryStorageConfiguration.TYPE_NAME) .setKeySerializer(serDef) .setValueSerializer(serDef) - .setRoutingPolicy(RoutingTier.SERVER) + .setRoutingPolicy(RoutingTier.CLIENT) .setRoutingStrategyType(strategyType) .setPreferredReads(preads) .setRequiredReads(rreads) @@ -614,7 +679,7 @@ public static VoldemortConfig createServerConfig(boolean useNio, Props props = new Props(); props.put("node.id", nodeId); props.put("voldemort.home", baseDir + "/node-" + nodeId); - props.put("bdb.cache.size", 1 * 1024 * 1024); + props.put("bdb.cache.size", 10 * 1024 * 1024); props.put("bdb.write.transactions", "true"); props.put("bdb.flush.transactions", "true"); props.put("jmx.enable", "false"); @@ -736,7 +801,7 @@ public static void waitForServerStart(SocketStoreFactory socketStoreFactory, Nod boolean success = false; int retries = 10; Store store = null; - while(retries-- > 0) { + while(retries-- > 0 && !success) { store = ServerTestUtils.getSocketStore(socketStoreFactory, MetadataStore.METADATA_STORE_NAME, node.getSocketPort()); @@ -761,6 +826,49 @@ public static void waitForServerStart(SocketStoreFactory socketStoreFactory, Nod throw new RuntimeException("Failed to connect with server:" + node); } + /*** + * + * + * NOTE: This relies on the current behavior of the AsyncOperationService to + * remove an operation if an explicit isComplete() is invoked. If/When that + * is changed, this method will always block upto timeoutMs & return + * + * @param server + * @param asyncOperationPattern substring to match with the operation + * description + * @param timeoutMs + * @return + */ + public static boolean waitForAsyncOperationOnServer(VoldemortServer server, + String asyncOperationPattern, + long timeoutMs) { + long endTimeMs = System.currentTimeMillis() + timeoutMs; + AsyncOperationService service = server.getAsyncRunner(); + List matchingOperationIds = null; + // wait till the atleast one matching operation shows up + while(System.currentTimeMillis() < endTimeMs) { + matchingOperationIds = service.getMatchingAsyncOperationList(asyncOperationPattern, + true); + if(matchingOperationIds.size() > 0) { + break; + } + } + // now wait for those operations to complete + while(System.currentTimeMillis() < endTimeMs) { + List completedOps = new ArrayList(matchingOperationIds.size()); + for(Integer op: matchingOperationIds) { + if(service.isComplete(op)) { + completedOps.add(op); + } + } + matchingOperationIds.removeAll(completedOps); + if(matchingOperationIds.size() == 0) { + return false; + } + } + return false; + } + protected static Cluster internalStartVoldemortCluster(int numServers, VoldemortServer[] voldemortServers, int[][] partitionMap, diff --git a/test/common/voldemort/TestUtils.java b/test/common/voldemort/TestUtils.java index 997d9611ac..2a47e30c81 100644 --- a/test/common/voldemort/TestUtils.java +++ b/test/common/voldemort/TestUtils.java @@ -25,8 +25,11 @@ import java.util.Calendar; import java.util.Collections; import java.util.GregorianCalendar; +import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Random; +import java.util.Set; import java.util.SortedSet; import java.util.TreeSet; @@ -35,6 +38,7 @@ import voldemort.cluster.Node; import voldemort.routing.RoutingStrategy; import voldemort.routing.RoutingStrategyFactory; +import voldemort.routing.StoreRoutingPlan; import voldemort.store.Store; import voldemort.store.StoreDefinition; import voldemort.utils.ByteArray; @@ -80,6 +84,47 @@ public static Versioned getVersioned(byte[] value, int... nodes) { return new Versioned(value, getClock(nodes)); } + /** + * Returns true if both the versioned lists are equal, in terms of values + * and vector clocks, taking into consideration, they might be in different + * orders as well. Ignores the timestamps as a part of the vector clock + * + * @param first + * @param second + * @return + */ + public static boolean areVersionedListsEqual(List> first, + List> second) { + if(first.size() != second.size()) + return false; + // find a match for every first element in second list + for(Versioned firstElement: first) { + boolean found = false; + for(Versioned secondElement: second) { + if(firstElement.equals(secondElement)) { + found = true; + break; + } + } + if(!found) + return false; + } + + // find a match for every second element in first list + for(Versioned secondElement: second) { + boolean found = false; + for(Versioned firstElement: first) { + if(firstElement.equals(secondElement)) { + found = true; + break; + } + } + if(!found) + return false; + } + return true; + } + /** * Record events for the given sequence of nodes * @@ -310,6 +355,39 @@ public static int getMissingPartitionsSize(Cluster orig, Cluster updated) { return diffPartition; } + /** + * Given a StoreRoutingPlan generates upto numKeysPerPartition keys per + * partition + * + * @param numKeysPerPartition + * @return a hashmap of partition to list of keys generated + */ + public static HashMap> createPartitionsKeys(StoreRoutingPlan routingPlan, + int numKeysPerPartition) { + HashMap> partitionToKeyList = new HashMap>(); + Set partitionsPending = new HashSet(routingPlan.getCluster() + .getNumberOfPartitions()); + for(int partition = 0; partition < routingPlan.getCluster().getNumberOfPartitions(); partition++) { + partitionsPending.add(partition); + partitionToKeyList.put(partition, new ArrayList(numKeysPerPartition)); + } + + for(int key = 0;; key++) { + byte[] keyBytes = ("key" + key).getBytes(); + int partition = routingPlan.getMasterPartitionId(keyBytes); + if(partitionToKeyList.get(partition).size() < numKeysPerPartition) { + partitionToKeyList.get(partition).add(keyBytes); + if(partitionToKeyList.get(partition).size() == numKeysPerPartition) { + partitionsPending.remove(partition); + } + } + if(partitionsPending.size() == 0) { + break; + } + } + return partitionToKeyList; + } + /** * Always uses UTF-8. */ diff --git a/test/common/voldemort/config/fat-client-config.avro b/test/common/voldemort/config/fat-client-config.avro new file mode 100644 index 0000000000..097bf03d1c --- /dev/null +++ b/test/common/voldemort/config/fat-client-config.avro @@ -0,0 +1,11 @@ +[ +{ + "store_name": "test", + "socket_timeout_ms": "1500", + "routing_timeout_ms": "1500" +}, +{ + "store_name": "slow-store-test", + "connection_timeout_ms": "500" +} +] diff --git a/test/common/voldemort/config/single-slow-store.xml b/test/common/voldemort/config/single-slow-store.xml new file mode 100644 index 0000000000..48a344cba4 --- /dev/null +++ b/test/common/voldemort/config/single-slow-store.xml @@ -0,0 +1,19 @@ + + + + slow-store-test + slow + Test slow store + consistent-routing + client + 1 + 1 + 1 + + string + + + string + + + diff --git a/test/common/voldemort/config/two-stores-replicated.xml b/test/common/voldemort/config/two-stores-replicated.xml index 3e15423fc0..8a769d3391 100644 --- a/test/common/voldemort/config/two-stores-replicated.xml +++ b/test/common/voldemort/config/two-stores-replicated.xml @@ -36,4 +36,23 @@ UTF-8 + + no-res + bdb + client + 3 + 2 + 2 + 2 + 2 + + string + UTF-8 + + + string + UTF-8 + + + diff --git a/test/common/voldemort/store/ForceFailStore.java b/test/common/voldemort/store/ForceFailStore.java index 6b0d9ba15a..c3ef76aadf 100644 --- a/test/common/voldemort/store/ForceFailStore.java +++ b/test/common/voldemort/store/ForceFailStore.java @@ -19,12 +19,15 @@ import java.util.List; import java.util.Map; +import org.apache.log4j.Logger; + import voldemort.VoldemortException; import voldemort.versioning.Version; import voldemort.versioning.Versioned; public class ForceFailStore extends DelegatingStore { + private final static Logger logger = Logger.getLogger(ForceFailStore.class); private final VoldemortException e; private volatile boolean fail = false; @@ -40,16 +43,24 @@ public void setFail(boolean fail) { @Override public void put(K key, Versioned value, T transform) throws VoldemortException { - if(fail) + if(fail) { + if(logger.isDebugEnabled()) { + logger.debug("PUT key " + key + " was forced to fail"); + } throw e; + } getInnerStore().put(key, value, transform); } @Override public boolean delete(K key, Version version) throws VoldemortException { - if(fail) + if(fail) { + if(logger.isDebugEnabled()) { + logger.debug("DELETE key " + key + " was forced to fail"); + } throw e; + } return getInnerStore().delete(key, version); } @@ -57,16 +68,24 @@ public boolean delete(K key, Version version) throws VoldemortException { @Override public Map>> getAll(Iterable keys, Map transforms) throws VoldemortException { - if(fail) + if(fail) { + if(logger.isDebugEnabled()) { + logger.debug("GETALL was forced to fail"); + } throw e; + } return getInnerStore().getAll(keys, transforms); } @Override public List> get(K key, T transform) throws VoldemortException { - if(fail) + if(fail) { + if(logger.isDebugEnabled()) { + logger.debug("GET key " + key + " was forced to fail"); + } throw e; + } return getInnerStore().get(key, transform); } diff --git a/test/common/voldemort/store/RandomlyFailingDelegatingStore.java b/test/common/voldemort/store/RandomlyFailingDelegatingStore.java index bb77a4ca90..3535df7a62 100644 --- a/test/common/voldemort/store/RandomlyFailingDelegatingStore.java +++ b/test/common/voldemort/store/RandomlyFailingDelegatingStore.java @@ -1,5 +1,7 @@ package voldemort.store; +import java.util.List; + import voldemort.VoldemortException; import voldemort.utils.ClosableIterator; import voldemort.utils.Pair; @@ -160,4 +162,9 @@ public boolean beginBatchModifications() { public boolean endBatchModifications() { return false; } + + @Override + public List> multiVersionPut(K key, List> values) { + return innerStorageEngine.multiVersionPut(key, values); + } } \ No newline at end of file diff --git a/test/common/voldemort/store/SleepyStore.java b/test/common/voldemort/store/SleepyStore.java index 3575ee0b17..372c74186d 100644 --- a/test/common/voldemort/store/SleepyStore.java +++ b/test/common/voldemort/store/SleepyStore.java @@ -25,13 +25,17 @@ public class SleepyStore extends DelegatingStore { - private final long sleepTimeMs; + private long sleepTimeMs; public SleepyStore(long sleepTimeMs, Store innerStore) { super(innerStore); this.sleepTimeMs = sleepTimeMs; } + public void setSleepTimeMs(long sleepTimeMs) { + this.sleepTimeMs = sleepTimeMs; + } + @Override public boolean delete(K key, Version version) throws VoldemortException { try { diff --git a/test/integration/voldemort/nonblocking/E2ENonblockingCheckoutTest.java b/test/integration/voldemort/nonblocking/E2ENonblockingCheckoutTest.java index 10f1dea45d..66585401e6 100644 --- a/test/integration/voldemort/nonblocking/E2ENonblockingCheckoutTest.java +++ b/test/integration/voldemort/nonblocking/E2ENonblockingCheckoutTest.java @@ -108,7 +108,7 @@ public static List getStoreDef(int nodeId) { .setType(storageConfiguration) .setKeySerializer(serDef) .setValueSerializer(serDef) - .setRoutingPolicy(RoutingTier.SERVER) + .setRoutingPolicy(RoutingTier.CLIENT) .setRoutingStrategyType(RoutingStrategyType.CONSISTENT_STRATEGY) .setReplicationFactor(3) .setPreferredReads(1) diff --git a/test/integration/voldemort/store/memory/InMemoryPutAssertionStorageEngine.java b/test/integration/voldemort/store/memory/InMemoryPutAssertionStorageEngine.java new file mode 100644 index 0000000000..8c63ef1dd5 --- /dev/null +++ b/test/integration/voldemort/store/memory/InMemoryPutAssertionStorageEngine.java @@ -0,0 +1,80 @@ +package voldemort.store.memory; + +import java.util.Collections; +import java.util.List; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; + +import org.apache.log4j.Logger; + +import voldemort.VoldemortException; +import voldemort.store.StoreUtils; +import voldemort.versioning.Versioned; + +/** + * . This class is used to assert puts on keys and to examine what key put + * assertions has been fulfilled and what are not This is particularly useful + * for cases where there are a large number of puts and the their values do not + * matter and they are not read + * + * @param Key Type + * @param Value Type + * @param Transformation Type + */ +public class InMemoryPutAssertionStorageEngine extends InMemoryStorageEngine { + + private static final Logger logger = Logger.getLogger(InMemoryPutAssertionStorageEngine.class); + + private final ConcurrentMap assertionMap; + + public InMemoryPutAssertionStorageEngine(String name) { + super(name); + this.assertionMap = new ConcurrentHashMap(); + } + + public synchronized void assertPut(K key) throws VoldemortException { + StoreUtils.assertValidKey(key); + + // delete if exist + List> result = map.remove(key); + if(result == null || result.size() == 0) { + // if non-exist, record as assertion + assertionMap.put(key, true); // use synchronized to avoid race + // condition here + if(logger.isDebugEnabled()) { + logger.debug("PUT Assertion added (not yet fulfilled) for key: " + key + + " assertionMap size: " + assertionMap.size()); + } + } else { + if(logger.isTraceEnabled()) { + logger.trace("PUT Assertion added (immediately fulfilled) for key: " + key + + " assertionMap size: " + assertionMap.size()); + } + } + } + + @Override + public synchronized void put(K key, Versioned value, T transforms) throws VoldemortException { + // try to delete from assertion + // do real put if has not been asserted + Boolean result = assertionMap.remove(key); + if(result == null) { + super.put(key, value, transforms); + if(logger.isTraceEnabled()) { + logger.trace("PUT key: " + key + " (never asserted) assertionMap size: " + + assertionMap.size()); + } + } else { + if(logger.isDebugEnabled()) { + logger.debug("PUT key: " + key + + " (found and fulfills put assertion) assertionMap size: " + + assertionMap.size()); + } + } + } + + public Set getFailedAssertions() { + return Collections.unmodifiableSet(assertionMap.keySet()); + } +} diff --git a/test/integration/voldemort/store/pausable/PausableStorageEngine.java b/test/integration/voldemort/store/pausable/PausableStorageEngine.java index 32fba9b4b4..1958408032 100644 --- a/test/integration/voldemort/store/pausable/PausableStorageEngine.java +++ b/test/integration/voldemort/store/pausable/PausableStorageEngine.java @@ -79,6 +79,12 @@ public void put(K key, Versioned value, T transforms) { inner.put(key, value, transforms); } + @Override + public List> multiVersionPut(K key, final List> values) { + blockIfNecessary(); + return inner.multiVersionPut(key, values); + } + @Override public ClosableIterator>> entries() { blockIfNecessary(); diff --git a/test/integration/voldemort/store/slop/HintedHandoffTestEnvironment.java b/test/integration/voldemort/store/slop/HintedHandoffTestEnvironment.java new file mode 100644 index 0000000000..e073c09c63 --- /dev/null +++ b/test/integration/voldemort/store/slop/HintedHandoffTestEnvironment.java @@ -0,0 +1,499 @@ +package voldemort.store.slop; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.Random; +import java.util.Set; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; + +import org.apache.log4j.Logger; + +import voldemort.ServerTestUtils; +import voldemort.TestUtils; +import voldemort.VoldemortTestConstants; +import voldemort.client.ClientConfig; +import voldemort.client.RoutingTier; +import voldemort.client.SocketStoreClientFactory; +import voldemort.client.StoreClient; +import voldemort.client.StoreClientFactory; +import voldemort.cluster.Cluster; +import voldemort.cluster.Node; +import voldemort.common.service.ServiceType; +import voldemort.common.service.VoldemortService; +import voldemort.routing.RoutingStrategy; +import voldemort.routing.RoutingStrategyFactory; +import voldemort.routing.RoutingStrategyType; +import voldemort.serialization.SerializerDefinition; +import voldemort.server.StoreRepository; +import voldemort.server.VoldemortConfig; +import voldemort.server.VoldemortServer; +import voldemort.server.storage.StorageService; +import voldemort.store.ForceFailStore; +import voldemort.store.PersistenceFailureException; +import voldemort.store.SleepyStore; +import voldemort.store.StorageEngine; +import voldemort.store.Store; +import voldemort.store.StoreDefinition; +import voldemort.store.StoreDefinitionBuilder; +import voldemort.store.memory.InMemoryPutAssertionStorageEngine; +import voldemort.store.memory.InMemoryStorageConfiguration; +import voldemort.store.slop.strategy.HintedHandoffStrategyType; +import voldemort.store.socket.SocketStoreFactory; +import voldemort.store.socket.clientrequest.ClientRequestExecutorPool; +import voldemort.utils.ByteArray; +import voldemort.utils.ClosableIterator; + +public class HintedHandoffTestEnvironment implements Runnable { + + private final Logger logger = Logger.getLogger(HintedHandoffTestEnvironment.class); + // basic configurations + private final static String STORE_NAME = "test-store"; + private final static SerializerDefinition SEL_DEF = new SerializerDefinition("identity"); + private final static Integer NUM_NODES_TOTAL = 8; + private final static Integer DEFAULT_REPLICATION_FACTOR = 3; + private final static Integer DEFAULT_P_WRITES = 1; + private final static Integer DEFAULT_R_WRITES = 1; + private final static HintedHandoffStrategyType DEFAULT_HINT_ROUTING_STRATEGY = HintedHandoffStrategyType.PROXIMITY_STRATEGY; + private int minNodesAvailable = 1; + + // cluster and servers + private Cluster cluster = null; + private final Map voldemortServers = new HashMap(); + private final CountDownLatch startFinishLatch = new CountDownLatch(1); + private final CountDownLatch wrapUpRequestLatch = new CountDownLatch(1); + private final CountDownLatch wrapUpFinishLatch = new CountDownLatch(1); + + // basic store + private StoreDefinitionBuilder storeDefBuilder = new StoreDefinitionBuilder(); + private StoreDefinition storeDef = null; + + // stores + private final Map> realStores = new HashMap>(); + private final Map> forceFailStores = new HashMap>(); + private final Map> sleepyStores = new HashMap>(); + private final Map slopStorageEngines = new HashMap(); + + // slop push + private static Integer DEFAULT_SLOP_PUSH_INTERVAL_S = 10; + + // failures + private final static Integer DEFAULT_REFRESH_INTERVAL_S = 8; + private final static Integer DEFAULT_ASYNC_RECOVERY_INTERVAL_S = 5; + private Integer statusRefreshIntervalSecond = DEFAULT_REFRESH_INTERVAL_S; + private Map nodesStatus = new HashMap(); + + // running thread + private final Thread thread; + + // client and routing + private StoreClientFactory factory; + private RoutingStrategy routingStrategy = null; + + public static enum NodeStatus { + NORMAL, + DOWN, + SLOW, + BDB_ERROR + } + + /** + * A test environment used for hinted handoff test This environment + * simulates multiple failures every several seconds The failure mode are + * among BDB Exception, node down and slow response + */ + public HintedHandoffTestEnvironment() { + storeDefBuilder.setName(STORE_NAME) + .setType(InMemoryStorageConfiguration.TYPE_NAME) + .setKeySerializer(SEL_DEF) + .setValueSerializer(SEL_DEF) + .setRoutingPolicy(RoutingTier.CLIENT) + .setRoutingStrategyType(RoutingStrategyType.CONSISTENT_STRATEGY) + .setReplicationFactor(DEFAULT_REPLICATION_FACTOR) + .setPreferredReads(1) + .setRequiredReads(1) + .setPreferredWrites(DEFAULT_P_WRITES) + .setRequiredWrites(DEFAULT_R_WRITES) + .setHintedHandoffStrategy(DEFAULT_HINT_ROUTING_STRATEGY); + thread = new Thread(this); + } + + public HintedHandoffTestEnvironment setPreferredWrite(int number) { + storeDefBuilder.setPreferredWrites(number); + return this; + } + + public HintedHandoffTestEnvironment setRequiredWrite(int number) { + storeDefBuilder.setRequiredWrites(number); + return this; + } + + public HintedHandoffTestEnvironment setReplicationFactor(int number) { + storeDefBuilder.setReplicationFactor(number); + return this; + } + + public HintedHandoffTestEnvironment setZonedReplicationFactor(int number) { + HashMap zoneReplicationFactor = new HashMap(); + zoneReplicationFactor.put(0, number); + zoneReplicationFactor.put(1, number); + storeDefBuilder.setReplicationFactor(number * 2); + storeDefBuilder.setZoneCountReads(1).setZoneCountWrites(1); + return this; + } + + /** + * Create inner store and storage engines before server starts + * + * @param nodeId + */ + public void createInnerStore(int nodeId) { + Store realStore = new InMemoryPutAssertionStorageEngine(STORE_NAME); + ForceFailStore forceFailStore = new ForceFailStore(realStore, + new PersistenceFailureException("Force failed")); + SleepyStore sleepyStore = new SleepyStore(0, + forceFailStore); + realStores.put(nodeId, realStore); + forceFailStores.put(nodeId, forceFailStore); + sleepyStores.put(nodeId, sleepyStore); + } + + /** + * Start a server How it works: + * + * 1. create a server using test utilities + * + * 2.Inject prepared test store and storage engine + * + * 3. Inject prepared slop store and storage engine + * + * @param nodeId The node of server to start + * @throws IOException + */ + public void startServer(int nodeId) throws IOException { + if(logger.isInfoEnabled()) + logger.info("Starting server of node [" + nodeId + "]"); + SocketStoreFactory socketStoreFactory = new ClientRequestExecutorPool(2, + 10000, + 100000, + 1024); + List stores = new ArrayList(); + stores.add(storeDef); + // start a voldemort server + VoldemortConfig config = ServerTestUtils.createServerConfigWithDefs(true, + nodeId, + TestUtils.createTempDir() + .getAbsolutePath(), + cluster, + stores, + new Properties()); + config.setNioAdminConnectorSelectors(1); + config.setNioConnectorSelectors(5); + config.setSlopFrequencyMs(DEFAULT_SLOP_PUSH_INTERVAL_S * 1000); + config.setSlopStoreType("memory"); + config.setFailureDetectorAsyncRecoveryInterval(DEFAULT_ASYNC_RECOVERY_INTERVAL_S * 1000); + + VoldemortServer vs = ServerTestUtils.startVoldemortServer(socketStoreFactory, config); + socketStoreFactory.close(); + voldemortServers.put(nodeId, vs); + + VoldemortService vsrv = vs.getService(ServiceType.STORAGE); + StoreRepository sr = ((StorageService) vsrv).getStoreRepository(); + + // storage engine injection + sr.removeLocalStore(STORE_NAME); + sr.addLocalStore(sleepyStores.get(nodeId)); + sr.removeStorageEngine(STORE_NAME); + sr.addStorageEngine((StorageEngine) realStores.get(nodeId)); + + // slop stores caching and injection + if(!slopStorageEngines.containsKey(nodeId)) { + SlopStorageEngine slopStorageEngine = sr.getSlopStore(); + slopStorageEngines.put(nodeId, slopStorageEngine); + } else { + sr.removeStorageEngine("slop"); + sr.removeLocalStore("slop"); + sr.addStorageEngine(slopStorageEngines.get(nodeId)); + sr.addLocalStore(slopStorageEngines.get(nodeId)); + sr.setSlopStore(slopStorageEngines.get(nodeId)); + } + } + + /** + * Stop a server + * + * @param nodeId The node of server to stop + */ + public void stopServer(int nodeId) { + if(logger.isInfoEnabled()) + logger.info("Stopping server of node [" + nodeId + "]"); + VoldemortServer server = voldemortServers.get(nodeId); + server.stop(); + } + + public Set getUniqueRandomNumbers(int max, int count) { + Set result = new HashSet(); + Random r = new Random(System.currentTimeMillis()); + while(result.size() <= max && result.size() < count) { + result.add(r.nextInt(max)); + } + return result; + } + + @Override + public void run() { + Random random = new Random(System.currentTimeMillis()); + cluster = VoldemortTestConstants.getEightNodeClusterWithZones(); + storeDef = storeDefBuilder.build(); + // setup store engines + for(Integer nodeId = 0; nodeId < NUM_NODES_TOTAL; nodeId++) { + createInnerStore(nodeId); // do only once + } + + for(Integer nodeId = 0; nodeId < NUM_NODES_TOTAL; nodeId++) { + try { + startServer(nodeId); + } catch(IOException e) { + logger.error("Server " + nodeId + "failed to start", e); + } + } + + // setup client factory + String bootstrapUrl = cluster.getNodeById(0).getSocketUrl().toString(); + factory = new SocketStoreClientFactory(new ClientConfig().setBootstrapUrls(bootstrapUrl)); + + // wait for start of servers + startFinishLatch.countDown(); + + try { + boolean wrapUpSignal = false; + while(!wrapUpSignal) { + if(logger.isInfoEnabled()) { + logger.info("Will sleep for a while or until seeing wrapUpSignal. sleep time: " + + statusRefreshIntervalSecond + " Seconds"); + } + wrapUpSignal = wrapUpRequestLatch.await(statusRefreshIntervalSecond, + TimeUnit.SECONDS); + + if(logger.isInfoEnabled()) { + if(wrapUpSignal) { + logger.info("Wake Up and wrap up. Make all servers NORMAL"); + minNodesAvailable = NUM_NODES_TOTAL; + } else { + logger.info("Wake Up and decide new failure statuses"); + } + for(Map.Entry> entry: realStores.entrySet()) { + InMemoryPutAssertionStorageEngine engine = (InMemoryPutAssertionStorageEngine) entry.getValue(); + logger.info("Outstanding Put Assertions of node [" + entry.getKey() + "]: " + + engine.getFailedAssertions().size()); + } + } + // decide random number of cluster nodes(at least 1 alive) with + // random ids to fail + Integer numNodesToFail = random.nextInt(NUM_NODES_TOTAL - minNodesAvailable + 1); + Set nodesToFail = getUniqueRandomNumbers(NUM_NODES_TOTAL, numNodesToFail); + if(logger.isInfoEnabled()) { + logger.info("Setting nodes to Fail: " + nodesToFail.toString()); + } + + for(Integer nodeId = 0; nodeId < NUM_NODES_TOTAL; nodeId++) { + if(nodesToFail.contains(nodeId)) { + // fail a node if it's normal + if(nodesStatus.get(nodeId) == NodeStatus.NORMAL) { + // random pick one failure node + Integer failureMode = random.nextInt(3); + switch(failureMode) { + case 0: + makeNodeDown(nodeId); + break; + case 1: + makeNodeSlow(nodeId); + break; + case 2: + makeNodeBdbError(nodeId); + break; + } + } + // otherwise, leave unchanged + } else { + // make node normal if not normal + if(nodesStatus.get(nodeId) != NodeStatus.NORMAL) { + makeNodeNormal(nodeId); + } + // otherwise, leave unchanged + } + } + } + } catch(InterruptedException e) {} finally { + wrapUpFinishLatch.countDown(); + } + } + + /** + * Make a node to shutdown + * + * @param nodeId + */ + public void makeNodeDown(int nodeId) { + if(nodesStatus.get(nodeId) != NodeStatus.DOWN) { + if(logger.isInfoEnabled()) { + logger.info("Setting Node[" + nodeId + "] to status [DOWN]"); + } + makeNodeNormal(nodeId); + stopServer(nodeId); + nodesStatus.put(nodeId, NodeStatus.DOWN); + } + } + + /** + * Make the node slow to respond to requests + * + * @param nodeId + */ + public void makeNodeSlow(int nodeId) { + if(nodesStatus.get(nodeId) != NodeStatus.SLOW) { + if(logger.isInfoEnabled()) { + logger.info("Setting Node[" + nodeId + "] to status [SLOW]"); + } + makeNodeNormal(nodeId); + sleepyStores.get(nodeId).setSleepTimeMs(100000); + nodesStatus.put(nodeId, NodeStatus.SLOW); + } + } + + /** + * Make a node throwing out PersistenceFailureException + * + * @param nodeId + */ + public void makeNodeBdbError(int nodeId) { + if(nodesStatus.get(nodeId) != NodeStatus.BDB_ERROR) { + if(logger.isInfoEnabled()) { + logger.info("Setting Node[" + nodeId + "] to status [BDB_ERROR]"); + } + makeNodeNormal(nodeId); + forceFailStores.get(nodeId).setFail(true); + nodesStatus.put(nodeId, NodeStatus.BDB_ERROR); + } + } + + /** + * Making a node to NORMAL state + * + * @param nodeId + */ + public void makeNodeNormal(int nodeId) { + NodeStatus status = nodesStatus.get(nodeId); + if(status == null) { + nodesStatus.put(nodeId, NodeStatus.NORMAL); + status = NodeStatus.NORMAL; + } + + if(status != NodeStatus.NORMAL) { + if(logger.isInfoEnabled()) { + logger.info("Setting Node[" + nodeId + "] to status [NORMAL]"); + } + } + + if(status == NodeStatus.DOWN) { + try { + startServer(nodeId); + } catch(IOException e) { + logger.error("Server " + nodeId + "failed to start", e); + } + } else if(status == NodeStatus.SLOW) { + sleepyStores.get(nodeId).setSleepTimeMs(0); + } else if(status == NodeStatus.BDB_ERROR) { + forceFailStores.get(nodeId).setFail(false); + } + nodesStatus.put(nodeId, NodeStatus.NORMAL); + } + + public Store getRealStore(int nodeId) { + return realStores.get(nodeId); + } + + public List routeRequest(byte[] key) { + if(routingStrategy == null) { + routingStrategy = new RoutingStrategyFactory().updateRoutingStrategy(storeDef, cluster); + } + return routingStrategy.routeRequest(key); + } + + public StoreClient makeClient() { + return factory.getStoreClient(STORE_NAME); + } + + /** + * Wrap up the testing environment by making all servers normal and wait for + * all slops to be pushed + * + * @throws InterruptedException + */ + public void warpUp() throws InterruptedException { + if(logger.isInfoEnabled()) { + logger.info("Waiting for wrap up"); + } + // signal make all servers up + wrapUpRequestLatch.countDown(); + // wait for all servers to come up + wrapUpFinishLatch.await(); + if(logger.isInfoEnabled()) { + logger.info("Finished waiting for wrap up"); + logger.info("Wait for slopPusherJob"); + } + + // wait until all slops are empty + List nonEmptySlopStorageEngines = new ArrayList(); + nonEmptySlopStorageEngines.addAll(slopStorageEngines.values()); + while(nonEmptySlopStorageEngines.size() != 0) { + SlopStorageEngine slopEngine = nonEmptySlopStorageEngines.get(0); + ClosableIterator it = slopEngine.keys(); + if(it.hasNext()) { + Thread.sleep(100); + } else { + nonEmptySlopStorageEngines.remove(0); + if(logger.isDebugEnabled()) { + logger.debug("One slop has been emptied. Waiting for " + + nonEmptySlopStorageEngines.size() + " slopStores"); + } + } + } + + if(logger.isInfoEnabled()) { + logger.info("Finished waiting for slopPusherJob"); + } + } + + /** + * Starting the testing environment and wait until all Voldemort server + * instances are online + * + * @throws InterruptedException + */ + public void start() throws InterruptedException { + if(logger.isInfoEnabled()) { + logger.info("Starting up and wait"); + } + thread.start(); + startFinishLatch.await(); + if(logger.isInfoEnabled()) { + logger.info("Finished Waiting for start up"); + } + } + + /** + * Stop the testing environment + */ + public void stop() { + factory.close(); + for(Integer nodeId: voldemortServers.keySet()) { + stopServer(nodeId); + } + } +} diff --git a/test/integration/voldemort/store/slop/LongHintedHandoffTest.java b/test/integration/voldemort/store/slop/LongHintedHandoffTest.java new file mode 100644 index 0000000000..aa18bb207c --- /dev/null +++ b/test/integration/voldemort/store/slop/LongHintedHandoffTest.java @@ -0,0 +1,147 @@ +package voldemort.store.slop; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.fail; + +import java.util.Arrays; +import java.util.Collection; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.log4j.Logger; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import voldemort.TestUtils; +import voldemort.client.StoreClient; +import voldemort.cluster.Node; +import voldemort.store.InsufficientOperationalNodesException; +import voldemort.store.Store; +import voldemort.store.memory.InMemoryPutAssertionStorageEngine; +import voldemort.utils.ByteArray; + +@RunWith(Parameterized.class) +public class LongHintedHandoffTest { + + private final Logger logger = Logger.getLogger(LongHintedHandoffTest.class); + private static final Long MAX_TOTAL_TIME_MS = 1000L * 60 * 5; + private static final Integer KEY_LENGTH = 16; + private static final Integer VALUE_LENGTH = 32; + private HintedHandoffTestEnvironment testEnv; + private final Integer replicationFactor; + private final Integer requiredWrites; + private final Integer preferredWrites; + private final Boolean zoned; + + public LongHintedHandoffTest(boolean zoned, + int replicationFactor, + int requiredWrites, + int preferredWrites) { + this.zoned = zoned; + this.replicationFactor = replicationFactor; + this.requiredWrites = requiredWrites; + this.preferredWrites = preferredWrites; + } + + @Parameterized.Parameters + public static Collection configs() { + return Arrays.asList(new Object[][] { { true, 3, 1, 1 }, { true, 3, 1, 2 }, + { true, 3, 1, 3 }, { true, 3, 2, 2 }, { true, 3, 2, 3 }, { true, 3, 3, 3 }, + { true, 2, 1, 1 }, { true, 2, 1, 2 }, { true, 2, 2, 2 }, { false, 3, 1, 1 }, + { false, 3, 1, 2 }, { false, 3, 1, 3 }, { false, 3, 2, 2 }, { false, 3, 2, 3 }, + { false, 3, 3, 3 }, { false, 2, 1, 1 }, { false, 2, 1, 2 }, { false, 2, 2, 2 } }); + } + + @Before + public void setUp() throws InterruptedException { + testEnv = new HintedHandoffTestEnvironment(); + testEnv.setPreferredWrite(preferredWrites).setRequiredWrite(requiredWrites); + if(this.zoned) { + testEnv.setReplicationFactor(replicationFactor); + } else { + testEnv.setZonedReplicationFactor(replicationFactor); + } + testEnv.start(); + } + + @After + public void tearDown() { + testEnv.stop(); + logger.info("Stopped all servers"); + } + + @Test + public void testHintedHandoff() throws InterruptedException { + Set nodeIds = new HashSet(); + long startMs = System.currentTimeMillis(); + long endMs = startMs + MAX_TOTAL_TIME_MS; + long totalPuts = 0; + long numRejectedPuts = 0; + long numAssertPuts = 0; + + StoreClient client = testEnv.makeClient(); + while(true) { + if(System.currentTimeMillis() > endMs) { + break; + } + // generate key + ByteArray key = new ByteArray(TestUtils.randomBytes(KEY_LENGTH)); + byte[] value = TestUtils.randomBytes(VALUE_LENGTH); + // put to nodes + try { + client.put(key.get(), value); + // if put does not throw exception + List routes = testEnv.routeRequest(key.get()); + for(Node node: routes) { + numAssertPuts++; + nodeIds.add(node.getId()); + Store realStore = testEnv.getRealStore(node.getId()); + if(realStore instanceof InMemoryPutAssertionStorageEngine) { + ((InMemoryPutAssertionStorageEngine) realStore).assertPut(key); + } else { + fail("realStore is not InMemoryPutAssertionStorageEngine. Test setup is wrong"); + } + } + } catch(InsufficientOperationalNodesException e) { + numRejectedPuts++; + if(logger.isDebugEnabled()) { + logger.debug("Key " + key + " is rejected for InsufficientOperationalNodes"); + } + } finally { + totalPuts++; + } + } + + // bring all servers up + testEnv.warpUp(); + + // check + long numFailedAssertions = 0; + for(Integer nodeId: nodeIds) { + Store realStore = testEnv.getRealStore(nodeId); + if(realStore instanceof InMemoryPutAssertionStorageEngine) { + Set keys = ((InMemoryPutAssertionStorageEngine) realStore).getFailedAssertions(); + for(ByteArray key: keys) { + logger.error("key [" + key + "] is asserted but not recorded on node [" + + nodeId + "]"); + } + numFailedAssertions += keys.size(); + } else { + fail("realStore is not InMemoryPutAssertionStorageEngine"); + } + } + + logger.info("Total Client Puts Rejected (InsufficientOperationalNodes): " + numRejectedPuts); + logger.info("Total Client Put Operations: " + totalPuts); + logger.info("Total Server Put Assertions: " + numAssertPuts); + logger.info("Total Server Put Lost: " + numFailedAssertions); + + assertEquals(numFailedAssertions + " server puts are lost: " + numFailedAssertions, + 0L, + numFailedAssertions); + } +} diff --git a/test/integration/voldemort/store/slow/SlowStorageEngine.java b/test/integration/voldemort/store/slow/SlowStorageEngine.java index bc10e6001f..856c33674e 100644 --- a/test/integration/voldemort/store/slow/SlowStorageEngine.java +++ b/test/integration/voldemort/store/slow/SlowStorageEngine.java @@ -125,6 +125,12 @@ public void put(K key, Versioned value, T transforms) throws VoldemortExcepti innerStorageEngine.put(key, value, transforms); } + @Override + public List> multiVersionPut(K key, final List> values) { + delayByOp(VoldemortOpCode.PUT_OP_CODE); + return innerStorageEngine.multiVersionPut(key, values); + } + @Override public ClosableIterator>> entries() { return innerStorageEngine.entries(); diff --git a/test/long/voldemort/socketpool/E2EClientRequestExecutorPoolAndFailureDetectorTest.java b/test/long/voldemort/socketpool/E2EClientRequestExecutorPoolAndFailureDetectorTest.java index a9d5667653..828810ff54 100644 --- a/test/long/voldemort/socketpool/E2EClientRequestExecutorPoolAndFailureDetectorTest.java +++ b/test/long/voldemort/socketpool/E2EClientRequestExecutorPoolAndFailureDetectorTest.java @@ -88,7 +88,7 @@ public static List getStoreDef() { .setType(storageConfiguration) .setKeySerializer(serDef) .setValueSerializer(serDef) - .setRoutingPolicy(RoutingTier.SERVER) + .setRoutingPolicy(RoutingTier.CLIENT) .setRoutingStrategyType(RoutingStrategyType.CONSISTENT_STRATEGY) .setHintedHandoffStrategy(HintedHandoffStrategyType.PROXIMITY_STRATEGY) .setReplicationFactor(3) diff --git a/test/unit/voldemort/client/AdminServiceBasicTest.java b/test/unit/voldemort/client/AdminServiceBasicTest.java index 1da3a7db89..0e111a950d 100644 --- a/test/unit/voldemort/client/AdminServiceBasicTest.java +++ b/test/unit/voldemort/client/AdminServiceBasicTest.java @@ -883,7 +883,7 @@ public void testStateTransitions() { System.currentTimeMillis())); MetadataStore.VoldemortState state = getVoldemortServer(0).getMetadataStore() - .getServerState(); + .getServerStateUnlocked(); assertEquals("State should be changed correctly to rebalancing state", MetadataStore.VoldemortState.REBALANCING_MASTER_SERVER, state); @@ -896,7 +896,7 @@ public void testStateTransitions() { .getVersion()).incremented(0, System.currentTimeMillis())); - state = getVoldemortServer(0).getMetadataStore().getServerState(); + state = getVoldemortServer(0).getMetadataStore().getServerStateUnlocked(); assertEquals("State should be changed correctly to rebalancing state", MetadataStore.VoldemortState.NORMAL_SERVER, state); @@ -909,7 +909,7 @@ public void testStateTransitions() { .getVersion()).incremented(0, System.currentTimeMillis())); - state = getVoldemortServer(0).getMetadataStore().getServerState(); + state = getVoldemortServer(0).getMetadataStore().getServerStateUnlocked(); assertEquals("State should be changed correctly to rebalancing state", MetadataStore.VoldemortState.REBALANCING_MASTER_SERVER, @@ -922,7 +922,7 @@ public void testStateTransitions() { .getVersion()).incremented(0, System.currentTimeMillis())); - state = getVoldemortServer(0).getMetadataStore().getServerState(); + state = getVoldemortServer(0).getMetadataStore().getServerStateUnlocked(); assertEquals("State should be changed correctly to rebalancing state", MetadataStore.VoldemortState.NORMAL_SERVER, state); diff --git a/test/unit/voldemort/client/protocol/admin/StreamingClientTest.java b/test/unit/voldemort/client/protocol/admin/StreamingClientTest.java index 6226cc1f1e..21002972b1 100644 --- a/test/unit/voldemort/client/protocol/admin/StreamingClientTest.java +++ b/test/unit/voldemort/client/protocol/admin/StreamingClientTest.java @@ -32,9 +32,9 @@ import voldemort.server.VoldemortServer; import voldemort.store.StoreDefinition; import voldemort.store.StoreDefinitionBuilder; +import voldemort.store.bdb.BdbStorageConfiguration; import voldemort.store.compress.CompressionStrategy; import voldemort.store.compress.CompressionStrategyFactory; -import voldemort.store.memory.InMemoryStorageConfiguration; import voldemort.store.socket.SocketStoreFactory; import voldemort.store.socket.clientrequest.ClientRequestExecutorPool; import voldemort.utils.ByteArray; @@ -80,10 +80,10 @@ public static void testSetup() { serverPorts = new int[TOTAL_SERVERS]; storeDef = new StoreDefinitionBuilder().setName(TEST_STORE_NAME) - .setType(InMemoryStorageConfiguration.TYPE_NAME) + .setType(BdbStorageConfiguration.TYPE_NAME) .setKeySerializer(new SerializerDefinition("string")) .setValueSerializer(new SerializerDefinition("string")) - .setRoutingPolicy(RoutingTier.SERVER) + .setRoutingPolicy(RoutingTier.CLIENT) .setRoutingStrategyType(RoutingStrategyType.CONSISTENT_STRATEGY) .setReplicationFactor(2) .setPreferredReads(1) diff --git a/test/unit/voldemort/client/rebalance/AbstractNonZonedRebalanceTest.java b/test/unit/voldemort/client/rebalance/AbstractNonZonedRebalanceTest.java new file mode 100644 index 0000000000..1626ea4b14 --- /dev/null +++ b/test/unit/voldemort/client/rebalance/AbstractNonZonedRebalanceTest.java @@ -0,0 +1,1349 @@ +/* + * Copyright 2008-2012 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package voldemort.client.rebalance; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotSame; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; + +import org.apache.commons.io.FileUtils; +import org.apache.log4j.Logger; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import voldemort.ServerTestUtils; +import voldemort.TestUtils; +import voldemort.client.ClientConfig; +import voldemort.client.DefaultStoreClient; +import voldemort.client.RoutingTier; +import voldemort.client.SocketStoreClientFactory; +import voldemort.client.StoreClient; +import voldemort.client.protocol.admin.AdminClient; +import voldemort.cluster.Cluster; +import voldemort.cluster.Node; +import voldemort.routing.RoutingStrategy; +import voldemort.routing.RoutingStrategyFactory; +import voldemort.routing.RoutingStrategyType; +import voldemort.routing.StoreRoutingPlan; +import voldemort.serialization.SerializerDefinition; +import voldemort.serialization.json.JsonReader; +import voldemort.server.VoldemortServer; +import voldemort.store.InvalidMetadataException; +import voldemort.store.Store; +import voldemort.store.StoreDefinition; +import voldemort.store.StoreDefinitionBuilder; +import voldemort.store.UnreachableStoreException; +import voldemort.store.bdb.BdbStorageConfiguration; +import voldemort.store.metadata.MetadataStore; +import voldemort.store.metadata.MetadataStore.VoldemortState; +import voldemort.store.readonly.JsonStoreBuilder; +import voldemort.store.readonly.ReadOnlyStorageConfiguration; +import voldemort.store.readonly.ReadOnlyStorageEngineTestInstance; +import voldemort.store.readonly.ReadOnlyStorageFormat; +import voldemort.store.readonly.swapper.AdminStoreSwapper; +import voldemort.utils.ByteArray; +import voldemort.utils.ByteUtils; +import voldemort.utils.RebalanceUtils; +import voldemort.versioning.ClockEntry; +import voldemort.versioning.ObsoleteVersionException; +import voldemort.versioning.VectorClock; +import voldemort.versioning.Versioned; +import voldemort.xml.StoreDefinitionsMapper; + +import com.google.common.collect.Lists; + +public abstract class AbstractNonZonedRebalanceTest extends AbstractRebalanceTest { + + private static final Logger logger = Logger.getLogger(AbstractNonZonedRebalanceTest.class.getName()); + + protected static int NUM_RO_CHUNKS_PER_BUCKET = 10; + protected static String testStoreNameRW = "test"; + protected static String testStoreNameRW2 = "test2"; + protected static String testStoreNameRO = "test-ro"; + + protected static String storeDefFileWithoutReplication; + protected static String storeDefFileWithReplication; + protected static String roStoreDefFileWithReplication; + protected static String rwStoreDefFileWithReplication; + protected static String rwTwoStoreDefFileWithReplication; + + private List storeDefWithoutReplication; + private List storeDefWithReplication; + private StoreDefinition roStoreDefWithoutReplication; + private StoreDefinition rwStoreDefWithoutReplication; + private StoreDefinition roStoreDefWithReplication; + private StoreDefinition rwStoreDefWithReplication; + private StoreDefinition rwStoreDefWithReplication2; + + public AbstractNonZonedRebalanceTest(boolean useNio, boolean useDonorBased) { + super(useNio, useDonorBased); + } + + @Before + public void setUp() throws IOException { + // First without replication + roStoreDefWithoutReplication = new StoreDefinitionBuilder().setName(testStoreNameRO) + .setType(ReadOnlyStorageConfiguration.TYPE_NAME) + .setKeySerializer(new SerializerDefinition("string")) + .setValueSerializer(new SerializerDefinition("string")) + .setRoutingPolicy(RoutingTier.CLIENT) + .setRoutingStrategyType(RoutingStrategyType.CONSISTENT_STRATEGY) + .setReplicationFactor(1) + .setPreferredReads(1) + .setRequiredReads(1) + .setPreferredWrites(1) + .setRequiredWrites(1) + .build(); + rwStoreDefWithoutReplication = new StoreDefinitionBuilder().setName(testStoreNameRW) + .setType(BdbStorageConfiguration.TYPE_NAME) + .setKeySerializer(new SerializerDefinition("string")) + .setValueSerializer(new SerializerDefinition("string")) + .setRoutingPolicy(RoutingTier.CLIENT) + .setRoutingStrategyType(RoutingStrategyType.CONSISTENT_STRATEGY) + .setReplicationFactor(1) + .setPreferredReads(1) + .setRequiredReads(1) + .setPreferredWrites(1) + .setRequiredWrites(1) + .build(); + + storeDefWithoutReplication = Lists.newArrayList(roStoreDefWithoutReplication, + rwStoreDefWithoutReplication); + String storeDefWithoutReplicationString = new StoreDefinitionsMapper().writeStoreList(storeDefWithoutReplication); + File file = File.createTempFile("two-stores-", ".xml"); + FileUtils.writeStringToFile(file, storeDefWithoutReplicationString); + storeDefFileWithoutReplication = file.getAbsolutePath(); + + // Now with replication + + roStoreDefWithReplication = new StoreDefinitionBuilder().setName(testStoreNameRO) + .setType(ReadOnlyStorageConfiguration.TYPE_NAME) + .setKeySerializer(new SerializerDefinition("string")) + .setValueSerializer(new SerializerDefinition("string")) + .setRoutingPolicy(RoutingTier.CLIENT) + .setRoutingStrategyType(RoutingStrategyType.CONSISTENT_STRATEGY) + .setReplicationFactor(2) + .setPreferredReads(1) + .setRequiredReads(1) + .setPreferredWrites(1) + .setRequiredWrites(1) + .build(); + file = File.createTempFile("ro-stores-", ".xml"); + FileUtils.writeStringToFile(file, + new StoreDefinitionsMapper().writeStoreList(Lists.newArrayList(roStoreDefWithReplication))); + roStoreDefFileWithReplication = file.getAbsolutePath(); + + rwStoreDefWithReplication = new StoreDefinitionBuilder().setName(testStoreNameRW) + .setType(BdbStorageConfiguration.TYPE_NAME) + .setKeySerializer(new SerializerDefinition("string")) + .setValueSerializer(new SerializerDefinition("string")) + .setRoutingPolicy(RoutingTier.CLIENT) + .setRoutingStrategyType(RoutingStrategyType.CONSISTENT_STRATEGY) + .setReplicationFactor(2) + .setPreferredReads(1) + .setRequiredReads(1) + .setPreferredWrites(1) + .setRequiredWrites(1) + .build(); + rwStoreDefWithReplication2 = new StoreDefinitionBuilder().setName(testStoreNameRW2) + .setType(BdbStorageConfiguration.TYPE_NAME) + .setKeySerializer(new SerializerDefinition("string")) + .setValueSerializer(new SerializerDefinition("string")) + .setRoutingPolicy(RoutingTier.CLIENT) + .setRoutingStrategyType(RoutingStrategyType.CONSISTENT_STRATEGY) + .setReplicationFactor(2) + .setPreferredReads(1) + .setRequiredReads(1) + .setPreferredWrites(1) + .setRequiredWrites(1) + .build(); + + file = File.createTempFile("rw-stores-", ".xml"); + FileUtils.writeStringToFile(file, + new StoreDefinitionsMapper().writeStoreList(Lists.newArrayList(rwStoreDefWithReplication))); + rwStoreDefFileWithReplication = file.getAbsolutePath(); + + file = File.createTempFile("rw-two-stores-", ".xml"); + FileUtils.writeStringToFile(file, + new StoreDefinitionsMapper().writeStoreList(Lists.newArrayList(rwStoreDefWithReplication, + rwStoreDefWithReplication2))); + rwTwoStoreDefFileWithReplication = file.getAbsolutePath(); + + storeDefWithReplication = Lists.newArrayList(roStoreDefWithReplication, + rwStoreDefWithReplication); + String storeDefWithReplicationString = new StoreDefinitionsMapper().writeStoreList(storeDefWithReplication); + file = File.createTempFile("two-stores-", ".xml"); + FileUtils.writeStringToFile(file, storeDefWithReplicationString); + storeDefFileWithReplication = file.getAbsolutePath(); + } + + @After + public void tearDown() { + testEntries.clear(); + testEntries = null; + socketStoreFactory.close(); + socketStoreFactory = null; + } + + @Test(timeout = 600000) + public void testRORWRebalance() throws Exception { + logger.info("Starting testRORWRebalance"); + try { + Cluster currentCluster = ServerTestUtils.getLocalCluster(2, new int[][] { + { 0, 1, 2, 3, 4, 5, 6, 7, 8 }, {} }); + + Cluster targetCluster = RebalanceUtils.createUpdatedCluster(currentCluster, + 1, + Lists.newArrayList(2, 3)); + + // start servers 0 , 1 only + List serverList = Arrays.asList(0, 1); + Map configProps = new HashMap(); + configProps.put("admin.max.threads", "50"); + currentCluster = startServers(currentCluster, + storeDefFileWithoutReplication, + serverList, + configProps); + // Update the cluster information based on the node information + targetCluster = updateCluster(targetCluster); + + RebalanceClientConfig config = new RebalanceClientConfig(); + config.setDeleteAfterRebalancingEnabled(true); + RebalanceController rebalanceClient = new RebalanceController(getBootstrapUrl(currentCluster, + 0), + config); + try { + + // Populate the two stores + populateData(currentCluster, + roStoreDefWithoutReplication, + rebalanceClient.getAdminClient(), + true); + + populateData(currentCluster, + rwStoreDefWithoutReplication, + rebalanceClient.getAdminClient(), + false); + + rebalanceAndCheck(currentCluster, + targetCluster, + storeDefWithoutReplication, + rebalanceClient, + Arrays.asList(1)); + + checkConsistentMetadata(targetCluster, serverList); + } finally { + // stop servers + stopServer(serverList); + } + } catch(AssertionError ae) { + logger.error("Assertion broken in testRORWRebalance ", ae); + throw ae; + } + } + + @Test(timeout = 600000) + public void testRORWRebalanceWithReplication() throws Exception { + logger.info("Starting testRORWRebalanceWithReplication"); + try { + Cluster currentCluster = ServerTestUtils.getLocalCluster(2, new int[][] { + { 0, 1, 2, 3, 4, 5, 6 }, { 7, 8 } }); + + Cluster targetCluster = RebalanceUtils.createUpdatedCluster(currentCluster, + 1, + Lists.newArrayList(2, 3)); + + // start servers 0 , 1 only + List serverList = Arrays.asList(0, 1); + Map configProps = new HashMap(); + configProps.put("admin.max.threads", "5"); + + currentCluster = startServers(currentCluster, + storeDefFileWithReplication, + serverList, + configProps); + // Update the cluster information based on the node information + targetCluster = updateCluster(targetCluster); + + RebalanceClientConfig config = new RebalanceClientConfig(); + config.setDeleteAfterRebalancingEnabled(true); + RebalanceController rebalanceClient = new RebalanceController(getBootstrapUrl(currentCluster, + 0), + config); + try { + // Populate the two stores + populateData(currentCluster, + roStoreDefWithReplication, + rebalanceClient.getAdminClient(), + true); + + populateData(currentCluster, + rwStoreDefWithReplication, + rebalanceClient.getAdminClient(), + false); + + rebalanceAndCheck(currentCluster, + targetCluster, + storeDefWithReplication, + rebalanceClient, + Arrays.asList(0, 1)); + checkConsistentMetadata(targetCluster, serverList); + } finally { + // stop servers + stopServer(serverList); + } + } catch(AssertionError ae) { + logger.error("Assertion broken in testRORWRebalanceWithReplication ", ae); + throw ae; + } + } + + @Test(timeout = 600000) + public void testRORebalanceWithReplication() throws Exception { + logger.info("Starting testRORebalanceWithReplication"); + try { + Cluster currentCluster = ServerTestUtils.getLocalCluster(2, new int[][] { + { 0, 1, 2, 3, 4, 5, 6 }, { 7, 8 } }); + + Cluster targetCluster = RebalanceUtils.createUpdatedCluster(currentCluster, + 1, + Lists.newArrayList(2, 3)); + + // start servers 0 , 1 only + List serverList = Arrays.asList(0, 1); + + // If this test fails, consider increasing the number of admin + // threads. + // In particular, if this test fails by RejectedExecutionHandler in + // SocketServer.java fires with an error message like the following: + // "[18:46:32,994 + // voldemort.server.socket.SocketServer[admin-server]] + // ERROR Too many open connections, 20 of 20 threads in use, denying + // connection from /127.0.0.1:43756 [Thread-552]". Note, this issues + // seems to only affect ThreadPoolBasedNonblockingStoreImpl tests + // rather + // than Nio-based tests. + Map configProps = new HashMap(); + configProps.put("admin.max.threads", "5"); + currentCluster = startServers(currentCluster, + roStoreDefFileWithReplication, + serverList, + configProps); + // Update the cluster information based on the node information + targetCluster = updateCluster(targetCluster); + + RebalanceClientConfig config = new RebalanceClientConfig(); + config.setDeleteAfterRebalancingEnabled(true); + RebalanceController rebalanceClient = new RebalanceController(getBootstrapUrl(currentCluster, + 0), + config); + try { + populateData(currentCluster, + roStoreDefWithReplication, + rebalanceClient.getAdminClient(), + true); + + rebalanceAndCheck(currentCluster, + targetCluster, + Lists.newArrayList(roStoreDefWithReplication), + rebalanceClient, + Arrays.asList(0, 1)); + checkConsistentMetadata(targetCluster, serverList); + } finally { + // stop servers + stopServer(serverList); + } + } catch(AssertionError ae) { + logger.error("Assertion broken in testRORebalanceWithReplication ", ae); + throw ae; + } + } + + @Test(timeout = 600000) + public void testRWRebalanceWithReplication() throws Exception { + logger.info("Starting testRWRebalanceWithReplication"); + try { + Cluster currentCluster = ServerTestUtils.getLocalCluster(2, new int[][] { + { 0, 1, 2, 3, 4, 5, 6 }, { 7, 8 } }); + + Cluster targetCluster = RebalanceUtils.createUpdatedCluster(currentCluster, + 1, + Lists.newArrayList(2, 3)); + + // start servers 0 , 1 only + List serverList = Arrays.asList(0, 1); + currentCluster = startServers(currentCluster, + rwStoreDefFileWithReplication, + serverList, + null); + // Update the cluster information based on the node information + targetCluster = updateCluster(targetCluster); + + RebalanceClientConfig config = new RebalanceClientConfig(); + config.setDeleteAfterRebalancingEnabled(true); + config.setStealerBasedRebalancing(!useDonorBased); + RebalanceController rebalanceClient = new RebalanceController(getBootstrapUrl(currentCluster, + 0), + config); + try { + populateData(currentCluster, + rwStoreDefWithReplication, + rebalanceClient.getAdminClient(), + false); + + rebalanceAndCheck(currentCluster, + targetCluster, + Lists.newArrayList(rwStoreDefWithReplication), + rebalanceClient, + Arrays.asList(0, 1)); + checkConsistentMetadata(targetCluster, serverList); + } finally { + // stop servers + stopServer(serverList); + } + } catch(AssertionError ae) { + logger.error("Assertion broken in testRWRebalanceWithReplication ", ae); + throw ae; + } + } + + @Test(timeout = 600000) + public void testRebalanceCleanPrimary() throws Exception { + logger.info("Starting testRebalanceCleanPrimary"); + try { + Cluster currentCluster = ServerTestUtils.getLocalCluster(3, new int[][] { { 0 }, + { 1, 3 }, { 2 } }); + + Cluster targetCluster = RebalanceUtils.createUpdatedCluster(currentCluster, + 2, + Lists.newArrayList(3)); + + // start servers 0 , 1, 2 + Map configProps = new HashMap(); + configProps.put("enable.repair", "true"); + List serverList = Arrays.asList(0, 1, 2); + currentCluster = startServers(currentCluster, + rwStoreDefFileWithReplication, + serverList, + configProps); + // Update the cluster information based on the node information + targetCluster = updateCluster(targetCluster); + + RebalanceClientConfig config = new RebalanceClientConfig(); + config.setDeleteAfterRebalancingEnabled(false); + config.setStealerBasedRebalancing(!useDonorBased); + RebalanceController rebalanceClient = new RebalanceController(getBootstrapUrl(currentCluster, + 0), + config); + try { + populateData(currentCluster, + rwStoreDefWithReplication, + rebalanceClient.getAdminClient(), + false); + + AdminClient admin = rebalanceClient.getAdminClient(); + // Figure out the positive and negative keys to check + List negativeTestKeyList = sampleKeysFromPartition(admin, + 1, + rwStoreDefWithReplication.getName(), + Arrays.asList(3), + 20); + List positiveTestKeyList = sampleKeysFromPartition(admin, + 1, + rwStoreDefWithReplication.getName(), + Arrays.asList(1), + 20); + + rebalanceAndCheck(currentCluster, + targetCluster, + Lists.newArrayList(rwStoreDefWithReplication), + rebalanceClient, + Arrays.asList(0, 1, 2)); + checkConsistentMetadata(targetCluster, serverList); + + // Do the cleanup operation + for(int i = 0; i < 3; i++) { + admin.storeMntOps.repairJob(i); + } + // wait for the repairs to complete + for(int i = 0; i < 3; i++) { + ServerTestUtils.waitForAsyncOperationOnServer(serverMap.get(i), "Repair", 5000); + } + + // do the positive tests + checkForKeyExistence(admin, + 1, + rwStoreDefWithReplication.getName(), + positiveTestKeyList); + // do the negative tests + checkForKeyNonExistence(admin, + 1, + rwStoreDefWithReplication.getName(), + negativeTestKeyList); + + logger.info("[Primary] Successful clean after Rebalancing"); + } finally { + // stop servers + stopServer(serverList); + } + } catch(AssertionError ae) { + logger.error("Assertion broken in testRebalanceCleanPrimary ", ae); + throw ae; + } + } + + @Test(timeout = 600000) + public void testRebalanceCleanSecondary() throws Exception { + logger.info("Starting testRebalanceCleanSecondary"); + try { + Cluster currentCluster = ServerTestUtils.getLocalCluster(3, new int[][] { { 0, 3 }, + { 1 }, { 2 } }); + + Cluster targetCluster = RebalanceUtils.createUpdatedCluster(currentCluster, + 2, + Lists.newArrayList(3)); + + // start servers 0 , 1, 2 + Map configProps = new HashMap(); + configProps.put("enable.repair", "true"); + List serverList = Arrays.asList(0, 1, 2); + currentCluster = startServers(currentCluster, + rwStoreDefFileWithReplication, + serverList, + configProps); + // Update the cluster information based on the node information + targetCluster = updateCluster(targetCluster); + + RebalanceClientConfig config = new RebalanceClientConfig(); + config.setDeleteAfterRebalancingEnabled(false); + config.setStealerBasedRebalancing(!useDonorBased); + RebalanceController rebalanceClient = new RebalanceController(getBootstrapUrl(currentCluster, + 0), + config); + try { + populateData(currentCluster, + rwStoreDefWithReplication, + rebalanceClient.getAdminClient(), + false); + + AdminClient admin = rebalanceClient.getAdminClient(); + // Figure out the positive and negative keys to check + List negativeTestKeyList = sampleKeysFromPartition(admin, + 1, + rwStoreDefWithReplication.getName(), + Arrays.asList(3), + 20); + List positiveTestKeyList = sampleKeysFromPartition(admin, + 0, + rwStoreDefWithReplication.getName(), + Arrays.asList(3), + 20); + + rebalanceAndCheck(currentCluster, + targetCluster, + Lists.newArrayList(rwStoreDefWithReplication), + rebalanceClient, + Arrays.asList(0, 1, 2)); + checkConsistentMetadata(targetCluster, serverList); + + // Do the cleanup operation + for(int i = 0; i < 3; i++) { + admin.storeMntOps.repairJob(i); + } + // wait for the repairs to complete + for(int i = 0; i < 3; i++) { + ServerTestUtils.waitForAsyncOperationOnServer(serverMap.get(i), "Repair", 5000); + } + + // do the positive tests + checkForKeyExistence(admin, + 0, + rwStoreDefWithReplication.getName(), + positiveTestKeyList); + // do the negative tests + checkForKeyNonExistence(admin, + 1, + rwStoreDefWithReplication.getName(), + negativeTestKeyList); + + logger.info("[Secondary] Successful clean after Rebalancing"); + } finally { + // stop servers + stopServer(serverList); + } + } catch(AssertionError ae) { + logger.error("Assertion broken in testRWRebalanceCleanSecondary ", ae); + throw ae; + } + } + + @Test(timeout = 600000) + public void testRWRebalanceFourNodes() throws Exception { + logger.info("Starting testRWRebalanceFourNodes"); + try { + Cluster currentCluster = ServerTestUtils.getLocalCluster(4, new int[][] { + { 0, 1, 4, 7, 9 }, { 2, 3, 5, 6, 8 }, {}, {} }); + + ArrayList nodes = Lists.newArrayList(currentCluster.getNodes()); + int totalPortNum = nodes.size() * 3; + int[] ports = new int[totalPortNum]; + for(int i = 0; i < nodes.size(); i++) { + ports[i * 3] = nodes.get(i).getHttpPort(); + ports[i * 3 + 1] = nodes.get(i).getSocketPort(); + ports[i * 3 + 2] = nodes.get(i).getAdminPort(); + } + + Cluster targetCluster = ServerTestUtils.getLocalCluster(4, ports, new int[][] { + { 0, 4, 7 }, { 2, 8 }, { 1, 6 }, { 3, 5, 9 } }); + + // start servers + List serverList = Arrays.asList(0, 1, 2, 3); + currentCluster = startServers(currentCluster, + rwTwoStoreDefFileWithReplication, + serverList, + null); + // Update the cluster information based on the node information + targetCluster = updateCluster(targetCluster); + + RebalanceClientConfig config = new RebalanceClientConfig(); + config.setDeleteAfterRebalancingEnabled(true); + config.setStealerBasedRebalancing(!useDonorBased); + config.setPrimaryPartitionBatchSize(100); + config.setMaxParallelRebalancing(5); + RebalanceController rebalanceClient = new RebalanceController(getBootstrapUrl(currentCluster, + 0), + config); + try { + populateData(currentCluster, + rwStoreDefWithReplication, + rebalanceClient.getAdminClient(), + false); + + populateData(currentCluster, + rwStoreDefWithReplication2, + rebalanceClient.getAdminClient(), + false); + + rebalanceAndCheck(currentCluster, + targetCluster, + Lists.newArrayList(rwStoreDefWithReplication, + rwStoreDefWithReplication2), + rebalanceClient, + serverList); + checkConsistentMetadata(targetCluster, serverList); + } catch(Exception e) { + fail(e.getMessage()); + } finally { + // stop servers + stopServer(serverList); + } + } catch(AssertionError ae) { + logger.error("Assertion broken in testRWRebalanceFourNodes ", ae); + throw ae; + } + } + + @Test(timeout = 600000) + public void testRWRebalanceSerial() throws Exception { + logger.info("Starting testRWRebalanceSerial"); + try { + Cluster currentCluster = ServerTestUtils.getLocalCluster(4, new int[][] { + { 0, 1, 4, 7, 9 }, { 2, 3, 5, 6, 8 }, {}, {} }); + + ArrayList nodes = Lists.newArrayList(currentCluster.getNodes()); + int totalPortNum = nodes.size() * 3; + int[] ports = new int[totalPortNum]; + for(int i = 0; i < nodes.size(); i++) { + ports[i * 3] = nodes.get(i).getHttpPort(); + ports[i * 3 + 1] = nodes.get(i).getSocketPort(); + ports[i * 3 + 2] = nodes.get(i).getAdminPort(); + } + + Cluster targetCluster = ServerTestUtils.getLocalCluster(4, ports, new int[][] { + { 0, 4, 7 }, { 2, 8 }, { 1, 6 }, { 3, 5, 9 } }); + + // start servers + Map serverProps = new HashMap(); + serverProps.put("max.parallel.stores.rebalancing", String.valueOf(1)); + List serverList = Arrays.asList(0, 1, 2, 3); + currentCluster = startServers(currentCluster, + rwTwoStoreDefFileWithReplication, + serverList, + serverProps); + // Update the cluster information based on the node information + targetCluster = updateCluster(targetCluster); + + RebalanceClientConfig config = new RebalanceClientConfig(); + config.setDeleteAfterRebalancingEnabled(true); + config.setStealerBasedRebalancing(!useDonorBased); + config.setPrimaryPartitionBatchSize(100); + config.setMaxParallelRebalancing(5); + RebalanceController rebalanceClient = new RebalanceController(getBootstrapUrl(currentCluster, + 0), + config); + try { + populateData(currentCluster, + rwStoreDefWithReplication, + rebalanceClient.getAdminClient(), + false); + + populateData(currentCluster, + rwStoreDefWithReplication2, + rebalanceClient.getAdminClient(), + false); + + rebalanceAndCheck(currentCluster, + targetCluster, + Lists.newArrayList(rwStoreDefWithReplication, + rwStoreDefWithReplication2), + rebalanceClient, + serverList); + checkConsistentMetadata(targetCluster, serverList); + } catch(Exception e) { + fail(e.getMessage()); + } finally { + // stop servers + stopServer(serverList); + } + } catch(AssertionError ae) { + logger.error("Assertion broken in testRWRebalanceSerial ", ae); + throw ae; + } + } + + @Test(timeout = 600000) + public void testProxyGetDuringRebalancing() throws Exception { + logger.info("Starting testProxyGetDuringRebalancing"); + try { + final Cluster currentCluster = ServerTestUtils.getLocalCluster(2, new int[][] { + { 0, 1, 2, 3, 4, 5, 6 }, { 7, 8 } }); + + final Cluster targetCluster = RebalanceUtils.createUpdatedCluster(currentCluster, + 1, + Lists.newArrayList(2, + 3)); + // start servers 0 , 1 only + final List serverList = Arrays.asList(0, 1); + Map configProps = new HashMap(); + configProps.put("admin.max.threads", "5"); + final Cluster updatedCurrentCluster = startServers(currentCluster, + storeDefFileWithReplication, + serverList, + configProps); + final Cluster updatedTargetCluster = updateCluster(targetCluster); + + ExecutorService executors = Executors.newFixedThreadPool(2); + final AtomicBoolean rebalancingComplete = new AtomicBoolean(false); + final List exceptions = Collections.synchronizedList(new ArrayList()); + + RebalanceClientConfig rebalanceClientConfig = new RebalanceClientConfig(); + rebalanceClientConfig.setMaxParallelRebalancing(2); + // We are forced to use stealer based since RO does not support + // donor + // based rebalancing yet + rebalanceClientConfig.setStealerBasedRebalancing(true); + + final RebalanceController rebalanceClient = new RebalanceController(getBootstrapUrl(updatedCurrentCluster, + 0), + rebalanceClientConfig); + + // Populate the two stores + populateData(updatedCurrentCluster, + roStoreDefWithReplication, + rebalanceClient.getAdminClient(), + true); + + populateData(updatedCurrentCluster, + rwStoreDefWithReplication, + rebalanceClient.getAdminClient(), + false); + + final SocketStoreClientFactory factory = new SocketStoreClientFactory(new ClientConfig().setBootstrapUrls(getBootstrapUrl(updatedCurrentCluster, + 0)) + .setEnableLazy(false) + .setSocketTimeout(120, + TimeUnit.SECONDS)); + + final StoreClient storeClientRW = new DefaultStoreClient(testStoreNameRW, + null, + factory, + 3); + + final StoreClient storeClientRO = new DefaultStoreClient(testStoreNameRO, + null, + factory, + 3); + + final CountDownLatch latch = new CountDownLatch(2); + // start get operation. + executors.execute(new Runnable() { + + @Override + public void run() { + try { + List keys = new ArrayList(testEntries.keySet()); + + while(!rebalancingComplete.get()) { + // should always able to get values. + int index = (int) (Math.random() * keys.size()); + + // should get a valid value + try { + Versioned value = storeClientRW.get(keys.get(index)); + assertNotSame("StoreClient get() should not return null.", + null, + value); + assertEquals("Value returned should be good", + new Versioned(testEntries.get(keys.get(index))), + value); + + value = storeClientRO.get(keys.get(index)); + assertNotSame("StoreClient get() should not return null.", + null, + value); + assertEquals("Value returned should be good", + new Versioned(testEntries.get(keys.get(index))), + value); + + } catch(Exception e) { + logger.error("Exception in online thread", e); + exceptions.add(e); + } finally { + latch.countDown(); + } + } + } catch(Exception e) { + logger.error("Exception in proxy get thread", e); + exceptions.add(e); + } finally { + factory.close(); + } + } + + }); + + executors.execute(new Runnable() { + + @Override + public void run() { + try { + + Thread.sleep(500); + rebalanceAndCheck(updatedCurrentCluster, + updatedTargetCluster, + storeDefWithReplication, + rebalanceClient, + Arrays.asList(0, 1)); + Thread.sleep(500); + rebalancingComplete.set(true); + checkConsistentMetadata(updatedTargetCluster, serverList); + } catch(Exception e) { + exceptions.add(e); + logger.error("Exception in rebalancing thread", e); + } finally { + // stop servers + try { + stopServer(serverList); + } catch(Exception e) { + throw new RuntimeException(e); + } + latch.countDown(); + } + } + }); + + latch.await(); + executors.shutdown(); + executors.awaitTermination(300, TimeUnit.SECONDS); + + // check No Exception + if(exceptions.size() > 0) { + for(Exception e: exceptions) { + e.printStackTrace(); + } + fail("Should not see any exceptions."); + } + } catch(AssertionError ae) { + logger.error("Assertion broken in testProxyGetDuringRebalancing ", ae); + throw ae; + } + } + + @Test(timeout = 600000) + public void testProxyPutDuringRebalancing() throws Exception { + logger.info("Starting testProxyPutDuringRebalancing"); + try { + Cluster currentCluster = ServerTestUtils.getLocalCluster(3, new int[][] { { 0 }, + { 1, 3 }, { 2 } }); + + Cluster targetCluster = RebalanceUtils.createUpdatedCluster(currentCluster, + 2, + Lists.newArrayList(3)); + + // start servers 0,1,2 only + final List serverList = Arrays.asList(0, 1, 2); + Map configProps = new HashMap(); + configProps.put("admin.max.threads", "5"); + final Cluster updatedCurrentCluster = startServers(currentCluster, + rwStoreDefFileWithReplication, + serverList, + configProps); + final Cluster updatedTargetCluster = updateCluster(targetCluster); + + ExecutorService executors = Executors.newFixedThreadPool(2); + final AtomicBoolean rebalancingComplete = new AtomicBoolean(false); + final List exceptions = Collections.synchronizedList(new ArrayList()); + + RebalanceClientConfig rebalanceClientConfig = new RebalanceClientConfig(); + rebalanceClientConfig.setMaxParallelRebalancing(2); + // Its is imperative that we test in a single shot since multiple + // batches would mean the proxy bridges being torn down and + // established multiple times and we cannot test against the source + // cluster topology then. + rebalanceClientConfig.setPrimaryPartitionBatchSize(Integer.MAX_VALUE); + rebalanceClientConfig.setStealerBasedRebalancing(!useDonorBased); + + final RebalanceController rebalanceClient = new RebalanceController(getBootstrapUrl(updatedCurrentCluster, + 0), + rebalanceClientConfig); + + populateData(updatedCurrentCluster, + rwStoreDefWithReplication, + rebalanceClient.getAdminClient(), + false); + + final AdminClient adminClient = rebalanceClient.getAdminClient(); + // the plan would cause these partitions to move + // Partition : Donor -> Stealer + // p2 (SEC) : 1 -> 0 + // p3 (PRI) : 1 -> 2 + final List movingKeysList = sampleKeysFromPartition(adminClient, + 1, + rwStoreDefWithReplication.getName(), + Arrays.asList(2, 3), + 20); + assertTrue("Empty list of moving keys...", movingKeysList.size() > 0); + final AtomicBoolean rebalancingStarted = new AtomicBoolean(false); + final AtomicBoolean proxyWritesDone = new AtomicBoolean(false); + final HashMap baselineTuples = new HashMap(testEntries); + final HashMap baselineVersions = new HashMap(); + + for(String key: baselineTuples.keySet()) { + baselineVersions.put(key, new VectorClock()); + } + + final CountDownLatch latch = new CountDownLatch(2); + // start get operation. + executors.execute(new Runnable() { + + @Override + public void run() { + SocketStoreClientFactory factory = null; + try { + // wait for the rebalancing to begin. + List serverList = Lists.newArrayList(serverMap.get(0), + serverMap.get(2)); + while(!rebalancingComplete.get()) { + Iterator serverIterator = serverList.iterator(); + while(serverIterator.hasNext()) { + VoldemortServer server = serverIterator.next(); + if(ByteUtils.getString(server.getMetadataStore() + .get(MetadataStore.SERVER_STATE_KEY, + null) + .get(0) + .getValue(), + "UTF-8") + .compareTo(VoldemortState.REBALANCING_MASTER_SERVER.toString()) == 0) { + logger.info("Server " + server.getIdentityNode().getId() + + " transitioned into REBALANCING MODE"); + serverIterator.remove(); + } + } + if(serverList.size() == 0) { + rebalancingStarted.set(true); + break; + } + } + + if(!rebalancingComplete.get()) { + factory = new SocketStoreClientFactory(new ClientConfig().setBootstrapUrls(getBootstrapUrl(updatedCurrentCluster, + 0)) + .setEnableLazy(false) + .setSocketTimeout(120, + TimeUnit.SECONDS)); + + final StoreClient storeClientRW = new DefaultStoreClient(testStoreNameRW, + null, + factory, + 3); + // Now perform some writes and determine the end + // state + // of the changed keys. Initially, all data now with + // zero vector clock + for(ByteArray movingKey: movingKeysList) { + try { + if(rebalancingComplete.get()) { + break; + } + String keyStr = ByteUtils.getString(movingKey.get(), "UTF-8"); + String valStr = "proxy_write"; + storeClientRW.put(keyStr, valStr); + baselineTuples.put(keyStr, valStr); + // all these keys will have [2:1] vector + // clock + // is node 2 is the pseudo master in both + // moves + baselineVersions.get(keyStr) + .incrementVersion(2, System.currentTimeMillis()); + proxyWritesDone.set(true); + } catch(InvalidMetadataException e) { + // let this go + logger.error("Encountered an invalid metadata exception.. ", e); + } + } + } + } catch(Exception e) { + logger.error("Exception in proxy put thread", e); + exceptions.add(e); + } finally { + if(factory != null) + factory.close(); + latch.countDown(); + } + } + + }); + + executors.execute(new Runnable() { + + @Override + public void run() { + try { + rebalanceClient.rebalance(updatedTargetCluster); + } catch(Exception e) { + logger.error("Error in rebalancing... ", e); + exceptions.add(e); + } finally { + rebalancingComplete.set(true); + latch.countDown(); + } + } + }); + + latch.await(); + executors.shutdown(); + executors.awaitTermination(300, TimeUnit.SECONDS); + + assertEquals("Client did not see all server transition into rebalancing state", + rebalancingStarted.get(), + true); + assertEquals("Not enough time to begin proxy writing", proxyWritesDone.get(), true); + checkEntriesPostRebalance(updatedCurrentCluster, + updatedTargetCluster, + Lists.newArrayList(rwStoreDefWithReplication), + Arrays.asList(0, 1, 2), + baselineTuples, + baselineVersions); + checkConsistentMetadata(updatedTargetCluster, serverList); + // check No Exception + if(exceptions.size() > 0) { + + for(Exception e: exceptions) { + e.printStackTrace(); + } + fail("Should not see any exceptions."); + } + // check that the proxy writes were made to the original donor, node + // 1 + List clockEntries = new ArrayList(serverList.size()); + for(Integer nodeid: serverList) + clockEntries.add(new ClockEntry(nodeid.shortValue(), System.currentTimeMillis())); + VectorClock clusterXmlClock = new VectorClock(clockEntries, System.currentTimeMillis()); + for(Integer nodeid: serverList) + adminClient.metadataMgmtOps.updateRemoteCluster(nodeid, + currentCluster, + clusterXmlClock); + + adminClient.setAdminClientCluster(currentCluster); + checkForTupleEquivalence(adminClient, + 1, + testStoreNameRW, + movingKeysList, + baselineTuples, + baselineVersions); + + // stop servers + try { + stopServer(serverList); + } catch(Exception e) { + throw new RuntimeException(e); + } + } catch(AssertionError ae) { + logger.error("Assertion broken in testProxyPutDuringRebalancing ", ae); + throw ae; + } + } + + @Test(timeout = 600000) + public void testServerSideRouting() throws Exception { + logger.info("Starting testServerSideRouting"); + try { + final Cluster currentCluster = ServerTestUtils.getLocalCluster(2, new int[][] { + { 0, 1, 2, 3, 4, 5, 6 }, { 7, 8 } }); + + final Cluster targetCluster = RebalanceUtils.createUpdatedCluster(currentCluster, + 1, + Lists.newArrayList(2, + 3)); + + final List serverList = Arrays.asList(0, 1); + Map configProps = new HashMap(); + configProps.put("admin.max.threads", "50"); + final Cluster updatedCurrentCluster = startServers(currentCluster, + storeDefFileWithReplication, + serverList, + configProps); + final Cluster updatedTargetCluster = updateCluster(targetCluster); + + ExecutorService executors = Executors.newFixedThreadPool(2); + final AtomicBoolean rebalancingToken = new AtomicBoolean(false); + final List exceptions = Collections.synchronizedList(new ArrayList()); + + // populate data now. + RebalanceClientConfig rebalanceClientConfig = new RebalanceClientConfig(); + rebalanceClientConfig.setMaxParallelRebalancing(2); + + final RebalanceController rebalanceClient = new RebalanceController(getBootstrapUrl(updatedCurrentCluster, + 0), + rebalanceClientConfig); + + // Populate the two stores + populateData(updatedCurrentCluster, + roStoreDefWithReplication, + rebalanceClient.getAdminClient(), + true); + + populateData(updatedCurrentCluster, + rwStoreDefWithReplication, + rebalanceClient.getAdminClient(), + false); + + Node node = updatedCurrentCluster.getNodeById(1); + final Store serverSideRoutingStoreRW = getSocketStore(testStoreNameRW, + node.getHost(), + node.getSocketPort(), + true); + final Store serverSideRoutingStoreRO = getSocketStore(testStoreNameRO, + node.getHost(), + node.getSocketPort(), + true); + + final CountDownLatch latch = new CountDownLatch(1); + + // start get operation. + executors.execute(new Runnable() { + + public void run() { + try { + List keys = new ArrayList(testEntries.keySet()); + + while(!rebalancingToken.get()) { + // should always able to get values. + int index = (int) (Math.random() * keys.size()); + + // should get a valid value + try { + List> values = serverSideRoutingStoreRW.get(new ByteArray(ByteUtils.getBytes(keys.get(index), + "UTF-8")), + null); + + assertEquals("serverSideRoutingStore should return value.", + 1, + values.size()); + assertEquals("Value returned should be good", + new Versioned(testEntries.get(keys.get(index))), + new Versioned(ByteUtils.getString(values.get(0) + .getValue(), + "UTF-8"), + values.get(0).getVersion())); + values = serverSideRoutingStoreRO.get(new ByteArray(ByteUtils.getBytes(keys.get(index), + "UTF-8")), + null); + + assertEquals("serverSideRoutingStore should return value.", + 1, + values.size()); + assertEquals("Value returned should be good", + new Versioned(testEntries.get(keys.get(index))), + new Versioned(ByteUtils.getString(values.get(0) + .getValue(), + "UTF-8"), + values.get(0).getVersion())); + + } catch(UnreachableStoreException e) { + // ignore + } catch(Exception e) { + exceptions.add(e); + } + } + + latch.countDown(); + } catch(Exception e) { + exceptions.add(e); + } + } + + }); + + executors.execute(new Runnable() { + + public void run() { + try { + Thread.sleep(500); + rebalanceAndCheck(updatedCurrentCluster, + updatedTargetCluster, + storeDefWithReplication, + rebalanceClient, + Arrays.asList(0, 1)); + + Thread.sleep(500); + rebalancingToken.set(true); + checkConsistentMetadata(targetCluster, serverList); + } catch(Exception e) { + exceptions.add(e); + } finally { + // stop servers as soon as the client thread has exited + // its + // loop. + try { + latch.await(300, TimeUnit.SECONDS); + stopServer(serverList); + } catch(Exception e) { + throw new RuntimeException(e); + } + } + } + }); + + executors.shutdown(); + executors.awaitTermination(300, TimeUnit.SECONDS); + + // check No Exception + if(exceptions.size() > 0) { + for(Exception e: exceptions) { + e.printStackTrace(); + } + fail("Should not see any exceptions !!"); + } + } catch(AssertionError ae) { + logger.error("Assertion broken in testServerSideRouting ", ae); + throw ae; + } + } + + protected void populateData(Cluster cluster, + StoreDefinition storeDef, + AdminClient adminClient, + boolean isReadOnly) throws Exception { + + // Populate Read write stores + if(!isReadOnly) { + // Create SocketStores for each Node first + Map> storeMap = new HashMap>(); + for(Node node: cluster.getNodes()) { + storeMap.put(node.getId(), + getSocketStore(storeDef.getName(), + node.getHost(), + node.getSocketPort())); + + } + + StoreRoutingPlan storeInstance = new StoreRoutingPlan(cluster, storeDef); + for(Entry entry: testEntries.entrySet()) { + ByteArray keyBytes = new ByteArray(ByteUtils.getBytes(entry.getKey(), "UTF-8")); + List preferenceNodes = storeInstance.getReplicationNodeList(keyBytes.get()); + + // Go over every node + for(int nodeId: preferenceNodes) { + try { + storeMap.get(nodeId) + .put(keyBytes, + new Versioned(ByteUtils.getBytes(entry.getValue(), + "UTF-8")), + null); + } catch(ObsoleteVersionException e) { + logger.info("Why are we seeing this at all here ?? "); + e.printStackTrace(); + } + } + } + + // close all socket stores + for(Store store: storeMap.values()) { + store.close(); + } + + } else { + // Populate Read only stores + + File baseDir = TestUtils.createTempDir(); + JsonReader reader = ReadOnlyStorageEngineTestInstance.makeTestDataReader(testEntries, + baseDir); + + RoutingStrategy router = new RoutingStrategyFactory().updateRoutingStrategy(storeDef, + cluster); + + File outputDir = TestUtils.createTempDir(baseDir); + JsonStoreBuilder storeBuilder = new JsonStoreBuilder(reader, + cluster, + storeDef, + router, + outputDir, + null, + testEntries.size() / 5, + 1, + NUM_RO_CHUNKS_PER_BUCKET, + 10000, + false); + storeBuilder.build(ReadOnlyStorageFormat.READONLY_V2); + + AdminStoreSwapper swapper = new AdminStoreSwapper(cluster, + Executors.newFixedThreadPool(cluster.getNumberOfNodes()), + adminClient, + 100000); + swapper.swapStoreData(testStoreNameRO, outputDir.getAbsolutePath(), 1L); + } + } +} \ No newline at end of file diff --git a/test/unit/voldemort/client/rebalance/AbstractRebalanceTest.java b/test/unit/voldemort/client/rebalance/AbstractRebalanceTest.java index acf18a6800..073a7ea973 100644 --- a/test/unit/voldemort/client/rebalance/AbstractRebalanceTest.java +++ b/test/unit/voldemort/client/rebalance/AbstractRebalanceTest.java @@ -13,222 +13,99 @@ * License for the specific language governing permissions and limitations under * the License. */ - package voldemort.client.rebalance; import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotSame; import static org.junit.Assert.fail; -import java.io.File; -import java.io.IOException; import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; +import java.util.Properties; import java.util.Set; -import java.util.concurrent.CountDownLatch; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicBoolean; - -import org.apache.commons.io.FileUtils; -import org.apache.log4j.Logger; -import org.junit.After; -import org.junit.Before; -import org.junit.Test; import voldemort.ServerTestUtils; import voldemort.TestUtils; -import voldemort.client.ClientConfig; -import voldemort.client.DefaultStoreClient; -import voldemort.client.RoutingTier; -import voldemort.client.SocketStoreClientFactory; -import voldemort.client.StoreClient; +import voldemort.VoldemortException; import voldemort.client.protocol.RequestFormatType; import voldemort.client.protocol.admin.AdminClient; +import voldemort.client.protocol.admin.QueryKeyResult; import voldemort.cluster.Cluster; import voldemort.cluster.Node; import voldemort.routing.RoutingStrategy; import voldemort.routing.RoutingStrategyFactory; -import voldemort.routing.RoutingStrategyType; -import voldemort.serialization.SerializerDefinition; -import voldemort.serialization.json.JsonReader; +import voldemort.routing.StoreRoutingPlan; +import voldemort.server.VoldemortConfig; +import voldemort.server.VoldemortServer; import voldemort.store.Store; import voldemort.store.StoreDefinition; -import voldemort.store.StoreDefinitionBuilder; -import voldemort.store.UnreachableStoreException; -import voldemort.store.bdb.BdbStorageConfiguration; import voldemort.store.metadata.MetadataStore; -import voldemort.store.readonly.JsonStoreBuilder; -import voldemort.store.readonly.ReadOnlyStorageConfiguration; -import voldemort.store.readonly.ReadOnlyStorageEngineTestInstance; -import voldemort.store.readonly.ReadOnlyStorageFormat; -import voldemort.store.readonly.swapper.AdminStoreSwapper; +import voldemort.store.metadata.MetadataStore.VoldemortState; import voldemort.store.socket.SocketStoreFactory; import voldemort.store.socket.clientrequest.ClientRequestExecutorPool; import voldemort.utils.ByteArray; import voldemort.utils.ByteUtils; -import voldemort.utils.KeyLocationValidation; -import voldemort.utils.NodeUtils; import voldemort.utils.Pair; import voldemort.utils.RebalanceUtils; -import voldemort.utils.StoreInstance; import voldemort.utils.Utils; -import voldemort.versioning.ObsoleteVersionException; import voldemort.versioning.VectorClock; import voldemort.versioning.Versioned; -import voldemort.xml.StoreDefinitionsMapper; - -import com.google.common.collect.Lists; public abstract class AbstractRebalanceTest { - private static final Logger logger = Logger.getLogger(AbstractRebalanceTest.class.getName()); - - protected static int NUM_RO_CHUNKS_PER_BUCKET = 10; - protected static String testStoreNameRW = "test"; - protected static String testStoreNameRW2 = "test2"; - protected static String testStoreNameRO = "test-ro"; + Map serverMap; - protected static String storeDefFileWithoutReplication; - protected static String storeDefFileWithReplication; - protected static String roStoreDefFileWithReplication; - protected static String rwStoreDefFileWithReplication; - protected static String rwTwoStoreDefFileWithReplication; - - private List storeDefWithoutReplication; - private List storeDefWithReplication; - private StoreDefinition roStoreDefWithoutReplication; - private StoreDefinition rwStoreDefWithoutReplication; - private StoreDefinition roStoreDefWithReplication; - private StoreDefinition rwStoreDefWithReplication; - private StoreDefinition rwStoreDefWithReplication2; - - protected SocketStoreFactory socketStoreFactory; + protected final boolean useNio; + protected final boolean useDonorBased; HashMap testEntries; + protected SocketStoreFactory socketStoreFactory; - @Before - public void setUp() throws IOException { + public AbstractRebalanceTest(boolean useNio, boolean useDonorBased) { + this.useNio = useNio; + this.useDonorBased = useDonorBased; + this.serverMap = new HashMap(); testEntries = ServerTestUtils.createRandomKeyValueString(getNumKeys()); socketStoreFactory = new ClientRequestExecutorPool(2, 10000, 100000, 32 * 1024); - - // First without replication - roStoreDefWithoutReplication = new StoreDefinitionBuilder().setName(testStoreNameRO) - .setType(ReadOnlyStorageConfiguration.TYPE_NAME) - .setKeySerializer(new SerializerDefinition("string")) - .setValueSerializer(new SerializerDefinition("string")) - .setRoutingPolicy(RoutingTier.SERVER) - .setRoutingStrategyType(RoutingStrategyType.CONSISTENT_STRATEGY) - .setReplicationFactor(1) - .setPreferredReads(1) - .setRequiredReads(1) - .setPreferredWrites(1) - .setRequiredWrites(1) - .build(); - rwStoreDefWithoutReplication = new StoreDefinitionBuilder().setName(testStoreNameRW) - .setType(BdbStorageConfiguration.TYPE_NAME) - .setKeySerializer(new SerializerDefinition("string")) - .setValueSerializer(new SerializerDefinition("string")) - .setRoutingPolicy(RoutingTier.SERVER) - .setRoutingStrategyType(RoutingStrategyType.CONSISTENT_STRATEGY) - .setReplicationFactor(1) - .setPreferredReads(1) - .setRequiredReads(1) - .setPreferredWrites(1) - .setRequiredWrites(1) - .build(); - - storeDefWithoutReplication = Lists.newArrayList(roStoreDefWithoutReplication, - rwStoreDefWithoutReplication); - String storeDefWithoutReplicationString = new StoreDefinitionsMapper().writeStoreList(storeDefWithoutReplication); - File file = File.createTempFile("two-stores-", ".xml"); - FileUtils.writeStringToFile(file, storeDefWithoutReplicationString); - storeDefFileWithoutReplication = file.getAbsolutePath(); - - // Now with replication - - roStoreDefWithReplication = new StoreDefinitionBuilder().setName(testStoreNameRO) - .setType(ReadOnlyStorageConfiguration.TYPE_NAME) - .setKeySerializer(new SerializerDefinition("string")) - .setValueSerializer(new SerializerDefinition("string")) - .setRoutingPolicy(RoutingTier.SERVER) - .setRoutingStrategyType(RoutingStrategyType.CONSISTENT_STRATEGY) - .setReplicationFactor(2) - .setPreferredReads(1) - .setRequiredReads(1) - .setPreferredWrites(1) - .setRequiredWrites(1) - .build(); - file = File.createTempFile("ro-stores-", ".xml"); - FileUtils.writeStringToFile(file, - new StoreDefinitionsMapper().writeStoreList(Lists.newArrayList(roStoreDefWithReplication))); - roStoreDefFileWithReplication = file.getAbsolutePath(); - - rwStoreDefWithReplication = new StoreDefinitionBuilder().setName(testStoreNameRW) - .setType(BdbStorageConfiguration.TYPE_NAME) - .setKeySerializer(new SerializerDefinition("string")) - .setValueSerializer(new SerializerDefinition("string")) - .setRoutingPolicy(RoutingTier.SERVER) - .setRoutingStrategyType(RoutingStrategyType.CONSISTENT_STRATEGY) - .setReplicationFactor(2) - .setPreferredReads(1) - .setRequiredReads(1) - .setPreferredWrites(1) - .setRequiredWrites(1) - .build(); - rwStoreDefWithReplication2 = new StoreDefinitionBuilder().setName(testStoreNameRW2) - .setType(BdbStorageConfiguration.TYPE_NAME) - .setKeySerializer(new SerializerDefinition("string")) - .setValueSerializer(new SerializerDefinition("string")) - .setRoutingPolicy(RoutingTier.SERVER) - .setRoutingStrategyType(RoutingStrategyType.CONSISTENT_STRATEGY) - .setReplicationFactor(2) - .setPreferredReads(1) - .setRequiredReads(1) - .setPreferredWrites(1) - .setRequiredWrites(1) - .build(); - - file = File.createTempFile("rw-stores-", ".xml"); - FileUtils.writeStringToFile(file, - new StoreDefinitionsMapper().writeStoreList(Lists.newArrayList(rwStoreDefWithReplication))); - rwStoreDefFileWithReplication = file.getAbsolutePath(); - - file = File.createTempFile("rw-two-stores-", ".xml"); - FileUtils.writeStringToFile(file, - new StoreDefinitionsMapper().writeStoreList(Lists.newArrayList(rwStoreDefWithReplication, - rwStoreDefWithReplication2))); - rwTwoStoreDefFileWithReplication = file.getAbsolutePath(); - - storeDefWithReplication = Lists.newArrayList(roStoreDefWithReplication, - rwStoreDefWithReplication); - String storeDefWithReplicationString = new StoreDefinitionsMapper().writeStoreList(storeDefWithReplication); - file = File.createTempFile("two-stores-", ".xml"); - FileUtils.writeStringToFile(file, storeDefWithReplicationString); - storeDefFileWithReplication = file.getAbsolutePath(); - } - - @After - public void tearDown() { - testEntries.clear(); - testEntries = null; - socketStoreFactory.close(); - socketStoreFactory = null; } - // TODO: Any way to not throw exception from here? - protected abstract Cluster startServers(Cluster cluster, - String StoreDefXmlFile, - List nodeToStart, - Map configProps) throws Exception; + // This method is susceptible to BindException issues due to TOCTOU + // problem with getLocalCluster (which is used to construct cluster that is + // passed in). + // TODO: Refactor AbstractRebalanceTest to take advantage of + // ServerTestUtils.startVoldemortCluster. + protected Cluster startServers(Cluster cluster, + String storeXmlFile, + List nodeToStart, + Map configProps) throws Exception { + for(int node: nodeToStart) { + Properties properties = new Properties(); + if(null != configProps) { + for(Entry property: configProps.entrySet()) { + properties.put(property.getKey(), property.getValue()); + } + } + // turn proxy puts on + properties.put("proxy.puts.during.rebalance", "true"); + + VoldemortConfig config = ServerTestUtils.createServerConfig(useNio, + node, + TestUtils.createTempDir() + .getAbsolutePath(), + null, + storeXmlFile, + properties); + + VoldemortServer server = ServerTestUtils.startVoldemortServer(socketStoreFactory, + config, + cluster); + serverMap.put(node, server); + } - protected abstract void stopServer(List nodesToStop) throws Exception; + return cluster; + } protected Cluster updateCluster(Cluster template) { return template; @@ -252,11 +129,23 @@ protected Store getSocketStore(String storeName, isRouted); } - protected abstract Cluster getCurrentCluster(int nodeId); - - protected abstract MetadataStore.VoldemortState getCurrentState(int nodeId); + protected VoldemortState getCurrentState(int nodeId) { + VoldemortServer server = serverMap.get(nodeId); + if(server == null) { + throw new VoldemortException("Node id " + nodeId + " does not exist"); + } else { + return server.getMetadataStore().getServerStateUnlocked(); + } + } - protected abstract boolean useDonorBased(); + protected Cluster getCurrentCluster(int nodeId) { + VoldemortServer server = serverMap.get(nodeId); + if(server == null) { + throw new VoldemortException("Node id " + nodeId + " does not exist"); + } else { + return server.getMetadataStore().getCluster(); + } + } public void checkConsistentMetadata(Cluster targetCluster, List serverList) { for(int nodeId: serverList) { @@ -265,6 +154,16 @@ public void checkConsistentMetadata(Cluster targetCluster, List serverL } } + protected void stopServer(List nodesToStop) throws Exception { + for(int node: nodesToStop) { + try { + ServerTestUtils.stopVoldemortServer(serverMap.get(node)); + } catch(VoldemortException e) { + // ignore these at stop time + } + } + } + /** * This method determines the "size" of the test to run... * @@ -272,908 +171,31 @@ public void checkConsistentMetadata(Cluster targetCluster, List serverL */ protected abstract int getNumKeys(); - @Test(timeout = 600000) - public void testRORWRebalance() throws Exception { - logger.info("Starting testRORWRebalance"); - Cluster currentCluster = ServerTestUtils.getLocalCluster(2, new int[][] { - { 0, 1, 2, 3, 4, 5, 6, 7, 8 }, {} }); - - Cluster targetCluster = RebalanceUtils.createUpdatedCluster(currentCluster, - 1, - Lists.newArrayList(2, 3)); - - // start servers 0 , 1 only - List serverList = Arrays.asList(0, 1); - Map configProps = new HashMap(); - configProps.put("admin.max.threads", "50"); - currentCluster = startServers(currentCluster, - storeDefFileWithoutReplication, - serverList, - configProps); - // Update the cluster information based on the node information - targetCluster = updateCluster(targetCluster); - - RebalanceClientConfig config = new RebalanceClientConfig(); - config.setDeleteAfterRebalancingEnabled(true); - RebalanceController rebalanceClient = new RebalanceController(getBootstrapUrl(currentCluster, - 0), - config); - try { - - // Populate the two stores - populateData(currentCluster, - roStoreDefWithoutReplication, - rebalanceClient.getAdminClient(), - true); - - populateData(currentCluster, - rwStoreDefWithoutReplication, - rebalanceClient.getAdminClient(), - false); - - rebalanceAndCheck(currentCluster, - targetCluster, - storeDefWithoutReplication, - rebalanceClient, - Arrays.asList(1)); - - checkConsistentMetadata(targetCluster, serverList); - } finally { - // stop servers - stopServer(serverList); - } - } - - @Test(timeout = 600000) - public void testRORWRebalanceWithReplication() throws Exception { - logger.info("Starting testRORWRebalanceWithReplication"); - Cluster currentCluster = ServerTestUtils.getLocalCluster(2, new int[][] { - { 0, 1, 2, 3, 4, 5, 6 }, { 7, 8 } }); - - Cluster targetCluster = RebalanceUtils.createUpdatedCluster(currentCluster, - 1, - Lists.newArrayList(2, 3)); - - // start servers 0 , 1 only - List serverList = Arrays.asList(0, 1); - Map configProps = new HashMap(); - configProps.put("admin.max.threads", "50"); - - currentCluster = startServers(currentCluster, - storeDefFileWithReplication, - serverList, - configProps); - // Update the cluster information based on the node information - targetCluster = updateCluster(targetCluster); - - RebalanceClientConfig config = new RebalanceClientConfig(); - config.setDeleteAfterRebalancingEnabled(true); - RebalanceController rebalanceClient = new RebalanceController(getBootstrapUrl(currentCluster, - 0), - config); - try { - // Populate the two stores - populateData(currentCluster, - roStoreDefWithReplication, - rebalanceClient.getAdminClient(), - true); - - populateData(currentCluster, - rwStoreDefWithReplication, - rebalanceClient.getAdminClient(), - false); - - rebalanceAndCheck(currentCluster, - targetCluster, - storeDefWithReplication, - rebalanceClient, - Arrays.asList(0, 1)); - checkConsistentMetadata(targetCluster, serverList); - } finally { - // stop servers - stopServer(serverList); - } - } - - @Test(timeout = 600000) - public void testRORebalanceWithReplication() throws Exception { - logger.info("Starting testRORebalanceWithReplication"); - Cluster currentCluster = ServerTestUtils.getLocalCluster(2, new int[][] { - { 0, 1, 2, 3, 4, 5, 6 }, { 7, 8 } }); - - Cluster targetCluster = RebalanceUtils.createUpdatedCluster(currentCluster, - 1, - Lists.newArrayList(2, 3)); - - // start servers 0 , 1 only - List serverList = Arrays.asList(0, 1); - - // If this test fails, consider increasing the number of admin threads. - // In particular, if this test fails by RejectedExecutionHandler in - // SocketServer.java fires with an error message like the following: - // "[18:46:32,994 voldemort.server.socket.SocketServer[admin-server]] - // ERROR Too many open connections, 20 of 20 threads in use, denying - // connection from /127.0.0.1:43756 [Thread-552]". Note, this issues - // seems to only affect ThreadPoolBasedNonblockingStoreImpl tests rather - // than Nio-based tests. - Map configProps = new HashMap(); - configProps.put("admin.max.threads", "50"); - currentCluster = startServers(currentCluster, - roStoreDefFileWithReplication, - serverList, - configProps); - // Update the cluster information based on the node information - targetCluster = updateCluster(targetCluster); - - RebalanceClientConfig config = new RebalanceClientConfig(); - config.setDeleteAfterRebalancingEnabled(true); - RebalanceController rebalanceClient = new RebalanceController(getBootstrapUrl(currentCluster, - 0), - config); - try { - populateData(currentCluster, - roStoreDefWithReplication, - rebalanceClient.getAdminClient(), - true); - - rebalanceAndCheck(currentCluster, - targetCluster, - Lists.newArrayList(roStoreDefWithReplication), - rebalanceClient, - Arrays.asList(0, 1)); - checkConsistentMetadata(targetCluster, serverList); - } finally { - // stop servers - stopServer(serverList); - } - } - - @Test(timeout = 600000) - public void testRWRebalanceWithReplication() throws Exception { - logger.info("Starting testRWRebalanceWithReplication"); - Cluster currentCluster = ServerTestUtils.getLocalCluster(2, new int[][] { - { 0, 1, 2, 3, 4, 5, 6 }, { 7, 8 } }); - - Cluster targetCluster = RebalanceUtils.createUpdatedCluster(currentCluster, - 1, - Lists.newArrayList(2, 3)); - - // start servers 0 , 1 only - List serverList = Arrays.asList(0, 1); - currentCluster = startServers(currentCluster, - rwStoreDefFileWithReplication, - serverList, - null); - // Update the cluster information based on the node information - targetCluster = updateCluster(targetCluster); - - RebalanceClientConfig config = new RebalanceClientConfig(); - config.setDeleteAfterRebalancingEnabled(true); - config.setStealerBasedRebalancing(!useDonorBased()); - RebalanceController rebalanceClient = new RebalanceController(getBootstrapUrl(currentCluster, - 0), - config); - try { - populateData(currentCluster, - rwStoreDefWithReplication, - rebalanceClient.getAdminClient(), - false); - - rebalanceAndCheck(currentCluster, - targetCluster, - Lists.newArrayList(rwStoreDefWithReplication), - rebalanceClient, - Arrays.asList(0, 1)); - checkConsistentMetadata(targetCluster, serverList); - } finally { - // stop servers - stopServer(serverList); - } - } - - @Test(timeout = 600000) - public void testRebalanceCleanPrimary() throws Exception { - logger.info("Starting testRebalanceCleanPrimary"); - Cluster currentCluster = ServerTestUtils.getLocalCluster(3, new int[][] { { 0 }, { 1, 3 }, - { 2 } }); - - Cluster targetCluster = RebalanceUtils.createUpdatedCluster(currentCluster, - 2, - Lists.newArrayList(3)); - - // start servers 0 , 1, 2 - Map configProps = new HashMap(); - configProps.put("enable.repair", "true"); - List serverList = Arrays.asList(0, 1, 2); - currentCluster = startServers(currentCluster, - rwStoreDefFileWithReplication, - serverList, - configProps); - // Update the cluster information based on the node information - targetCluster = updateCluster(targetCluster); - - RebalanceClientConfig config = new RebalanceClientConfig(); - config.setDeleteAfterRebalancingEnabled(false); - RebalanceController rebalanceClient = new RebalanceController(getBootstrapUrl(currentCluster, - 0), - config); - try { - populateData(currentCluster, - rwStoreDefWithReplication, - rebalanceClient.getAdminClient(), - false); - - // Figure out the positive and negative keys to check - ByteArray[] checkKeysNegative = new ByteArray[20]; - List movedPartitions = new ArrayList(); - movedPartitions.add(3); - AdminClient admin = rebalanceClient.getAdminClient(); - Iterator keys = null; - keys = admin.bulkFetchOps.fetchKeys(1, - rwStoreDefWithReplication.getName(), - movedPartitions, - null, - false); - int keyIndex = 0; - while(keys.hasNext() && keyIndex < 20) { - checkKeysNegative[keyIndex++] = keys.next(); - } - ByteArray[] checkKeysPositive = new ByteArray[20]; - List stablePartitions = new ArrayList(); - stablePartitions.add(1); - Iterator keys2 = null; - keys2 = admin.bulkFetchOps.fetchKeys(1, - rwStoreDefWithReplication.getName(), - stablePartitions, - null, - false); - int keyIndex2 = 0; - while(keys2.hasNext() && keyIndex2 < 20) { - checkKeysPositive[keyIndex2++] = keys2.next(); - } - - rebalanceAndCheck(currentCluster, - targetCluster, - Lists.newArrayList(rwStoreDefWithReplication), - rebalanceClient, - Arrays.asList(0, 1, 2)); - checkConsistentMetadata(targetCluster, serverList); - - // Do the cleanup operation - - for(int i = 0; i < 3; i++) { - admin.storeMntOps.repairJob(i); - } - - boolean cleanNode = true; - for(int i = 0; i < keyIndex; i++) { - KeyLocationValidation val = new KeyLocationValidation(targetCluster, - 1, - rwStoreDefWithReplication, - checkKeysNegative[i]); - if(!val.validate(false)) - cleanNode = false; - } - for(int i = 0; i < keyIndex2; i++) { - KeyLocationValidation val = new KeyLocationValidation(targetCluster, - 1, - rwStoreDefWithReplication, - checkKeysPositive[i]); - if(!val.validate(true)) - cleanNode = false; - } - if(cleanNode) - System.out.println("[Primary] Successful clean after Rebalancing"); - else - System.out.println("[Primary] Rebalancing not clean"); - - } finally { - // stop servers - stopServer(serverList); - } - } - - @Test(timeout = 600000) - public void testRebalanceCleanSecondary() throws Exception { - logger.info("Starting testRebalanceCleanSecondary"); - Cluster currentCluster = ServerTestUtils.getLocalCluster(3, new int[][] { { 0, 3 }, { 1 }, - { 2 } }); - - Cluster targetCluster = RebalanceUtils.createUpdatedCluster(currentCluster, - 2, - Lists.newArrayList(3)); - - // start servers 0 , 1, 2 - Map configProps = new HashMap(); - configProps.put("enable.repair", "true"); - List serverList = Arrays.asList(0, 1, 2); - currentCluster = startServers(currentCluster, - rwStoreDefFileWithReplication, - serverList, - configProps); - // Update the cluster information based on the node information - targetCluster = updateCluster(targetCluster); - - RebalanceClientConfig config = new RebalanceClientConfig(); - config.setDeleteAfterRebalancingEnabled(false); - RebalanceController rebalanceClient = new RebalanceController(getBootstrapUrl(currentCluster, - 0), - config); - try { - populateData(currentCluster, - rwStoreDefWithReplication, - rebalanceClient.getAdminClient(), - false); - - // Figure out the positive and negative keys to check - ByteArray[] checkKeysNegative = new ByteArray[20]; - List movedPartitions = new ArrayList(); - movedPartitions.add(3); - AdminClient admin = rebalanceClient.getAdminClient(); - Iterator keys = null; - keys = admin.bulkFetchOps.fetchKeys(1, - rwStoreDefWithReplication.getName(), - movedPartitions, - null, - false); - int keyIndex = 0; - while(keys.hasNext() && keyIndex < 20) { - checkKeysNegative[keyIndex++] = keys.next(); - } - - ByteArray[] checkKeysPositive = new ByteArray[20]; - List stablePartitions = new ArrayList(); - stablePartitions.add(3); - Iterator keys2 = null; - keys2 = admin.bulkFetchOps.fetchKeys(0, - rwStoreDefWithReplication.getName(), - stablePartitions, - null, - false); - int keyIndex2 = 0; - while(keys2.hasNext() && keyIndex2 < 20) { - checkKeysPositive[keyIndex2++] = keys2.next(); - } - - rebalanceAndCheck(currentCluster, - targetCluster, - Lists.newArrayList(rwStoreDefWithReplication), - rebalanceClient, - Arrays.asList(0, 1, 2)); - checkConsistentMetadata(targetCluster, serverList); - - // Do the cleanup operation - - for(int i = 0; i < 3; i++) { - admin.storeMntOps.repairJob(i); - } - - boolean cleanNode = true; - for(int i = 0; i < keyIndex; i++) { - KeyLocationValidation val = new KeyLocationValidation(targetCluster, - 1, - rwStoreDefWithReplication, - checkKeysNegative[i]); - if(!val.validate(false)) - cleanNode = false; - } - for(int i = 0; i < keyIndex2; i++) { - KeyLocationValidation val = new KeyLocationValidation(targetCluster, - 0, - rwStoreDefWithReplication, - checkKeysPositive[i]); - if(!val.validate(true)) - cleanNode = false; - } - if(cleanNode) - System.out.println("[Secondary] Successful clean after Rebalancing"); - else - System.out.println("[Secondary] Rebalancing not clean"); - - } finally { - // stop servers - stopServer(serverList); - } - } - - @Test(timeout = 600000) - public void testRWRebalanceFourNodes() throws Exception { - logger.info("Starting testRWRebalanceFourNodes"); - Cluster currentCluster = ServerTestUtils.getLocalCluster(4, new int[][] { - { 0, 1, 4, 7, 9 }, { 2, 3, 5, 6, 8 }, {}, {} }); - - ArrayList nodes = Lists.newArrayList(currentCluster.getNodes()); - int totalPortNum = nodes.size() * 3; - int[] ports = new int[totalPortNum]; - for(int i = 0; i < nodes.size(); i++) { - ports[i * 3] = nodes.get(i).getHttpPort(); - ports[i * 3 + 1] = nodes.get(i).getSocketPort(); - ports[i * 3 + 2] = nodes.get(i).getAdminPort(); - } - - Cluster targetCluster = ServerTestUtils.getLocalCluster(4, ports, new int[][] { - { 0, 4, 7 }, { 2, 8 }, { 1, 6 }, { 3, 5, 9 } }); - - // start servers - List serverList = Arrays.asList(0, 1, 2, 3); - currentCluster = startServers(currentCluster, - rwTwoStoreDefFileWithReplication, - serverList, - null); - // Update the cluster information based on the node information - targetCluster = updateCluster(targetCluster); - - RebalanceClientConfig config = new RebalanceClientConfig(); - config.setDeleteAfterRebalancingEnabled(true); - config.setStealerBasedRebalancing(!useDonorBased()); - config.setPrimaryPartitionBatchSize(100); - config.setMaxParallelRebalancing(5); - RebalanceController rebalanceClient = new RebalanceController(getBootstrapUrl(currentCluster, - 0), - config); - try { - populateData(currentCluster, - rwStoreDefWithReplication, - rebalanceClient.getAdminClient(), - false); - - populateData(currentCluster, - rwStoreDefWithReplication2, - rebalanceClient.getAdminClient(), - false); - - rebalanceAndCheck(currentCluster, - targetCluster, - Lists.newArrayList(rwStoreDefWithReplication, - rwStoreDefWithReplication2), - rebalanceClient, - serverList); - checkConsistentMetadata(targetCluster, serverList); - } catch(Exception e) { - fail(e.getMessage()); - } finally { - // stop servers - stopServer(serverList); - } - } - - @Test(timeout = 600000) - public void testRWRebalanceSerial() throws Exception { - logger.info("Starting testRWRebalanceSerial"); - Cluster currentCluster = ServerTestUtils.getLocalCluster(4, new int[][] { - { 0, 1, 4, 7, 9 }, { 2, 3, 5, 6, 8 }, {}, {} }); - - ArrayList nodes = Lists.newArrayList(currentCluster.getNodes()); - int totalPortNum = nodes.size() * 3; - int[] ports = new int[totalPortNum]; - for(int i = 0; i < nodes.size(); i++) { - ports[i * 3] = nodes.get(i).getHttpPort(); - ports[i * 3 + 1] = nodes.get(i).getSocketPort(); - ports[i * 3 + 2] = nodes.get(i).getAdminPort(); - } - - Cluster targetCluster = ServerTestUtils.getLocalCluster(4, ports, new int[][] { - { 0, 4, 7 }, { 2, 8 }, { 1, 6 }, { 3, 5, 9 } }); - - // start servers - Map serverProps = new HashMap(); - serverProps.put("max.parallel.stores.rebalancing", String.valueOf(1)); - List serverList = Arrays.asList(0, 1, 2, 3); - currentCluster = startServers(currentCluster, - rwTwoStoreDefFileWithReplication, - serverList, - serverProps); - // Update the cluster information based on the node information - targetCluster = updateCluster(targetCluster); - - RebalanceClientConfig config = new RebalanceClientConfig(); - config.setDeleteAfterRebalancingEnabled(true); - config.setStealerBasedRebalancing(!useDonorBased()); - config.setPrimaryPartitionBatchSize(100); - config.setMaxParallelRebalancing(5); - RebalanceController rebalanceClient = new RebalanceController(getBootstrapUrl(currentCluster, - 0), - config); - try { - populateData(currentCluster, - rwStoreDefWithReplication, - rebalanceClient.getAdminClient(), - false); - - populateData(currentCluster, - rwStoreDefWithReplication2, - rebalanceClient.getAdminClient(), - false); - - rebalanceAndCheck(currentCluster, - targetCluster, - Lists.newArrayList(rwStoreDefWithReplication, - rwStoreDefWithReplication2), - rebalanceClient, - serverList); - checkConsistentMetadata(targetCluster, serverList); - } catch(Exception e) { - fail(e.getMessage()); - } finally { - // stop servers - stopServer(serverList); - } - } - - @Test(timeout = 600000) - public void testProxyGetDuringRebalancing() throws Exception { - logger.info("Starting testProxyGetDuringRebalancing"); - final Cluster currentCluster = ServerTestUtils.getLocalCluster(2, new int[][] { - { 0, 1, 2, 3, 4, 5, 6 }, { 7, 8 } }); - - final Cluster targetCluster = RebalanceUtils.createUpdatedCluster(currentCluster, - 1, - Lists.newArrayList(2, 3)); - // start servers 0 , 1 only - final List serverList = Arrays.asList(0, 1); - Map configProps = new HashMap(); - configProps.put("admin.max.threads", "50"); - final Cluster updatedCurrentCluster = startServers(currentCluster, - storeDefFileWithReplication, - serverList, - configProps); - final Cluster updatedTargetCluster = updateCluster(targetCluster); - - ExecutorService executors = Executors.newFixedThreadPool(2); - final AtomicBoolean rebalancingToken = new AtomicBoolean(false); - final List exceptions = Collections.synchronizedList(new ArrayList()); - - RebalanceClientConfig rebalanceClientConfig = new RebalanceClientConfig(); - rebalanceClientConfig.setMaxParallelRebalancing(2); - - final RebalanceController rebalanceClient = new RebalanceController(getBootstrapUrl(updatedCurrentCluster, - 0), - rebalanceClientConfig); - - // Populate the two stores - populateData(updatedCurrentCluster, - roStoreDefWithReplication, - rebalanceClient.getAdminClient(), - true); - - populateData(updatedCurrentCluster, - rwStoreDefWithReplication, - rebalanceClient.getAdminClient(), - false); - - final SocketStoreClientFactory factory = new SocketStoreClientFactory(new ClientConfig().setBootstrapUrls(getBootstrapUrl(updatedCurrentCluster, - 0)) - .setEnableLazy(false) - .setSocketTimeout(120, - TimeUnit.SECONDS)); - - final StoreClient storeClientRW = new DefaultStoreClient(testStoreNameRW, - null, - factory, - 3); - - final StoreClient storeClientRO = new DefaultStoreClient(testStoreNameRO, - null, - factory, - 3); - - // start get operation. - executors.execute(new Runnable() { - - public void run() { - try { - List keys = new ArrayList(testEntries.keySet()); - - while(!rebalancingToken.get()) { - // should always able to get values. - int index = (int) (Math.random() * keys.size()); - - // should get a valid value - try { - Versioned value = storeClientRW.get(keys.get(index)); - assertNotSame("StoreClient get() should not return null.", null, value); - assertEquals("Value returned should be good", - new Versioned(testEntries.get(keys.get(index))), - value); - - value = storeClientRO.get(keys.get(index)); - assertNotSame("StoreClient get() should not return null.", null, value); - assertEquals("Value returned should be good", - new Versioned(testEntries.get(keys.get(index))), - value); - - } catch(Exception e) { - e.printStackTrace(); - exceptions.add(e); - } - } - - } catch(Exception e) { - exceptions.add(e); - } finally { - factory.close(); - } - } - - }); - - executors.execute(new Runnable() { - - public void run() { - try { - - Thread.sleep(500); - rebalanceAndCheck(updatedCurrentCluster, - updatedTargetCluster, - storeDefWithReplication, - rebalanceClient, - Arrays.asList(0, 1)); - Thread.sleep(500); - rebalancingToken.set(true); - checkConsistentMetadata(updatedTargetCluster, serverList); - - } catch(Exception e) { - exceptions.add(e); - } finally { - // stop servers - try { - stopServer(serverList); - } catch(Exception e) { - throw new RuntimeException(e); - } - } - } - }); - - executors.shutdown(); - executors.awaitTermination(300, TimeUnit.SECONDS); - - // check No Exception - if(exceptions.size() > 0) { - for(Exception e: exceptions) { - e.printStackTrace(); - } - fail("Should not see any exceptions."); - } - } - - @Test(timeout = 600000) - public void testServerSideRouting() throws Exception { - logger.info("Starting testServerSideRouting"); - final Cluster currentCluster = ServerTestUtils.getLocalCluster(2, new int[][] { - { 0, 1, 2, 3, 4, 5, 6 }, { 7, 8 } }); - - final Cluster targetCluster = RebalanceUtils.createUpdatedCluster(currentCluster, - 1, - Lists.newArrayList(2, 3)); - - final List serverList = Arrays.asList(0, 1); - Map configProps = new HashMap(); - configProps.put("admin.max.threads", "50"); - final Cluster updatedCurrentCluster = startServers(currentCluster, - storeDefFileWithReplication, - serverList, - configProps); - final Cluster updatedTargetCluster = updateCluster(targetCluster); - - ExecutorService executors = Executors.newFixedThreadPool(2); - final AtomicBoolean rebalancingToken = new AtomicBoolean(false); - final List exceptions = Collections.synchronizedList(new ArrayList()); - - // populate data now. - RebalanceClientConfig rebalanceClientConfig = new RebalanceClientConfig(); - rebalanceClientConfig.setMaxParallelRebalancing(2); - - final RebalanceController rebalanceClient = new RebalanceController(getBootstrapUrl(updatedCurrentCluster, - 0), - rebalanceClientConfig); - - // Populate the two stores - populateData(updatedCurrentCluster, - roStoreDefWithReplication, - rebalanceClient.getAdminClient(), - true); - - populateData(updatedCurrentCluster, - rwStoreDefWithReplication, - rebalanceClient.getAdminClient(), - false); - - Node node = updatedCurrentCluster.getNodeById(1); - final Store serverSideRoutingStoreRW = getSocketStore(testStoreNameRW, - node.getHost(), - node.getSocketPort(), - true); - final Store serverSideRoutingStoreRO = getSocketStore(testStoreNameRO, - node.getHost(), - node.getSocketPort(), - true); - - final CountDownLatch latch = new CountDownLatch(1); - - // start get operation. - executors.execute(new Runnable() { - - public void run() { - try { - List keys = new ArrayList(testEntries.keySet()); - - while(!rebalancingToken.get()) { - // should always able to get values. - int index = (int) (Math.random() * keys.size()); - - // should get a valid value - try { - List> values = serverSideRoutingStoreRW.get(new ByteArray(ByteUtils.getBytes(keys.get(index), - "UTF-8")), - null); - - assertEquals("serverSideRoutingStore should return value.", - 1, - values.size()); - assertEquals("Value returned should be good", - new Versioned(testEntries.get(keys.get(index))), - new Versioned(ByteUtils.getString(values.get(0) - .getValue(), - "UTF-8"), - values.get(0).getVersion())); - values = serverSideRoutingStoreRO.get(new ByteArray(ByteUtils.getBytes(keys.get(index), - "UTF-8")), - null); - - assertEquals("serverSideRoutingStore should return value.", - 1, - values.size()); - assertEquals("Value returned should be good", - new Versioned(testEntries.get(keys.get(index))), - new Versioned(ByteUtils.getString(values.get(0) - .getValue(), - "UTF-8"), - values.get(0).getVersion())); - - } catch(UnreachableStoreException e) { - // ignore - } catch(Exception e) { - exceptions.add(e); - } - } - - latch.countDown(); - } catch(Exception e) { - exceptions.add(e); - } - } - - }); - - executors.execute(new Runnable() { - - public void run() { - try { - Thread.sleep(500); - rebalanceAndCheck(updatedCurrentCluster, - updatedTargetCluster, - storeDefWithReplication, - rebalanceClient, - Arrays.asList(0, 1)); - - Thread.sleep(500); - rebalancingToken.set(true); - checkConsistentMetadata(targetCluster, serverList); - } catch(Exception e) { - exceptions.add(e); - } finally { - // stop servers as soon as the client thread has exited its - // loop. - try { - latch.await(300, TimeUnit.SECONDS); - stopServer(serverList); - } catch(Exception e) { - throw new RuntimeException(e); - } - } - } - }); - - executors.shutdown(); - executors.awaitTermination(300, TimeUnit.SECONDS); - - // check No Exception - if(exceptions.size() > 0) { - for(Exception e: exceptions) { - e.printStackTrace(); - } - fail("Should not see any exceptions !!"); - } - } - - protected void populateData(Cluster cluster, - StoreDefinition storeDef, - AdminClient adminClient, - boolean isReadOnly) throws Exception { - - // Populate Read write stores - if(!isReadOnly) { - // Create SocketStores for each Node first - Map> storeMap = new HashMap>(); - for(Node node: cluster.getNodes()) { - storeMap.put(node.getId(), - getSocketStore(storeDef.getName(), - node.getHost(), - node.getSocketPort())); - - } - - RoutingStrategy routing = new RoutingStrategyFactory().updateRoutingStrategy(storeDef, - cluster); - for(Entry entry: testEntries.entrySet()) { - ByteArray keyBytes = new ByteArray(ByteUtils.getBytes(entry.getKey(), "UTF-8")); - List preferenceNodes = NodeUtils.getNodeIds(routing.routeRequest(keyBytes.get())); - - // Go over every node - for(int nodeId: preferenceNodes) { - try { - storeMap.get(nodeId) - .put(keyBytes, - new Versioned(ByteUtils.getBytes(entry.getValue(), - "UTF-8")), - null); - } catch(ObsoleteVersionException e) { - System.out.println("Why are we seeing this at all here ?? "); - e.printStackTrace(); - } - } - } - - // close all socket stores - for(Store store: storeMap.values()) { - store.close(); - } - - } else { - // Populate Read only stores - - File baseDir = TestUtils.createTempDir(); - JsonReader reader = ReadOnlyStorageEngineTestInstance.makeTestDataReader(testEntries, - baseDir); - - RoutingStrategy router = new RoutingStrategyFactory().updateRoutingStrategy(storeDef, - cluster); - - File outputDir = TestUtils.createTempDir(baseDir); - JsonStoreBuilder storeBuilder = new JsonStoreBuilder(reader, - cluster, - storeDef, - router, - outputDir, - null, - testEntries.size() / 5, - 1, - NUM_RO_CHUNKS_PER_BUCKET, - 10000, - false); - storeBuilder.build(ReadOnlyStorageFormat.READONLY_V2); - - AdminStoreSwapper swapper = new AdminStoreSwapper(cluster, - Executors.newFixedThreadPool(cluster.getNumberOfNodes()), - adminClient, - 100000); - swapper.swapStoreData(testStoreNameRO, outputDir.getAbsolutePath(), 1L); - } - } - protected String getBootstrapUrl(Cluster cluster, int nodeId) { Node node = cluster.getNodeById(nodeId); return "tcp://" + node.getHost() + ":" + node.getSocketPort(); } - private void rebalanceAndCheck(Cluster currentCluster, - Cluster targetCluster, - List storeDefs, - RebalanceController rebalanceClient, - List nodeCheckList) { + protected void rebalanceAndCheck(Cluster currentCluster, + Cluster targetCluster, + List storeDefs, + RebalanceController rebalanceClient, + List nodeCheckList) { rebalanceClient.rebalance(targetCluster); + checkEntriesPostRebalance(currentCluster, + targetCluster, + storeDefs, + nodeCheckList, + testEntries, + null); + } + protected void checkEntriesPostRebalance(Cluster currentCluster, + Cluster targetCluster, + List storeDefs, + List nodeCheckList, + HashMap baselineTuples, + HashMap baselineVersions) { for(StoreDefinition storeDef: storeDefs) { Map>> currentNodeToPartitionTuples = RebalanceUtils.getNodeIdToAllPartitions(currentCluster, storeDef, @@ -1197,27 +219,30 @@ private void rebalanceAndCheck(Cluster currentCluster, targetCluster, storeDef, store, - flattenedPresentTuples); + flattenedPresentTuples, + baselineTuples, + baselineVersions); } } - } - private void checkGetEntries(Node node, - Cluster cluster, - StoreDefinition def, - Store store, - HashMap> flattenedPresentTuples) { + protected void checkGetEntries(Node node, + Cluster cluster, + StoreDefinition def, + Store store, + HashMap> flattenedPresentTuples, + HashMap baselineTuples, + HashMap baselineVersions) { RoutingStrategy routing = new RoutingStrategyFactory().updateRoutingStrategy(def, cluster); - for(Entry entry: testEntries.entrySet()) { + for(Entry entry: baselineTuples.entrySet()) { ByteArray keyBytes = new ByteArray(ByteUtils.getBytes(entry.getKey(), "UTF-8")); List partitions = routing.getPartitionList(keyBytes.get()); - if(StoreInstance.checkKeyBelongsToPartition(partitions, - node.getPartitionIds(), - flattenedPresentTuples)) { + if(StoreRoutingPlan.checkKeyBelongsToPartition(partitions, + node.getPartitionIds(), + flattenedPresentTuples)) { List> values = store.get(keyBytes, null); // expecting exactly one version @@ -1227,8 +252,18 @@ private void checkGetEntries(Node node, } assertEquals("Expecting exactly one version", 1, values.size()); Versioned value = values.get(0); - // check version matches (expecting base version for all) - assertEquals("Value version should match", new VectorClock(), value.getVersion()); + // check version matches + if(baselineVersions == null) { + // expecting base version for all + assertEquals("Value version should match", + new VectorClock(), + value.getVersion()); + } else { + assertEquals("Value version should match", + baselineVersions.get(entry.getKey()), + value.getVersion()); + } + // check value matches. assertEquals("Value bytes should match", entry.getValue(), @@ -1236,6 +271,123 @@ private void checkGetEntries(Node node, } } + } + + protected List sampleKeysFromPartition(AdminClient admin, + int serverId, + String store, + List partitionsToSample, + int numSamples) { + List samples = new ArrayList(numSamples); + Iterator keys = admin.bulkFetchOps.fetchKeys(serverId, + store, + partitionsToSample, + null, + false); + int count = 0; + while(keys.hasNext() && count < numSamples) { + samples.add(keys.next()); + count++; + } + return samples; + } + + /** + * REFACTOR: these should belong AdminClient so existence checks can be done + * easily across the board + * + * @param admin + * @param serverId + * @param store + * @param keyList + */ + protected void checkForKeyExistence(AdminClient admin, + int serverId, + String store, + List keyList) { + // do the positive tests + Iterator positiveTestResultsItr = admin.streamingOps.queryKeys(serverId, + store, + keyList.iterator()); + while(positiveTestResultsItr.hasNext()) { + QueryKeyResult item = positiveTestResultsItr.next(); + ByteArray key = item.getKey(); + List> vals = item.getValues(); + Exception e = item.getException(); + + assertEquals("Error fetching key " + key, null, e); + assertEquals("Value not found for key " + key, true, vals != null & vals.size() != 0); + + } + } + + /** + * REFACTOR: these should belong AdminClient so existence checks can be done + * easily across the board + * + * @param admin + * @param serverId + * @param store + * @param keyList + */ + protected void checkForTupleEquivalence(AdminClient admin, + int serverId, + String store, + List keyList, + HashMap baselineTuples, + HashMap baselineVersions) { + // do the positive tests + Iterator positiveTestResultsItr = admin.streamingOps.queryKeys(serverId, + store, + keyList.iterator()); + while(positiveTestResultsItr.hasNext()) { + QueryKeyResult item = positiveTestResultsItr.next(); + ByteArray key = item.getKey(); + List> vals = item.getValues(); + Exception e = item.getException(); + + assertEquals("Error fetching key " + key, null, e); + assertEquals("Value not found for key " + key, true, vals != null & vals.size() != 0); + + String keyStr = ByteUtils.getString(key.get(), "UTF-8"); + if(baselineTuples != null) + assertEquals("Value does not match up ", + baselineTuples.get(keyStr), + ByteUtils.getString(vals.get(0).getValue(), "UTF-8")); + if(baselineVersions != null) + assertEquals("Version does not match up", + baselineVersions.get(keyStr), + vals.get(0).getVersion()); + } + } + /** + * REFACTOR: these should belong AdminClient so existence checks can be done + * easily across the board + * + * @param admin + * @param serverId + * @param store + * @param keyList + */ + protected void checkForKeyNonExistence(AdminClient admin, + int serverId, + String store, + List keyList) { + Iterator negativeTestResultsItr = admin.streamingOps.queryKeys(serverId, + store, + keyList.iterator()); + while(negativeTestResultsItr.hasNext()) { + QueryKeyResult item = negativeTestResultsItr.next(); + ByteArray key = item.getKey(); + List> vals = item.getValues(); + Exception e = item.getException(); + + assertEquals("Error fetching key " + key, null, e); + assertEquals("Value " + vals + "found for key " + key, + true, + vals == null || vals.size() == 0); + + } } -} \ No newline at end of file +} diff --git a/test/unit/voldemort/client/rebalance/AbstractZonedRebalanceTest.java b/test/unit/voldemort/client/rebalance/AbstractZonedRebalanceTest.java new file mode 100644 index 0000000000..56ef9688f6 --- /dev/null +++ b/test/unit/voldemort/client/rebalance/AbstractZonedRebalanceTest.java @@ -0,0 +1,839 @@ +/* + * Copyright 2008-2012 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package voldemort.client.rebalance; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotSame; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; + +import org.apache.commons.io.FileUtils; +import org.apache.log4j.Logger; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import voldemort.ServerTestUtils; +import voldemort.client.ClientConfig; +import voldemort.client.DefaultStoreClient; +import voldemort.client.RoutingTier; +import voldemort.client.SocketStoreClientFactory; +import voldemort.client.StoreClient; +import voldemort.client.protocol.admin.AdminClient; +import voldemort.cluster.Cluster; +import voldemort.cluster.Node; +import voldemort.routing.RoutingStrategy; +import voldemort.routing.RoutingStrategyFactory; +import voldemort.routing.RoutingStrategyType; +import voldemort.routing.StoreRoutingPlan; +import voldemort.serialization.SerializerDefinition; +import voldemort.server.VoldemortServer; +import voldemort.store.InvalidMetadataException; +import voldemort.store.Store; +import voldemort.store.StoreDefinition; +import voldemort.store.StoreDefinitionBuilder; +import voldemort.store.bdb.BdbStorageConfiguration; +import voldemort.store.metadata.MetadataStore; +import voldemort.store.metadata.MetadataStore.VoldemortState; +import voldemort.store.slop.strategy.HintedHandoffStrategyType; +import voldemort.utils.ByteArray; +import voldemort.utils.ByteUtils; +import voldemort.utils.RebalanceUtils; +import voldemort.versioning.ClockEntry; +import voldemort.versioning.ObsoleteVersionException; +import voldemort.versioning.VectorClock; +import voldemort.versioning.Versioned; +import voldemort.xml.StoreDefinitionsMapper; + +import com.google.common.collect.Lists; + +/** + * Rebalancing tests with zoned configurations with cross zone moves (since + * {@link AbstractNonZonedRebalanceTest} should cover intra zone moves already + * + */ +public abstract class AbstractZonedRebalanceTest extends AbstractRebalanceTest { + + private static final Logger logger = Logger.getLogger(AbstractZonedRebalanceTest.class.getName()); + + protected static String testStoreNameRW = "test"; + protected static String testStoreNameRW2 = "test2"; + + protected static String storeDefFileWithoutReplication; + protected static String storeDefFileWithReplication; + protected static String rwStoreDefFileWithReplication; + protected static String rwTwoStoreDefFileWithReplication; + + private List storeDefWithoutReplication; + private List storeDefWithReplication; + private StoreDefinition rwStoreDefWithoutReplication; + private StoreDefinition rwStoreDefWithReplication; + private StoreDefinition rwStoreDefWithReplication2; + + public AbstractZonedRebalanceTest(boolean useNio, boolean useDonorBased) { + super(useNio, useDonorBased); + } + + @Before + public void setUp() throws IOException { + // First without replication + HashMap zrfRWStoreWithoutReplication = new HashMap(); + zrfRWStoreWithoutReplication.put(0, 1); + zrfRWStoreWithoutReplication.put(1, 1); + rwStoreDefWithoutReplication = new StoreDefinitionBuilder().setName(testStoreNameRW) + .setType(BdbStorageConfiguration.TYPE_NAME) + .setKeySerializer(new SerializerDefinition("string")) + .setValueSerializer(new SerializerDefinition("string")) + .setRoutingPolicy(RoutingTier.CLIENT) + .setRoutingStrategyType(RoutingStrategyType.ZONE_STRATEGY) + .setReplicationFactor(2) + .setPreferredReads(1) + .setRequiredReads(1) + .setPreferredWrites(1) + .setRequiredWrites(1) + .setZoneCountReads(0) + .setZoneCountWrites(0) + .setZoneReplicationFactor(zrfRWStoreWithoutReplication) + .setHintedHandoffStrategy(HintedHandoffStrategyType.PROXIMITY_STRATEGY) + .build(); + + storeDefWithoutReplication = Lists.newArrayList(rwStoreDefWithoutReplication); + String storeDefWithoutReplicationString = new StoreDefinitionsMapper().writeStoreList(storeDefWithoutReplication); + File file = File.createTempFile("two-stores-", ".xml"); + FileUtils.writeStringToFile(file, storeDefWithoutReplicationString); + storeDefFileWithoutReplication = file.getAbsolutePath(); + + // Now with replication + HashMap zrfRWStoreWithReplication = new HashMap(); + zrfRWStoreWithReplication.put(0, 2); + zrfRWStoreWithReplication.put(1, 2); + rwStoreDefWithReplication = new StoreDefinitionBuilder().setName(testStoreNameRW) + .setType(BdbStorageConfiguration.TYPE_NAME) + .setKeySerializer(new SerializerDefinition("string")) + .setValueSerializer(new SerializerDefinition("string")) + .setRoutingPolicy(RoutingTier.CLIENT) + .setRoutingStrategyType(RoutingStrategyType.ZONE_STRATEGY) + .setReplicationFactor(4) + .setPreferredReads(1) + .setRequiredReads(1) + .setPreferredWrites(1) + .setRequiredWrites(1) + .setZoneCountReads(0) + .setZoneCountWrites(0) + .setZoneReplicationFactor(zrfRWStoreWithReplication) + .setHintedHandoffStrategy(HintedHandoffStrategyType.PROXIMITY_STRATEGY) + .build(); + rwStoreDefWithReplication2 = new StoreDefinitionBuilder().setName(testStoreNameRW2) + .setType(BdbStorageConfiguration.TYPE_NAME) + .setKeySerializer(new SerializerDefinition("string")) + .setValueSerializer(new SerializerDefinition("string")) + .setRoutingPolicy(RoutingTier.CLIENT) + .setRoutingStrategyType(RoutingStrategyType.ZONE_STRATEGY) + .setReplicationFactor(4) + .setPreferredReads(1) + .setRequiredReads(1) + .setPreferredWrites(1) + .setRequiredWrites(1) + .setZoneCountReads(0) + .setZoneCountWrites(0) + .setZoneReplicationFactor(zrfRWStoreWithReplication) + .setHintedHandoffStrategy(HintedHandoffStrategyType.PROXIMITY_STRATEGY) + .build(); + + file = File.createTempFile("rw-stores-", ".xml"); + FileUtils.writeStringToFile(file, + new StoreDefinitionsMapper().writeStoreList(Lists.newArrayList(rwStoreDefWithReplication))); + rwStoreDefFileWithReplication = file.getAbsolutePath(); + + file = File.createTempFile("rw-two-stores-", ".xml"); + FileUtils.writeStringToFile(file, + new StoreDefinitionsMapper().writeStoreList(Lists.newArrayList(rwStoreDefWithReplication, + rwStoreDefWithReplication2))); + rwTwoStoreDefFileWithReplication = file.getAbsolutePath(); + + storeDefWithReplication = Lists.newArrayList(rwStoreDefWithReplication); + String storeDefWithReplicationString = new StoreDefinitionsMapper().writeStoreList(storeDefWithReplication); + file = File.createTempFile("two-stores-", ".xml"); + FileUtils.writeStringToFile(file, storeDefWithReplicationString); + storeDefFileWithReplication = file.getAbsolutePath(); + } + + @After + public void tearDown() { + testEntries.clear(); + testEntries = null; + socketStoreFactory.close(); + socketStoreFactory = null; + } + + @Test(timeout = 600000) + public void testRWRebalance() throws Exception { + logger.info("Starting testRWRebalance"); + try { + + Cluster currentCluster = ServerTestUtils.getLocalZonedCluster(4, 2, new int[] { 0, 0, + 1, 1 }, new int[][] { { 0, 2, 4, 6 }, {}, { 1, 3, 5, 7 }, {} }); + Cluster targetCluster = RebalanceUtils.createUpdatedCluster(currentCluster, + 3, + Lists.newArrayList(2, 6)); + targetCluster = RebalanceUtils.createUpdatedCluster(targetCluster, + 1, + Lists.newArrayList(3, 7)); + + // start all the servers + List serverList = Arrays.asList(0, 1, 2, 3); + Map configProps = new HashMap(); + configProps.put("admin.max.threads", "5"); + currentCluster = startServers(currentCluster, + storeDefFileWithoutReplication, + serverList, + configProps); + // Update the cluster information based on the node information + targetCluster = updateCluster(targetCluster); + + RebalanceClientConfig config = new RebalanceClientConfig(); + config.setDeleteAfterRebalancingEnabled(true); + config.setStealerBasedRebalancing(!useDonorBased); + RebalanceController rebalanceClient = new RebalanceController(getBootstrapUrl(currentCluster, + 0), + config); + try { + populateData(currentCluster, rwStoreDefWithoutReplication); + + rebalanceAndCheck(currentCluster, + targetCluster, + storeDefWithoutReplication, + rebalanceClient, + Arrays.asList(1, 2)); + + checkConsistentMetadata(targetCluster, serverList); + } finally { + // stop servers + stopServer(serverList); + } + } catch(AssertionError ae) { + logger.error("Assertion broken in testRWRebalance ", ae); + throw ae; + } + } + + public void testRWRebalanceWithReplication(boolean serial) throws Exception { + logger.info("Starting testRWRebalanceWithReplication"); + + Cluster currentCluster = ServerTestUtils.getLocalZonedCluster(4, + 2, + new int[] { 0, 0, 1, 1 }, + new int[][] { { 0, 2, 4 }, + { 6 }, { 1, 3, 5 }, + { 7 } }); + Cluster targetCluster = RebalanceUtils.createUpdatedCluster(currentCluster, + 3, + Lists.newArrayList(2)); + targetCluster = RebalanceUtils.createUpdatedCluster(targetCluster, 1, Lists.newArrayList(3)); + + // start servers + List serverList = Arrays.asList(0, 1, 2, 3); + Map configProps = new HashMap(); + configProps.put("admin.max.threads", "5"); + if(serial) + configProps.put("max.parallel.stores.rebalancing", String.valueOf(1)); + currentCluster = startServers(currentCluster, + storeDefFileWithReplication, + serverList, + configProps); + // Update the cluster information based on the node information + targetCluster = updateCluster(targetCluster); + + RebalanceClientConfig config = new RebalanceClientConfig(); + config.setDeleteAfterRebalancingEnabled(true); + config.setStealerBasedRebalancing(!useDonorBased); + config.setPrimaryPartitionBatchSize(100); + config.setMaxParallelRebalancing(5); + RebalanceController rebalanceClient = new RebalanceController(getBootstrapUrl(currentCluster, + 0), + config); + try { + + populateData(currentCluster, rwStoreDefWithReplication); + + rebalanceAndCheck(currentCluster, + targetCluster, + storeDefWithReplication, + rebalanceClient, + Arrays.asList(0, 1, 2, 3)); + checkConsistentMetadata(targetCluster, serverList); + } finally { + // stop servers + stopServer(serverList); + } + } + + @Test(timeout = 600000) + public void testRWRebalanceWithReplication() throws Exception { + try { + testRWRebalanceWithReplication(false); + } catch(AssertionError ae) { + logger.error("Assertion broken in testRWRebalanceWithReplication ", ae); + throw ae; + } + } + + @Test(timeout = 600000) + public void testRWRebalanceWithReplicationSerial() throws Exception { + try { + testRWRebalanceWithReplication(true); + } catch(AssertionError ae) { + logger.error("Assertion broken in testRWRebalanceWithReplicationSerial ", ae); + throw ae; + } + } + + @Test(timeout = 600000) + public void testRebalanceCleanPrimarySecondary() throws Exception { + logger.info("Starting testRebalanceCleanPrimary"); + try { + Cluster currentCluster = ServerTestUtils.getLocalZonedCluster(6, 2, new int[] { 0, 0, + 0, 1, 1, 1 }, new int[][] { { 0 }, { 1, 6 }, { 2 }, { 3 }, { 4, 7 }, { 5 } }); + Cluster targetCluster = RebalanceUtils.createUpdatedCluster(currentCluster, + 2, + Lists.newArrayList(7)); + targetCluster = RebalanceUtils.createUpdatedCluster(targetCluster, + 5, + Lists.newArrayList(6)); + + /** + * original server partition ownership + * + * [s0 : p0,p3,p4,p5,p6,p7] [s1 : p1-p7] [s2 : p1,p2] [s3 : + * p0,p1,p2,p3,p6,p7] [s4 : p1-p7] [s5 : p4,p5] + * + * target server partition ownership + * + * [s0 : p0,p2,p3,p4,p5,p6,p7] [s1 : p0,p1] [s2 : p1-p7] [s3 : + * p0.p1,p2,p3,p5,p6,p7] [s4 : p0,p1,p2,p3,p4,p7] [s5 : p4,p5,p6] + */ + + // start servers + List serverList = Arrays.asList(0, 1, 2, 3, 4, 5); + Map configProps = new HashMap(); + configProps.put("enable.repair", "true"); + currentCluster = startServers(currentCluster, + rwStoreDefFileWithReplication, + serverList, + configProps); + // Update the cluster information based on the node information + targetCluster = updateCluster(targetCluster); + + RebalanceClientConfig config = new RebalanceClientConfig(); + config.setDeleteAfterRebalancingEnabled(false); + config.setStealerBasedRebalancing(!useDonorBased); + RebalanceController rebalanceClient = new RebalanceController(getBootstrapUrl(currentCluster, + 0), + config); + try { + populateData(currentCluster, rwStoreDefWithReplication); + + AdminClient admin = rebalanceClient.getAdminClient(); + + List p6KeySamples = sampleKeysFromPartition(admin, + 1, + rwStoreDefWithReplication.getName(), + Arrays.asList(6), + 20); + List p1KeySamples = sampleKeysFromPartition(admin, + 1, + rwStoreDefWithReplication.getName(), + Arrays.asList(1), + 20); + List p3KeySamples = sampleKeysFromPartition(admin, + 0, + rwStoreDefWithReplication.getName(), + Arrays.asList(3), + 20); + List p2KeySamples = sampleKeysFromPartition(admin, + 1, + rwStoreDefWithReplication.getName(), + Arrays.asList(2), + 20); + List p7KeySamples = sampleKeysFromPartition(admin, + 4, + rwStoreDefWithReplication.getName(), + Arrays.asList(7), + 20); + + rebalanceAndCheck(currentCluster, + targetCluster, + Lists.newArrayList(rwStoreDefWithReplication), + rebalanceClient, + Arrays.asList(0, 1, 2, 3)); + checkConsistentMetadata(targetCluster, serverList); + + // Do the cleanup operation + for(int i = 0; i < 6; i++) { + admin.storeMntOps.repairJob(i); + } + // wait for the repairs to complete + for(int i = 0; i < 6; i++) { + ServerTestUtils.waitForAsyncOperationOnServer(serverMap.get(i), "Repair", 5000); + } + + // confirm a primary movement in zone 0 : P6 : s1 -> S2. The + // zone 0 + // primary changes when p6 moves cross zone + // check for existence of p6 in server 2, + checkForKeyExistence(admin, 2, rwStoreDefWithReplication.getName(), p6KeySamples); + // also check for p6 absence in server 1. + checkForKeyNonExistence(admin, 1, rwStoreDefWithReplication.getName(), p6KeySamples); + + // confirm a secondary movement in zone 0.. p2 : s1 -> s0 + // check for its existence in server 0 + checkForKeyExistence(admin, 0, rwStoreDefWithReplication.getName(), p2KeySamples); + // check for its absernce in server 1 + checkForKeyNonExistence(admin, 1, rwStoreDefWithReplication.getName(), p2KeySamples); + + // also check that p1 is stable in server 1 [primary stability] + checkForKeyExistence(admin, 1, rwStoreDefWithReplication.getName(), p1KeySamples); + // check that p3 is stable in server 0 [Secondary stability] + checkForKeyExistence(admin, 0, rwStoreDefWithReplication.getName(), p3KeySamples); + + // finally, test for server 4 which now became the secondary for + // p7 + // from being a primary before + checkForKeyExistence(admin, 4, rwStoreDefWithReplication.getName(), p7KeySamples); + } finally { + // stop servers + stopServer(serverList); + } + } catch(AssertionError ae) { + logger.error("Assertion broken in testRebalanceCleanPrimarySecondary ", ae); + throw ae; + } + } + + @Test(timeout = 600000) + public void testProxyGetDuringRebalancing() throws Exception { + logger.info("Starting testProxyGetDuringRebalancing"); + try { + Cluster currentCluster = ServerTestUtils.getLocalZonedCluster(4, 2, new int[] { 0, 0, + 1, 1 }, new int[][] { { 0, 2, 4 }, { 6 }, { 1, 3, 5 }, { 7 } }); + Cluster targetCluster = RebalanceUtils.createUpdatedCluster(currentCluster, + 3, + Lists.newArrayList(2)); + targetCluster = RebalanceUtils.createUpdatedCluster(targetCluster, + 1, + Lists.newArrayList(3)); + + final List serverList = Arrays.asList(0, 1, 2, 3); + Map configProps = new HashMap(); + configProps.put("admin.max.threads", "5"); + final Cluster updatedCurrentCluster = startServers(currentCluster, + storeDefFileWithReplication, + serverList, + configProps); + // Update the cluster information based on the node information + final Cluster updatedTargetCluster = updateCluster(targetCluster); + + ExecutorService executors = Executors.newFixedThreadPool(2); + final AtomicBoolean rebalancingComplete = new AtomicBoolean(false); + final List exceptions = Collections.synchronizedList(new ArrayList()); + + RebalanceClientConfig rebalanceClientConfig = new RebalanceClientConfig(); + rebalanceClientConfig.setMaxParallelRebalancing(2); + // Again, forced to use steal based since RO does not support donor + // based yet. + rebalanceClientConfig.setStealerBasedRebalancing(true); + + final RebalanceController rebalanceClient = new RebalanceController(getBootstrapUrl(updatedCurrentCluster, + 0), + rebalanceClientConfig); + try { + + populateData(currentCluster, rwStoreDefWithReplication); + + final SocketStoreClientFactory factory = new SocketStoreClientFactory(new ClientConfig().setBootstrapUrls(getBootstrapUrl(currentCluster, + 0)) + .setEnableLazy(false) + .setSocketTimeout(120, + TimeUnit.SECONDS)); + + final StoreClient storeClientRW = new DefaultStoreClient(rwStoreDefWithReplication.getName(), + null, + factory, + 3); + + final CountDownLatch latch = new CountDownLatch(2); + // start get operation. + executors.execute(new Runnable() { + + @Override + public void run() { + try { + List keys = new ArrayList(testEntries.keySet()); + + while(!rebalancingComplete.get()) { + // should always able to get values. + int index = (int) (Math.random() * keys.size()); + + // should get a valid value + try { + Versioned value = storeClientRW.get(keys.get(index)); + assertNotSame("StoreClient get() should not return null.", + null, + value); + assertEquals("Value returned should be good", + new Versioned(testEntries.get(keys.get(index))), + value); + } catch(Exception e) { + logger.error("Exception in proxy get thread", e); + e.printStackTrace(); + exceptions.add(e); + } + } + + } catch(Exception e) { + logger.error("Exception in proxy get thread", e); + exceptions.add(e); + } finally { + factory.close(); + latch.countDown(); + } + } + + }); + + executors.execute(new Runnable() { + + @Override + public void run() { + try { + + Thread.sleep(500); + rebalanceAndCheck(updatedCurrentCluster, + updatedTargetCluster, + storeDefWithReplication, + rebalanceClient, + Arrays.asList(0, 1, 2, 3)); + Thread.sleep(500); + rebalancingComplete.set(true); + checkConsistentMetadata(updatedTargetCluster, serverList); + + } catch(Exception e) { + exceptions.add(e); + } finally { + // stop servers + try { + stopServer(serverList); + } catch(Exception e) { + throw new RuntimeException(e); + } + latch.countDown(); + } + } + }); + + latch.await(); + executors.shutdown(); + executors.awaitTermination(300, TimeUnit.SECONDS); + + // check No Exception + if(exceptions.size() > 0) { + for(Exception e: exceptions) { + e.printStackTrace(); + } + fail("Should not see any exceptions."); + } + } finally { + // stop servers + stopServer(serverList); + } + } catch(AssertionError ae) { + logger.error("Assertion broken in testProxyGetDuringRebalancing ", ae); + throw ae; + } + } + + @Test(timeout = 600000) + public void testProxyPutDuringRebalancing() throws Exception { + logger.info("Starting testProxyPutDuringRebalancing"); + try { + Cluster currentCluster = ServerTestUtils.getLocalZonedCluster(6, 2, new int[] { 0, 0, + 0, 1, 1, 1 }, new int[][] { { 0 }, { 1, 6 }, { 2 }, { 3 }, { 4, 7 }, { 5 } }); + Cluster targetCluster = RebalanceUtils.createUpdatedCluster(currentCluster, + 2, + Lists.newArrayList(7)); + targetCluster = RebalanceUtils.createUpdatedCluster(targetCluster, + 5, + Lists.newArrayList(6)); + + /** + * original server partition ownership + * + * [s0 : p0,p3,p4,p5,p6,p7] [s1 : p1-p7] [s2 : p1,p2] [s3 : + * p0,p1,p2,p3,p6,p7] [s4 : p1-p7] [s5 : p4,p5] + * + * target server partition ownership + * + * [s0 : p0,p2,p3,p4,p5,p6,p7] [s1 : p0,p1] [s2 : p1-p7] [s3 : + * p0.p1,p2,p3,p5,p6,p7] [s4 : p0,p1,p2,p3,p4,p7] [s5 : p4,p5,p6] + */ + List serverList = Arrays.asList(0, 1, 2, 3, 4, 5); + Map configProps = new HashMap(); + configProps.put("admin.max.threads", "5"); + final Cluster updatedCurrentCluster = startServers(currentCluster, + rwStoreDefFileWithReplication, + serverList, + configProps); + // Update the cluster information based on the node information + final Cluster updatedTargetCluster = updateCluster(targetCluster); + + ExecutorService executors = Executors.newFixedThreadPool(2); + final AtomicBoolean rebalancingComplete = new AtomicBoolean(false); + final List exceptions = Collections.synchronizedList(new ArrayList()); + + RebalanceClientConfig rebalanceClientConfig = new RebalanceClientConfig(); + rebalanceClientConfig.setMaxParallelRebalancing(2); + // Its is imperative that we test in a single shot since multiple + // batches would mean the proxy bridges being torn down and + // established multiple times and we cannot test against the source + // cluster topology then. + rebalanceClientConfig.setPrimaryPartitionBatchSize(Integer.MAX_VALUE); + rebalanceClientConfig.setStealerBasedRebalancing(!useDonorBased); + + final RebalanceController rebalanceClient = new RebalanceController(getBootstrapUrl(updatedCurrentCluster, + 0), + rebalanceClientConfig); + + populateData(currentCluster, rwStoreDefWithReplication); + final AdminClient adminClient = rebalanceClient.getAdminClient(); + // the plan would cause the following cross zone move Partition : + // Donor -> Stealer p6 (PRI) : 1 -> 5 + final List movingKeysList = sampleKeysFromPartition(adminClient, + 1, + rwStoreDefWithReplication.getName(), + Arrays.asList(6), + 20); + assertTrue("Empty list of moving keys...", movingKeysList.size() > 0); + final AtomicBoolean rebalancingStarted = new AtomicBoolean(false); + final AtomicBoolean proxyWritesDone = new AtomicBoolean(false); + final HashMap baselineTuples = new HashMap(testEntries); + final HashMap baselineVersions = new HashMap(); + + for(String key: baselineTuples.keySet()) { + baselineVersions.put(key, new VectorClock()); + } + + final CountDownLatch latch = new CountDownLatch(2); + // start get operation. + executors.execute(new Runnable() { + + @Override + public void run() { + SocketStoreClientFactory factory = null; + try { + // wait for the rebalancing to begin + List serverList = Lists.newArrayList(serverMap.get(4), + serverMap.get(2), + serverMap.get(3), + serverMap.get(5)); + while(!rebalancingComplete.get()) { + Iterator serverIterator = serverList.iterator(); + while(serverIterator.hasNext()) { + VoldemortServer server = serverIterator.next(); + if(ByteUtils.getString(server.getMetadataStore() + .get(MetadataStore.SERVER_STATE_KEY, + null) + .get(0) + .getValue(), + "UTF-8") + .compareTo(VoldemortState.REBALANCING_MASTER_SERVER.toString()) == 0) { + logger.info("Server " + server.getIdentityNode().getId() + + " transitioned into REBALANCING MODE"); + serverIterator.remove(); + } + } + if(serverList.size() == 0) { + rebalancingStarted.set(true); + break; + } + } + + if(rebalancingStarted.get()) { + factory = new SocketStoreClientFactory(new ClientConfig().setBootstrapUrls(getBootstrapUrl(updatedCurrentCluster, + 0)) + .setEnableLazy(false) + .setSocketTimeout(120, + TimeUnit.SECONDS) + .setClientZoneId(1)); + + final StoreClient storeClientRW = new DefaultStoreClient(testStoreNameRW, + null, + factory, + 3); + // Now perform some writes and determine the end + // state of the changed keys. Initially, all data + // now with zero vector clock + for(ByteArray movingKey: movingKeysList) { + try { + String keyStr = ByteUtils.getString(movingKey.get(), "UTF-8"); + String valStr = "proxy_write"; + storeClientRW.put(keyStr, valStr); + baselineTuples.put(keyStr, valStr); + // all these keys will have [5:1] vector + // clock is node 5 is the new pseudo master + baselineVersions.get(keyStr) + .incrementVersion(5, System.currentTimeMillis()); + proxyWritesDone.set(true); + if(rebalancingComplete.get()) { + break; + } + } catch(InvalidMetadataException e) { + // let this go + logger.error("Encountered an invalid metadata exception.. ", e); + } + } + } + } catch(Exception e) { + logger.error("Exception in proxy write thread..", e); + exceptions.add(e); + } finally { + if(factory != null) + factory.close(); + latch.countDown(); + } + } + + }); + + executors.execute(new Runnable() { + + @Override + public void run() { + try { + rebalanceClient.rebalance(updatedTargetCluster); + } catch(Exception e) { + logger.error("Error in rebalancing... ", e); + exceptions.add(e); + } finally { + rebalancingComplete.set(true); + latch.countDown(); + } + } + }); + + latch.await(); + executors.shutdown(); + executors.awaitTermination(300, TimeUnit.SECONDS); + + assertEquals("Client did not see all server transition into rebalancing state", + rebalancingStarted.get(), + true); + assertEquals("Not enough time to begin proxy writing", proxyWritesDone.get(), true); + checkEntriesPostRebalance(updatedCurrentCluster, + updatedTargetCluster, + Lists.newArrayList(rwStoreDefWithReplication), + Arrays.asList(0, 1, 2, 3, 4, 5), + baselineTuples, + baselineVersions); + checkConsistentMetadata(updatedTargetCluster, serverList); + // check No Exception + if(exceptions.size() > 0) { + for(Exception e: exceptions) { + e.printStackTrace(); + } + fail("Should not see any exceptions."); + } + // check that the proxy writes were made to the original donor, node + // 1 + List clockEntries = new ArrayList(serverList.size()); + for(Integer nodeid: serverList) + clockEntries.add(new ClockEntry(nodeid.shortValue(), System.currentTimeMillis())); + VectorClock clusterXmlClock = new VectorClock(clockEntries, System.currentTimeMillis()); + for(Integer nodeid: serverList) + adminClient.metadataMgmtOps.updateRemoteCluster(nodeid, + currentCluster, + clusterXmlClock); + adminClient.setAdminClientCluster(currentCluster); + checkForTupleEquivalence(adminClient, + 1, + testStoreNameRW, + movingKeysList, + baselineTuples, + baselineVersions); + + // stop servers + try { + stopServer(serverList); + } catch(Exception e) { + throw new RuntimeException(e); + } + } catch(AssertionError ae) { + logger.error("Assertion broken in testProxyPutDuringRebalancing ", ae); + throw ae; + } + } + + protected void populateData(Cluster cluster, StoreDefinition storeDef) throws Exception { + + // Create SocketStores for each Node first + Map> storeMap = new HashMap>(); + for(Node node: cluster.getNodes()) { + storeMap.put(node.getId(), + getSocketStore(storeDef.getName(), node.getHost(), node.getSocketPort())); + } + + RoutingStrategy routing = new RoutingStrategyFactory().updateRoutingStrategy(storeDef, + cluster); + StoreRoutingPlan storeInstance = new StoreRoutingPlan(cluster, storeDef); + for(Entry entry: testEntries.entrySet()) { + ByteArray keyBytes = new ByteArray(ByteUtils.getBytes(entry.getKey(), "UTF-8")); + List preferenceNodes = storeInstance.getReplicationNodeList(keyBytes.get()); + // Go over every node + for(int nodeId: preferenceNodes) { + try { + storeMap.get(nodeId) + .put(keyBytes, + new Versioned(ByteUtils.getBytes(entry.getValue(), "UTF-8")), + null); + } catch(ObsoleteVersionException e) { + logger.info("Why are we seeing this at all here ?? "); + e.printStackTrace(); + } + } + } + + // close all socket stores + for(Store store: storeMap.values()) { + store.close(); + } + } +} diff --git a/test/unit/voldemort/client/rebalance/AdminRebalanceTest.java b/test/unit/voldemort/client/rebalance/AdminRebalanceTest.java index 2f97eb0e4e..5aa98637dc 100644 --- a/test/unit/voldemort/client/rebalance/AdminRebalanceTest.java +++ b/test/unit/voldemort/client/rebalance/AdminRebalanceTest.java @@ -200,7 +200,7 @@ public void startFourNodeRO() throws IOException { .setType(ReadOnlyStorageConfiguration.TYPE_NAME) .setKeySerializer(new SerializerDefinition("string")) .setValueSerializer(new SerializerDefinition("string")) - .setRoutingPolicy(RoutingTier.SERVER) + .setRoutingPolicy(RoutingTier.CLIENT) .setRoutingStrategyType(RoutingStrategyType.CONSISTENT_STRATEGY) .setReplicationFactor(2) .setPreferredReads(1) @@ -212,7 +212,7 @@ public void startFourNodeRO() throws IOException { .setType(ReadOnlyStorageConfiguration.TYPE_NAME) .setKeySerializer(new SerializerDefinition("string")) .setValueSerializer(new SerializerDefinition("string")) - .setRoutingPolicy(RoutingTier.SERVER) + .setRoutingPolicy(RoutingTier.CLIENT) .setRoutingStrategyType(RoutingStrategyType.CONSISTENT_STRATEGY) .setReplicationFactor(3) .setPreferredReads(1) @@ -252,7 +252,7 @@ public void startFourNodeRORW() throws IOException { .setType(ReadOnlyStorageConfiguration.TYPE_NAME) .setKeySerializer(new SerializerDefinition("string")) .setValueSerializer(new SerializerDefinition("string")) - .setRoutingPolicy(RoutingTier.SERVER) + .setRoutingPolicy(RoutingTier.CLIENT) .setRoutingStrategyType(RoutingStrategyType.CONSISTENT_STRATEGY) .setReplicationFactor(2) .setPreferredReads(1) @@ -264,7 +264,7 @@ public void startFourNodeRORW() throws IOException { .setType(ReadOnlyStorageConfiguration.TYPE_NAME) .setKeySerializer(new SerializerDefinition("string")) .setValueSerializer(new SerializerDefinition("string")) - .setRoutingPolicy(RoutingTier.SERVER) + .setRoutingPolicy(RoutingTier.CLIENT) .setRoutingStrategyType(RoutingStrategyType.CONSISTENT_STRATEGY) .setReplicationFactor(3) .setPreferredReads(1) @@ -399,6 +399,9 @@ public void testRebalanceNodeRW() throws IOException { getServer(partitionPlan.getStealerId()).getMetadataStore() .put(MetadataStore.SERVER_STATE_KEY, MetadataStore.VoldemortState.REBALANCING_MASTER_SERVER); + getServer(partitionPlan.getStealerId()).getMetadataStore() + .put(MetadataStore.REBALANCING_SOURCE_CLUSTER_XML, + partitionPlan.getInitialCluster()); } try { @@ -515,7 +518,7 @@ public void testRebalanceNodeRW() throws IOException { for(VoldemortServer server: servers) { assertEquals(server.getMetadataStore().getRebalancerState(), new RebalancerState(new ArrayList())); - assertEquals(server.getMetadataStore().getServerState(), + assertEquals(server.getMetadataStore().getServerStateUnlocked(), MetadataStore.VoldemortState.NORMAL_SERVER); } } finally { @@ -571,6 +574,9 @@ public void testRebalanceNodeRW2() throws IOException { getServer(partitionPlan.getStealerId()).getMetadataStore() .put(MetadataStore.REBALANCING_STEAL_INFO, new RebalancerState(Lists.newArrayList(RebalancePartitionsInfo.create(partitionPlan.toJsonString())))); + getServer(partitionPlan.getStealerId()).getMetadataStore() + .put(MetadataStore.REBALANCING_SOURCE_CLUSTER_XML, + partitionPlan.getInitialCluster()); } // Update the cluster metadata on all three nodes @@ -719,7 +725,7 @@ public void testRebalanceNodeRW2() throws IOException { for(VoldemortServer server: servers) { assertEquals(server.getMetadataStore().getRebalancerState(), new RebalancerState(new ArrayList())); - assertEquals(server.getMetadataStore().getServerState(), + assertEquals(server.getMetadataStore().getServerStateUnlocked(), MetadataStore.VoldemortState.NORMAL_SERVER); } } finally { @@ -745,6 +751,9 @@ public void testRebalanceNodeRO() throws IOException { getServer(partitionPlan.getStealerId()).getMetadataStore() .put(MetadataStore.REBALANCING_STEAL_INFO, new RebalancerState(Lists.newArrayList(RebalancePartitionsInfo.create(partitionPlan.toJsonString())))); + getServer(partitionPlan.getStealerId()).getMetadataStore() + .put(MetadataStore.REBALANCING_SOURCE_CLUSTER_XML, + partitionPlan.getInitialCluster()); } // Actually run it @@ -801,7 +810,7 @@ public void testRebalanceNodeRO() throws IOException { for(VoldemortServer server: servers) { assertEquals(server.getMetadataStore().getRebalancerState(), new RebalancerState(new ArrayList())); - assertEquals(server.getMetadataStore().getServerState(), + assertEquals(server.getMetadataStore().getServerStateUnlocked(), MetadataStore.VoldemortState.NORMAL_SERVER); } @@ -815,7 +824,7 @@ public void testRebalanceNodeRO() throws IOException { .setType(ReadOnlyStorageConfiguration.TYPE_NAME) .setKeySerializer(new SerializerDefinition("string")) .setValueSerializer(new SerializerDefinition("string")) - .setRoutingPolicy(RoutingTier.SERVER) + .setRoutingPolicy(RoutingTier.CLIENT) .setRoutingStrategyType(RoutingStrategyType.CONSISTENT_STRATEGY) .setReplicationFactor(2) .setPreferredReads(1) @@ -825,8 +834,14 @@ public void testRebalanceNodeRO() throws IOException { .build())); try { + // TODO pass the target storedefs + // ATTENTION JAY adminClient.rebalanceOps.rebalanceStateChange(cluster, targetCluster, + servers[2].getMetadataStore() + .getStoreDefList(), + servers[2].getMetadataStore() + .getStoreDefList(), plans, true, true, @@ -844,8 +859,14 @@ public void testRebalanceNodeRO() throws IOException { checkRO(cluster); // Test 2) All passes scenario + // TODO pass the target storedefs + // ATTENTION JAY adminClient.rebalanceOps.rebalanceStateChange(cluster, targetCluster, + servers[2].getMetadataStore() + .getStoreDefList(), + servers[2].getMetadataStore() + .getStoreDefList(), plans, true, true, @@ -899,6 +920,9 @@ public void testRebalanceNodeRORW() throws IOException, InterruptedException { getServer(partitionPlan.getStealerId()).getMetadataStore() .put(MetadataStore.REBALANCING_STEAL_INFO, new RebalancerState(Lists.newArrayList(RebalancePartitionsInfo.create(partitionPlan.toJsonString())))); + getServer(partitionPlan.getStealerId()).getMetadataStore() + .put(MetadataStore.REBALANCING_SOURCE_CLUSTER_XML, + partitionPlan.getInitialCluster()); } // Actually run it @@ -936,8 +960,14 @@ public void testRebalanceNodeRORW() throws IOException, InterruptedException { 0)); try { + // TODO pass the target storedefs + // ATTENTION JAY adminClient.rebalanceOps.rebalanceStateChange(cluster, targetCluster, + servers[2].getMetadataStore() + .getStoreDefList(), + servers[2].getMetadataStore() + .getStoreDefList(), plans, true, true, @@ -953,7 +983,7 @@ public void testRebalanceNodeRORW() throws IOException, InterruptedException { if(server.getMetadataStore().getNodeId() != 3) { assertEquals(server.getMetadataStore().getRebalancerState(), new RebalancerState(new ArrayList())); - assertEquals(server.getMetadataStore().getServerState(), + assertEquals(server.getMetadataStore().getServerStateUnlocked(), MetadataStore.VoldemortState.NORMAL_SERVER); } assertEquals(server.getMetadataStore().getCluster(), cluster); @@ -974,7 +1004,7 @@ public void testRebalanceNodeRORW() throws IOException, InterruptedException { .setType(ReadOnlyStorageConfiguration.TYPE_NAME) .setKeySerializer(new SerializerDefinition("string")) .setValueSerializer(new SerializerDefinition("string")) - .setRoutingPolicy(RoutingTier.SERVER) + .setRoutingPolicy(RoutingTier.CLIENT) .setRoutingStrategyType(RoutingStrategyType.CONSISTENT_STRATEGY) .setReplicationFactor(2) .setPreferredReads(1) @@ -984,8 +1014,14 @@ public void testRebalanceNodeRORW() throws IOException, InterruptedException { .build())); try { + // // TODO pass the target storedefs + // ATTENTION JAY adminClient.rebalanceOps.rebalanceStateChange(cluster, targetCluster, + servers[2].getMetadataStore() + .getStoreDefList(), + servers[2].getMetadataStore() + .getStoreDefList(), plans, true, true, @@ -1000,7 +1036,7 @@ public void testRebalanceNodeRORW() throws IOException, InterruptedException { for(VoldemortServer server: servers) { assertEquals(server.getMetadataStore().getRebalancerState(), new RebalancerState(new ArrayList())); - assertEquals(server.getMetadataStore().getServerState(), + assertEquals(server.getMetadataStore().getServerStateUnlocked(), MetadataStore.VoldemortState.NORMAL_SERVER); assertEquals(server.getMetadataStore().getCluster(), cluster); } @@ -1017,8 +1053,15 @@ public void testRebalanceNodeRORW() throws IOException, InterruptedException { storeDef4)); // Test 3) Everything should work + + // TODO pass the target storedefs + // ATTENTION JAY adminClient.rebalanceOps.rebalanceStateChange(cluster, targetCluster, + servers[2].getMetadataStore() + .getStoreDefList(), + servers[2].getMetadataStore() + .getStoreDefList(), plans, true, true, @@ -1031,7 +1074,8 @@ public void testRebalanceNodeRORW() throws IOException, InterruptedException { nodesChecked.add(plan.getStealerId()); assertEquals(servers[plan.getStealerId()].getMetadataStore().getRebalancerState(), new RebalancerState(Lists.newArrayList(plan))); - assertEquals(servers[plan.getStealerId()].getMetadataStore().getServerState(), + assertEquals(servers[plan.getStealerId()].getMetadataStore() + .getServerStateUnlocked(), MetadataStore.VoldemortState.REBALANCING_MASTER_SERVER); assertEquals(servers[plan.getStealerId()].getMetadataStore().getCluster(), targetCluster); @@ -1044,7 +1088,7 @@ public void testRebalanceNodeRORW() throws IOException, InterruptedException { for(int nodeId: allNodes) { assertEquals(servers[nodeId].getMetadataStore().getRebalancerState(), new RebalancerState(new ArrayList())); - assertEquals(servers[nodeId].getMetadataStore().getServerState(), + assertEquals(servers[nodeId].getMetadataStore().getServerStateUnlocked(), MetadataStore.VoldemortState.NORMAL_SERVER); assertEquals(servers[nodeId].getMetadataStore().getCluster(), targetCluster); } @@ -1088,8 +1132,14 @@ public void testRebalanceStateChange() throws IOException { startFourNodeRW(); // Test 1) Normal case where-in all are up + // TODO pass the target storedefs + // ATTENTION JAY adminClient.rebalanceOps.rebalanceStateChange(cluster, targetCluster, + servers[2].getMetadataStore() + .getStoreDefList(), + servers[2].getMetadataStore() + .getStoreDefList(), plans, false, false, @@ -1128,8 +1178,14 @@ public void testRebalanceStateChange() throws IOException { 0)); try { + // TODO pass the target storedefs + // ATTENTION JAY adminClient.rebalanceOps.rebalanceStateChange(cluster, targetCluster, + servers[2].getMetadataStore() + .getStoreDefList(), + servers[2].getMetadataStore() + .getStoreDefList(), plans, false, false, @@ -1156,8 +1212,14 @@ public void testRebalanceStateChange() throws IOException { servers[3] = null; try { + // TODO pass the target storedefs + // ATTENTION JAY adminClient.rebalanceOps.rebalanceStateChange(cluster, targetCluster, + servers[2].getMetadataStore() + .getStoreDefList(), + servers[2].getMetadataStore() + .getStoreDefList(), plans, false, false, @@ -1187,8 +1249,14 @@ public void testClusterAndRebalanceStateChange() throws IOException { startFourNodeRW(); // Test 1) Normal case where-in all are up + // TODO pass the target storedefs + // ATTENTION JAY adminClient.rebalanceOps.rebalanceStateChange(cluster, targetCluster, + servers[2].getMetadataStore() + .getStoreDefList(), + servers[2].getMetadataStore() + .getStoreDefList(), plans, false, true, @@ -1230,8 +1298,14 @@ public void testClusterAndRebalanceStateChange() throws IOException { 0)); try { + // TODO pass the target storedefs + // ATTENTION JAY adminClient.rebalanceOps.rebalanceStateChange(cluster, targetCluster, + servers[2].getMetadataStore() + .getStoreDefList(), + servers[2].getMetadataStore() + .getStoreDefList(), plans, false, true, @@ -1259,8 +1333,14 @@ public void testClusterAndRebalanceStateChange() throws IOException { servers[3] = null; try { + // TODO pass the target storedefs + // ATTENTION JAY adminClient.rebalanceOps.rebalanceStateChange(cluster, targetCluster, + servers[2].getMetadataStore() + .getStoreDefList(), + servers[2].getMetadataStore() + .getStoreDefList(), plans, false, true, diff --git a/test/unit/voldemort/client/rebalance/RebalanceMetadataConsistencyTest.java b/test/unit/voldemort/client/rebalance/RebalanceMetadataConsistencyTest.java new file mode 100644 index 0000000000..44fe387282 --- /dev/null +++ b/test/unit/voldemort/client/rebalance/RebalanceMetadataConsistencyTest.java @@ -0,0 +1,199 @@ +package voldemort.client.rebalance; + +import java.lang.reflect.Method; +import java.util.List; + +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import voldemort.ServerTestUtils; +import voldemort.client.RoutingTier; +import voldemort.cluster.Cluster; +import voldemort.routing.RoutingStrategyType; +import voldemort.serialization.SerializerDefinition; +import voldemort.server.rebalance.Rebalancer; +import voldemort.store.Store; +import voldemort.store.StoreDefinition; +import voldemort.store.StoreDefinitionBuilder; +import voldemort.store.bdb.BdbStorageConfiguration; +import voldemort.store.memory.InMemoryStorageEngine; +import voldemort.store.metadata.MetadataStore; +import voldemort.versioning.Versioned; +import voldemort.xml.ClusterMapper; +import voldemort.xml.StoreDefinitionsMapper; + +import com.google.common.collect.Lists; + +/** + * + * This test checks if we can interleave write to cluster and store metadata we + * spwan a writer and a reader to see if we see an inconsistent state of the + * system at any point + * + */ +public class RebalanceMetadataConsistencyTest { + + private MetadataStore metadataStore; + private Cluster currentCluster; + private Cluster targetCluster; + + protected static String testStoreNameRW = "test"; + protected static String testStoreNameRW2 = "test2"; + + private StoreDefinition rwStoreDefWithReplication; + private StoreDefinition rwStoreDefWithReplication2; + private Rebalancer rebalancer; + + Cluster checkCluster; + List checkstores; + + /** + * Convenient method to execute private methods from other classes. + * + * @param test Instance of the class we want to test + * @param methodName Name of the method we want to test + * @param params Arguments we want to pass to the method + * @return Object with the result of the executed method + * @throws Exception + */ + public static Object invokePrivateMethod(Object test, String methodName, Object params[]) + throws Exception { + Object ret = null; + + final Method[] methods = test.getClass().getDeclaredMethods(); + for(int i = 0; i < methods.length; ++i) { + if(methods[i].getName().equals(methodName)) { + methods[i].setAccessible(true); + ret = methods[i].invoke(test, params); + break; + } + } + + return ret; + } + + @Before + public void setUp() { + + currentCluster = ServerTestUtils.getLocalCluster(3, new int[][] { { 0 }, { 1, 3 }, { 2 } }); + + targetCluster = ServerTestUtils.getLocalCluster(3, + new int[][] { { 0 }, { 1 }, { 2 }, { 3 } }); + + rwStoreDefWithReplication = new StoreDefinitionBuilder().setName(testStoreNameRW) + .setType(BdbStorageConfiguration.TYPE_NAME) + .setKeySerializer(new SerializerDefinition("string")) + .setValueSerializer(new SerializerDefinition("string")) + .setRoutingPolicy(RoutingTier.CLIENT) + .setRoutingStrategyType(RoutingStrategyType.CONSISTENT_STRATEGY) + .setReplicationFactor(2) + .setPreferredReads(1) + .setRequiredReads(1) + .setPreferredWrites(1) + .setRequiredWrites(1) + .build(); + Store innerStore = new InMemoryStorageEngine("inner-store"); + innerStore.put(MetadataStore.CLUSTER_KEY, + new Versioned(new ClusterMapper().writeCluster(currentCluster)), + null); + innerStore.put(MetadataStore.STORES_KEY, + new Versioned(new StoreDefinitionsMapper().writeStoreList(Lists.newArrayList(rwStoreDefWithReplication))), + null); + + rwStoreDefWithReplication2 = new StoreDefinitionBuilder().setName(testStoreNameRW2) + .setType(BdbStorageConfiguration.TYPE_NAME) + .setKeySerializer(new SerializerDefinition("string")) + .setValueSerializer(new SerializerDefinition("string")) + .setRoutingPolicy(RoutingTier.CLIENT) + .setRoutingStrategyType(RoutingStrategyType.CONSISTENT_STRATEGY) + .setReplicationFactor(2) + .setPreferredReads(1) + .setRequiredReads(1) + .setPreferredWrites(1) + .setRequiredWrites(1) + .build(); + + metadataStore = new MetadataStore(innerStore, 0); + rebalancer = new Rebalancer(null, metadataStore, null, null); + + } + + @Test + public void testThreading() { + + for(int i = 0; i < 3000; i++) { + + Cluster cluster; + StoreDefinition storeDef; + if((i % 2) == 0) { + cluster = currentCluster; + storeDef = rwStoreDefWithReplication; + + } else { + cluster = targetCluster; + storeDef = rwStoreDefWithReplication2; + + } + ThreadWriter tw = new ThreadWriter(cluster, storeDef); + Thread writer = new Thread(tw); + writer.start(); + ThreadReader tr = new ThreadReader(); + + Thread reader = new Thread(tr); + reader.start(); + } + + } + + class ThreadWriter implements Runnable { + + Cluster cluster; + StoreDefinition storeDef; + + ThreadWriter(Cluster cluster, StoreDefinition storeDef) { + + this.cluster = cluster; + this.storeDef = storeDef; + } + + @Override + public void run() { + + test(); + + } + + public void test() { + Object[] params = { MetadataStore.CLUSTER_KEY, this.cluster, MetadataStore.STORES_KEY, + Lists.newArrayList(this.storeDef) }; + try { + invokePrivateMethod(rebalancer, "changeClusterAndStores", params); + } catch(Exception e) { + + e.printStackTrace(); + } + } + + } + + class ThreadReader implements Runnable { + + @Override + public void run() { + + metadataStore.readLock.lock(); + checkCluster = metadataStore.getCluster(); + checkstores = metadataStore.getStoreDefList(); + metadataStore.readLock.unlock(); + + if(checkCluster.equals(currentCluster)) { + Assert.assertEquals(checkstores.get(0), rwStoreDefWithReplication); + } + if(checkCluster.equals(targetCluster)) { + Assert.assertEquals(checkstores.get(0), rwStoreDefWithReplication2); + } + } + } + +} diff --git a/test/unit/voldemort/client/rebalance/RebalanceRebootstrapConsistencyTest.java b/test/unit/voldemort/client/rebalance/RebalanceRebootstrapConsistencyTest.java new file mode 100644 index 0000000000..567e3ef3a7 --- /dev/null +++ b/test/unit/voldemort/client/rebalance/RebalanceRebootstrapConsistencyTest.java @@ -0,0 +1,296 @@ +package voldemort.client.rebalance; + +import static org.junit.Assert.fail; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileWriter; +import java.util.ArrayList; +import java.util.Date; +import java.util.HashMap; +import java.util.List; +import java.util.Properties; +import java.util.concurrent.Callable; +import java.util.concurrent.TimeUnit; + +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import voldemort.ServerTestUtils; +import voldemort.client.RoutingTier; +import voldemort.client.SystemStore; +import voldemort.client.SystemStoreRepository; +import voldemort.client.protocol.admin.AdminClient; +import voldemort.client.scheduler.AsyncMetadataVersionManager; +import voldemort.cluster.Cluster; +import voldemort.cluster.Node; +import voldemort.common.service.SchedulerService; +import voldemort.routing.RoutingStrategyType; +import voldemort.serialization.SerializerDefinition; +import voldemort.server.VoldemortConfig; +import voldemort.server.VoldemortServer; +import voldemort.store.StoreDefinition; +import voldemort.store.StoreDefinitionBuilder; +import voldemort.store.bdb.BdbStorageConfiguration; +import voldemort.store.metadata.MetadataStore; +import voldemort.store.socket.SocketStoreFactory; +import voldemort.store.socket.clientrequest.ClientRequestExecutorPool; +import voldemort.store.system.SystemStoreConstants; +import voldemort.utils.RebalanceUtils; +import voldemort.utils.SystemTime; +import voldemort.versioning.VectorClock; +import voldemort.versioning.Versioned; +import voldemort.xml.ClusterMapper; +import voldemort.xml.StoreDefinitionsMapper; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; + +/** + * + * We simulate the rebalance controller here by changing the cluster state and + * stores state + * + * On rebootstrap we want to ensure that the cluster and store defs are + * consistent from a client's perspective + * + */ +public class RebalanceRebootstrapConsistencyTest { + + private Cluster cluster; + private List servers; + + String[] bootStrapUrls = null; + public static String socketUrl = ""; + protected final int CLIENT_ZONE_ID = 0; + + private SystemStore sysVersionStore; + private SystemStoreRepository repository; + private SchedulerService scheduler; + private AsyncMetadataVersionManager asyncCheckMetadata; + private boolean callbackDone = false; + + private StoreDefinition rwStoreDefWithReplication; + private StoreDefinition rwStoreDefWithReplication2; + + protected static String testStoreNameRW = "test"; + protected static String testStoreNameRW2 = "test2"; + + private static final String CLUSTER_VERSION_KEY = "cluster.xml"; + int maxRetries = 0; + + private static final ClusterMapper clusterMapper = new ClusterMapper(); + private static final StoreDefinitionsMapper storeMapper = new StoreDefinitionsMapper(); + + AdminClient adminClient; + + List newstoredefs; + Cluster newCluster; + + @Before + public void setUp() throws Exception { + SocketStoreFactory socketStoreFactory = new ClientRequestExecutorPool(2, + 10000, + 100000, + 32 * 1024); + + int numServers = 2; + + rwStoreDefWithReplication = new StoreDefinitionBuilder().setName(testStoreNameRW) + .setType(BdbStorageConfiguration.TYPE_NAME) + .setKeySerializer(new SerializerDefinition("string")) + .setValueSerializer(new SerializerDefinition("string")) + .setRoutingPolicy(RoutingTier.CLIENT) + .setRoutingStrategyType(RoutingStrategyType.CONSISTENT_STRATEGY) + .setReplicationFactor(2) + .setPreferredReads(1) + .setRequiredReads(1) + .setPreferredWrites(1) + .setRequiredWrites(1) + .build(); + + rwStoreDefWithReplication2 = new StoreDefinitionBuilder().setName(testStoreNameRW2) + .setType(BdbStorageConfiguration.TYPE_NAME) + .setKeySerializer(new SerializerDefinition("string")) + .setValueSerializer(new SerializerDefinition("string")) + .setRoutingPolicy(RoutingTier.CLIENT) + .setRoutingStrategyType(RoutingStrategyType.CONSISTENT_STRATEGY) + .setReplicationFactor(2) + .setPreferredReads(1) + .setRequiredReads(1) + .setPreferredWrites(1) + .setRequiredWrites(1) + .build(); + + List storedefs = new ArrayList(); + + storedefs.add(rwStoreDefWithReplication); + + String storesXmlStr = new StoreDefinitionsMapper().writeStoreList(storedefs); + + // create a temp file + File tempStoresXml = File.createTempFile("tempfile", ".tmp"); + + BufferedWriter bw = new BufferedWriter(new FileWriter(tempStoresXml)); + bw.write(storesXmlStr); + bw.close(); + + VoldemortServer[] voldemortServers = new VoldemortServer[numServers]; + int partitionMap[][] = { { 0, 1 }, {} }; + cluster = ServerTestUtils.startVoldemortCluster(numServers, + voldemortServers, + partitionMap, + socketStoreFactory, + false, + null, + tempStoresXml.getAbsolutePath(), + new Properties()); + + servers = Lists.newArrayList(); + for(int i = 0; i < numServers; ++i) { + servers.add(voldemortServers[i]); + } + + socketUrl = voldemortServers[0].getIdentityNode().getSocketUrl().toString(); + + bootStrapUrls = new String[1]; + bootStrapUrls[0] = socketUrl; + sysVersionStore = new SystemStore(SystemStoreConstants.SystemStoreName.voldsys$_metadata_version_persistence.name(), + bootStrapUrls, + this.CLIENT_ZONE_ID); + repository = new SystemStoreRepository(); + repository.addSystemStore(sysVersionStore, + SystemStoreConstants.SystemStoreName.voldsys$_metadata_version_persistence.name()); + this.scheduler = new SchedulerService(2, SystemTime.INSTANCE, true); + + Callable rebootstrapCallback = new Callable() { + + @Override + public Void call() throws Exception { + // callbackForClusterChange(); + checkConsistentMetadata(); + return null; + } + }; + + // Starting the Version Metadata Manager + this.asyncCheckMetadata = new AsyncMetadataVersionManager(this.repository, + rebootstrapCallback, + null); + scheduler.schedule(asyncCheckMetadata.getClass().getName(), + asyncCheckMetadata, + new Date(), + 500); + + // Wait until the Version Manager is active + + while(maxRetries < 3 && !asyncCheckMetadata.isActive) { + Thread.sleep(500); + maxRetries++; + } + + } + + @After + public void tearDown() { + if(servers != null) + for(VoldemortServer server: servers) + server.stop(); + } + + /* + * simulate rebalance behavior + */ + public void rebalance() { + assert servers != null && servers.size() > 1; + + VoldemortConfig config = servers.get(0).getVoldemortConfig(); + adminClient = RebalanceUtils.createTempAdminClient(config, cluster, 4); + HashMap> replicaToPartitionList = Maps.newHashMap(); + replicaToPartitionList.put(0, ImmutableList.of(0, 1)); + int req = adminClient.storeMntOps.migratePartitions(0, + 1, + testStoreNameRW, + replicaToPartitionList, + null, + null, + false); + adminClient.rpcOps.waitForCompletion(1, req, 5, TimeUnit.SECONDS); + Versioned versionedCluster = adminClient.metadataMgmtOps.getRemoteCluster(0); + + Node node0 = versionedCluster.getValue().getNodeById(0); + Node node1 = versionedCluster.getValue().getNodeById(1); + Node newNode0 = new Node(node0.getId(), + node0.getHost(), + node0.getHttpPort(), + node0.getSocketPort(), + node0.getAdminPort(), + ImmutableList. of()); + Node newNode1 = new Node(node1.getId(), + node1.getHost(), + node1.getHttpPort(), + node1.getSocketPort(), + node1.getAdminPort(), + ImmutableList.of(0, 1)); + adminClient.storeMntOps.deletePartitions(0, testStoreNameRW, ImmutableList.of(0, 1), null); + + newCluster = new Cluster(cluster.getName(), + ImmutableList.of(newNode0, newNode1), + Lists.newArrayList(cluster.getZones())); + + newstoredefs = new ArrayList(); + + newstoredefs.add(rwStoreDefWithReplication2); + for(Node node: cluster.getNodes()) { + VectorClock clock = (VectorClock) versionedCluster.getVersion(); + clock.incrementVersion(node.getId(), System.currentTimeMillis()); + + HashMap> keyValueMap = new HashMap>(); + keyValueMap.put(MetadataStore.CLUSTER_KEY, + new Versioned(clusterMapper.writeCluster(newCluster), clock)); + + keyValueMap.put(MetadataStore.STORES_KEY, + new Versioned(storeMapper.writeStoreList(newstoredefs), clock)); + + adminClient.metadataMgmtOps.updateRemoteMetadata(node.getId(), keyValueMap); + } + + adminClient.metadataMgmtOps.updateMetadataversion(CLUSTER_VERSION_KEY); + + } + + @Test + public void testBasicAsyncBehaviour() { + + try { + + rebalance(); + maxRetries = 0; + while(maxRetries < 3 && !callbackDone) { + Thread.sleep(2000); + maxRetries++; + } + + } catch(Exception e) { + e.printStackTrace(); + fail("Failed to start the Metadata Version Manager : " + e.getMessage()); + } + } + + /* + * In callback ensure metadata is consistent + */ + private void checkConsistentMetadata() { + + Versioned versionedCluster = adminClient.metadataMgmtOps.getRemoteCluster(0); + Versioned> versionedStoreDefs = adminClient.metadataMgmtOps.getRemoteStoreDefList(0); + + if(versionedCluster.getValue().equals(newCluster)) { + Assert.assertEquals(versionedStoreDefs.getValue().get(0), rwStoreDefWithReplication2); + } + } + +} diff --git a/test/unit/voldemort/client/rebalance/RebalanceTest.java b/test/unit/voldemort/client/rebalance/RebalanceTest.java index 9b4c445ea6..100e70611b 100644 --- a/test/unit/voldemort/client/rebalance/RebalanceTest.java +++ b/test/unit/voldemort/client/rebalance/RebalanceTest.java @@ -1,5 +1,5 @@ /* - * Copyright 2012 LinkedIn, Inc + * Copyright 2013 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of @@ -16,27 +16,13 @@ package voldemort.client.rebalance; -import java.io.IOException; import java.util.Arrays; import java.util.Collection; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Map.Entry; -import java.util.Properties; import org.junit.runner.RunWith; import org.junit.runners.Parameterized; import org.junit.runners.Parameterized.Parameters; -import voldemort.ServerTestUtils; -import voldemort.TestUtils; -import voldemort.VoldemortException; -import voldemort.cluster.Cluster; -import voldemort.server.VoldemortConfig; -import voldemort.server.VoldemortServer; -import voldemort.store.metadata.MetadataStore.VoldemortState; - /** * Start VoldemortServer locally using ServerTestUtils and run rebalancing * tests. @@ -44,18 +30,12 @@ * */ @RunWith(Parameterized.class) -public class RebalanceTest extends AbstractRebalanceTest { +public class RebalanceTest extends AbstractNonZonedRebalanceTest { private final int NUM_KEYS = 20; - Map serverMap; - private final boolean useNio; - private final boolean useDonorBased; - public RebalanceTest(boolean useNio, boolean useDonorBased) { - this.useNio = useNio; - this.useDonorBased = useDonorBased; - this.serverMap = new HashMap(); + super(useNio, useDonorBased); } @Parameters @@ -64,79 +44,8 @@ public static Collection configs() { { false, false } }); } - @Override - protected VoldemortState getCurrentState(int nodeId) { - VoldemortServer server = serverMap.get(nodeId); - if(server == null) { - throw new VoldemortException("Node id " + nodeId + " does not exist"); - } else { - return server.getMetadataStore().getServerState(); - } - } - - @Override - protected Cluster getCurrentCluster(int nodeId) { - VoldemortServer server = serverMap.get(nodeId); - if(server == null) { - throw new VoldemortException("Node id " + nodeId + " does not exist"); - } else { - return server.getMetadataStore().getCluster(); - } - } - @Override protected int getNumKeys() { return NUM_KEYS; } - - // This method is susceptible to BindException issues due to TOCTOU - // problem with getLocalCluster (which is used to construct cluster that is - // passed in). - // TODO: Refactor AbstractRebalanceTest to take advantage of - // ServerTestUtils.startVoldemortCluster. - @Override - protected Cluster startServers(Cluster cluster, - String storeXmlFile, - List nodeToStart, - Map configProps) throws IOException { - for(int node: nodeToStart) { - Properties properties = new Properties(); - if(null != configProps) { - for(Entry property: configProps.entrySet()) { - properties.put(property.getKey(), property.getValue()); - } - } - - VoldemortConfig config = ServerTestUtils.createServerConfig(useNio, - node, - TestUtils.createTempDir() - .getAbsolutePath(), - null, - storeXmlFile, - properties); - - VoldemortServer server = ServerTestUtils.startVoldemortServer(socketStoreFactory, - config, - cluster); - serverMap.put(node, server); - } - - return cluster; - } - - @Override - protected void stopServer(List nodesToStop) throws IOException { - for(int node: nodesToStop) { - try { - ServerTestUtils.stopVoldemortServer(serverMap.get(node)); - } catch(VoldemortException e) { - // ignore these at stop time - } - } - } - - @Override - protected boolean useDonorBased() { - return this.useDonorBased; - } } diff --git a/test/unit/voldemort/client/rebalance/ZonedRebalanceTest.java b/test/unit/voldemort/client/rebalance/ZonedRebalanceTest.java new file mode 100644 index 0000000000..d698a2331f --- /dev/null +++ b/test/unit/voldemort/client/rebalance/ZonedRebalanceTest.java @@ -0,0 +1,44 @@ +/* + * Copyright 2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +package voldemort.client.rebalance; + +import java.util.Arrays; +import java.util.Collection; + +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.Parameters; + +@RunWith(Parameterized.class) +public class ZonedRebalanceTest extends AbstractZonedRebalanceTest { + + private final int NUM_KEYS = 100; + + public ZonedRebalanceTest(boolean useNio, boolean useDonorBased) { + super(useNio, useDonorBased); + } + + @Parameters + public static Collection configs() { + return Arrays.asList(new Object[][] { { true, true }, { true, false }, { false, true }, + { false, false } }); + } + + @Override + protected int getNumKeys() { + return NUM_KEYS; + } +} diff --git a/test/unit/voldemort/coordinator/CoordinatorRestAPITest.java b/test/unit/voldemort/coordinator/CoordinatorRestAPITest.java new file mode 100644 index 0000000000..17ce89b76e --- /dev/null +++ b/test/unit/voldemort/coordinator/CoordinatorRestAPITest.java @@ -0,0 +1,334 @@ +/* + * Copyright 2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package voldemort.coordinator; + +import static org.junit.Assert.fail; + +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.io.OutputStream; +import java.net.HttpURLConnection; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; +import java.util.Properties; + +import org.apache.commons.codec.binary.Base64; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import voldemort.ServerTestUtils; +import voldemort.server.VoldemortServer; +import voldemort.store.socket.SocketStoreFactory; +import voldemort.store.socket.clientrequest.ClientRequestExecutorPool; +import voldemort.versioning.VectorClock; + +public class CoordinatorRestAPITest { + + private VoldemortServer[] servers; + public static String socketUrl = ""; + private static final String STORE_NAME = "test"; + private static final String STORES_XML = "test/common/voldemort/config/single-store.xml"; + private static final String FAT_CLIENT_CONFIG_FILE_PATH = "test/common/voldemort/config/fat-client-config.avro"; + private final SocketStoreFactory socketStoreFactory = new ClientRequestExecutorPool(2, + 10000, + 100000, + 32 * 1024); + private CoordinatorService coordinator = null; + private final String coordinatorURL = "http://localhost:8080"; + + private class TestVersionedValue { + + private String value; + private VectorClock vc; + + public TestVersionedValue(String val, VectorClock vc) { + this.setValue(val); + this.setVc(vc); + } + + public String getValue() { + return value; + } + + public void setValue(String value) { + this.value = value; + } + + public VectorClock getVc() { + return vc; + } + + public void setVc(VectorClock vc) { + this.vc = vc; + } + } + + @Before + public void setUp() throws Exception { + int numServers = 1; + servers = new VoldemortServer[numServers]; + int partitionMap[][] = { { 0, 2, 4, 6, 1, 3, 5, 7 } }; + Properties props = new Properties(); + props.setProperty("storage.configs", + "voldemort.store.bdb.BdbStorageConfiguration,voldemort.store.slow.SlowStorageConfiguration"); + + ServerTestUtils.startVoldemortCluster(numServers, + servers, + partitionMap, + socketStoreFactory, + true, // useNio + null, + STORES_XML, + props); + + CoordinatorConfig config = new CoordinatorConfig(); + List bootstrapUrls = new ArrayList(); + socketUrl = servers[0].getIdentityNode().getSocketUrl().toString(); + bootstrapUrls.add(socketUrl); + + System.out.println("\n\n************************ Starting the Coordinator *************************"); + + config.setBootstrapURLs(bootstrapUrls); + config.setFatClientConfigPath(FAT_CLIENT_CONFIG_FILE_PATH); + + this.coordinator = new CoordinatorService(config); + if(!this.coordinator.isStarted()) { + this.coordinator.start(); + } + } + + @After + public void tearDown() throws Exception { + if(this.socketStoreFactory != null) { + this.socketStoreFactory.close(); + } + + if(this.coordinator != null && this.coordinator.isStarted()) { + this.coordinator.stop(); + } + } + + private VectorClock doPut(String key, String payload, VectorClock vc) { + VectorClock successfulPutVC = null; + try { + // Create the right URL and Http connection + HttpURLConnection conn = null; + String base64Key = new String(Base64.encodeBase64(key.getBytes())); + URL url = new URL(this.coordinatorURL + "/" + STORE_NAME + "/" + base64Key); + conn = (HttpURLConnection) url.openConnection(); + + // Set the right headers + conn.setRequestMethod("POST"); + conn.setDoOutput(true); + conn.setDoInput(true); + conn.setRequestProperty("Content-Type", "binary"); + conn.setRequestProperty("Content-Length", "" + payload.length()); + conn.setRequestProperty(VoldemortHttpRequestHandler.X_VOLD_REQUEST_TIMEOUT_MS, "1000"); + + if(vc != null) { + String eTag = CoordinatorUtils.getSerializedVectorClock(vc); + conn.setRequestProperty("ETag", eTag); + } + + // Write the payload + OutputStream out = conn.getOutputStream(); + out.write(payload.getBytes()); + out.close(); + + // Check for the right response code + if(conn.getResponseCode() != 201) { + System.err.println("Illegal response during PUT : " + conn.getResponseMessage()); + fail("Incorrect response received for a HTTP put request :" + + conn.getResponseCode()); + } + + } catch(Exception e) { + e.printStackTrace(); + fail("Error in sending the REST request"); + } + + return successfulPutVC; + } + + private boolean doDelete(String key) { + try { + + // Create the right URL and Http connection + HttpURLConnection conn = null; + String base64Key = new String(Base64.encodeBase64(key.getBytes())); + URL url = new URL(this.coordinatorURL + "/" + STORE_NAME + "/" + base64Key); + conn = (HttpURLConnection) url.openConnection(); + + // Set the right headers + conn.setRequestMethod("DELETE"); + conn.setDoInput(true); + conn.setRequestProperty(VoldemortHttpRequestHandler.X_VOLD_REQUEST_TIMEOUT_MS, "1000"); + + // Check for the right response code + if(conn.getResponseCode() != 204) { + System.err.println("Illegal response during DELETE : " + conn.getResponseMessage()); + fail("Incorrect response received for a HTTP put request :" + + conn.getResponseCode()); + } else { + return true; + } + + } catch(Exception e) { + e.printStackTrace(); + fail("Error in sending the REST request"); + } + + return false; + } + + private TestVersionedValue doGet(String key) { + String response = null; + TestVersionedValue responseObj = null; + try { + + // Create the right URL and Http connection + HttpURLConnection conn = null; + String base64Key = new String(Base64.encodeBase64(key.getBytes())); + URL url = new URL(this.coordinatorURL + "/" + STORE_NAME + "/" + base64Key); + conn = (HttpURLConnection) url.openConnection(); + + // Set the right headers + conn.setRequestMethod("GET"); + conn.setDoInput(true); + conn.setRequestProperty(VoldemortHttpRequestHandler.X_VOLD_REQUEST_TIMEOUT_MS, "1000"); + + if(conn.getResponseCode() == 404) { + return null; + } + + // Check for the right response code + if(conn.getResponseCode() != 200) { + System.err.println("Illegal response during GET : " + conn.getResponseMessage()); + fail("Incorrect response received for a HTTP put request :" + + conn.getResponseCode()); + } + + // Buffer the result into a string + BufferedReader rd = new BufferedReader(new InputStreamReader(conn.getInputStream())); + StringBuilder sb = new StringBuilder(); + String line; + while((line = rd.readLine()) != null) { + sb.append(line); + } + rd.close(); + + conn.disconnect(); + + response = sb.toString(); + VectorClock vc = CoordinatorUtils.deserializeVectorClock(conn.getHeaderField("ETag")); + responseObj = new TestVersionedValue(response, vc); + + } catch(Exception e) { + e.printStackTrace(); + fail("Error in sending the REST request"); + } + + return responseObj; + } + + @Test + public void testReadAfterWrite() { + String key = "Which_Imperial_IPA_do_I_want_to_drink"; + String payload = "Pliny the Younger"; + + // 1. Do a put + doPut(key, payload, null); + + // 2. Do a get on the same key + TestVersionedValue response = doGet(key); + if(response == null) { + fail("key does not exist after a put. "); + } + + System.out.println("Received value: " + response.getValue()); + if(!response.getValue().equals(payload)) { + fail("Received value is incorrect ! Expected : " + payload + " but got : " + + response.getValue()); + } + } + + @Test + public void testDelete() { + String key = "Which_sour_beer_do_I_want_to_drink"; + String payload = "Duchesse De Bourgogne"; + + // 1. Do a put + doPut(key, payload, null); + + // 2. Do a get on the same key + TestVersionedValue response = doGet(key); + if(response == null) { + fail("key does not exist after a put. "); + } + System.out.println("Received value: " + response.getValue()); + if(!response.getValue().equals(payload)) { + fail("Received value is incorrect ! Expected : " + payload + " but got : " + + response.getValue()); + } + + // 3. Do a delete + boolean isDeleted = doDelete(key); + if(!isDeleted) { + fail("Could not delete the key. Error !"); + } + + // 4. Do a get on the same key : this should fail + response = doGet(key); + if(response != null) { + fail("key still exists after deletion. "); + } + } + + @Test + public void testVersionedPut() { + String key = "Which_Porter_do_I_want_to_drink"; + String payload = "Founders Porter"; + String newPayload = "Samuel Smith Taddy Porter"; + + // 1. Do a put + doPut(key, payload, null); + + // 2. Do a get on the same key + TestVersionedValue response = doGet(key); + if(response == null) { + fail("key does not exist after a put. "); + } + System.out.println("Received value: " + response.getValue()); + + // 3. Do a versioned put based on the version received previously + doPut(key, newPayload, response.getVc()); + + // 4. Do a get again on the same key + TestVersionedValue newResponse = doGet(key); + if(newResponse == null) { + fail("key does not exist after the versioned put. "); + } + + System.out.println("Received value after the Versioned put: " + newResponse.getValue()); + if(!newResponse.getValue().equals(newPayload)) { + fail("Received value is incorrect ! Expected : " + newPayload + " but got : " + + newResponse.getValue()); + } + } +} diff --git a/test/unit/voldemort/coordinator/DynamicTimeoutStoreClientTest.java b/test/unit/voldemort/coordinator/DynamicTimeoutStoreClientTest.java new file mode 100644 index 0000000000..863a1e67cc --- /dev/null +++ b/test/unit/voldemort/coordinator/DynamicTimeoutStoreClientTest.java @@ -0,0 +1,161 @@ +/* + * Copyright 2013 LinkedIn, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package voldemort.coordinator; + +import static org.junit.Assert.fail; + +import java.io.File; +import java.util.Properties; + +import org.apache.commons.io.FileUtils; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import voldemort.ServerTestUtils; +import voldemort.client.ClientConfig; +import voldemort.client.SocketStoreClientFactory; +import voldemort.cluster.Cluster; +import voldemort.server.VoldemortServer; +import voldemort.store.CompositeGetVoldemortRequest; +import voldemort.store.CompositePutVoldemortRequest; +import voldemort.store.InsufficientOperationalNodesException; +import voldemort.store.socket.SocketStoreFactory; +import voldemort.store.socket.clientrequest.ClientRequestExecutorPool; +import voldemort.utils.ByteArray; +import voldemort.versioning.Versioned; +import voldemort.xml.ClusterMapper; + +/** + * Class to test the Fat Client wrapper + */ +public class DynamicTimeoutStoreClientTest { + + private VoldemortServer[] servers; + private Cluster cluster; + public static String socketUrl = ""; + private static final String STORE_NAME = "slow-store-test"; + private static final String STORES_XML = "test/common/voldemort/config/single-slow-store.xml"; + private static final String SLOW_STORE_DELAY = "500"; + private final SocketStoreFactory socketStoreFactory = new ClientRequestExecutorPool(2, + 10000, + 100000, + 32 * 1024); + private DynamicTimeoutStoreClient dynamicTimeoutClient = null; + + /** + * Setup a one node Voldemort cluster with a 'slow' store + * (SlowStorageEngine) with a delay of 500 ms for get and put. + * + * @throws java.lang.Exception + */ + @Before + public void setUp() throws Exception { + int numServers = 1; + servers = new VoldemortServer[numServers]; + int partitionMap[][] = { { 0, 2, 4, 6, 1, 3, 5, 7 } }; + Properties props = new Properties(); + props.setProperty("storage.configs", + "voldemort.store.bdb.BdbStorageConfiguration,voldemort.store.slow.SlowStorageConfiguration"); + props.setProperty("testing.slow.queueing.get.ms", SLOW_STORE_DELAY); + props.setProperty("testing.slow.queueing.put.ms", SLOW_STORE_DELAY); + + cluster = ServerTestUtils.startVoldemortCluster(numServers, + servers, + partitionMap, + socketStoreFactory, + true, // useNio + null, + STORES_XML, + props); + + socketUrl = servers[0].getIdentityNode().getSocketUrl().toString(); + String bootstrapUrl = socketUrl; + ClientConfig clientConfig = new ClientConfig().setBootstrapUrls(bootstrapUrl) + .setEnableCompressionLayer(false) + .setEnableSerializationLayer(false) + .enableDefaultClient(true) + .setEnableLazy(false); + + String storesXml = FileUtils.readFileToString(new File(STORES_XML), "UTF-8"); + ClusterMapper mapper = new ClusterMapper(); + + this.dynamicTimeoutClient = new DynamicTimeoutStoreClient(STORE_NAME, + new SocketStoreClientFactory(clientConfig), + 1, + storesXml, + mapper.writeCluster(cluster)); + } + + /** + * @throws java.lang.Exception + */ + @After + public void tearDown() throws Exception { + if(this.socketStoreFactory != null) { + this.socketStoreFactory.close(); + } + } + + /** + * Test the dynamic per call timeout. We do a regular put with the default + * configured timeout. We then do a put with a dynamic timeout of 200 ms + * which is less than the delay at the server side. After this we do a get + * with a dynamic timeout of 1500 ms which should succeed and return the + * value from the first put. + */ + @Test + public void test() { + long incorrectTimeout = 200; + long correctTimeout = 1500; + String key = "a"; + String value = "First"; + String newValue = "Second"; + + try { + this.dynamicTimeoutClient.put(new ByteArray(key.getBytes()), value.getBytes()); + } catch(Exception e) { + fail("Error in regular put."); + } + + long startTime = System.currentTimeMillis(); + try { + this.dynamicTimeoutClient.putWithCustomTimeout(new CompositePutVoldemortRequest(new ByteArray(key.getBytes()), + newValue.getBytes(), + incorrectTimeout)); + fail("Should not reach this point. The small (incorrect) timeout did not work."); + } catch(InsufficientOperationalNodesException ion) { + System.out.println("This failed as Expected."); + } + + try { + Versioned versionedValue = this.dynamicTimeoutClient.getWithCustomTimeout(new CompositeGetVoldemortRequest(new ByteArray(key.getBytes()), + correctTimeout, + true)); + long endTime = System.currentTimeMillis(); + System.out.println("Total time taken = " + (endTime - startTime)); + String response = new String(versionedValue.getValue()); + if(!response.equals(value)) { + fail("The returned value does not match. Expected: " + value + " but Received: " + + response); + } + } catch(Exception e) { + e.printStackTrace(); + fail("The dynamic per call timeout did not work !"); + } + } +} diff --git a/test/unit/voldemort/routing/StoreRoutingPlanTest.java b/test/unit/voldemort/routing/StoreRoutingPlanTest.java new file mode 100644 index 0000000000..2d35d99bc4 --- /dev/null +++ b/test/unit/voldemort/routing/StoreRoutingPlanTest.java @@ -0,0 +1,163 @@ +package voldemort.routing; + +import static org.junit.Assert.assertEquals; + +import java.util.HashMap; +import java.util.List; + +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import voldemort.ServerTestUtils; +import voldemort.TestUtils; +import voldemort.client.RoutingTier; +import voldemort.cluster.Cluster; +import voldemort.cluster.Zone; +import voldemort.serialization.SerializerDefinition; +import voldemort.store.StoreDefinition; +import voldemort.store.StoreDefinitionBuilder; +import voldemort.store.bdb.BdbStorageConfiguration; +import voldemort.store.slop.strategy.HintedHandoffStrategyType; + +import com.google.common.collect.Lists; + +public class StoreRoutingPlanTest { + + StoreRoutingPlan zonedRoutingPlan; + StoreRoutingPlan nonZonedRoutingPlan; + + public StoreRoutingPlanTest() {} + + @Before + public void setup() { + Cluster nonZonedCluster = ServerTestUtils.getLocalCluster(3, new int[] { 1000, 2000, 3000, + 1000, 2000, 3000, 1000, 2000, 3000 }, new int[][] { { 0 }, { 1, 3 }, { 2 } }); + StoreDefinition nonZoned211StoreDef = new StoreDefinitionBuilder().setName("non-zoned") + .setType(BdbStorageConfiguration.TYPE_NAME) + .setKeySerializer(new SerializerDefinition("string")) + .setValueSerializer(new SerializerDefinition("string")) + .setRoutingPolicy(RoutingTier.CLIENT) + .setRoutingStrategyType(RoutingStrategyType.CONSISTENT_STRATEGY) + .setReplicationFactor(2) + .setPreferredReads(1) + .setRequiredReads(1) + .setPreferredWrites(1) + .setRequiredWrites(1) + .build(); + nonZonedRoutingPlan = new StoreRoutingPlan(nonZonedCluster, nonZoned211StoreDef); + + int[] dummyZonedPorts = new int[] { 1000, 2000, 3000, 1000, 2000, 3000, 1000, 2000, 3000, + 1000, 2000, 3000, 1000, 2000, 3000, 1000, 2000, 3000 }; + Cluster zonedCluster = ServerTestUtils.getLocalZonedCluster(6, + 2, + new int[] { 0, 0, 0, 1, 1, 1 }, + new int[][] { { 0 }, { 1, 6 }, + { 2 }, { 3 }, { 4, 7 }, + { 5 } }, + dummyZonedPorts); + HashMap zrfRWStoreWithReplication = new HashMap(); + zrfRWStoreWithReplication.put(0, 2); + zrfRWStoreWithReplication.put(1, 2); + StoreDefinition zoned211StoreDef = new StoreDefinitionBuilder().setName("zoned") + .setType(BdbStorageConfiguration.TYPE_NAME) + .setKeySerializer(new SerializerDefinition("string")) + .setValueSerializer(new SerializerDefinition("string")) + .setRoutingPolicy(RoutingTier.CLIENT) + .setRoutingStrategyType(RoutingStrategyType.ZONE_STRATEGY) + .setReplicationFactor(4) + .setPreferredReads(1) + .setRequiredReads(1) + .setPreferredWrites(1) + .setRequiredWrites(1) + .setZoneCountReads(0) + .setZoneCountWrites(0) + .setZoneReplicationFactor(zrfRWStoreWithReplication) + .setHintedHandoffStrategy(HintedHandoffStrategyType.PROXIMITY_STRATEGY) + .build(); + zonedRoutingPlan = new StoreRoutingPlan(zonedCluster, zoned211StoreDef); + } + + @Test + public void testZonedStoreRoutingPlan() { + HashMap> samplePartitionKeysMap = TestUtils.createPartitionsKeys(zonedRoutingPlan, + 1); + assertEquals("Node 1 does not contain p5?", + (Integer) 6, + zonedRoutingPlan.getNodesPartitionIdForKey(1, samplePartitionKeysMap.get(5) + .get(0))); + assertEquals("Node 4 does not contain p5?", + (Integer) 7, + zonedRoutingPlan.getNodesPartitionIdForKey(4, samplePartitionKeysMap.get(5) + .get(0))); + assertEquals("Replication list does not match up", + Lists.newArrayList(0, 1, 3, 4), + zonedRoutingPlan.getReplicationNodeList(0)); + + assertEquals("Zone replica type should be 1", + 1, + zonedRoutingPlan.getZoneReplicaType(0, 0, samplePartitionKeysMap.get(6).get(0))); + assertEquals("Zone replica type should be 0", + 0, + zonedRoutingPlan.getZoneReplicaType(0, 1, samplePartitionKeysMap.get(6).get(0))); + assertEquals("Zone replica type should be 1", + 1, + zonedRoutingPlan.getZoneReplicaType(1, 3, samplePartitionKeysMap.get(7).get(0))); + assertEquals("Zone replica type should be 0", + 0, + zonedRoutingPlan.getZoneReplicaType(1, 4, samplePartitionKeysMap.get(7).get(0))); + + assertEquals("Replica owner should be 1", + 1, + zonedRoutingPlan.getZoneReplicaNode(0, 1, samplePartitionKeysMap.get(2).get(0))); + assertEquals("Replica owner should be 1", + 1, + zonedRoutingPlan.getZoneReplicaNode(0, 0, samplePartitionKeysMap.get(3).get(0))); + assertEquals("Replica owner should be 4", + 4, + zonedRoutingPlan.getZoneReplicaNode(1, 1, samplePartitionKeysMap.get(1).get(0))); + assertEquals("Replica owner should be 3", + 3, + zonedRoutingPlan.getZoneReplicaNode(1, 0, samplePartitionKeysMap.get(2).get(0))); + } + + @Test + public void testNonZonedStoreRoutingPlan() { + HashMap> samplePartitionKeysMap = TestUtils.createPartitionsKeys(nonZonedRoutingPlan, + 1); + + assertEquals("Node 1 does not contain p2 as secondary?", + (Integer) 3, + nonZonedRoutingPlan.getNodesPartitionIdForKey(1, samplePartitionKeysMap.get(2) + .get(0))); + assertEquals("Replication list does not match up", + Lists.newArrayList(1, 2), + nonZonedRoutingPlan.getReplicationNodeList(1)); + + assertEquals("Zone replica type should be 1", + 1, + nonZonedRoutingPlan.getZoneReplicaType(Zone.DEFAULT_ZONE_ID, + 2, + samplePartitionKeysMap.get(1).get(0))); + assertEquals("Zone replica type should be 0", + 0, + nonZonedRoutingPlan.getZoneReplicaType(Zone.DEFAULT_ZONE_ID, + 1, + samplePartitionKeysMap.get(3).get(0))); + assertEquals("Replica owner should be 2", + 2, + nonZonedRoutingPlan.getZoneReplicaNode(Zone.DEFAULT_ZONE_ID, + 1, + samplePartitionKeysMap.get(1).get(0))); + assertEquals("Replica owner should be 1", + 1, + nonZonedRoutingPlan.getZoneReplicaNode(Zone.DEFAULT_ZONE_ID, + 0, + samplePartitionKeysMap.get(3).get(0))); + } + + @After + public void teardown() { + + } +} diff --git a/test/unit/voldemort/store/AbstractStorageEngineTest.java b/test/unit/voldemort/store/AbstractStorageEngineTest.java index 9e21b8c0cf..ebcc30382e 100644 --- a/test/unit/voldemort/store/AbstractStorageEngineTest.java +++ b/test/unit/voldemort/store/AbstractStorageEngineTest.java @@ -16,10 +16,13 @@ package voldemort.store; +import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Map; +import org.junit.Test; + import voldemort.TestUtils; import voldemort.serialization.StringSerializer; import voldemort.store.serialized.SerializingStorageEngine; @@ -146,6 +149,89 @@ public void testTruncate() throws Exception { } } + @Test + public void testMultiVersionPuts() { + + StorageEngine store = getStorageEngine(); + + try { + // Insert with concurrent versions + ByteArray key = new ByteArray("mvpKey1".getBytes()); + List> vals = new ArrayList>(); + vals.add(TestUtils.getVersioned("val1".getBytes(), 1)); + vals.add(TestUtils.getVersioned("val2".getBytes(), 2)); + vals.add(TestUtils.getVersioned("val3".getBytes(), 3)); + List> obsoletes = store.multiVersionPut(key, vals); + assertTrue("Should not be any rejected versions..", obsoletes.size() == 0); + assertEquals("Should have all 3 versions stored", 3, store.get(key, null).size()); + assertTrue("All concurrent versions expected", + TestUtils.areVersionedListsEqual(vals, store.get(key, null))); + List> saveVals = vals; + + // Insert with some concurrent and some obsolete versions + key = new ByteArray("mvpKey2".getBytes()); + vals = new ArrayList>(); + vals.add(TestUtils.getVersioned("val1".getBytes(), 1)); + vals.add(TestUtils.getVersioned("val2".getBytes(), 2)); + vals.add(TestUtils.getVersioned("val3".getBytes(), 1, 1)); + obsoletes = store.multiVersionPut(key, vals); + assertTrue("Should not be any obsolete versions..", obsoletes.size() == 0); + assertEquals("Should have 2 versions stored, with 1:2 superceding 1:1", + 2, + store.get(key, null).size()); + vals.remove(0); + assertTrue("Should have 2 versiones stored, with 1:2 superceding 1:1", + TestUtils.areVersionedListsEqual(vals, store.get(key, null))); + + // Update of concurrent versions, on top of concurrent versions + key = new ByteArray("mvpKey1".getBytes()); + vals = new ArrayList>(); + vals.add(TestUtils.getVersioned("val4".getBytes(), 4)); + vals.add(TestUtils.getVersioned("val5".getBytes(), 5)); + vals.add(TestUtils.getVersioned("val6".getBytes(), 6)); + obsoletes = store.multiVersionPut(key, vals); + assertTrue("Should not be any rejected versions..", obsoletes.size() == 0); + assertEquals("Should have all 6 versions stored", 6, store.get(key, null).size()); + vals.addAll(saveVals); + assertTrue("All 6 concurrent versions expected", + TestUtils.areVersionedListsEqual(vals, store.get(key, null))); + saveVals = vals; + + // Update of some obsolete versions, on top of concurrent versions + key = new ByteArray("mvpKey1".getBytes()); + vals = new ArrayList>(); + // one obsolete version + Versioned obsoleteVersion = TestUtils.getVersioned("val4-obsolete".getBytes(), + 4); + vals.add(obsoleteVersion); + // one new concurrent version + vals.add(TestUtils.getVersioned("val7".getBytes(), 7)); + obsoletes = store.multiVersionPut(key, vals); + assertTrue("Should be one version rejected..", obsoletes.size() == 1); + assertEquals("Obsolete's version should be 4:1", obsoleteVersion, obsoletes.get(0)); + assertEquals("Should have all 7 versions stored", 7, store.get(key, null).size()); + vals.remove(0); + vals.addAll(saveVals); + assertTrue("All 7 concurrent versions expected", + TestUtils.areVersionedListsEqual(vals, store.get(key, null))); + + // super version, makes all versions obsolete + key = new ByteArray("mvpKey1".getBytes()); + vals = new ArrayList>(); + vals.add(TestUtils.getVersioned("val1234567".getBytes(), 1, 2, 3, 4, 5, 6, 7)); + obsoletes = store.multiVersionPut(key, vals); + assertTrue("Should not be any rejected versions..", obsoletes.size() == 0); + assertEquals("Exactly one version to be stored", 1, store.get(key, null).size()); + assertTrue("Exactly one version to be stored", + TestUtils.areVersionedListsEqual(vals, store.get(key, null))); + } catch(UnsupportedOperationException uoe) { + // expected if the storage engine does not support multi version + // puts + System.err.println("Multi version puts not supported in test " + + this.getClass().getName()); + } + } + @SuppressWarnings("unused") private boolean remove(List list, byte[] item) { Iterator it = list.iterator(); diff --git a/test/unit/voldemort/store/bdb/BdbStorageEngineTest.java b/test/unit/voldemort/store/bdb/BdbStorageEngineTest.java index 80bf2b2c7b..ab55340b34 100644 --- a/test/unit/voldemort/store/bdb/BdbStorageEngineTest.java +++ b/test/unit/voldemort/store/bdb/BdbStorageEngineTest.java @@ -17,6 +17,7 @@ package voldemort.store.bdb; import java.io.File; +import java.io.FilenameFilter; import java.util.Arrays; import java.util.Collection; import java.util.List; @@ -295,7 +296,16 @@ public void testNativeBackup() throws Exception { String[] backedUp = backupToDir.list(); Arrays.sort(backedUp); assertArrayEquals(backedUp, new String[] { "00000000.jdb", "00000001.jdb" }); - assertEquals(backupFileModified, backupToDir.listFiles()[0].lastModified()); + FilenameFilter filter = new FilenameFilter() { + + @Override + public boolean accept(File dir, String name) { + if(name.equals("00000000.jdb")) + return true; + return false; + } + }; + assertEquals(backupFileModified, backupToDir.listFiles(filter)[0].lastModified()); } finally { deleteDir(backupToDir); } diff --git a/test/unit/voldemort/store/metadata/MetadataStoreTest.java b/test/unit/voldemort/store/metadata/MetadataStoreTest.java index 83702b7601..63032d1a4f 100644 --- a/test/unit/voldemort/store/metadata/MetadataStoreTest.java +++ b/test/unit/voldemort/store/metadata/MetadataStoreTest.java @@ -16,14 +16,21 @@ package voldemort.store.metadata; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; -import junit.framework.TestCase; +import org.junit.Before; +import org.junit.Test; + import voldemort.ServerTestUtils; import voldemort.client.rebalance.RebalancePartitionsInfo; +import voldemort.cluster.Cluster; import voldemort.server.rebalance.RebalancerState; import voldemort.store.metadata.MetadataStore.VoldemortState; import voldemort.utils.ByteArray; @@ -36,7 +43,7 @@ import com.google.common.collect.Maps; -public class MetadataStoreTest extends TestCase { +public class MetadataStoreTest { private static int TEST_RUNS = 100; @@ -44,11 +51,11 @@ public class MetadataStoreTest extends TestCase { private List TEST_KEYS = Arrays.asList(MetadataStore.CLUSTER_KEY, MetadataStore.STORES_KEY, MetadataStore.REBALANCING_STEAL_INFO, - MetadataStore.SERVER_STATE_KEY); + MetadataStore.SERVER_STATE_KEY, + MetadataStore.REBALANCING_SOURCE_CLUSTER_XML); - @Override + @Before public void setUp() throws Exception { - super.setUp(); metadataStore = ServerTestUtils.createMetadataStore(ServerTestUtils.getLocalCluster(1), ServerTestUtils.getStoreDefs(1)); } @@ -61,7 +68,8 @@ public ByteArray getValidKey() { public byte[] getValidValue(ByteArray key) { String keyString = ByteUtils.getString(key.get(), "UTF-8"); - if(MetadataStore.CLUSTER_KEY.equals(keyString)) { + if(MetadataStore.CLUSTER_KEY.equals(keyString) + || MetadataStore.REBALANCING_SOURCE_CLUSTER_XML.equals(keyString)) { return ByteUtils.getBytes(new ClusterMapper().writeCluster(ServerTestUtils.getLocalCluster(1)), "UTF-8"); } else if(MetadataStore.STORES_KEY.equals(keyString)) { @@ -96,6 +104,7 @@ public byte[] getValidValue(ByteArray key) { throw new RuntimeException("Unhandled key:" + keyString + " passed"); } + @Test public void testSimpleGetAndPut() { for(int i = 0; i <= TEST_RUNS; i++) { ByteArray key = getValidKey(); @@ -108,6 +117,7 @@ public void testSimpleGetAndPut() { } } + @Test public void testRepeatedPuts() { for(int i = 0; i <= TEST_RUNS; i++) { for(int j = 0; j <= 5; j++) { @@ -123,6 +133,7 @@ public void testRepeatedPuts() { } } + @Test public void testObsoletePut() { for(int i = 0; i <= TEST_RUNS; i++) { ByteArray key = getValidKey(); @@ -141,6 +152,7 @@ public void testObsoletePut() { } } + @Test public void testSynchronousPut() { for(int i = 0; i <= TEST_RUNS; i++) { ByteArray key = getValidKey(); @@ -160,6 +172,7 @@ public void testSynchronousPut() { } } + @Test public void testCleanAllStates() { // put state entries. incrementVersionAndPut(metadataStore, @@ -167,7 +180,7 @@ public void testCleanAllStates() { MetadataStore.VoldemortState.REBALANCING_MASTER_SERVER); assertEquals("Values should match.", - metadataStore.getServerState(), + metadataStore.getServerStateUnlocked(), VoldemortState.REBALANCING_MASTER_SERVER); // do clean @@ -175,10 +188,32 @@ public void testCleanAllStates() { // check all values revert back to default. assertEquals("Values should match.", - metadataStore.getServerState(), + metadataStore.getServerStateUnlocked(), VoldemortState.NORMAL_SERVER); } + @Test + public void testRebalacingSourceClusterXmlKey() { + metadataStore.cleanAllRebalancingState(); + + assertTrue("Should be null", null == metadataStore.getRebalancingSourceCluster()); + + Cluster dummyCluster = ServerTestUtils.getLocalCluster(2); + metadataStore.put(MetadataStore.REBALANCING_SOURCE_CLUSTER_XML, dummyCluster); + assertEquals("Should be equal", dummyCluster, metadataStore.getRebalancingSourceCluster()); + + metadataStore.put(MetadataStore.REBALANCING_SOURCE_CLUSTER_XML, (Object) null); + assertTrue("Should be null", null == metadataStore.getRebalancingSourceCluster()); + + List> sourceClusterVersions = metadataStore.get(MetadataStore.REBALANCING_SOURCE_CLUSTER_XML, + null); + assertTrue("Just one version expected", 1 == sourceClusterVersions.size()); + assertEquals("Empty string should map to null", + "", + new String(sourceClusterVersions.get(0).getValue())); + + } + private void checkValues(Versioned value, List> list, ByteArray key) { assertEquals("should return exactly one value ", 1, list.size()); diff --git a/test/unit/voldemort/store/rebalancing/RedirectingStoreTest.java b/test/unit/voldemort/store/rebalancing/RedirectingStoreTest.java index 2ecfb2bb50..3ce557faf0 100644 --- a/test/unit/voldemort/store/rebalancing/RedirectingStoreTest.java +++ b/test/unit/voldemort/store/rebalancing/RedirectingStoreTest.java @@ -16,18 +16,24 @@ package voldemort.store.rebalancing; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + import java.io.File; import java.io.IOException; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.HashMap; import java.util.Iterator; import java.util.List; +import java.util.Map; +import java.util.Map.Entry; import java.util.Properties; import java.util.Set; -import java.util.Map.Entry; - -import junit.framework.TestCase; +import java.util.concurrent.Executors; import org.apache.commons.io.FileUtils; import org.junit.After; @@ -56,14 +62,17 @@ import voldemort.server.VoldemortServer; import voldemort.server.rebalance.RebalancerState; import voldemort.store.InvalidMetadataException; +import voldemort.store.Store; import voldemort.store.StoreDefinition; import voldemort.store.StoreDefinitionBuilder; -import voldemort.store.memory.InMemoryStorageConfiguration; +import voldemort.store.bdb.BdbStorageConfiguration; import voldemort.store.metadata.MetadataStore; import voldemort.store.socket.SocketStoreFactory; import voldemort.store.socket.clientrequest.ClientRequestExecutorPool; import voldemort.utils.ByteArray; +import voldemort.utils.DaemonThreadFactory; import voldemort.utils.RebalanceUtils; +import voldemort.versioning.ClockEntry; import voldemort.versioning.ObsoleteVersionException; import voldemort.versioning.VectorClock; import voldemort.versioning.Versioned; @@ -73,7 +82,7 @@ import com.google.common.collect.Maps; @RunWith(Parameterized.class) -public class RedirectingStoreTest extends TestCase { +public class RedirectingStoreTest { private VoldemortServer[] servers; private Cluster targetCluster; @@ -82,6 +91,8 @@ public class RedirectingStoreTest extends TestCase { private List secondaryPartitionsMoved; private HashMap primaryEntriesMoved; private HashMap secondaryEntriesMoved; + private HashMap proxyPutTestPrimaryEntries; + private HashMap proxyPutTestSecondaryEntries; private final boolean useNio; private StoreDefinition storeDef; private final SocketStoreFactory storeFactory = new ClientRequestExecutorPool(2, @@ -98,7 +109,6 @@ public static Collection configs() { return Arrays.asList(new Object[][] { { true }, { false } }); } - @Override @Before public void setUp() throws IOException, InterruptedException { currentCluster = ServerTestUtils.getLocalCluster(3, new int[][] { { 0, 1 }, { 2, 3 }, {} }); @@ -106,10 +116,10 @@ public void setUp() throws IOException, InterruptedException { this.primaryPartitionsMoved = Lists.newArrayList(0); this.secondaryPartitionsMoved = Lists.newArrayList(2, 3); this.storeDef = new StoreDefinitionBuilder().setName("test") - .setType(InMemoryStorageConfiguration.TYPE_NAME) + .setType(BdbStorageConfiguration.TYPE_NAME) .setKeySerializer(new SerializerDefinition("string")) .setValueSerializer(new SerializerDefinition("string")) - .setRoutingPolicy(RoutingTier.SERVER) + .setRoutingPolicy(RoutingTier.CLIENT) .setRoutingStrategyType(RoutingStrategyType.CONSISTENT_STRATEGY) .setReplicationFactor(2) .setPreferredReads(1) @@ -142,6 +152,8 @@ public void setUp() throws IOException, InterruptedException { this.primaryEntriesMoved = Maps.newHashMap(); this.secondaryEntriesMoved = Maps.newHashMap(); + this.proxyPutTestPrimaryEntries = Maps.newHashMap(); + this.proxyPutTestSecondaryEntries = Maps.newHashMap(); RoutingStrategy strategy = new RoutingStrategyFactory().updateRoutingStrategy(storeDef, currentCluster); @@ -159,6 +171,32 @@ public void setUp() throws IOException, InterruptedException { // Hope the 'God of perfect timing' is on our side Thread.sleep(500); + // steal a few primary key-value pairs for testing proxy put logic + int cnt = 0; + for(Entry entry: primaryEntriesMoved.entrySet()) { + if(cnt > 3) + break; + this.proxyPutTestPrimaryEntries.put(entry.getKey(), entry.getValue()); + cnt++; + } + for(ByteArray key: this.proxyPutTestPrimaryEntries.keySet()) { + this.primaryEntriesMoved.remove(key); + } + assertTrue("Not enough primary entries", primaryEntriesMoved.size() > 1); + + // steal a few secondary key-value pairs for testing proxy put logic + cnt = 0; + for(Entry entry: secondaryEntriesMoved.entrySet()) { + if(cnt > 3) + break; + this.proxyPutTestSecondaryEntries.put(entry.getKey(), entry.getValue()); + cnt++; + } + for(ByteArray key: this.proxyPutTestSecondaryEntries.keySet()) { + this.secondaryEntriesMoved.remove(key); + } + assertTrue("Not enough secondary entries", primaryEntriesMoved.size() > 1); + RebalanceClusterPlan plan = new RebalanceClusterPlan(currentCluster, targetCluster, Lists.newArrayList(storeDef), @@ -173,6 +211,14 @@ public void setUp() throws IOException, InterruptedException { servers[partitionPlan.getStealerId()].getMetadataStore() .put(MetadataStore.REBALANCING_STEAL_INFO, new RebalancerState(Lists.newArrayList(partitionPlan))); + servers[partitionPlan.getStealerId()].getMetadataStore() + .put(MetadataStore.REBALANCING_SOURCE_CLUSTER_XML, + currentCluster); + + // update orginal storedefs + servers[partitionPlan.getStealerId()].getMetadataStore() + .put(MetadataStore.REBALANCING_SOURCE_STORES_XML, + Lists.newArrayList(storeDef)); } // Update the cluster metadata on all three nodes @@ -182,7 +228,6 @@ public void setUp() throws IOException, InterruptedException { } - @Override @After public void tearDown() { for(VoldemortServer server: servers) { @@ -225,7 +270,11 @@ private RedirectingStore getRedirectingStore(int nodeId, metadata, servers[nodeId].getStoreRepository(), new NoopFailureDetector(), - storeFactory); + storeFactory, + true, + Executors.newFixedThreadPool(1, + new DaemonThreadFactory("voldemort-proxy-put-thread")), + new ProxyPutStats(null)); } @Test @@ -316,7 +365,7 @@ public void testProxyGetAll() { } @Test - public void testProxyPut() { + public void testProxyGetDuringPut() { final RedirectingStore storeNode2 = getRedirectingStore(2, servers[2].getMetadataStore(), @@ -354,7 +403,272 @@ public void testProxyPut() { } } + } + + /** + * This exits out immediately if the node is not proxy putting. + * + * @param store + */ + private void waitForProxyPutsToDrain(RedirectingStore store) { + // wait for the proxy write to complete + while(store.getProxyPutStats().getNumPendingProxyPuts() > 0) { + try { + Thread.sleep(50); + } catch(InterruptedException e) { + e.printStackTrace(); + } + } + } + + @Test + public void testProxyPuts() { + + List testPrimaryKeys = new ArrayList(this.proxyPutTestPrimaryEntries.keySet()); + List testSecondaryKeys = new ArrayList(this.proxyPutTestSecondaryEntries.keySet()); + + final RedirectingStore redirectingStoreNode2 = getRedirectingStore(2, + servers[2].getMetadataStore(), + "test"); + final RedirectingStore redirectingStoreNode0 = getRedirectingStore(0, + servers[0].getMetadataStore(), + "test"); + final Store socketStoreNode2 = redirectingStoreNode2.getRedirectingSocketStore("test", + 2); + final Store socketStoreNode0 = redirectingStoreNode0.getRedirectingSocketStore("test", + 0); + + // 1. Make sure the vector clocks make sense.. Read through Node 2 and + // proxy getting from Node 0 and issue a write based off that, + // incrementing the clock for Node 2 and make sure there is no + // ObsoleteVersionException at both Node 0 and + // Node 2. + ByteArray secondaryKey = testSecondaryKeys.get(0); + VectorClock clock1 = ((VectorClock) redirectingStoreNode2.getVersions(secondaryKey).get(0)).incremented(2, + System.currentTimeMillis()); + try { + redirectingStoreNode2.put(secondaryKey, + Versioned.value("write-through".getBytes("UTF-8"), clock1), + null); + } catch(Exception e) { + fail("Unexpected error in testing write through proxy put"); + e.printStackTrace(); + } + waitForProxyPutsToDrain(redirectingStoreNode2); + + assertTrue("Unexpected failures in proxy put", + redirectingStoreNode2.getProxyPutStats().getNumProxyPutFailures() == 0); + assertEquals("Unexpected value in Node 2", + "write-through", + new String(socketStoreNode2.get(secondaryKey, null).get(0).getValue())); + assertTrue("Proxy write not seen on proxy node 0", + "write-through".equals(new String(socketStoreNode0.get(secondaryKey, null) + .get(0) + .getValue()))); + + // Also test that if put fails locally, proxy put is not attempted. + try { + redirectingStoreNode2.put(secondaryKey, + Versioned.value("write-through-updated".getBytes("UTF-8"), + clock1), + null); + fail("Should have thrown OVE"); + } catch(ObsoleteVersionException ove) { + // Expected + } catch(Exception e) { + fail("Unexpected error in testing write through proxy put"); + e.printStackTrace(); + } + waitForProxyPutsToDrain(redirectingStoreNode2); + assertFalse("Proxy write not seen on proxy node 0", + "write-through-updated".equals(new String(socketStoreNode0.get(secondaryKey, + null) + .get(0) + .getValue()))); + + // 2. Make sure if the proxy node is still a replica, we don't issue + // proxy puts. Node 2 -> Node 0 on partition 0, for which Node 0 is + // still a replica + ByteArray primaryKey = testPrimaryKeys.get(0); + VectorClock clock2 = ((VectorClock) redirectingStoreNode2.getVersions(primaryKey).get(0)).incremented(2, + System.currentTimeMillis()); + try { + redirectingStoreNode2.put(primaryKey, + Versioned.value("write-through".getBytes("UTF-8"), clock2), + null); + } catch(Exception e) { + fail("Unexpected error in testing write through proxy put"); + e.printStackTrace(); + } + waitForProxyPutsToDrain(redirectingStoreNode2); + assertEquals("Unexpected value in Node 2", + "write-through", + new String(socketStoreNode2.get(primaryKey, null).get(0).getValue())); + assertFalse("Proxy write seen on proxy node which is a replica", + "write-through".equals(new String(socketStoreNode0.get(primaryKey, null) + .get(0) + .getValue()))); + + // 3. If the same entry reaches Node 2 again from Node 0, via partition + // fetch, it will + // generate OVE. + try { + redirectingStoreNode2.put(primaryKey, + Versioned.value("write-through".getBytes("UTF-8"), clock2), + null); + fail("Should have thrown OVE"); + } catch(ObsoleteVersionException ove) { + // Expected + } catch(Exception e) { + fail("Unexpected error in testing write through proxy put"); + e.printStackTrace(); + } + } + private VectorClock makeSuperClock(long time) { + List clockEntries = new ArrayList(); + clockEntries.add(new ClockEntry((short) 0, time)); + clockEntries.add(new ClockEntry((short) 1, time)); + clockEntries.add(new ClockEntry((short) 2, time)); + return new VectorClock(clockEntries, time); } + @Test + public void testProxyFetchOptimizations() { + + List testPrimaryKeys = new ArrayList(this.proxyPutTestPrimaryEntries.keySet()); + List testSecondaryKeys = new ArrayList(this.proxyPutTestSecondaryEntries.keySet()); + + final RedirectingStore redirectingStoreNode2 = getRedirectingStore(2, + servers[2].getMetadataStore(), + "test"); + final RedirectingStore redirectingStoreNode0 = getRedirectingStore(0, + servers[0].getMetadataStore(), + "test"); + final Store socketStoreNode2 = redirectingStoreNode2.getRedirectingSocketStore("test", + 2); + final Store socketStoreNode0 = redirectingStoreNode0.getRedirectingSocketStore("test", + 0); + + long time = System.currentTimeMillis(); + // 1. Test that once a key is fetched over, get() can serve it locally.. + ByteArray primaryKey1 = testPrimaryKeys.get(1); + assertTrue("Originally key should not exist on Node 2", + socketStoreNode2.get(primaryKey1, null).size() == 0); + + assertTrue("get on Node 2 should return a valid value by proxy fetching from Node 0", + redirectingStoreNode2.get(primaryKey1, null).size() > 0); + + socketStoreNode0.delete(primaryKey1, makeSuperClock(time++)); + assertTrue("Still should be able to serve it locally from Node 2", + redirectingStoreNode2.get(primaryKey1, null).size() > 0); + + // 2. Test that put is still issued on top of version on remote version. + // But once moved over, can be issued just on local version. + ByteArray secondaryKey1 = testSecondaryKeys.get(1); + VectorClock writeClock = makeSuperClock(time++); + socketStoreNode0.put(secondaryKey1, new Versioned("value-win".getBytes(), + writeClock), null); + try { + redirectingStoreNode2.put(secondaryKey1, new Versioned("value-ove".getBytes(), + writeClock), null); + fail("Missing OVE.. put should be based on remote version"); + } catch(ObsoleteVersionException ove) { + // should have OVE if based on remote version due to equal clock + } + // But would have still move over value from Node 0 + assertEquals("Value not moved over from Node 0", + "value-win", + new String(socketStoreNode2.get(secondaryKey1, null).get(0).getValue())); + socketStoreNode0.delete(secondaryKey1, makeSuperClock(time++)); + redirectingStoreNode2.put(secondaryKey1, + new Versioned("value-final".getBytes(), + makeSuperClock(time++)), + null); + assertEquals("Final value not found on node 2", + "value-final", + new String(socketStoreNode2.get(secondaryKey1, null).get(0).getValue())); + assertEquals("Final value not found on node 0", + "value-final", + new String(socketStoreNode0.get(secondaryKey1, null).get(0).getValue())); + + // delete all the primary and secondary keys from Node 2 and Node 0, to + // begin getAll() tests + for(ByteArray key: testPrimaryKeys) { + socketStoreNode0.delete(key, makeSuperClock(time++)); + socketStoreNode2.delete(key, makeSuperClock(time++)); + socketStoreNode0.put(key, new Versioned("normal".getBytes(), + makeSuperClock(time++)), null); + } + for(ByteArray key: testSecondaryKeys) { + socketStoreNode0.delete(key, makeSuperClock(time++)); + socketStoreNode2.delete(key, makeSuperClock(time++)); + socketStoreNode0.put(key, new Versioned("normal".getBytes(), + makeSuperClock(time++)), null); + } + + // 3. Test case where some keys are moved over and some are n't for + // getAlls. + List keyList = new ArrayList(); + keyList.addAll(testPrimaryKeys); + keyList.addAll(testSecondaryKeys); + keyList.add(new ByteArray("non-existent-key".getBytes())); + + // add the first primary & secondary key with bigger vector clock on + // Node 2 and lower clock on Node 0.. + VectorClock smallerClock = makeSuperClock(time++); + VectorClock biggerClock = makeSuperClock(time++); + socketStoreNode0.put(testPrimaryKeys.get(0), new Versioned("loser".getBytes(), + smallerClock), null); + socketStoreNode2.put(testPrimaryKeys.get(0), new Versioned("winner".getBytes(), + biggerClock), null); + socketStoreNode0.put(testSecondaryKeys.get(0), new Versioned("loser".getBytes(), + smallerClock), null); + socketStoreNode2.put(testSecondaryKeys.get(0), new Versioned("winner".getBytes(), + biggerClock), null); + + Map>> vals = redirectingStoreNode2.getAll(keyList, null); + assertEquals("Should contain exactly as many keys as the primary + secondary keys", + testPrimaryKeys.size() + testSecondaryKeys.size(), + vals.size()); + assertFalse("Should not contain non existent key", + vals.containsKey(new ByteArray("non-existent-key".getBytes()))); + + for(Entry>> entry: vals.entrySet()) { + String valueStr = new String(entry.getValue().get(0).getValue()); + if(entry.getKey().equals(testPrimaryKeys.get(0)) + || entry.getKey().equals(testSecondaryKeys.get(0))) { + assertEquals("Value should be 'winner'", "winner", valueStr); + } else { + assertEquals("Value should be 'normal'", "normal", valueStr); + } + } + + // Now delete all keys on Node 0 and make sure it is still served out of + // Node 2 + for(ByteArray key: testPrimaryKeys) { + socketStoreNode0.delete(key, makeSuperClock(time++)); + } + for(ByteArray key: testSecondaryKeys) { + socketStoreNode0.delete(key, makeSuperClock(time++)); + } + + vals = redirectingStoreNode2.getAll(keyList, null); + assertEquals("Should contain exactly as many keys as the primary + secondary keys", + testPrimaryKeys.size() + testSecondaryKeys.size(), + vals.size()); + assertFalse("Should not contain non existent key", + vals.containsKey(new ByteArray("non-existent-key".getBytes()))); + + for(Entry>> entry: vals.entrySet()) { + String valueStr = new String(entry.getValue().get(0).getValue()); + if(entry.getKey().equals(testPrimaryKeys.get(0)) + || entry.getKey().equals(testSecondaryKeys.get(0))) { + assertEquals("Value should be 'winner'", "winner", valueStr); + } else { + assertEquals("Value should be 'normal'", "normal", valueStr); + } + } + + } } diff --git a/test/unit/voldemort/store/routed/HintedHandoffSendHintTest.java b/test/unit/voldemort/store/routed/HintedHandoffSendHintTest.java new file mode 100644 index 0000000000..170ba2463a --- /dev/null +++ b/test/unit/voldemort/store/routed/HintedHandoffSendHintTest.java @@ -0,0 +1,468 @@ +package voldemort.store.routed; + +import static org.junit.Assert.assertTrue; +import static voldemort.VoldemortTestConstants.getNineNodeCluster; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.Random; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.TimeUnit; + +import org.apache.log4j.Logger; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import voldemort.ServerTestUtils; +import voldemort.TestUtils; +import voldemort.client.ClientConfig; +import voldemort.client.RoutingTier; +import voldemort.client.TimeoutConfig; +import voldemort.cluster.Cluster; +import voldemort.cluster.Node; +import voldemort.cluster.Zone; +import voldemort.cluster.failuredetector.FailureDetector; +import voldemort.cluster.failuredetector.FailureDetectorConfig; +import voldemort.cluster.failuredetector.FailureDetectorUtils; +import voldemort.cluster.failuredetector.MutableStoreVerifier; +import voldemort.cluster.failuredetector.ThresholdFailureDetector; +import voldemort.common.service.ServiceType; +import voldemort.common.service.VoldemortService; +import voldemort.routing.RoutingStrategy; +import voldemort.routing.RoutingStrategyFactory; +import voldemort.routing.RoutingStrategyType; +import voldemort.serialization.ByteArraySerializer; +import voldemort.serialization.IdentitySerializer; +import voldemort.serialization.Serializer; +import voldemort.serialization.SerializerDefinition; +import voldemort.serialization.SlopSerializer; +import voldemort.server.RequestRoutingType; +import voldemort.server.VoldemortConfig; +import voldemort.server.VoldemortServer; +import voldemort.server.storage.StorageService; +import voldemort.store.ForceFailStore; +import voldemort.store.Store; +import voldemort.store.StoreDefinition; +import voldemort.store.StoreDefinitionBuilder; +import voldemort.store.UnreachableStoreException; +import voldemort.store.memory.InMemoryStorageConfiguration; +import voldemort.store.nonblockingstore.NonblockingStore; +import voldemort.store.serialized.SerializingStore; +import voldemort.store.slop.Slop; +import voldemort.store.slop.SlopStorageEngine; +import voldemort.store.slop.strategy.HintedHandoffStrategyType; +import voldemort.store.socket.SocketStore; +import voldemort.store.socket.SocketStoreFactory; +import voldemort.store.socket.clientrequest.ClientRequestExecutorPool; +import voldemort.utils.ByteArray; +import voldemort.utils.ByteUtils; +import voldemort.versioning.Versioned; + +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.HashMultimap; +import com.google.common.collect.Maps; +import com.google.common.collect.Multimap; +import com.google.common.collect.Sets; + +@RunWith(Parameterized.class) +public class HintedHandoffSendHintTest { + + private final static Logger logger = Logger.getLogger(HintedHandoffSendHintTest.class); + + private final static String STORE_NAME = "test"; + private final static String SLOP_STORE_NAME = "slop"; + + private final static int NUM_NODES_TOTAL = 9; + private final static int NUM_NODES_FAILED = 4; + + private int REPLICATION_FACTOR = 3; + private final static int P_READS = 1; + private final static int R_READS = 1; + private int P_WRITES = 1; + private int R_WRITES = 1; + + private final static int KEY_LENGTH = 32; + private final static int VALUE_LENGTH = 32; + private final static int SOCKET_TIMEOUT_MS = 500; + + private final Class failureDetectorCls = ThresholdFailureDetector.class; + private final HintedHandoffStrategyType hintRoutingStrategy; + + private final Map> subStores = new ConcurrentHashMap>(); + private final Map> forceFailStores = new ConcurrentHashMap>(); + private final Map> slopStores = new ConcurrentHashMap>(); + private final Map socketTestStores = new HashMap(); + private final Map socketSlopStores = new HashMap(); + private final Map slopStorageEngines = new ConcurrentHashMap(); + private final Multimap keysToNodes = HashMultimap.create(); + private final Map keyValues = Maps.newHashMap(); + private final Map voldemortServers = new HashMap(); + + private Cluster cluster; + private FailureDetector failureDetector; + private StoreDefinition storeDef; + private RoutingStrategy strategy; + private RoutedStore routedStore; + private final List keyList = new ArrayList(); + + final static class SocketStoreClientFactoryForTest { + + private final String storeName; + private final String slopStoreName; + private final ClientRequestExecutorPool storeFactory; + private final ClientConfig config; + + public SocketStoreClientFactoryForTest(String testStoreName, String slopStoreName) { + this.storeName = testStoreName; + this.slopStoreName = slopStoreName; + config = new ClientConfig(); + storeFactory = new ClientRequestExecutorPool(config.getSelectors(), + config.getMaxConnectionsPerNode(), + config.getConnectionTimeout(TimeUnit.MILLISECONDS), + SOCKET_TIMEOUT_MS, + config.getSocketBufferSize(), + config.getSocketKeepAlive(), + false, + 0); + } + + protected SocketStore getSocketTestStoreByNode(Node node) { + return storeFactory.create(storeName, + node.getHost(), + node.getSocketPort(), + config.getRequestFormatType(), + RequestRoutingType.getRequestRoutingType(false, false)); + } + + protected SocketStore getSocketSlopStoreByNode(Node node) { + return storeFactory.create(slopStoreName, + node.getHost(), + node.getSocketPort(), + config.getRequestFormatType(), + RequestRoutingType.getRequestRoutingType(false, false)); + } + } + + public HintedHandoffSendHintTest(HintedHandoffStrategyType hintRoutingStrategy, + int replicationFactor, + int requiredWrites, + int preferredWrites) { + this.hintRoutingStrategy = hintRoutingStrategy; + this.REPLICATION_FACTOR = replicationFactor; + this.R_WRITES = requiredWrites; + this.P_WRITES = preferredWrites; + } + + @Parameterized.Parameters + public static Collection configs() { + return Arrays.asList(new Object[][] { + { HintedHandoffStrategyType.CONSISTENT_STRATEGY, 3, 1, 1 }, + { HintedHandoffStrategyType.CONSISTENT_STRATEGY, 3, 1, 2 }, + { HintedHandoffStrategyType.CONSISTENT_STRATEGY, 3, 1, 3 }, + { HintedHandoffStrategyType.CONSISTENT_STRATEGY, 3, 2, 2 }, + { HintedHandoffStrategyType.CONSISTENT_STRATEGY, 3, 2, 3 }, + { HintedHandoffStrategyType.CONSISTENT_STRATEGY, 2, 1, 1 }, + { HintedHandoffStrategyType.CONSISTENT_STRATEGY, 2, 1, 2 }, + { HintedHandoffStrategyType.PROXIMITY_STRATEGY, 3, 1, 1 }, + { HintedHandoffStrategyType.PROXIMITY_STRATEGY, 3, 1, 2 }, + { HintedHandoffStrategyType.PROXIMITY_STRATEGY, 3, 1, 3 }, + { HintedHandoffStrategyType.PROXIMITY_STRATEGY, 3, 2, 2 }, + { HintedHandoffStrategyType.PROXIMITY_STRATEGY, 3, 2, 3 }, + { HintedHandoffStrategyType.PROXIMITY_STRATEGY, 2, 1, 1 }, + { HintedHandoffStrategyType.PROXIMITY_STRATEGY, 2, 1, 2 } }); + } + + private StoreDefinition getStoreDef() { + SerializerDefinition serDef = new SerializerDefinition("string"); + return new StoreDefinitionBuilder().setName(STORE_NAME) + .setType(InMemoryStorageConfiguration.TYPE_NAME) + .setKeySerializer(serDef) + .setValueSerializer(serDef) + .setRoutingPolicy(RoutingTier.CLIENT) + .setRoutingStrategyType(RoutingStrategyType.CONSISTENT_STRATEGY) + .setReplicationFactor(REPLICATION_FACTOR) + .setPreferredReads(P_READS) + .setRequiredReads(R_READS) + .setPreferredWrites(P_WRITES) + .setRequiredWrites(R_WRITES) + .setHintedHandoffStrategy(this.hintRoutingStrategy) + .build(); + } + + @Before + public void setUp() throws Exception { + if(logger.isDebugEnabled()) { + logger.debug("Test Started: replication[" + REPLICATION_FACTOR + "], preferredW[" + + P_WRITES + "], requiredW[" + R_WRITES + "]"); + } + cluster = getNineNodeCluster(); + storeDef = getStoreDef(); + + // create voldemort servers + for(Integer nodeId = 0; nodeId < NUM_NODES_TOTAL; nodeId++) { + SocketStoreFactory socketStoreFactory; + socketStoreFactory = new ClientRequestExecutorPool(2, 10000, 100000, 1024); + List stores = new ArrayList(); + stores.add(storeDef); + VoldemortConfig config = ServerTestUtils.createServerConfigWithDefs(true, + nodeId, + TestUtils.createTempDir() + .getAbsolutePath(), + cluster, + stores, + new Properties()); + config.setNioAdminConnectorSelectors(1); + config.setNioConnectorSelectors(2); + VoldemortServer vs = ServerTestUtils.startVoldemortServer(socketStoreFactory, config); + VoldemortService vsrv = vs.getService(ServiceType.STORAGE); + StorageService ss = (StorageService) vsrv; + voldemortServers.put(nodeId, vs); + + slopStorageEngines.put(nodeId, ss.getStoreRepository().getSlopStore()); + slopStores.put(nodeId, SerializingStore.wrap(ss.getStoreRepository().getSlopStore(), + new ByteArraySerializer(), + new SlopSerializer(), + new IdentitySerializer())); + // wrap original store with force fail store + Store store = ss.getStoreRepository() + .removeLocalStore(STORE_NAME); + UnreachableStoreException exception = new UnreachableStoreException("Force failed"); + ForceFailStore forceFailStore = new ForceFailStore(store, + exception); + forceFailStores.put(nodeId, forceFailStore); + ss.getStoreRepository().addLocalStore(forceFailStore); + } + + strategy = new RoutingStrategyFactory().updateRoutingStrategy(storeDef, cluster); + + // create client socket stores and slop stores + SocketStoreClientFactoryForTest clientSocketStoreFactory = new SocketStoreClientFactoryForTest(STORE_NAME, + SLOP_STORE_NAME); + Serializer slopKeySerializer = new ByteArraySerializer(); + Serializer slopValueSerializer = new SlopSerializer(); + Map> testStores = subStores; + Map> slopStores = new HashMap>(); + for(Node node: cluster.getNodes()) { + // test store + SocketStore socketTestStore = clientSocketStoreFactory.getSocketTestStoreByNode(node); + socketTestStores.put(node.getId(), socketTestStore); + testStores.put(node.getId(), socketTestStore); + + // slop store + SocketStore socketSlopStore = clientSocketStoreFactory.getSocketSlopStoreByNode(node); + Store slopStore = SerializingStore.wrap(socketSlopStore, + slopKeySerializer, + slopValueSerializer, + new IdentitySerializer()); + socketSlopStores.put(node.getId(), socketSlopStore); + slopStores.put(node.getId(), slopStore); + } + + // set failure detector + if(failureDetector != null) + failureDetector.destroy(); + FailureDetectorConfig failureDetectorConfig = new FailureDetectorConfig(); + failureDetectorConfig.setImplementationClassName(failureDetectorCls.getName()); + failureDetectorConfig.setThreshold(50); + failureDetectorConfig.setCluster(cluster); + failureDetectorConfig.setStoreVerifier(MutableStoreVerifier.create(subStores)); + failureDetector = FailureDetectorUtils.create(failureDetectorConfig, false); + + // make routedStore + routedStore = new PipelineRoutedStore(STORE_NAME, + testStores, + socketTestStores, + slopStores, + socketSlopStores, + cluster, + storeDef, + false, + Zone.DEFAULT_ZONE_ID, + new TimeoutConfig(1500L, false), + failureDetector, + false, + 0); + + // generate the keys + for(int i = 0; i < 5; i++) { + Set nodesCovered = Sets.newHashSet(); + while(nodesCovered.size() < NUM_NODES_TOTAL) { + ByteArray randomKey = new ByteArray(TestUtils.randomBytes(KEY_LENGTH)); + byte[] randomValue = TestUtils.randomBytes(VALUE_LENGTH); + + if(randomKey.length() > 0 && randomValue.length > 0) { + if(!keyList.contains(randomKey)) { + for(Node node: strategy.routeRequest(randomKey.get())) { + keysToNodes.put(randomKey, node.getId()); + nodesCovered.add(node.getId()); + } + logger.info("Inserting key [" + randomKey + "] to key list as id:" + + keyList.size()); + keyList.add(randomKey); + keyValues.put(randomKey, new ByteArray(randomValue)); + } + } + } + } + } + + @After + public void tearDown() throws Exception { + if(failureDetector != null) + failureDetector.destroy(); + for(VoldemortServer vs: voldemortServers.values()) { + vs.stop(); + } + routedStore.close(); + if(logger.isDebugEnabled()) { + logger.debug("Test Ended: replication[" + REPLICATION_FACTOR + "], preferredW[" + + P_WRITES + "], requiredW[" + R_WRITES + "]"); + } + } + + @Test + public void testHintedHandoff() throws Exception { + Set failedNodeSet = chooseFailedNodeSet(NUM_NODES_FAILED); + Multimap nodeToFailedKeysMap = doBatchPut(failedNodeSet); + + // wait for async operations + // must be greater than socket timeout to ensure slop is registered + logger.debug("Sleep for async operations to finish"); + Thread.sleep(Math.max(2000, SOCKET_TIMEOUT_MS * 2)); + + Map> nodeToSlopData = new HashMap>(); + Set slopKeys = makeSlopKeys(nodeToFailedKeysMap, Slop.Operation.PUT); + for(Store slopStore: slopStores.values()) { + Map>> getAllResult = slopStore.getAll(slopKeys, null); + for(Map.Entry>> entry: getAllResult.entrySet()) { + Slop slop = entry.getValue().get(0).getValue(); + Integer nodeId = slop.getNodeId(); + // get data + if(!nodeToSlopData.containsKey(nodeId)) { + nodeToSlopData.put(nodeId, new HashMap()); + } + Map perNodeSlopMap = nodeToSlopData.get(nodeId); + perNodeSlopMap.put(slop.getKey(), slop.getValue()); + + if(logger.isTraceEnabled()) + logger.trace(slop); + } + } + + int errorCount = 0; + for(Map.Entry failedKey: nodeToFailedKeysMap.entries()) { + Integer nodeId = failedKey.getKey(); + ByteArray key = failedKey.getValue(); + byte[] expected = keyValues.get(key).get(); + + Integer id = keyList.indexOf(key); + + // check if map exist + Map perNodeSlopMap = nodeToSlopData.get(nodeId); + if(perNodeSlopMap == null) { + logger.error("Slop does not have key[" + key + "][id:" + id + "]"); + errorCount++; + continue; + } + + byte[] actual = perNodeSlopMap.get(key); + + if(actual == null) { + logger.error("Slop does not have key[" + key + "][nodeId:" + nodeId + "]"); + errorCount++; + } else if(ByteUtils.compare(actual, expected) != 0) { + logger.error("Slop key[" + key + "][nodeId:" + nodeId + + "] does not have the correct value"); + errorCount++; + } else { + logger.debug("Slop has key[" + key + "][nodeId:" + nodeId + "] with value[" + + actual + "]"); + } + } + assertTrue(errorCount + " Slop(s) incorrect. See log for more info", errorCount == 0); + } + + private Set makeSlopKeys(Multimap failedKeys, + Slop.Operation operation) { + Set slopKeys = Sets.newHashSet(); + + for(Map.Entry entry: failedKeys.entries()) { + byte[] opCode = new byte[] { operation.getOpCode() }; + byte[] spacer = new byte[] { (byte) 0 }; + byte[] storeName = ByteUtils.getBytes(STORE_NAME, "UTF-8"); + byte[] nodeIdBytes = new byte[ByteUtils.SIZE_OF_INT]; + ByteUtils.writeInt(nodeIdBytes, entry.getKey(), 0); + ByteArray slopKey = new ByteArray(ByteUtils.cat(opCode, + spacer, + storeName, + spacer, + nodeIdBytes, + spacer, + entry.getValue().get())); + slopKeys.add(slopKey); + } + return slopKeys; + } + + private static Set chooseFailedNodeSet(int count) { + // decide failed nodes + Set failedNodes = new HashSet(); + Random rand = new Random(); + while(failedNodes.size() < count && failedNodes.size() <= NUM_NODES_TOTAL) { + int n = rand.nextInt(NUM_NODES_TOTAL); + failedNodes.add(n); + } + + if(logger.isDebugEnabled()) { + logger.debug("Failing requests to " + failedNodes); + } + return failedNodes; + } + + private Multimap doBatchPut(Set failedNodeSet) { + for(Integer nodeId: failedNodeSet) { + forceFailStores.get(nodeId).setFail(true); + } + // put keys into the nodes + Multimap nodeToFailedKeys = ArrayListMultimap.create(); + for(ByteArray key: keysToNodes.keySet()) { + List nodes = strategy.routeRequest(key.get()); + List failedNodes = new ArrayList(); + for(Node node: nodes) { + if(failedNodeSet != null && failedNodeSet.contains(node.getId())) { + failedNodes.add(node); + } + } + // determine if write will succeed (if numGoodNodes > required) + if((nodes.size() - failedNodes.size()) >= R_WRITES) { + for(Node node: failedNodes) { + nodeToFailedKeys.put(node.getId(), key); + logger.debug("[key:" + key + "] to [nodeId:" + node.getId() + "] should fail"); + } + } else { + logger.debug("[key:" + key + "] should fail overall due to insufficient nodes"); + } + + try { + Versioned versioned = new Versioned(keyValues.get(key).get()); + if(logger.isTraceEnabled()) + logger.trace("PUT key [" + key + "] to store"); + routedStore.put(key, versioned, null); + } catch(Exception e) { + if(logger.isTraceEnabled()) + logger.trace(e, e); + } + } + return nodeToFailedKeys; + } +} diff --git a/test/unit/voldemort/store/routed/HintedHandoffTest.java b/test/unit/voldemort/store/routed/HintedHandoffTest.java index a2c5ee197a..59c4426c5c 100644 --- a/test/unit/voldemort/store/routed/HintedHandoffTest.java +++ b/test/unit/voldemort/store/routed/HintedHandoffTest.java @@ -1,7 +1,6 @@ package voldemort.store.routed; import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; import static voldemort.VoldemortTestConstants.getNineNodeCluster; @@ -136,7 +135,7 @@ private StoreDefinition getStoreDef(String storeName, .setType(InMemoryStorageConfiguration.TYPE_NAME) .setKeySerializer(serDef) .setValueSerializer(serDef) - .setRoutingPolicy(RoutingTier.SERVER) + .setRoutingPolicy(RoutingTier.CLIENT) .setRoutingStrategyType(strategyType) .setReplicationFactor(replicationFactor) .setPreferredReads(preads) @@ -234,36 +233,6 @@ public void tearDown() throws Exception { routedStoreThreadPool.shutdown(); } - @Test - public void testHintedHandoff() throws Exception { - Set failedNodes = getFailedNodes(); - Multimap failedKeys = populateStore(failedNodes); - Thread.sleep(5000); - - Map dataInSlops = Maps.newHashMap(); - Set slopKeys = makeSlopKeys(failedKeys, Slop.Operation.PUT); - for(Store slopStore: slopStores.values()) { - Map>> res = slopStore.getAll(slopKeys, null); - for(Map.Entry>> entry: res.entrySet()) { - Slop slop = entry.getValue().get(0).getValue(); - dataInSlops.put(slop.getKey(), slop.getValue()); - - if(logger.isTraceEnabled()) - logger.trace(slop); - } - } - - for(Map.Entry failedKey: failedKeys.entries()) { - byte[] expected = keyValues.get(failedKey.getValue()).get(); - byte[] actual = dataInSlops.get(failedKey.getValue()); - - assertNotNull("data should be stored in the slop for key = " + failedKey.getValue(), - actual); - assertEquals("correct should be stored in slop", 0, ByteUtils.compare(actual, expected)); - } - - } - private Set makeSlopKeys(Multimap failedKeys, Slop.Operation operation) { Set slopKeys = Sets.newHashSet(); diff --git a/test/unit/voldemort/store/routed/ReadRepairerTest.java b/test/unit/voldemort/store/routed/ReadRepairerTest.java index 343d277a83..b1f48ce155 100644 --- a/test/unit/voldemort/store/routed/ReadRepairerTest.java +++ b/test/unit/voldemort/store/routed/ReadRepairerTest.java @@ -292,6 +292,16 @@ public void testLotsOfVersions() throws Exception { getValue(6, 1, new int[] { 3, 3 }))); } + @Test + public void testConcurrentVersionsDoNotResultInRepairs() throws Exception { + List> emptyExpectedList = new ArrayList>(); + assertVariationsEqual(emptyExpectedList, + asList(getValue(1, 1, new int[] { 1, 1, 2, 2, 2, 2, 3, 3 }), + getValue(1, 1, new int[] { 1, 1, 1, 1, 2, 2, 3, 3 }), + getValue(2, 1, new int[] { 1, 1, 2, 2, 2, 2, 3, 3 }), + getValue(2, 1, new int[] { 1, 1, 1, 1, 2, 2, 3, 3 }))); + } + /** * Test the equality with a few variations on ordering * diff --git a/test/unit/voldemort/utils/ClusterForkLiftToolTest.java b/test/unit/voldemort/utils/ClusterForkLiftToolTest.java index be59902376..71822cdccd 100644 --- a/test/unit/voldemort/utils/ClusterForkLiftToolTest.java +++ b/test/unit/voldemort/utils/ClusterForkLiftToolTest.java @@ -6,6 +6,7 @@ import java.io.File; import java.io.IOException; import java.util.HashMap; +import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Properties; @@ -23,6 +24,7 @@ import voldemort.client.protocol.admin.AdminClientConfig; import voldemort.cluster.Cluster; import voldemort.cluster.Node; +import voldemort.routing.StoreRoutingPlan; import voldemort.server.VoldemortServer; import voldemort.store.StoreDefinition; import voldemort.store.StoreUtils; @@ -41,6 +43,7 @@ public class ClusterForkLiftToolTest { final static String STORES_XML = "test/common/voldemort/config/two-stores-replicated.xml"; final static String PRIMARY_RESOLVING_STORE_NAME = "test"; final static String GLOBALLY_RESOLVING_STORE_NAME = "best"; + final static String MULTIPLE_VERSIONS_STORE_NAME = "no-res"; private String srcBootStrapUrl; private String dstBootStrapUrl; @@ -51,6 +54,7 @@ public class ClusterForkLiftToolTest { private StoreDefinition primaryResolvingStoreDef; private StoreDefinition globallyResolvingStoreDef; + private StoreDefinition nonResolvingStoreDef; private HashMap kvPairs; private String firstKey; @@ -124,6 +128,8 @@ public void setUpClusters() { globallyResolvingStoreDef = StoreUtils.getStoreDef(storeDefs, GLOBALLY_RESOLVING_STORE_NAME); + nonResolvingStoreDef = StoreUtils.getStoreDef(storeDefs, MULTIPLE_VERSIONS_STORE_NAME); + srcfactory = new SocketStoreClientFactory(new ClientConfig().setBootstrapUrls(srcBootStrapUrl) .setSelectors(1) .setRoutingTimeout(1000, @@ -157,7 +163,8 @@ public void setUpClusters() { @Test public void testPrimaryResolvingForkLift() throws Exception { - StoreInstance srcStoreInstance = new StoreInstance(srcCluster, primaryResolvingStoreDef); + StoreRoutingPlan srcStoreInstance = new StoreRoutingPlan(srcCluster, + primaryResolvingStoreDef); // populate data on the source cluster.. for(Map.Entry entry: kvPairs.entrySet()) { @@ -192,7 +199,7 @@ public void testPrimaryResolvingForkLift() throws Exception { 1000, Lists.newArrayList(PRIMARY_RESOLVING_STORE_NAME), null, - false); + ClusterForkLiftTool.ForkLiftTaskMode.primary_resolution); forkLiftTool.run(); // do a write to destination cluster @@ -223,7 +230,8 @@ public void testPrimaryResolvingForkLift() throws Exception { @Test public void testGloballyResolvingForkLift() throws Exception { - StoreInstance srcStoreInstance = new StoreInstance(srcCluster, globallyResolvingStoreDef); + StoreRoutingPlan srcStoreInstance = new StoreRoutingPlan(srcCluster, + globallyResolvingStoreDef); // populate data on the source cluster.. for(Map.Entry entry: kvPairs.entrySet()) { @@ -258,7 +266,7 @@ public void testGloballyResolvingForkLift() throws Exception { 1000, Lists.newArrayList(GLOBALLY_RESOLVING_STORE_NAME), null, - true); + ClusterForkLiftTool.ForkLiftTaskMode.global_resolution); forkLiftTool.run(); // do a write to destination cluster @@ -286,6 +294,65 @@ public void testGloballyResolvingForkLift() throws Exception { } } + @Test + public void testNoresolutionForkLift() throws Exception { + + int versions = 0; + + StoreRoutingPlan srcStoreInstance = new StoreRoutingPlan(srcCluster, nonResolvingStoreDef); + + // generate a conflict on the master partition + int masterNode = srcStoreInstance.getNodeIdForPartitionId(srcStoreInstance.getMasterPartitionId(conflictKey.getBytes("UTF-8"))); + VectorClock losingClock = new VectorClock(Lists.newArrayList(new ClockEntry((short) 0, 5)), + System.currentTimeMillis()); + VectorClock winningClock = new VectorClock(Lists.newArrayList(new ClockEntry((short) 1, 5)), + losingClock.getTimestamp() + 1); + srcAdminClient.storeOps.putNodeKeyValue(MULTIPLE_VERSIONS_STORE_NAME, + new NodeValue(masterNode, + new ByteArray(conflictKey.getBytes("UTF-8")), + new Versioned("losing value".getBytes("UTF-8"), + losingClock))); + srcAdminClient.storeOps.putNodeKeyValue(MULTIPLE_VERSIONS_STORE_NAME, + new NodeValue(masterNode, + new ByteArray(conflictKey.getBytes("UTF-8")), + new Versioned("winning value".getBytes("UTF-8"), + winningClock))); + // perform the forklifting.. + ClusterForkLiftTool forkLiftTool = new ClusterForkLiftTool(srcBootStrapUrl, + dstBootStrapUrl, + 10000, + 1, + 1000, + Lists.newArrayList(MULTIPLE_VERSIONS_STORE_NAME), + null, + ClusterForkLiftTool.ForkLiftTaskMode.no_resolution); + forkLiftTool.run(); + + AdminClient dstAdminClient = new AdminClient(dstBootStrapUrl, + new AdminClientConfig(), + new ClientConfig()); + + for(Node node: dstAdminClient.getAdminClientCluster().getNodes()) { + + Iterator>> entryItr = dstAdminClient.bulkFetchOps.fetchEntries(node.getId(), + MULTIPLE_VERSIONS_STORE_NAME, + node.getPartitionIds(), + null, + true); + + while(entryItr.hasNext()) { + Pair> record = entryItr.next(); + ByteArray key = record.getFirst(); + Versioned versioned = record.getSecond(); + versions++; + + } + + } + assertEquals("Both conflicting versions present", versions, 2); + + } + @After public void tearDownClusters() { diff --git a/test/unit/voldemort/versioning/VectorClockTest.java b/test/unit/voldemort/versioning/VectorClockTest.java index c5c53e6e6f..b48ea4b1cb 100644 --- a/test/unit/voldemort/versioning/VectorClockTest.java +++ b/test/unit/voldemort/versioning/VectorClockTest.java @@ -16,8 +16,13 @@ package voldemort.versioning; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; import static voldemort.TestUtils.getClock; -import junit.framework.TestCase; + +import org.junit.Test; + import voldemort.TestUtils; import com.google.common.collect.Lists; @@ -27,8 +32,10 @@ * * */ -public class VectorClockTest extends TestCase { +@SuppressWarnings("deprecation") +public class VectorClockTest { + @Test public void testEqualsAndHashcode() { VectorClock one = getClock(1, 2); VectorClock other = getClock(1, 2); @@ -36,6 +43,7 @@ public void testEqualsAndHashcode() { assertEquals(one.hashCode(), other.hashCode()); } + @Test public void testComparisons() { assertTrue("The empty clock should not happen before itself.", getClock().compare(getClock()) != Occurred.CONCURRENTLY); @@ -47,10 +55,13 @@ public void testComparisons() { getClock(1).compare(getClock(2)) == Occurred.CONCURRENTLY); assertTrue("Clocks with different events should be concurrent.", getClock(1, 1, 2).compare(getClock(1, 1, 3)) == Occurred.CONCURRENTLY); + assertTrue("Clocks with different events should be concurrent.", + getClock(1, 2, 3, 3).compare(getClock(1, 1, 2, 3)) == Occurred.CONCURRENTLY); assertTrue(getClock(2, 2).compare(getClock(1, 2, 2, 3)) == Occurred.BEFORE && getClock(1, 2, 2, 3).compare(getClock(2, 2)) == Occurred.AFTER); } + @Test public void testMerge() { // merging two clocks should create a clock contain the element-wise // maximums @@ -74,6 +85,7 @@ public void testMerge() { * See gihub issue #25: Incorrect coersion of version to short before * passing to ClockEntry constructor */ + @Test public void testMergeWithLargeVersion() { VectorClock clock1 = getClock(1); VectorClock clock2 = new VectorClock(Lists.newArrayList(new ClockEntry((short) 1, @@ -83,6 +95,7 @@ public void testMergeWithLargeVersion() { assertEquals(mergedClock.getMaxVersion(), Short.MAX_VALUE + 1); } + @Test public void testSerialization() { assertEquals("The empty clock serializes incorrectly.", getClock(), @@ -93,6 +106,42 @@ public void testSerialization() { new VectorClock(clock.toBytes())); } + @Test + public void testSerializationBackwardCompatibility() { + assertEquals("The empty clock serializes incorrectly.", + getClock(), + new VectorClock(getClock().toBytes())); + VectorClock clock = getClock(1, 1, 2, 3, 4, 4, 6); + // Old Vector Clock would serialize to this: + // 0 5 1 0 1 2 0 2 1 0 3 1 0 4 2 0 6 1 [timestamp] + byte[] knownSerializedHead = { 0, 5, 1, 0, 1, 2, 0, 2, 1, 0, 3, 1, 0, 4, 2, 0, 6, 1 }; + byte[] serialized = clock.toBytes(); + for(int index = 0; index < knownSerializedHead.length; index++) { + assertEquals("byte at index " + index + " is not equal", + knownSerializedHead[index], + serialized[index]); + } + } + + /** + * Pre-condition: timestamp is ignored in determine vector clock equality + */ + @Test + public void testDeserializationBackwardCompatibility() { + assertEquals("The empty clock serializes incorrectly.", + getClock(), + new VectorClock(getClock().toBytes())); + VectorClock clock = getClock(1, 1, 2, 3, 4, 4, 6); + // Old Vector Clock would serialize to this: + // 0 5; 1; 0 1, 2; 0 2, 1; 0 3, 1; 0 4, 2; 0 6, 1; [timestamp=random] + byte[] knownSerialized = { 0, 5, 1, 0, 1, 2, 0, 2, 1, 0, 3, 1, 0, 4, 2, 0, 6, 1, 0, 0, 1, + 0x3e, 0x7b, (byte) 0x8c, (byte) 0x9d, 0x19 }; + assertEquals("vector clock does not deserialize correctly on given byte array", + clock, + new VectorClock(knownSerialized)); + } + + @Test public void testSerializationWraps() { VectorClock clock = getClock(1, 1, 2, 3, 3, 6); for(int i = 0; i < 300; i++) @@ -100,6 +149,7 @@ public void testSerializationWraps() { assertEquals("Clock does not serialize to itself.", clock, new VectorClock(clock.toBytes())); } + @Test public void testIncrementOrderDoesntMatter() { // Clocks should have the property that no matter what order the // increment operations are done in the resulting clocks are equal @@ -119,6 +169,7 @@ public void testIncrementOrderDoesntMatter() { } } + @Test public void testIncrementAndSerialize() { int node = 1; VectorClock vc = getClock(node); @@ -132,4 +183,37 @@ public void testIncrementAndSerialize() { assertEquals(increments + 1, vc.getMaxVersion()); } + /** + * A test for comparing vector clocks that nodes of clock entries are not + * sorted In case people insert clock entries without using increment we + * need to test although it has been deprecated + */ + @Test + public void testNodeClockEntryDeprecate() { + VectorClock vc1 = new VectorClock(); + try { + vc1.getEntries().add(new ClockEntry((short) 2, 2)); + fail("Did not throw UnsupportedOperationException"); + } catch(UnsupportedOperationException e) { + + } + } + + @Test + public void testVersion0NotAcceptable() { + try { + ClockEntry clockEntry = new ClockEntry(); + clockEntry.setVersion(0); + fail("Did not throw IllegalArgumentException"); + } catch(IllegalArgumentException e) {} + } + + @Test + public void testNodeLess0NotAcceptable() { + try { + ClockEntry clockEntry = new ClockEntry(); + clockEntry.setNodeId((short) -1); + fail("Did not throw IllegalArgumentException"); + } catch(IllegalArgumentException e) {} + } }