forked from voldemort/voldemort
-
Notifications
You must be signed in to change notification settings - Fork 0
/
AdminClient.java
2579 lines (2355 loc) · 130 KB
/
AdminClient.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* Copyright 2008-2009 LinkedIn, Inc
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package voldemort.client.protocol.admin;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.StringReader;
import java.net.Socket;
import java.nio.channels.Channels;
import java.nio.channels.FileChannel;
import java.nio.channels.ReadableByteChannel;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Properties;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import org.apache.commons.io.FileUtils;
import org.apache.log4j.Logger;
import voldemort.VoldemortException;
import voldemort.client.ClientConfig;
import voldemort.client.SocketStoreClientFactory;
import voldemort.client.SystemStore;
import voldemort.client.protocol.RequestFormatType;
import voldemort.client.protocol.VoldemortFilter;
import voldemort.client.protocol.pb.ProtoUtils;
import voldemort.client.protocol.pb.VAdminProto;
import voldemort.client.protocol.pb.VAdminProto.RebalancePartitionInfoMap;
import voldemort.client.protocol.pb.VProto;
import voldemort.client.protocol.pb.VProto.RequestType;
import voldemort.client.rebalance.RebalancePartitionsInfo;
import voldemort.cluster.Cluster;
import voldemort.cluster.Node;
import voldemort.routing.RoutingStrategy;
import voldemort.routing.RoutingStrategyFactory;
import voldemort.server.protocol.admin.AsyncOperationStatus;
import voldemort.server.rebalance.RebalancerState;
import voldemort.server.rebalance.VoldemortRebalancingException;
import voldemort.store.ErrorCodeMapper;
import voldemort.store.StoreDefinition;
import voldemort.store.metadata.MetadataStore;
import voldemort.store.metadata.MetadataStore.VoldemortState;
import voldemort.store.mysql.MysqlStorageConfiguration;
import voldemort.store.readonly.ReadOnlyStorageConfiguration;
import voldemort.store.readonly.ReadOnlyStorageFormat;
import voldemort.store.readonly.ReadOnlyStorageMetadata;
import voldemort.store.readonly.ReadOnlyUtils;
import voldemort.store.slop.Slop;
import voldemort.store.slop.Slop.Operation;
import voldemort.store.socket.SocketDestination;
import voldemort.store.system.SystemStoreConstants;
import voldemort.store.views.ViewStorageConfiguration;
import voldemort.utils.ByteArray;
import voldemort.utils.ByteUtils;
import voldemort.utils.MetadataVersionStoreUtils;
import voldemort.utils.NetworkClassLoader;
import voldemort.utils.Pair;
import voldemort.utils.RebalanceUtils;
import voldemort.utils.Utils;
import voldemort.versioning.VectorClock;
import voldemort.versioning.Version;
import voldemort.versioning.Versioned;
import voldemort.xml.ClusterMapper;
import voldemort.xml.StoreDefinitionsMapper;
import com.google.common.collect.AbstractIterator;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.google.protobuf.ByteString;
import com.google.protobuf.Message;
/**
* AdminClient is intended for administrative functionality that is useful and
* often needed, but should be used sparingly (if at all) at the application
* level.
* <p>
* Some of the uses of AdminClient include
* <ul>
* <li>Extraction of data for backups</li>
* <li>Extraction of all keys</li>
* <li>Bulk loading entries</li>
* <li>Migrating partitions</li>
* <li>Get/Update metadata info from selective Nodes</li>
* <li>Used extensively by rebalancing (dynamic node addition/deletion) feature
* (presently in development).</li>
* </ul>
*
*/
public class AdminClient {
private static final Logger logger = Logger.getLogger(AdminClient.class);
private final ErrorCodeMapper errorMapper;
private final SocketPool pool;
private final NetworkClassLoader networkClassLoader;
private static final ClusterMapper clusterMapper = new ClusterMapper();
private static final StoreDefinitionsMapper storeMapper = new StoreDefinitionsMapper();
// Parameters for exponential back off
private static final long INITIAL_DELAY = 250; // Initial delay
private static final long PRINT_STATS_THRESHOLD = 10000;
private static final long PRINT_STATS_INTERVAL = 5 * 60 * 1000; // 5 minutes
private final AdminClientConfig adminClientConfig;
private static final String CLUSTER_VERSION_KEY = "cluster.xml";
private static final int DEFAULT_ZONE_ID = 0;
public final static List<String> restoreStoreEngineBlackList = Arrays.asList(MysqlStorageConfiguration.TYPE_NAME,
ReadOnlyStorageConfiguration.TYPE_NAME,
ViewStorageConfiguration.TYPE_NAME);
private Cluster currentCluster;
private SystemStore<String, String> sysStoreVersion = null;
/**
* Create an instance of AdminClient given a URL of a node in the cluster.
* The bootstrap URL is used to get the cluster metadata.
*
* @param bootstrapURL URL pointing to the bootstrap node
* @param adminClientConfig Configuration for AdminClient specifying client
* parameters eg. <br>
* <ul>
* <t>
* <li>number of threads</li>
* <li>number of sockets per node</li>
* <li>socket buffer size</li>
* </ul>
*/
public AdminClient(String bootstrapURL, AdminClientConfig adminClientConfig) {
this.currentCluster = getClusterFromBootstrapURL(bootstrapURL);
this.errorMapper = new ErrorCodeMapper();
this.pool = createSocketPool(adminClientConfig);
this.networkClassLoader = new NetworkClassLoader(Thread.currentThread()
.getContextClassLoader());
this.adminClientConfig = adminClientConfig;
initSystemStoreClient(bootstrapURL, DEFAULT_ZONE_ID);
}
/**
* Create an instance of AdminClient given a {@link Cluster} object.
*
* @param cluster Initialized cluster object, describing the nodes we wish
* to contact
* @param adminClientConfig Configuration for AdminClient specifying client
* parameters eg. <br>
* <ul>
* <t>
* <li>number of threads</li>
* <li>number of sockets per node</li>
* <li>socket buffer size</li>
* </ul>
*/
public AdminClient(Cluster cluster, AdminClientConfig adminClientConfig) {
this.currentCluster = cluster;
this.errorMapper = new ErrorCodeMapper();
this.pool = createSocketPool(adminClientConfig);
this.networkClassLoader = new NetworkClassLoader(Thread.currentThread()
.getContextClassLoader());
this.adminClientConfig = adminClientConfig;
Node node = cluster.getNodeById(0);
String bootstrapURL = "tcp://" + node.getHost() + ":" + node.getSocketPort();
initSystemStoreClient(bootstrapURL, DEFAULT_ZONE_ID);
}
/**
* Wrapper for the actual AdminClient constructor given the URL of a node in
* the cluster.
*
* @param bootstrapURL URL pointing to the bootstrap node
* @param adminClientConfig Configuration for AdminClient specifying client
* parameters eg. <br>
* <ul>
* <t>
* <li>number of threads</li>
* <li>number of sockets per node</li>
* <li>socket buffer size</li>
* </ul>
* @param zoneID The primary Zone ID for the purpose of the SystemStore
*/
public AdminClient(String bootstrapURL, AdminClientConfig adminClientConfig, int zoneID) {
this(bootstrapURL, adminClientConfig);
initSystemStoreClient(bootstrapURL, zoneID);
}
private void initSystemStoreClient(String bootstrapURL, int zoneID) {
String[] bootstrapUrls = new String[1];
bootstrapUrls[0] = bootstrapURL;
try {
this.sysStoreVersion = new SystemStore<String, String>(SystemStoreConstants.SystemStoreName.voldsys$_metadata_version_persistence.name(),
bootstrapUrls,
zoneID);
} catch(Exception e) {
logger.debug("Error while creating a system store client for metadata version store.");
}
}
/**
* Increment the metadata version for the given key (cluster or store)
*
* @param versionKey The metadata key for which Version should be
* incremented
*/
public void updateMetadataversion(String versionKey) {
Properties props = MetadataVersionStoreUtils.getProperties(this.sysStoreVersion);
if(props != null && props.getProperty(versionKey) != null) {
logger.debug("Version obtained = " + props.getProperty(versionKey));
long newValue = Long.parseLong(props.getProperty(versionKey)) + 1;
props.setProperty(versionKey, Long.toString(newValue));
} else {
logger.debug("Current version is null. Assuming version 0.");
if(props == null) {
props = new Properties();
}
props.setProperty(versionKey, "0");
}
MetadataVersionStoreUtils.setProperties(this.sysStoreVersion, props);
}
/**
* Set the metadata versions to the given set
*
* @param newProperties The new metadata versions to be set across all the
* nodes in the cluster
*/
public void setMetadataversion(Properties newProperties) {
MetadataVersionStoreUtils.setProperties(this.sysStoreVersion, newProperties);
}
private Cluster getClusterFromBootstrapURL(String bootstrapURL) {
ClientConfig config = new ClientConfig();
// try to bootstrap metadata from bootstrapUrl
config.setBootstrapUrls(bootstrapURL);
SocketStoreClientFactory factory = new SocketStoreClientFactory(config);
// get Cluster from bootStrapUrl
String clusterXml = factory.bootstrapMetadataWithRetries(MetadataStore.CLUSTER_KEY,
factory.validateUrls(config.getBootstrapUrls()));
// release all threads/sockets hold by the factory.
factory.close();
return clusterMapper.readCluster(new StringReader(clusterXml), false);
}
private SocketPool createSocketPool(AdminClientConfig config) {
TimeUnit unit = TimeUnit.SECONDS;
return new SocketPool(config.getMaxConnectionsPerNode(),
(int) unit.toMillis(config.getAdminConnectionTimeoutSec()),
(int) unit.toMillis(config.getAdminSocketTimeoutSec()),
config.getAdminSocketBufferSize(),
config.getAdminSocketKeepAlive());
}
private <T extends Message.Builder> T sendAndReceive(int nodeId, Message message, T builder) {
Node node = this.getAdminClientCluster().getNodeById(nodeId);
SocketDestination destination = new SocketDestination(node.getHost(),
node.getAdminPort(),
RequestFormatType.ADMIN_PROTOCOL_BUFFERS);
SocketAndStreams sands = pool.checkout(destination);
try {
DataOutputStream outputStream = sands.getOutputStream();
DataInputStream inputStream = sands.getInputStream();
ProtoUtils.writeMessage(outputStream, message);
outputStream.flush();
return ProtoUtils.readToBuilder(inputStream, builder);
} catch(IOException e) {
close(sands.getSocket());
throw new VoldemortException(e);
} finally {
pool.checkin(destination, sands);
}
}
/**
* Update a stream of key/value entries at the given node. The iterator
* entries are <em>streamed</em> from the client to the server:
* <ol>
* <li>Client performs a handshake with the server (sending in the update
* entries request with a store name and a {@link VoldemortFilter} instance.
* </li>
* <li>While entryIterator has entries, the client will keep sending the
* updates one after another to the server, buffering the data, without
* waiting for a response from the server.</li>
* <li>After iteration is complete, send an end of stream message, force a
* flush of the buffer, check the response on the server to check if a
* {@link VoldemortException} has occured.</li>
* </ol>
*
* @param nodeId Id of the remote node (where we wish to update the entries)
* @param storeName Store name for the entries
* @param entryIterator Iterator of key-value pairs for the entries
* @param filter Custom filter implementation to filter out entries which
* should not be updated.
* @throws VoldemortException
*/
public void updateEntries(int nodeId,
String storeName,
Iterator<Pair<ByteArray, Versioned<byte[]>>> entryIterator,
VoldemortFilter filter) {
Node node = this.getAdminClientCluster().getNodeById(nodeId);
SocketDestination destination = new SocketDestination(node.getHost(),
node.getAdminPort(),
RequestFormatType.ADMIN_PROTOCOL_BUFFERS);
SocketAndStreams sands = pool.checkout(destination);
DataOutputStream outputStream = sands.getOutputStream();
DataInputStream inputStream = sands.getInputStream();
boolean firstMessage = true;
long printStatsTimer = System.currentTimeMillis() + PRINT_STATS_INTERVAL;
long entryCount = 0;
try {
if(entryIterator.hasNext()) {
while(entryIterator.hasNext()) {
Pair<ByteArray, Versioned<byte[]>> entry = entryIterator.next();
VAdminProto.PartitionEntry partitionEntry = VAdminProto.PartitionEntry.newBuilder()
.setKey(ProtoUtils.encodeBytes(entry.getFirst()))
.setVersioned(ProtoUtils.encodeVersioned(entry.getSecond()))
.build();
VAdminProto.UpdatePartitionEntriesRequest.Builder updateRequest = VAdminProto.UpdatePartitionEntriesRequest.newBuilder()
.setStore(storeName)
.setPartitionEntry(partitionEntry);
entryCount++;
if(firstMessage) {
if(filter != null) {
updateRequest.setFilter(encodeFilter(filter));
}
ProtoUtils.writeMessage(outputStream,
VAdminProto.VoldemortAdminRequest.newBuilder()
.setType(VAdminProto.AdminRequestType.UPDATE_PARTITION_ENTRIES)
.setUpdatePartitionEntries(updateRequest)
.build());
outputStream.flush();
firstMessage = false;
} else {
ProtoUtils.writeMessage(outputStream, updateRequest.build());
if(printStatsTimer <= System.currentTimeMillis()
|| 0 == entryCount % PRINT_STATS_THRESHOLD) {
logger.info("UpdatePartitionEntries: fetched " + entryCount
+ " to node " + nodeId + " for store " + storeName);
printStatsTimer = System.currentTimeMillis() + PRINT_STATS_INTERVAL;
}
}
}
ProtoUtils.writeEndOfStream(outputStream);
outputStream.flush();
VAdminProto.UpdatePartitionEntriesResponse.Builder updateResponse = ProtoUtils.readToBuilder(inputStream,
VAdminProto.UpdatePartitionEntriesResponse.newBuilder());
if(updateResponse.hasError()) {
throwException(updateResponse.getError());
}
}
} catch(IOException e) {
close(sands.getSocket());
throw new VoldemortException(e);
} finally {
pool.checkin(destination, sands);
}
}
private void initiateFetchRequest(DataOutputStream outputStream,
String storeName,
HashMap<Integer, List<Integer>> replicaToPartitionList,
VoldemortFilter filter,
boolean fetchValues,
boolean fetchMasterEntries,
Cluster initialCluster,
long skipRecords) throws IOException {
HashMap<Integer, List<Integer>> filteredReplicaToPartitionList = Maps.newHashMap();
if(fetchMasterEntries) {
if(!replicaToPartitionList.containsKey(0)) {
throw new VoldemortException("Could not find any partitions for primary replica type");
} else {
filteredReplicaToPartitionList.put(0, replicaToPartitionList.get(0));
}
} else {
filteredReplicaToPartitionList.putAll(replicaToPartitionList);
}
VAdminProto.FetchPartitionEntriesRequest.Builder fetchRequest = VAdminProto.FetchPartitionEntriesRequest.newBuilder()
.setFetchValues(fetchValues)
.addAllReplicaToPartition(ProtoUtils.encodePartitionTuple(filteredReplicaToPartitionList))
.setStore(storeName)
.setSkipRecords(skipRecords);
try {
if(filter != null) {
fetchRequest.setFilter(encodeFilter(filter));
}
} catch(IOException e) {
throw new VoldemortException(e);
}
if(initialCluster != null) {
fetchRequest.setInitialCluster(new ClusterMapper().writeCluster(initialCluster));
}
VAdminProto.VoldemortAdminRequest request = VAdminProto.VoldemortAdminRequest.newBuilder()
.setType(VAdminProto.AdminRequestType.FETCH_PARTITION_ENTRIES)
.setFetchPartitionEntries(fetchRequest)
.build();
ProtoUtils.writeMessage(outputStream, request);
outputStream.flush();
}
private VAdminProto.FetchPartitionEntriesResponse responseFromStream(DataInputStream inputStream,
int size)
throws IOException {
byte[] input = new byte[size];
ByteUtils.read(inputStream, input);
VAdminProto.FetchPartitionEntriesResponse.Builder response = VAdminProto.FetchPartitionEntriesResponse.newBuilder();
response.mergeFrom(input);
return response.build();
}
/**
* Legacy interface for fetching entries. See
* {@link AdminClient#fetchEntries(int, String, HashMap, VoldemortFilter, boolean, Cluster, long)}
* for more information.
*
* @param nodeId Id of the node to fetch from
* @param storeName Name of the store
* @param partitionList List of the partitions
* @param filter Custom filter implementation to filter out entries which
* should not be fetched.
* @param fetchMasterEntries Fetch an entry only if master replica
* @param skipRecords Number of records to skip
* @return An iterator which allows entries to be streamed as they're being
* iterated over.
*/
public Iterator<Pair<ByteArray, Versioned<byte[]>>> fetchEntries(int nodeId,
String storeName,
List<Integer> partitionList,
VoldemortFilter filter,
boolean fetchMasterEntries,
long skipRecords) {
return fetchEntries(nodeId,
storeName,
getReplicaToPartitionMap(nodeId, storeName, partitionList),
filter,
fetchMasterEntries,
null,
skipRecords);
}
/**
* Legacy interface for fetching entries. See
* {@link AdminClient#fetchEntries(int, String, HashMap, VoldemortFilter, boolean, Cluster, long)}
* for more information.
*
* @param nodeId Id of the node to fetch from
* @param storeName Name of the store
* @param partitionList List of the partitions
* @param filter Custom filter implementation to filter out entries which
* should not be fetched.
* @param fetchMasterEntries Fetch an entry only if master replica
* @return An iterator which allows entries to be streamed as they're being
* iterated over.
*/
public Iterator<Pair<ByteArray, Versioned<byte[]>>> fetchEntries(int nodeId,
String storeName,
List<Integer> partitionList,
VoldemortFilter filter,
boolean fetchMasterEntries) {
return fetchEntries(nodeId, storeName, partitionList, filter, fetchMasterEntries, 0);
}
/**
* Fetch key/value tuples belonging to this map of replica type to partition
* list
* <p>
*
* <b>Streaming API</b> - The server keeps sending the messages as it's
* iterating over the data. Once iteration has finished, the server sends an
* "end of stream" marker and flushes its buffer. A response indicating a
* {@link VoldemortException} may be sent at any time during the process.
* <br>
*
* <p>
* Entries are being streamed <em>as the iteration happens</em> i.e. the
* whole result set is <b>not</b> buffered in memory.
*
* @param nodeId Id of the node to fetch from
* @param storeName Name of the store
* @param replicaToPartitionList Mapping of replica type to partition list
* @param filter Custom filter implementation to filter out entries which
* should not be fetched.
* @param fetchMasterEntries Fetch an entry only if master replica
* @param initialCluster The cluster metadata to use while making the
* decision to fetch entries. This is important during rebalancing
* where-in we want to fetch keys using an older metadata compared to
* the new one.
* @param skipRecords Number of records to skip
* @return An iterator which allows entries to be streamed as they're being
* iterated over.
*/
public Iterator<Pair<ByteArray, Versioned<byte[]>>> fetchEntries(int nodeId,
String storeName,
HashMap<Integer, List<Integer>> replicaToPartitionList,
VoldemortFilter filter,
boolean fetchMasterEntries,
Cluster initialCluster,
long skipRecords) {
Node node = this.getAdminClientCluster().getNodeById(nodeId);
final SocketDestination destination = new SocketDestination(node.getHost(),
node.getAdminPort(),
RequestFormatType.ADMIN_PROTOCOL_BUFFERS);
final SocketAndStreams sands = pool.checkout(destination);
DataOutputStream outputStream = sands.getOutputStream();
final DataInputStream inputStream = sands.getInputStream();
try {
initiateFetchRequest(outputStream,
storeName,
replicaToPartitionList,
filter,
true,
fetchMasterEntries,
initialCluster,
skipRecords);
} catch(IOException e) {
close(sands.getSocket());
pool.checkin(destination, sands);
throw new VoldemortException(e);
}
return new AbstractIterator<Pair<ByteArray, Versioned<byte[]>>>() {
@Override
public Pair<ByteArray, Versioned<byte[]>> computeNext() {
try {
int size = inputStream.readInt();
if(size == -1) {
pool.checkin(destination, sands);
return endOfData();
}
VAdminProto.FetchPartitionEntriesResponse response = responseFromStream(inputStream,
size);
if(response.hasError()) {
pool.checkin(destination, sands);
throwException(response.getError());
}
VAdminProto.PartitionEntry partitionEntry = response.getPartitionEntry();
return Pair.create(ProtoUtils.decodeBytes(partitionEntry.getKey()),
ProtoUtils.decodeVersioned(partitionEntry.getVersioned()));
} catch(IOException e) {
close(sands.getSocket());
pool.checkin(destination, sands);
throw new VoldemortException(e);
}
}
};
}
/**
* Legacy interface for fetching entries. See
* {@link AdminClient#fetchKeys(int, String, HashMap, VoldemortFilter, boolean, Cluster, long)}
* for more information.
*
* @param nodeId Id of the node to fetch from
* @param storeName Name of the store
* @param partitionList List of the partitions to retrieve
* @param filter Custom filter implementation to filter out entries which
* should not be fetched.
* @param fetchMasterEntries Fetch a key only if master replica
* @param skipRecords Number of keys to skip
* @return An iterator which allows keys to be streamed as they're being
* iterated over.
*/
public Iterator<ByteArray> fetchKeys(int nodeId,
String storeName,
List<Integer> partitionList,
VoldemortFilter filter,
boolean fetchMasterEntries,
long skipRecords) {
return fetchKeys(nodeId,
storeName,
getReplicaToPartitionMap(nodeId, storeName, partitionList),
filter,
fetchMasterEntries,
null,
skipRecords);
}
/**
* Legacy interface for fetching entries. See
* {@link AdminClient#fetchKeys(int, String, HashMap, VoldemortFilter, boolean, Cluster, long)}
* for more information.
*
* @param nodeId Id of the node to fetch from
* @param storeName Name of the store
* @param partitionList List of the partitions to retrieve
* @param filter Custom filter implementation to filter out entries which
* should not be fetched.
* @param fetchMasterEntries Fetch a key only if master replica
* @return An iterator which allows keys to be streamed as they're being
* iterated over.
*/
public Iterator<ByteArray> fetchKeys(int nodeId,
String storeName,
List<Integer> partitionList,
VoldemortFilter filter,
boolean fetchMasterEntries) {
return fetchKeys(nodeId, storeName, partitionList, filter, fetchMasterEntries, 0);
}
/**
* Fetch all keys belonging to the map of replica type to partition list.
* Identical to {@link AdminClient#fetchEntries} but
* <em>only fetches the keys</em>
*
* @param nodeId The node id from where to fetch the keys
* @param storeName The store name whose keys we want to retrieve
* @param replicaToPartitionList Map of replica type to corresponding
* partition list
* @param filter Custom filter
* @param initialCluster Cluster to use for selecting a key. If null, use
* the default metadata from the metadata store
* @param skipRecords Number of records to skip [ Used for sampling ]
* @return Returns an iterator of the keys
*/
public Iterator<ByteArray> fetchKeys(int nodeId,
String storeName,
HashMap<Integer, List<Integer>> replicaToPartitionList,
VoldemortFilter filter,
boolean fetchMasterEntries,
Cluster initialCluster,
long skipRecords) {
Node node = this.getAdminClientCluster().getNodeById(nodeId);
final SocketDestination destination = new SocketDestination(node.getHost(),
node.getAdminPort(),
RequestFormatType.ADMIN_PROTOCOL_BUFFERS);
final SocketAndStreams sands = pool.checkout(destination);
DataOutputStream outputStream = sands.getOutputStream();
final DataInputStream inputStream = sands.getInputStream();
try {
initiateFetchRequest(outputStream,
storeName,
replicaToPartitionList,
filter,
false,
fetchMasterEntries,
initialCluster,
skipRecords);
} catch(IOException e) {
close(sands.getSocket());
pool.checkin(destination, sands);
throw new VoldemortException(e);
}
return new AbstractIterator<ByteArray>() {
@Override
public ByteArray computeNext() {
try {
int size = inputStream.readInt();
if(size == -1) {
pool.checkin(destination, sands);
return endOfData();
}
VAdminProto.FetchPartitionEntriesResponse response = responseFromStream(inputStream,
size);
if(response.hasError()) {
pool.checkin(destination, sands);
throwException(response.getError());
}
return ProtoUtils.decodeBytes(response.getKey());
} catch(IOException e) {
close(sands.getSocket());
pool.checkin(destination, sands);
throw new VoldemortException(e);
}
}
};
}
/**
* RestoreData from copies on other machines for the given nodeId
* <p>
* Recovery mechanism to recover and restore data actively from replicated
* copies in the cluster.<br>
*
* @param nodeId Id of the node to restoreData
* @param parallelTransfers number of transfers
* @throws InterruptedException
*/
public void restoreDataFromReplications(int nodeId, int parallelTransfers) {
restoreDataFromReplications(nodeId, parallelTransfers, -1);
}
/**
* RestoreData from copies on other machines for the given nodeId
* <p>
* Recovery mechanism to recover and restore data actively from replicated
* copies in the cluster.<br>
*
* @param nodeId Id of the node to restoreData
* @param parallelTransfers number of transfers
* @param zoneId zone from which the nodes are chosen from, -1 means no zone
* preference
* @throws InterruptedException
*/
public void restoreDataFromReplications(int nodeId, int parallelTransfers, int zoneId) {
ExecutorService executors = Executors.newFixedThreadPool(parallelTransfers,
new ThreadFactory() {
public Thread newThread(Runnable r) {
Thread thread = new Thread(r);
thread.setName("restore-data-thread");
return thread;
}
});
try {
List<StoreDefinition> storeDefList = getRemoteStoreDefList(nodeId).getValue();
Cluster cluster = getRemoteCluster(nodeId).getValue();
List<StoreDefinition> writableStores = Lists.newArrayList();
for(StoreDefinition def: storeDefList) {
if(def.isView()) {
logger.info("Ignoring store " + def.getName() + " since it is a view");
} else if(restoreStoreEngineBlackList.contains(def.getType())) {
logger.info("Ignoring store " + def.getName()
+ " since we don't support restoring for " + def.getType()
+ " storage engine");
} else if(def.getReplicationFactor() == 1) {
logger.info("Ignoring store " + def.getName()
+ " since replication factor is set to 1");
} else {
writableStores.add(def);
}
}
for(StoreDefinition def: writableStores) {
restoreStoreFromReplication(nodeId, cluster, def, executors, zoneId);
}
} finally {
executors.shutdown();
try {
executors.awaitTermination(adminClientConfig.getRestoreDataTimeoutSec(),
TimeUnit.SECONDS);
} catch(InterruptedException e) {
logger.error("Interrupted while waiting restore operation to finish.");
}
logger.info("Finished restoring data.");
}
}
/**
* For a particular node, finds out all the [replica, partition] tuples it
* needs to steal in order to be brought back to normal state
*
* @param restoringNode The id of the node which needs to be restored
* @param cluster The cluster definition
* @param storeDef The store definition to use
* @return Map of node id to map of replica type and corresponding partition
* list
*/
public Map<Integer, HashMap<Integer, List<Integer>>> getReplicationMapping(int restoringNode,
Cluster cluster,
StoreDefinition storeDef) {
return getReplicationMapping(restoringNode, cluster, storeDef, -1);
}
/**
* For a particular node, finds out all the [replica, partition] tuples it
* needs to steal in order to be brought back to normal state
*
* @param restoringNode The id of the node which needs to be restored
* @param cluster The cluster definition
* @param storeDef The store definition to use
* @param zoneId zone from which nodes are chosen, -1 means no zone
* preference
* @return Map of node id to map of replica type and corresponding partition
* list
*/
public Map<Integer, HashMap<Integer, List<Integer>>> getReplicationMapping(int restoringNode,
Cluster cluster,
StoreDefinition storeDef,
int zoneId) {
Map<Integer, HashMap<Integer, List<Integer>>> returnMap = Maps.newHashMap();
RoutingStrategy strategy = new RoutingStrategyFactory().updateRoutingStrategy(storeDef,
cluster);
List<Integer> restoringNodePartition = cluster.getNodeById(restoringNode).getPartitionIds();
// Go over every partition. As long as one of them belongs to the
// current node list, find its replica
for(Node node: cluster.getNodes()) {
for(int partitionId: node.getPartitionIds()) {
List<Integer> replicatingPartitions = strategy.getReplicatingPartitionList(partitionId);
List<Integer> extraCopyReplicatingPartitions = Lists.newArrayList(replicatingPartitions);
if(replicatingPartitions.size() <= 1) {
throw new VoldemortException("Store "
+ storeDef.getName()
+ " cannot be restored from replica because replication factor = 1");
}
if(replicatingPartitions.removeAll(restoringNodePartition)) {
if(replicatingPartitions.size() == 0) {
throw new VoldemortException("Found a case where-in the overlap of "
+ "the node partition list results in no replicas "
+ "being left in replicating list");
}
addDonorWithZonePreference(replicatingPartitions,
extraCopyReplicatingPartitions,
returnMap,
zoneId,
cluster,
storeDef);
}
}
}
return returnMap;
}
/**
* For each partition that need to be restored, find a donor node that owns
* the partition AND has the same zone ID as requested. -1 means no zone
* preference required when finding a donor node needs to steal in order to
*
* @param remainderPartitions The replicating partitions without the one
* needed by the restore node
* @param originalPartitions The entire replicating partition list
* (including the one needed by the restore node)
* @param donorMap All donor nodes that will be fetched from
* @param zondId The zone from which donor nodes will be chosen from; -1
* means all zones are fine
* @param cluster The cluster metadata
* @param storeDef The store to be restored
* @return
*/
private void addDonorWithZonePreference(List<Integer> remainderPartitions,
List<Integer> originalPartitions,
Map<Integer, HashMap<Integer, List<Integer>>> donorMap,
int zoneId,
Cluster cluster,
StoreDefinition storeDef) {
Map<Integer, Integer> partitionToNodeId = RebalanceUtils.getCurrentPartitionMapping(cluster);
int nodeId = -1;
int replicaType = -1;
int partition = -1;
boolean found = false;
int index = 0;
while(!found && index < remainderPartitions.size()) {
replicaType = originalPartitions.indexOf(remainderPartitions.get(index));
nodeId = partitionToNodeId.get(remainderPartitions.get(index));
if(-1 == zoneId || cluster.getNodeById(nodeId).getZoneId() == zoneId) {
found = true;
} else {
index++;
}
}
if(!found) {
throw new VoldemortException("unable to find a node to fetch partition " + partition
+ " of replica type " + replicaType + " for store "
+ storeDef.getName());
}
partition = originalPartitions.get(0);
HashMap<Integer, List<Integer>> replicaToPartitionList = null;
if(donorMap.containsKey(nodeId)) {
replicaToPartitionList = donorMap.get(nodeId);
} else {
replicaToPartitionList = Maps.newHashMap();
donorMap.put(nodeId, replicaToPartitionList);
}
List<Integer> partitions = null;
if(replicaToPartitionList.containsKey(replicaType)) {
partitions = replicaToPartitionList.get(replicaType);
} else {
partitions = Lists.newArrayList();
replicaToPartitionList.put(replicaType, partitions);
}
partitions.add(partition);
}
/**
* For a particular store and node, runs the replication job. This works
* only for read-write stores
*
* @param restoringNodeId The node which we want to restore
* @param cluster The cluster metadata
* @param storeDef The definition of the store which we want to restore
* @param executorService An executor to allow us to run the replication job
*/
private void restoreStoreFromReplication(final int restoringNodeId,
final Cluster cluster,
final StoreDefinition storeDef,
final ExecutorService executorService,
final int zoneId) {
logger.info("Restoring data for store " + storeDef.getName() + " on node "
+ restoringNodeId);
Map<Integer, HashMap<Integer, List<Integer>>> restoreMapping = getReplicationMapping(restoringNodeId,
cluster,
storeDef,
zoneId);
// migrate partition
for(final Entry<Integer, HashMap<Integer, List<Integer>>> replicationEntry: restoreMapping.entrySet()) {
final int donorNodeId = replicationEntry.getKey();
executorService.submit(new Runnable() {
public void run() {
try {
logger.info("Restoring data for store " + storeDef.getName() + " at node "
+ restoringNodeId + " from node " + replicationEntry.getKey()
+ " partitions:" + replicationEntry.getValue());
int migrateAsyncId = migratePartitions(donorNodeId,
restoringNodeId,
storeDef.getName(),
replicationEntry.getValue(),
null,
null,
false);
waitForCompletion(restoringNodeId,
migrateAsyncId,
adminClientConfig.getRestoreDataTimeoutSec(),
TimeUnit.SECONDS);
logger.info("Restoring data for store:" + storeDef.getName()
+ " from node " + donorNodeId + " completed.");
} catch(Exception e) {
logger.error("Restore operation for store " + storeDef.getName()
+ "from node " + donorNodeId + " failed.", e);
}
}
});
}
}
/**
* Rebalance a stealer-donor node pair for a set of stores. This is run on
* the donor node.
*
* @param stealInfos List of partition steal information
* @return The request id of the async operation
*/
public int rebalanceNode(List<RebalancePartitionsInfo> stealInfos) {
List<VAdminProto.RebalancePartitionInfoMap> rebalancePartitionInfoMap = ProtoUtils.encodeRebalancePartitionInfoMap(stealInfos);
VAdminProto.InitiateRebalanceNodeOnDonorRequest rebalanceNodeRequest = VAdminProto.InitiateRebalanceNodeOnDonorRequest.newBuilder()
.addAllRebalancePartitionInfo(rebalancePartitionInfoMap)
.build();
VAdminProto.VoldemortAdminRequest adminRequest = VAdminProto.VoldemortAdminRequest.newBuilder()
.setType(VAdminProto.AdminRequestType.INITIATE_REBALANCE_NODE_ON_DONOR)
.setInitiateRebalanceNodeOnDonor(rebalanceNodeRequest)
.build();
VAdminProto.AsyncOperationStatusResponse.Builder response = sendAndReceive(stealInfos.get(0)
.getDonorId(),
adminRequest,
VAdminProto.AsyncOperationStatusResponse.newBuilder());
if(response.hasError())