Permalink
Browse files

merged with batch-performance repository

  • Loading branch information...
jexp committed Oct 31, 2012
2 parents 06b9a53 + 17ca9d5 commit 4964fb56a868b21ee0ba3cdebe61e746e4860f08
Showing with 4,950 additions and 259 deletions.
  1. +3 −1 .gitignore
  2. +2 −2 batch.properties
  3. +27 −6 pom.xml
  4. +163 −0 readme.md
  5. +116 −0 src/main/java/org/neo4j/batchimport/DisruptorBatchInserter.java
  6. +28 −246 src/main/java/org/neo4j/batchimport/Importer.java
  7. +21 −0 src/main/java/org/neo4j/batchimport/NodeStructFactory.java
  8. +39 −0 src/main/java/org/neo4j/batchimport/StdOutReport.java
  9. +85 −0 src/main/java/org/neo4j/batchimport/Utils.java
  10. +38 −0 src/main/java/org/neo4j/batchimport/collections/ConcurrentIntReverseRelationshipMap.java
  11. +40 −0 src/main/java/org/neo4j/batchimport/collections/ConcurrentLongReverseRelationshipMap.java
  12. +23 −0 src/main/java/org/neo4j/batchimport/collections/PrimitiveIntReverseRelationshipMap.java
  13. +10 −0 src/main/java/org/neo4j/batchimport/collections/ReverseRelationshipMap.java
  14. +86 −0 src/main/java/org/neo4j/batchimport/handlers/NodeWriteFileHandler.java
  15. +44 −0 src/main/java/org/neo4j/batchimport/handlers/NodeWriteRecordHandler.java
  16. +55 −0 src/main/java/org/neo4j/batchimport/handlers/PropertyEncodingHandler.java
  17. +62 −0 src/main/java/org/neo4j/batchimport/handlers/PropertyRecordCreatorHandler.java
  18. +21 −0 src/main/java/org/neo4j/batchimport/handlers/PropertyRecordHighIdHandler.java
  19. +52 −0 src/main/java/org/neo4j/batchimport/handlers/PropertyWriteRecordHandler.java
  20. +146 −0 src/main/java/org/neo4j/batchimport/handlers/RelationshipFileWriter.java
  21. +82 −0 src/main/java/org/neo4j/batchimport/handlers/RelationshipIdHandler.java
  22. +75 −0 src/main/java/org/neo4j/batchimport/handlers/RelationshipRecordWriter.java
  23. +79 −0 src/main/java/org/neo4j/batchimport/handlers/RelationshipWriteHandler.java
  24. +20 −0 src/main/java/org/neo4j/batchimport/handlers/RelationshipWriter.java
  25. +16 −0 src/main/java/org/neo4j/batchimport/importer/RelType.java
  26. +85 −0 src/main/java/org/neo4j/batchimport/importer/RowData.java
  27. +69 −0 src/main/java/org/neo4j/batchimport/importer/Type.java
  28. +65 −0 src/main/java/org/neo4j/batchimport/structs/NodeStruct.java
  29. +27 −0 src/main/java/org/neo4j/batchimport/structs/Property.java
  30. +34 −0 src/main/java/org/neo4j/batchimport/structs/PropertyHolder.java
  31. +34 −0 src/main/java/org/neo4j/batchimport/structs/Relationship.java
  32. +735 −0 src/main/java/org/neo4j/kernel/impl/nioneo/store/CommonAbstractStore.java
  33. +223 −0 src/main/java/org/neo4j/kernel/impl/nioneo/store/PropertyBlock.java
  34. +201 −0 src/main/java/org/neo4j/kernel/impl/nioneo/store/PropertyRecord.java
  35. +741 −0 src/main/java/org/neo4j/kernel/impl/nioneo/store/PropertyStore.java
  36. +1,013 −0 src/main/java/org/neo4j/unsafe/batchinsert/BatchInserterImpl.java
  37. +28 −0 src/main/resources/log4j.properties
  38. +2 −2 src/test/java/DataTest.java
  39. +1 −1 src/test/java/TestDataGenerator.java
  40. +141 −0 src/test/java/org/neo4j/batchimport/DisruptorTest.java
  41. +1 −1 src/test/java/org/neo4j/batchimport/ImporterTest.java
  42. +48 −0 src/test/java/org/neo4j/batchimport/RelTest.java
  43. +169 −0 src/test/java/org/neo4j/batchimport/TestImporter.java
View
@@ -1,9 +1,11 @@
.project
.shell_history
-.idea
*.ipr
*.iws
*.iml
+.idea
target
*.csv
+.DS_Store
+.settings
View
@@ -1,9 +1,9 @@
-dump_configuration=true
+dump_configuration=false
cache_type=none
use_memory_mapped_buffers=true
neostore.propertystore.db.index.keys.mapped_memory=5M
neostore.propertystore.db.index.mapped_memory=5M
neostore.nodestore.db.mapped_memory=200M
neostore.relationshipstore.db.mapped_memory=1000M
neostore.propertystore.db.mapped_memory=1000M
-neostore.propertystore.db.strings.mapped_memory=200M
+neostore.propertystore.db.strings.mapped_memory=200M
View
33 pom.xml
@@ -2,9 +2,11 @@
<modelVersion>4.0.0</modelVersion>
<groupId>org.neo4j</groupId>
<artifactId>batch-import</artifactId>
- <version>0.1-SNAPSHOT</version>
- <name>Simple Batch Importer</name>
-
+ <version>1.9-SNAPSHOT</version>
+ <name>Neo4j Batch Importer</name>
+ <properties>
+ <neo4j.version>1.9-SNAPSHOT</neo4j.version>
+ </properties>
<repositories>
<repository>
<id>Neo4j Snapshots</id>
@@ -13,12 +15,22 @@
</repositories>
<dependencies>
+ <dependency>
+ <groupId>edu.ucla.sspace</groupId>
+ <artifactId>sspace</artifactId>
+ <version>2.0.3</version>
+ </dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.8.1</version>
<scope>test</scope>
</dependency>
+ <dependency>
+ <groupId>log4j</groupId>
+ <artifactId>log4j</artifactId>
+ <version>1.2.17</version>
+ </dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-core</artifactId>
@@ -28,14 +40,23 @@
<dependency>
<groupId>org.neo4j</groupId>
<artifactId>neo4j-kernel</artifactId>
- <version>1.8-SNAPSHOT</version>
+ <version>${neo4j.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.neo4j</groupId>
+ <artifactId>neo4j-enterprise</artifactId>
+ <version>${neo4j.version}</version>
</dependency>
<dependency>
<groupId>org.neo4j</groupId>
<artifactId>neo4j-lucene-index</artifactId>
- <version>1.8-SNAPSHOT</version>
+ <version>${neo4j.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>com.googlecode.disruptor</groupId>
+ <artifactId>disruptor</artifactId>
+ <version>2.10.3</version>
</dependency>
-
</dependencies>
<build>
<plugins>
View
163 readme.md
@@ -0,0 +1,163 @@
+# Neo4j (CSV) Batch Importer
+
+You provide one tab separated csv file for nodes and one for relationships (optionally more for indexes)
+
+Example data for the files is a small social network
+
+## File format
+
+* Property names in first row.
+* The row number corresponds to the node-id (node 0 is the reference node)
+* Property values not listed will not be set on the nodes or properties.
+* Optionally property fields can have a type (defaults to String) indicated with name:type where type is one of (int, long, float, double, boolean, byte, short, char, string). The string value is then converted to that type. Conversion failure will result in abort of the import operation.
+
+## Examples
+
+### nodes.csv
+
+ name age works_on
+ Michael 37 neo4j
+ Selina 14
+ Rana 6
+ Selma 4
+
+### rels.csv
+
+ start end type since counter:int
+ 1 2 FATHER_OF 1998-07-10 1
+ 1 3 FATHER_OF 2007-09-15 2
+ 1 4 FATHER_OF 2008-05-03 3
+ 3 4 SISTER_OF 2008-05-03 5
+ 2 3 SISTER_OF 2007-09-15 7
+
+
+## Execution
+
+ java -server -Xmx4G -jar ../batch-import/target/batch-import-jar-with-dependencies.jar neo4j/data/graph.db nodes.csv rels.csv
+
+
+ ynagzet:batchimport mh$ rm -rf target/db
+ ynagzet:batchimport mh$ mvn clean compile assembly:single
+ [INFO] Scanning for projects...
+ [INFO] ------------------------------------------------------------------------
+ [INFO] Building Simple Batch Importer
+ [INFO] task-segment: [clean, compile, assembly:single]
+ [INFO] ------------------------------------------------------------------------
+ ...
+ [INFO] Building jar: /Users/mh/java/neo/batchimport/target/batch-import-jar-with-dependencies.jar
+ [INFO] ------------------------------------------------------------------------
+ [INFO] BUILD SUCCESSFUL
+ [INFO] ------------------------------------------------------------------------
+ ynagzet:batchimport mh$ java -server -Xmx4G -jar target/batch-import-jar-with-dependencies.jar target/db nodes.csv rels.csv
+ Physical mem: 16384MB, Heap size: 3640MB
+ use_memory_mapped_buffers=false
+ neostore.propertystore.db.index.keys.mapped_memory=5M
+ neostore.propertystore.db.strings.mapped_memory=100M
+ neostore.propertystore.db.arrays.mapped_memory=215M
+ neo_store=/Users/mh/java/neo/batchimport/target/db/neostore
+ neostore.relationshipstore.db.mapped_memory=1000M
+ neostore.propertystore.db.index.mapped_memory=5M
+ neostore.propertystore.db.mapped_memory=1000M
+ dump_configuration=true
+ cache_type=none
+ neostore.nodestore.db.mapped_memory=200M
+ ...........................................................................
+ Importing 7500000 Nodes took 17 seconds
+ ....................................................................................................35818 ms
+ ....................................................................................................39343 ms
+ ....................................................................................................41788 ms
+ ....................................................................................................48897 ms
+ ............
+ Importing 41246740 Relationships took 170 seconds
+ 212 seconds
+ ynagzet:batchimport mh$ du -sh target/db/
+ 3,2G target/db/
+
+
+## Indexing
+
+Optionally you can add nodes and relationships to indexes.
+
+Add four arguments per each index to command line:
+
+To create a full text node index called users using nodes_index.csv:
+
+ node_index users fulltext nodes_index.csv
+
+To create an exact relationship index called worked using rels_index.csv:
+
+ rel_index worked exact rels_index.csv
+
+Example command line:
+
+ java -server -Xmx4G -jar ../batch-import/target/batch-import-jar-with-dependencies.jar neo4j/data/graph.db nodes.csv rels.csv node_index users fulltext nodes_index.csv rel_index worked exact rels_index.csv
+
+## Examples
+
+### nodes_index.csv
+
+ id name language
+ 1 Victor Richards West Frisian
+ 2 Virginia Shaw Korean
+ 3 Lois Simpson Belarusian
+ 4 Randy Bishop Hiri Motu
+ 5 Lori Mendoza Tok Pisin
+
+### rels_index.csv
+
+ id property1 property2
+ 0 cwqbnxrv rpyqdwhk
+ 1 qthnrret tzjmmhta
+ 2 dtztaqpy pbmcdqyc
+
+
+
+# Parallel Batch inserter with Neo4j
+
+Uses the [LMAX Disruptor](http://lmax-exchange.github.com/disruptor/) to parallelize operations during batch-insertion.
+
+## The 6 operations are:
+
+1. property encoding
+2. property-record creation
+3. relationship-id creation and forward handling of reverse relationship chains
+4. writing node-records
+5. writing relationship-records
+6. writing property-records
+
+## Dependencies:
+
+ (1)<--(2)<--(6)
+ (2)<--(5)-->(3)
+ (2)<--(4)-->(3)
+
+It uses the above dependency setup of disruptor handlers to execute the different concerns in parallel. A ringbuffer of about 2^18 elements is used and a heap size of 5-20G, MMIO configuration within the heap limits.
+
+## Execution:
+
+ MAVEN_OPTS="-Xmx5G -Xms5G -server -d64 -XX:NewRatio=5" mvn clean test-compile exec:java -Dexec.mainClass=org.neo4j.batchimport.DisruptorTest -Dexec.classpathScope=test
+
+## current limitations, constraints:
+
+* only up to 2bn relationships (due to an int based multi-map)
+* have to know max # of rels per node, properties per node and relationship
+* relationships have to be pre-sorted by min(start,end)
+
+## measurements
+
+We successfully imported 2bn nodes (2 properties) and 20bn relationships (1 property) in 11 hours on an EC2 high-IO instance,
+with 35 ECU, 60GB RAM, 2TB SSD writing up to 200MB/s, resulting in a store of 1.4 TB. That makes around 500k elements per second.
+
+## future improvements:
+
+* implement batch-importer CSV "API" on top of this
+* stripe writes across store-files (i.e. strip the relationship-record file over 10 handlers, according to CPUs)
+* parallelize writing to dynamic string and arraystore too
+* change relationship-record updates for backwards pointers to run in a separate handler that is
+ RandomAccessFile-based (or nio2) and just writes the 2 int values directly at file-pos
+* add a csv analyser / sorter that
+* add support & parallelize index addition
+* good support for index based lookup for relationship construction (kv-store, better in-memory structure, e.g. a collection of long[])
+* use id-compression internally to save memory in structs (write a CompressedLongArray)
+* reuse PropertyBlock, PropertyRecords, RelationshipRecords, NodeRecords, probably subclass them and override getId() etc. or copy the code
+ from the Store's to work with interfaces
@@ -0,0 +1,116 @@
+package org.neo4j.batchimport;
+
+import com.lmax.disruptor.RingBuffer;
+import com.lmax.disruptor.SingleThreadedClaimStrategy;
+import com.lmax.disruptor.YieldingWaitStrategy;
+import com.lmax.disruptor.dsl.Disruptor;
+import org.apache.log4j.Logger;
+import org.neo4j.batchimport.handlers.*;
+import org.neo4j.batchimport.structs.NodeStruct;
+import org.neo4j.kernel.impl.nioneo.store.NeoStore;
+import org.neo4j.unsafe.batchinsert.BatchInserterImpl;
+import org.neo4j.unsafe.batchinsert.BatchInserters;
+
+import java.util.Arrays;
+import java.util.Map;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+
+/**
+* @author mh
+* @since 27.10.12
+*/
+public class DisruptorBatchInserter {
+
+ private final static Logger log = Logger.getLogger(DisruptorBatchInserter.class);
+
+ private final static int RING_SIZE = 1 << 18;
+
+ private Disruptor<NodeStruct> incomingEventDisruptor;
+ private final String storeDir;
+ private BatchInserterImpl inserter;
+ private ExecutorService executor;
+ private PropertyEncodingHandler[] propertyMappingHandlers;
+ private RelationshipIdHandler relationshipIdHandler;
+ private NodeWriteRecordHandler nodeWriter;
+ private PropertyWriteRecordHandler propertyWriter;
+ private RelationshipWriteHandler relationshipWriter;
+ private PropertyRecordCreatorHandler propertyRecordCreatorHandler;
+ private final Map<String,String> config;
+ private final long nodesToCreate;
+ private final NodeStructFactory nodeStructFactory;
+
+ public DisruptorBatchInserter(String storeDir, final Map<String, String> config, int nodesToCreate, final NodeStructFactory nodeStructFactory) {
+ this.storeDir = storeDir;
+ this.config = config;
+ this.nodesToCreate = nodesToCreate;
+ this.nodeStructFactory = nodeStructFactory;
+ }
+
+ void init() {
+ inserter = (BatchInserterImpl) BatchInserters.inserter(storeDir, config);
+ nodeStructFactory.init(inserter);
+ NeoStore neoStore = inserter.getNeoStore();
+ neoStore.getNodeStore().setHighId(nodesToCreate + 1);
+ executor = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors());
+ //final ExecutorService executor = Executors.newCachedThreadPool();
+
+ incomingEventDisruptor = new Disruptor<NodeStruct>(nodeStructFactory, executor, new SingleThreadedClaimStrategy(RING_SIZE), new YieldingWaitStrategy());
+
+ createHandlers(neoStore,nodeStructFactory);
+
+ incomingEventDisruptor.
+ handleEventsWith(propertyMappingHandlers).
+ then(propertyRecordCreatorHandler, relationshipIdHandler).
+ then(nodeWriter, relationshipWriter, propertyWriter); //
+ }
+
+ private void createHandlers(NeoStore neoStore, NodeStructFactory nodeStructFactory) {
+ propertyMappingHandlers = PropertyEncodingHandler.createHandlers(inserter);
+
+ propertyRecordCreatorHandler = new PropertyRecordCreatorHandler();
+ relationshipIdHandler = new RelationshipIdHandler(nodeStructFactory.getMaxRelsPerNode());
+
+ //nodeWriter = new NodeFileWriteHandler(new File(nodeStore.getStorageFileName()));
+ nodeWriter = new NodeWriteRecordHandler(neoStore.getNodeStore());
+ propertyWriter = new PropertyWriteRecordHandler(neoStore.getPropertyStore());
+ relationshipWriter = new RelationshipWriteHandler(new RelationshipRecordWriter(neoStore.getRelationshipStore()));
+ //relationshipWriter = new RelationshipWriteHandler(new RelationshipFileWriter(new File(neoStore.getRelationshipStore().getStorageFileName())));
+ }
+
+ void run() {
+ RingBuffer<NodeStruct> ringBuffer = incomingEventDisruptor.start();
+ long time = System.currentTimeMillis();
+ for (long nodeId = 0; nodeId < nodesToCreate; nodeId++) {
+ long sequence = ringBuffer.next();
+ NodeStruct nodeStruct = ringBuffer.get(sequence).init();
+
+ nodeStructFactory.fillStruct(nodeId,nodeStruct);
+
+ if (nodeId % (nodesToCreate / 100) == 0) {
+ log.info(nodeId + " " + (System.currentTimeMillis()-time)+" ms.");
+ time = System.currentTimeMillis();
+ }
+ ringBuffer.publish(sequence);
+ }
+ }
+ void shutdown() {
+ incomingEventDisruptor.shutdown();
+ executor.shutdown();
+
+ nodeWriter.close();
+ propertyWriter.close();
+ relationshipWriter.close();
+
+ inserter.shutdown();
+ }
+ void report() {
+ log.info("mapped " + Arrays.deepToString(propertyMappingHandlers));
+
+ log.info("relIds " + relationshipIdHandler);
+
+ log.info("wrote nodes " + nodeWriter);
+ log.info("wrote rels " + relationshipWriter);
+ log.info("wrote props " + propertyWriter);
+ }
+}
Oops, something went wrong.

0 comments on commit 4964fb5

Please sign in to comment.