Permalink
Browse files

batch-importer for csv files into neo4j datastore

  • Loading branch information...
0 parents commit 3af588aa4250a8ee551fb69d0dbb3c06b46e81c2 @jexp committed Jan 13, 2012
Showing with 332 additions and 0 deletions.
  1. +8 −0 .gitignore
  2. +8 −0 batch.properties
  3. +49 −0 pom.xml
  4. +62 −0 readme.txt
  5. +168 −0 src/main/java/org/neo4j/batchimport/Importer.java
  6. +37 −0 src/test/java/TestDataGenerator.java
@@ -0,0 +1,8 @@
+.shell_history
+.idea
+*.ipr
+*.iws
+*.iml
+target
+*.csv
+
@@ -0,0 +1,8 @@
+dump_configuration=true
+cache_type=none
+neostore.propertystore.db.index.keys.mapped_memory=5M
+neostore.propertystore.db.index.mapped_memory=5M
+neostore.nodestore.db.mapped_memory=200M
+neostore.relationshipstore.db.mapped_memory=1000M
+neostore.propertystore.db.mapped_memory=1000M
+neostore.propertystore.db.strings.mapped_memory=100M
49 pom.xml
@@ -0,0 +1,49 @@
+<project>
+ <modelVersion>4.0.0</modelVersion>
+ <groupId>org.neo4j</groupId>
+ <artifactId>batch-import</artifactId>
+ <version>0.1-SNAPSHOT</version>
+ <name>Simple Batch Importer</name>
+ <dependencies>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <version>4.8.1</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.neo4j</groupId>
+ <artifactId>neo4j-kernel</artifactId>
+ <version>1.6.M02</version>
+ </dependency>
+ </dependencies>
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-compiler-plugin</artifactId>
+ <version>2.1</version>
+ <configuration>
+ <source>1.6</source>
+ <target>1.6</target>
+ </configuration>
+ </plugin>
+ <plugin>
+ <artifactId>maven-assembly-plugin</artifactId>
+ <configuration>
+ <finalName>
+ batch-import
+ </finalName>
+ <archive>
+ <manifest>
+ <mainClass>org.neo4j.batchimport.Importer</mainClass>
+ </manifest>
+ </archive>
+ <descriptorRefs>
+ <descriptorRef>jar-with-dependencies</descriptorRef>
+ </descriptorRefs>
+ </configuration>
+ </plugin>
+ </plugins>
+ </build>
+</project>
@@ -0,0 +1,62 @@
+Tab separated input files.
+format nodes.csv
+
+propname1 propname2
+value1 value2
+
+format rels.csv
+
+start end type propname1 propname2
+0 1 OWNS value1 value2
+1 2 IS_A value1 value2
+
+
+ynagzet:batchimport mh$ rm -rf target/db
+ynagzet:batchimport mh$ mvn clean compile assembly:single
+[INFO] Scanning for projects...
+[INFO] ------------------------------------------------------------------------
+[INFO] Building Simple Batch Importer
+[INFO] task-segment: [clean, compile, assembly:single]
+[INFO] ------------------------------------------------------------------------
+[INFO] [clean:clean {execution: default-clean}]
+[INFO] Deleting directory /Users/mh/java/neo/batchimport/target
+[INFO] [resources:resources {execution: default-resources}]
+[WARNING] Using platform encoding (MacRoman actually) to copy filtered resources, i.e. build is platform dependent!
+[INFO] skip non existing resourceDirectory /Users/mh/java/neo/batchimport/src/main/resources
+[INFO] [compiler:compile {execution: default-compile}]
+[WARNING] File encoding has not been set, using platform encoding MacRoman, i.e. build is platform dependent!
+[INFO] Compiling 1 source file to /Users/mh/java/neo/batchimport/target/classes
+[INFO] [assembly:single {execution: default-cli}]
+[INFO] Processing DependencySet (output=)
+[INFO] Building jar: /Users/mh/java/neo/batchimport/target/batch-import-jar-with-dependencies.jar
+[INFO] ------------------------------------------------------------------------
+[INFO] BUILD SUCCESSFUL
+[INFO] ------------------------------------------------------------------------
+[INFO] Total time: 2 seconds
+[INFO] Finished at: Fri Jan 13 04:06:06 CET 2012
+[INFO] Final Memory: 17M/81M
+[INFO] ------------------------------------------------------------------------
+ynagzet:batchimport mh$ java -server -Xmx4G -jar target/batch-import-jar-with-dependencies.jar target/db nodes.csv rels.csv
+Physical mem: 16384MB, Heap size: 3640MB
+use_memory_mapped_buffers=false
+neostore.propertystore.db.index.keys.mapped_memory=5M
+neostore.propertystore.db.strings.mapped_memory=100M
+neostore.propertystore.db.arrays.mapped_memory=215M
+neo_store=/Users/mh/java/neo/batchimport/target/db/neostore
+neostore.relationshipstore.db.mapped_memory=1000M
+neostore.propertystore.db.index.mapped_memory=5M
+neostore.propertystore.db.mapped_memory=1000M
+dump_configuration=true
+cache_type=none
+neostore.nodestore.db.mapped_memory=200M
+...........................................................................
+Importing 7500000 Nodes took 17 seconds
+....................................................................................................35818 ms
+....................................................................................................39343 ms
+....................................................................................................41788 ms
+....................................................................................................48897 ms
+............
+Importing 41246740 Relationships took 170 seconds
+212 seconds
+ynagzet:batchimport mh$ du -sh target/db/
+3,2G target/db/
@@ -0,0 +1,168 @@
+package org.neo4j.batchimport;
+
+import org.neo4j.graphdb.RelationshipType;
+import org.neo4j.kernel.impl.batchinsert.BatchInserterImpl;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.Map;
+
+import static org.neo4j.helpers.collection.MapUtil.map;
+import static org.neo4j.helpers.collection.MapUtil.stringMap;
+
+public class Importer {
+ private static Report report;
+ private BatchInserterImpl db;
+
+ public Importer(File graphDb) {
+ final Map<String, String> config = getConfig();
+ db = new BatchInserterImpl(graphDb.getAbsolutePath(), config);
+ report = new Report(10 * 1000 * 1000, 100);
+ }
+
+ private Map<String, String> getConfig() {
+ if (new File("batch.properties").exists()) {
+ return BatchInserterImpl.loadProperties("batch.properties");
+ } else {
+ return stringMap(
+ "dump_configuration", "true",
+ "cache_type", "none",
+ "neostore.propertystore.db.index.keys.mapped_memory", "5M",
+ "neostore.propertystore.db.index.mapped_memory", "5M",
+ "neostore.nodestore.db.mapped_memory", "50M",
+ "neostore.relationshipstore.db.mapped_memory", "250M",
+ "neostore.propertystore.db.mapped_memory", "200M",
+ "neostore.propertystore.db.strings.mapped_memory", "100M");
+ }
+ }
+
+ public static void main(String[] args) throws IOException {
+ if (args.length < 3) {
+ System.err.println("Usage java -jar batchimport.jar data/dir nodes.csv relationships.csv");
+ }
+ File graphDb = new File(args[0]);
+ File nodesFile = new File(args[1]);
+ File relationshipsFile = new File(args[2]);
+ if (!graphDb.exists()) graphDb.mkdirs();
+ Importer importBatch = new Importer(graphDb);
+ try {
+ if (nodesFile.exists()) importBatch.importNodes(nodesFile);
+ if (relationshipsFile.exists()) importBatch.importRelationships(relationshipsFile);
+ } finally {
+ importBatch.finish();
+ }
+ }
+
+ private void finish() {
+ db.shutdown();
+ report.finish();
+ }
+
+ static class Data {
+ private final Object[] data;
+ private final int offset;
+ private final String delim;
+
+ public Data(String header, String delim, int offset) {
+ this.offset = offset;
+ this.delim = delim;
+ String[] fields = header.split(delim);
+ data = new Object[(fields.length - offset) * 2];
+ for (int i = 0; i < fields.length - offset; i++) {
+ data[i * 2] = fields[i + offset];
+ }
+ }
+
+ public Object[] update(String line, Object... header) {
+ final String[] values = line.split(delim);
+ if (header.length > 0) {
+ System.arraycopy(values, 0, header, 0, header.length);
+ }
+ for (int i = 0; i < values.length - offset; i++) {
+ data[i * 2 + 1] = values[i + offset];
+ }
+ return data;
+ }
+
+ }
+
+ static class Report {
+ private final long batch;
+ private final long dots;
+ private long count;
+ private long total = System.currentTimeMillis(), time, batchTime;
+
+ public Report(long batch, int dots) {
+ this.batch = batch;
+ this.dots = batch / dots;
+ }
+
+ public void reset() {
+ count = 0;
+ batchTime = time = System.currentTimeMillis();
+ }
+
+ public void finish() {
+ System.out.println((System.currentTimeMillis() - total) / 1000 + " seconds ");
+ }
+
+ public void dots() {
+ if ((++count % dots) != 0) return;
+ System.out.print(".");
+ if ((count % batch) != 0) return;
+ long now = System.currentTimeMillis();
+ System.out.println((now - batchTime) + " ms for "+batch);
+ batchTime = now;
+ }
+
+ public void finishImport(String type) {
+ System.out.println("\nImporting " + count + " " + type + " took " + (System.currentTimeMillis() - time) / 1000 + " seconds ");
+ }
+ }
+
+ private void importNodes(File file) throws IOException {
+ BufferedReader bf = new BufferedReader(new FileReader(file));
+ final Data data = new Data(bf.readLine(), "\t", 0);
+ String line;
+ report.reset();
+ while ((line = bf.readLine()) != null) {
+ db.createNode(map(data.update(line)));
+ report.dots();
+ }
+ report.finishImport("Nodes");
+ }
+
+ private void importRelationships(File file) throws IOException {
+ BufferedReader bf = new BufferedReader(new FileReader(file));
+ final Data data = new Data(bf.readLine(), "\t", 3);
+ Object[] rel = new Object[3];
+ final Type type = new Type();
+ String line;
+ report.reset();
+ while ((line = bf.readLine()) != null) {
+ final Map<String, Object> properties = map(data.update(line, rel));
+ db.createRelationship(id(rel[0]), id(rel[1]), type.update(rel[2]), properties);
+ report.dots();
+ }
+ report.finishImport("Relationships");
+ }
+
+ static class Type implements RelationshipType {
+ String name;
+
+ public Type update(Object value) {
+ this.name = value.toString();
+ return this;
+ }
+
+ public String name() {
+ return name;
+ }
+ }
+
+ private long id(Object id) {
+ return Long.parseLong(id.toString());
+ }
+}
@@ -0,0 +1,37 @@
+import java.io.BufferedWriter;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.Random;
+
+/**
+ * @author mh
+ * @since 13.01.12
+ */
+public class TestDataGenerator {
+
+ private static int NODES = 75 * 1000 * 100; // * 1000;
+ private static final int RELS_PER_NODE = 10;
+ private static final String[] TYPES = {"ONE","TWO","THREE","FOUR","FIVE","SIX","SEVEN","EIGHT","NINE","TEN"};
+
+ public static void main(String...args) throws IOException {
+ Random rnd = new Random();
+ long relCount=0, time = System.currentTimeMillis();
+ BufferedWriter nodeFile = new BufferedWriter(new FileWriter("nodes.csv"));
+ nodeFile.write("Node\tRels\tProperty\n");
+ BufferedWriter relFile = new BufferedWriter(new FileWriter("rels.csv"));
+ relFile.write("Start\tEnde\tType\tProperty\n");
+ for (int node = 0; node < NODES; node++) {
+ final int rels = rnd.nextInt(RELS_PER_NODE);
+ nodeFile.write(node+"\t"+rels+"\tTEST\n");
+ for (int rel = rels; rel >= 0; rel--) {
+ relCount++;
+ final int node1 = rnd.nextInt(NODES);
+ final int node2 = rnd.nextInt(NODES);
+ relFile.write(node1 + "\t" + node2 + "\t" + TYPES[rel] + "\t" + "Property"+ "\n");
+ }
+ }
+ nodeFile.close();
+ relFile.close();
+ System.out.println("Creating "+NODES+" and "+relCount+" Relationships took "+((System.currentTimeMillis()-time)/1000)+" seconds.");
+ }
+}

0 comments on commit 3af588a

Please sign in to comment.