Skip to content
This repository

Feature Request: Allow for Indexing #3

Merged
merged 1 commit into from almost 2 years ago

3 participants

Max De Marzi Michael Hunger Peter
Max De Marzi

Allowing users to specify csv files for fulltext and exact indexing of nodes and relationships.

Michael Hunger
Owner
jexp commented July 02, 2012

max could you squash your commits into one? with git rebase -i ccd110e "p"ick the first one and "s"quash the others into a single commit.

Max De Marzi

Done.

Michael Hunger
Owner
jexp commented July 03, 2012

Thanks,

What do you think about my suggestion of skipping these additional index files and rather using the header for this? Seems less involved?

Max De Marzi

I think doing it this way we capture the more general case and avoid problems with:

  • wanting a field to be indexed in two indexes
  • indexes that use values not included in the nodes (I could see this for aliases, or i18n)
  • easier to update the importer as you add a file without messing with your nodes.csv
Michael Hunger jexp merged commit 69c442d into from July 03, 2012
Michael Hunger jexp closed this July 03, 2012
Michael Hunger
Owner
jexp commented July 03, 2012

I think those are rather the 10% use cases so I'm still not convinced that they are worth the effort :)

I think I'll implement the other solution for the 90% case

Peter

Two side notes: always close writers/streams + do it in a finally clause and specify the encoding for the streams

Max De Marzi

I'm a Ruby guy, so I did what I could... >8-]

Sounds like somebody (cough Peter cough) is volunteering a pull-request?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Showing 1 unique commit by 1 author.

Jul 03, 2012
Max De Marzi updating libraries and adding Lucene dependency in order to
implement node and relationship indexing.
0e47380
This page is out of date. Refresh to see the latest.
1  .gitignore
... ...
@@ -1,3 +1,4 @@
  1
+.project
1 2
 .shell_history
2 3
 .idea
3 4
 *.ipr
8  pom.xml
@@ -14,8 +14,14 @@
14 14
         <dependency>
15 15
             <groupId>org.neo4j</groupId>
16 16
             <artifactId>neo4j-kernel</artifactId>
17  
-            <version>1.6.M02</version>
  17
+            <version>1.8.M05</version>
18 18
         </dependency>
  19
+        <dependency>
  20
+            <groupId>org.neo4j</groupId>
  21
+            <artifactId>neo4j-lucene-index</artifactId>
  22
+            <version>1.8.M05</version>
  23
+        </dependency>
  24
+
19 25
     </dependencies>
20 26
     <build>
21 27
         <plugins>
31  readme.txt
@@ -57,4 +57,33 @@ Importing 7500000 Nodes took 17 seconds
57 57
 Importing 41246740 Relationships took 170 seconds 
58 58
 212 seconds 
59 59
 ynagzet:batchimport mh$ du -sh target/db/
60  
-3,2G	target/db/
  60
+3,2G	target/db/
  61
+
  62
+Optionally you can add nodes and relationships to indexes.
  63
+
  64
+Add four arguments per each index to command line:
  65
+
  66
+To create a full text node index called users using nodes_index.csv:
  67
+node_index users fulltext nodes_index.csv 
  68
+
  69
+To create an exact relationship index called worked using rels_index.csv:
  70
+rel_index worked exact rels_index.csv
  71
+
  72
+Example command line:
  73
+java -server -Xmx4G -jar ../batch-import/target/batch-import-jar-with-dependencies.jar neo4j/data/graph.db nodes.csv rels.csv node_index users fulltext nodes_index.csv rel_index worked exact rels_index.csv
  74
+
  75
+nodes_index.csv
  76
+
  77
+id	name	language
  78
+1	Victor Richards	West Frisian
  79
+2	Virginia Shaw	Korean
  80
+3	Lois Simpson	Belarusian
  81
+4	Randy Bishop	Hiri Motu
  82
+5	Lori Mendoza	Tok Pisin
  83
+
  84
+rels_index.csv
  85
+
  86
+id	property1	property2
  87
+0	cwqbnxrv	rpyqdwhk
  88
+1	qthnrret	tzjmmhta
  89
+2	dtztaqpy	pbmcdqyc
128  src/main/java/org/neo4j/batchimport/Importer.java
... ...
@@ -1,41 +1,59 @@
1 1
 package org.neo4j.batchimport;
2 2
 
3 3
 import org.neo4j.graphdb.RelationshipType;
4  
-import org.neo4j.kernel.impl.batchinsert.BatchInserterImpl;
  4
+import org.neo4j.unsafe.batchinsert.BatchInserter;
  5
+import org.neo4j.unsafe.batchinsert.BatchInserters;
  6
+import org.neo4j.unsafe.batchinsert.BatchInserterImpl;
  7
+import org.neo4j.unsafe.batchinsert.BatchInserterIndexProvider;
  8
+import org.neo4j.unsafe.batchinsert.BatchInserterIndex;
  9
+import org.neo4j.unsafe.batchinsert.LuceneBatchInserterIndexProvider;
5 10
 
6 11
 import java.io.BufferedReader;
7 12
 import java.io.File;
8 13
 import java.io.FileReader;
  14
+import java.io.FileWriter;
9 15
 import java.io.IOException;
  16
+import java.util.HashMap;
10 17
 import java.util.Map;
  18
+import org.neo4j.helpers.collection.MapUtil;
11 19
 
12 20
 import static org.neo4j.helpers.collection.MapUtil.map;
13 21
 import static org.neo4j.helpers.collection.MapUtil.stringMap;
  22
+import static org.neo4j.index.impl.lucene.LuceneIndexImplementation.EXACT_CONFIG;
14 23
 
15 24
 public class Importer {
16 25
     private static Report report;
17  
-    private BatchInserterImpl db;
18  
-
  26
+    private BatchInserter db;
  27
+    private BatchInserterIndexProvider lucene;
  28
+    
19 29
     public Importer(File graphDb) {
20  
-        final Map<String, String> config = getConfig();
21  
-        db = new BatchInserterImpl(graphDb.getAbsolutePath(), config);
22  
-        report = new Report(10 * 1000 * 1000, 100);
23  
-    }
24  
-
25  
-    private Map<String, String> getConfig() {
26  
-        if (new File("batch.properties").exists()) {
27  
-            return BatchInserterImpl.loadProperties("batch.properties");
28  
-        } else {
29  
-            return stringMap(
30  
-                    "dump_configuration", "true",
31  
-                    "cache_type", "none",
32  
-                    "neostore.propertystore.db.index.keys.mapped_memory", "5M",
33  
-                    "neostore.propertystore.db.index.mapped_memory", "5M",
34  
-                    "neostore.nodestore.db.mapped_memory", "50M",
35  
-                    "neostore.relationshipstore.db.mapped_memory", "250M",
36  
-                    "neostore.propertystore.db.mapped_memory", "200M",
37  
-                    "neostore.propertystore.db.strings.mapped_memory", "100M");
  30
+    	Map<String, String> config = new HashMap<String, String>();
  31
+    	try {
  32
+	        if (new File("batch.properties").exists()) {
  33
+	        	System.out.println("Using Existing Configuration File");
  34
+	        } else {
  35
+		        System.out.println("Writing Configuration File to batch.properties");
  36
+				FileWriter fw = new FileWriter( "batch.properties" );
  37
+		        fw.append( "neostore.nodestore.db.mapped_memory=100M\n"
  38
+		                 + "neostore.relationshipstore.db.mapped_memory=1G\n"
  39
+		                 + "neostore.propertystore.db.mapped_memory=250M\n"
  40
+		                 + "neostore.propertystore.db.strings.mapped_memory=100M\n"
  41
+		                 + "neostore.propertystore.db.arrays.mapped_memory=0M\n"
  42
+		                 + "neostore.propertystore.db.index.keys.mapped_memory=15M\n"
  43
+		                 + "neostore.propertystore.db.index.mapped_memory=15M" );
  44
+		        fw.close();
  45
+	        }
  46
+
  47
+        config = MapUtil.load( new File(
  48
+                "batch.properties" ) );
  49
+
  50
+        } catch (Exception e) {
  51
+    		System.out.println(e.getMessage());
38 52
         }
  53
+                
  54
+        db = BatchInserters.inserter(graphDb.getAbsolutePath(), config);
  55
+        lucene = new LuceneBatchInserterIndexProvider(db);
  56
+        report = new Report(10 * 1000 * 1000, 100);
39 57
     }
40 58
 
41 59
     public static void main(String[] args) throws IOException {
@@ -45,17 +63,33 @@ public static void main(String[] args) throws IOException {
45 63
         File graphDb = new File(args[0]);
46 64
         File nodesFile = new File(args[1]);
47 65
         File relationshipsFile = new File(args[2]);
  66
+        File indexFile = new File("");
  67
+        String indexName = "";
  68
+        String indexType = "";
  69
+        
48 70
         if (!graphDb.exists()) graphDb.mkdirs();
49 71
         Importer importBatch = new Importer(graphDb);
50 72
         try {
51 73
             if (nodesFile.exists()) importBatch.importNodes(nodesFile);
52  
-            if (relationshipsFile.exists()) importBatch.importRelationships(relationshipsFile);
53  
-        } finally {
  74
+            if (relationshipsFile.exists()) importBatch.importRelationships(relationshipsFile);         
  75
+			for (int i = 3; i < args.length; i = i + 4) {
  76
+				indexName = args[i+1];
  77
+				indexType = args[i+2];
  78
+				indexFile = new File(args[i + 3]);
  79
+				if (args[i].equals("node_index")) {
  80
+					if (indexFile.exists()) importBatch.importNodeIndexes(indexFile, indexName, indexType);
  81
+				} else {
  82
+					if (indexFile.exists()) importBatch.importRelationshipIndexes(indexFile, indexName, indexType);
  83
+				}
  84
+			
  85
+			}
  86
+		} finally {
54 87
             importBatch.finish();
55 88
         }
56 89
     }
57 90
 
58 91
     private void finish() {
  92
+        lucene.shutdown();
59 93
         db.shutdown();
60 94
         report.finish();
61 95
     }
@@ -149,6 +183,54 @@ private void importRelationships(File file) throws IOException {
149 183
         report.finishImport("Relationships");
150 184
     }
151 185
 
  186
+    private void importNodeIndexes(File file, String indexName, String indexType) throws IOException {
  187
+    	BatchInserterIndex index;
  188
+    	if (indexType.equals("fulltext")) {
  189
+    		index = lucene.nodeIndex( indexName, stringMap( "type", "fulltext" ) );
  190
+    	} else {
  191
+    		index = lucene.nodeIndex( indexName, EXACT_CONFIG );
  192
+    	}
  193
+        
  194
+        BufferedReader bf = new BufferedReader(new FileReader(file));
  195
+        
  196
+        final Data data = new Data(bf.readLine(), "\t", 1);
  197
+        Object[] node = new Object[1];
  198
+        String line;
  199
+        report.reset();
  200
+        while ((line = bf.readLine()) != null) {        
  201
+            final Map<String, Object> properties = map(data.update(line, node));
  202
+            index.add(id(node[0]), properties);
  203
+            report.dots();
  204
+        }
  205
+                
  206
+        report.finishImport("Nodes into " + indexName + " Index");
  207
+    }
  208
+
  209
+    private void importRelationshipIndexes(File file, String indexName, String indexType) throws IOException {
  210
+    	BatchInserterIndex index;
  211
+    	if (indexType.equals("fulltext")) {
  212
+    		index = lucene.relationshipIndex( indexName, stringMap( "type", "fulltext" ) );
  213
+    	} else {
  214
+    		index = lucene.relationshipIndex( indexName, EXACT_CONFIG );
  215
+    	}
  216
+
  217
+        BufferedReader bf = new BufferedReader(new FileReader(file));
  218
+        
  219
+        final Data data = new Data(bf.readLine(), "\t", 1);
  220
+        Object[] rel = new Object[1];
  221
+        String line;
  222
+        report.reset();
  223
+        while ((line = bf.readLine()) != null) {        
  224
+            final Map<String, Object> properties = map(data.update(line, rel));
  225
+            index.add(id(rel[0]), properties);
  226
+            report.dots();
  227
+        }
  228
+                
  229
+        report.finishImport("Relationships into " + indexName + " Index");
  230
+
  231
+    }
  232
+
  233
+
152 234
     static class Type implements RelationshipType {
153 235
         String name;
154 236
 
Commit_comment_tip

Tip: You can add notes to lines in a file. Hover to the left of a line to make a note

Something went wrong with that request. Please try again.