Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
added reconstruct functionality
  • Loading branch information
javasoze committed Jun 1, 2013
1 parent 8f1afa1 commit c71734a
Show file tree
Hide file tree
Showing 3 changed files with 148 additions and 1 deletion.
2 changes: 2 additions & 0 deletions src/main/java/com/senseidb/clue/ClueContext.java
Expand Up @@ -22,6 +22,7 @@
import com.senseidb.clue.commands.NormsCommand;
import com.senseidb.clue.commands.PostingsCommand;
import com.senseidb.clue.commands.ReadonlyCommand;
import com.senseidb.clue.commands.ReconstructCommand;
import com.senseidb.clue.commands.SearchCommand;
import com.senseidb.clue.commands.StoredFieldCommand;
import com.senseidb.clue.commands.TermVectorCommand;
Expand Down Expand Up @@ -63,6 +64,7 @@ public ClueContext(Directory directory, IndexReaderFactory readerFactory,
new NormsCommand(this);
new TermVectorCommand(this);
new StoredFieldCommand(this);
new ReconstructCommand(this);
}

public void registerCommand(ClueCommand cmd){
Expand Down
145 changes: 145 additions & 0 deletions src/main/java/com/senseidb/clue/commands/ReconstructCommand.java
@@ -0,0 +1,145 @@
package com.senseidb.clue.commands;

import java.io.IOException;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Map.Entry;
import java.util.TreeMap;

import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;

import com.senseidb.clue.ClueContext;

public class ReconstructCommand extends ClueCommand {

public ReconstructCommand(ClueContext ctx) {
super(ctx);
}

@Override
public String getName() {
return "reconstruct";
}

@Override
public String help() {
return "reconstructs an indexed field for a document";
}

public String reconstructWithPositions(TermsEnum te, int docid, Bits liveDocs) throws IOException{
TreeMap<Integer,List<String>> docTextMap = new TreeMap<Integer,List<String>>();
BytesRef text;
DocsAndPositionsEnum dpe = null;
while ((text = te.next()) != null) {
dpe = te.docsAndPositions(liveDocs, dpe);
int iterDoc;
while ((iterDoc = dpe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
if (iterDoc == docid) {
int freq = dpe.freq();
for (int i = 0; i < freq; ++i) {
int pos = dpe.nextPosition();
List<String> textList = docTextMap.get(pos);
if (textList == null) {
textList = new ArrayList<String>();
docTextMap.put(pos, textList);
}
textList.add(text.utf8ToString());
}
}
}
}
StringBuffer buf = new StringBuffer();
for (Entry<Integer, List<String>> entry : docTextMap.entrySet()) {
Integer pos = entry.getKey();
List<String> terms = entry.getValue();
for (String term : terms) {
buf.append(term+"("+pos+") ");
}
}
return buf.toString();
}

public String reconstructNoPositions(TermsEnum te, int docid, Bits liveDocs) throws IOException{
List<String> textList = new ArrayList<String>();
BytesRef text;
DocsEnum dpe = null;
while ((text = te.next()) != null) {
dpe = te.docs(liveDocs, dpe);
int iterDoc;
while ((iterDoc = dpe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
if (iterDoc == docid) {
textList.add(text.utf8ToString());
}
}
}
StringBuffer buf = new StringBuffer();
for (String s : textList) {
buf.append(s+" ");
}
return buf.toString();
}


@Override
public void execute(String[] args, PrintStream out) throws Exception {
if (args.length != 2) {
out.println("usage: field doc");
return;
}

String field = args[0];

int doc = Integer.parseInt(args[1]);

IndexReader reader = ctx.getIndexReader();
List<AtomicReaderContext> leaves = reader.leaves();

boolean found = false;


for (AtomicReaderContext ctx : leaves) {
AtomicReader atomicReader = ctx.reader();
FieldInfo finfo = atomicReader.getFieldInfos().fieldInfo(field);
if (finfo == null) continue;

if (!finfo.isIndexed()) {
out.println(field+" is not an indexed field");
return;
}
int docID = doc - ctx.docBase;
if (docID >= 0) {
Terms terms = atomicReader.terms(field);
boolean hasPositions = terms.hasPositions();

TermsEnum te = terms.iterator(null);
if (hasPositions) {
out.println(reconstructWithPositions(te, docID, atomicReader.getLiveDocs()));
}
else {
out.println(reconstructNoPositions(te, docID, atomicReader.getLiveDocs()));
}
found = true;
break;
}
}

if (!found) {
out.println(doc + " not found");
return;
}


}

}
Expand Up @@ -33,7 +33,7 @@ public String help() {
@Override
public void execute(String[] args, PrintStream out) throws Exception {
if (args.length != 2) {
out.println("usage: field doc1,doc2...");
out.println("usage: field doc");
return;
}

Expand Down

0 comments on commit c71734a

Please sign in to comment.