Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
added reconstruct functionality
- Loading branch information
Showing
3 changed files
with
148 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
145 changes: 145 additions & 0 deletions
145
src/main/java/com/senseidb/clue/commands/ReconstructCommand.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,145 @@ | ||
package com.senseidb.clue.commands; | ||
|
||
import java.io.IOException; | ||
import java.io.PrintStream; | ||
import java.util.ArrayList; | ||
import java.util.List; | ||
import java.util.Map.Entry; | ||
import java.util.TreeMap; | ||
|
||
import org.apache.lucene.index.AtomicReader; | ||
import org.apache.lucene.index.AtomicReaderContext; | ||
import org.apache.lucene.index.DocsAndPositionsEnum; | ||
import org.apache.lucene.index.DocsEnum; | ||
import org.apache.lucene.index.FieldInfo; | ||
import org.apache.lucene.index.IndexReader; | ||
import org.apache.lucene.index.Terms; | ||
import org.apache.lucene.index.TermsEnum; | ||
import org.apache.lucene.search.DocIdSetIterator; | ||
import org.apache.lucene.util.Bits; | ||
import org.apache.lucene.util.BytesRef; | ||
|
||
import com.senseidb.clue.ClueContext; | ||
|
||
public class ReconstructCommand extends ClueCommand { | ||
|
||
public ReconstructCommand(ClueContext ctx) { | ||
super(ctx); | ||
} | ||
|
||
@Override | ||
public String getName() { | ||
return "reconstruct"; | ||
} | ||
|
||
@Override | ||
public String help() { | ||
return "reconstructs an indexed field for a document"; | ||
} | ||
|
||
public String reconstructWithPositions(TermsEnum te, int docid, Bits liveDocs) throws IOException{ | ||
TreeMap<Integer,List<String>> docTextMap = new TreeMap<Integer,List<String>>(); | ||
BytesRef text; | ||
DocsAndPositionsEnum dpe = null; | ||
while ((text = te.next()) != null) { | ||
dpe = te.docsAndPositions(liveDocs, dpe); | ||
int iterDoc; | ||
while ((iterDoc = dpe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { | ||
if (iterDoc == docid) { | ||
int freq = dpe.freq(); | ||
for (int i = 0; i < freq; ++i) { | ||
int pos = dpe.nextPosition(); | ||
List<String> textList = docTextMap.get(pos); | ||
if (textList == null) { | ||
textList = new ArrayList<String>(); | ||
docTextMap.put(pos, textList); | ||
} | ||
textList.add(text.utf8ToString()); | ||
} | ||
} | ||
} | ||
} | ||
StringBuffer buf = new StringBuffer(); | ||
for (Entry<Integer, List<String>> entry : docTextMap.entrySet()) { | ||
Integer pos = entry.getKey(); | ||
List<String> terms = entry.getValue(); | ||
for (String term : terms) { | ||
buf.append(term+"("+pos+") "); | ||
} | ||
} | ||
return buf.toString(); | ||
} | ||
|
||
public String reconstructNoPositions(TermsEnum te, int docid, Bits liveDocs) throws IOException{ | ||
List<String> textList = new ArrayList<String>(); | ||
BytesRef text; | ||
DocsEnum dpe = null; | ||
while ((text = te.next()) != null) { | ||
dpe = te.docs(liveDocs, dpe); | ||
int iterDoc; | ||
while ((iterDoc = dpe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { | ||
if (iterDoc == docid) { | ||
textList.add(text.utf8ToString()); | ||
} | ||
} | ||
} | ||
StringBuffer buf = new StringBuffer(); | ||
for (String s : textList) { | ||
buf.append(s+" "); | ||
} | ||
return buf.toString(); | ||
} | ||
|
||
|
||
@Override | ||
public void execute(String[] args, PrintStream out) throws Exception { | ||
if (args.length != 2) { | ||
out.println("usage: field doc"); | ||
return; | ||
} | ||
|
||
String field = args[0]; | ||
|
||
int doc = Integer.parseInt(args[1]); | ||
|
||
IndexReader reader = ctx.getIndexReader(); | ||
List<AtomicReaderContext> leaves = reader.leaves(); | ||
|
||
boolean found = false; | ||
|
||
|
||
for (AtomicReaderContext ctx : leaves) { | ||
AtomicReader atomicReader = ctx.reader(); | ||
FieldInfo finfo = atomicReader.getFieldInfos().fieldInfo(field); | ||
if (finfo == null) continue; | ||
|
||
if (!finfo.isIndexed()) { | ||
out.println(field+" is not an indexed field"); | ||
return; | ||
} | ||
int docID = doc - ctx.docBase; | ||
if (docID >= 0) { | ||
Terms terms = atomicReader.terms(field); | ||
boolean hasPositions = terms.hasPositions(); | ||
|
||
TermsEnum te = terms.iterator(null); | ||
if (hasPositions) { | ||
out.println(reconstructWithPositions(te, docID, atomicReader.getLiveDocs())); | ||
} | ||
else { | ||
out.println(reconstructNoPositions(te, docID, atomicReader.getLiveDocs())); | ||
} | ||
found = true; | ||
break; | ||
} | ||
} | ||
|
||
if (!found) { | ||
out.println(doc + " not found"); | ||
return; | ||
} | ||
|
||
|
||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters