/
ReconstructCommand.java
145 lines (122 loc) · 4.03 KB
/
ReconstructCommand.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
package com.senseidb.clue.commands;
import java.io.IOException;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Map.Entry;
import java.util.TreeMap;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import com.senseidb.clue.ClueContext;
public class ReconstructCommand extends ClueCommand {
public ReconstructCommand(ClueContext ctx) {
super(ctx);
}
@Override
public String getName() {
return "reconstruct";
}
@Override
public String help() {
return "reconstructs an indexed field for a document";
}
public String reconstructWithPositions(TermsEnum te, int docid, Bits liveDocs) throws IOException{
TreeMap<Integer,List<String>> docTextMap = new TreeMap<Integer,List<String>>();
BytesRef text;
DocsAndPositionsEnum dpe = null;
while ((text = te.next()) != null) {
dpe = te.docsAndPositions(liveDocs, dpe);
int iterDoc;
while ((iterDoc = dpe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
if (iterDoc == docid) {
int freq = dpe.freq();
for (int i = 0; i < freq; ++i) {
int pos = dpe.nextPosition();
List<String> textList = docTextMap.get(pos);
if (textList == null) {
textList = new ArrayList<String>();
docTextMap.put(pos, textList);
}
textList.add(text.utf8ToString());
}
}
}
}
StringBuffer buf = new StringBuffer();
for (Entry<Integer, List<String>> entry : docTextMap.entrySet()) {
Integer pos = entry.getKey();
List<String> terms = entry.getValue();
for (String term : terms) {
buf.append(term+"("+pos+") ");
}
}
return buf.toString();
}
public String reconstructNoPositions(TermsEnum te, int docid, Bits liveDocs) throws IOException{
List<String> textList = new ArrayList<String>();
BytesRef text;
DocsEnum dpe = null;
while ((text = te.next()) != null) {
dpe = te.docs(liveDocs, dpe);
int iterDoc;
while ((iterDoc = dpe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
if (iterDoc == docid) {
textList.add(text.utf8ToString());
}
}
}
StringBuffer buf = new StringBuffer();
for (String s : textList) {
buf.append(s+" ");
}
return buf.toString();
}
@Override
public void execute(String[] args, PrintStream out) throws Exception {
if (args.length != 2) {
out.println("usage: field doc");
return;
}
String field = args[0];
int doc = Integer.parseInt(args[1]);
IndexReader reader = ctx.getIndexReader();
List<AtomicReaderContext> leaves = reader.leaves();
boolean found = false;
for (AtomicReaderContext ctx : leaves) {
AtomicReader atomicReader = ctx.reader();
FieldInfo finfo = atomicReader.getFieldInfos().fieldInfo(field);
if (finfo == null) continue;
if (!finfo.isIndexed()) {
out.println(field+" is not an indexed field");
return;
}
int docID = doc - ctx.docBase;
if (docID >= 0) {
Terms terms = atomicReader.terms(field);
boolean hasPositions = terms.hasPositions();
TermsEnum te = terms.iterator(null);
if (hasPositions) {
out.println(reconstructWithPositions(te, docID, atomicReader.getLiveDocs()));
}
else {
out.println(reconstructNoPositions(te, docID, atomicReader.getLiveDocs()));
}
found = true;
break;
}
}
if (!found) {
out.println(doc + " not found");
return;
}
}
}