-
Notifications
You must be signed in to change notification settings - Fork 0
/
TrecDocIterator.py
executable file
·60 lines (49 loc) · 1.24 KB
/
TrecDocIterator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#!/usr/bin/env python
import cStringIO
import re
from org.apache.lucene.document import Document, Field, StringField, TextField
class TrecDocIterator:
fl = None
at_eof = False
def __init__(self, fileObj):
global at_eof,fl
fl = open(fileObj.toString(), 'r')
print "Reading " + fileObj.toString()
at_eof = False
def hasNext(self):
return (not at_eof)
def next(self):
global at_eof
doc = Document()
try:
buf = cStringIO.StringIO()
docno_tag = re.compile("<DOCNO>\\s*(\\S+)\\s*<")
in_doc = False
while (True):
line = fl.readline()
if (line == ""):
at_eof = True
break
if not in_doc:
if line.startswith("<DOC>"):
in_doc = true
else:
continue
if line.startswith("</DOC>"):
in_doc = false
buf.write(line)
break
mtch = docno_tag.match(line)
if mtch:
docno = mtch.group(1)
doc.add(StringField("docno", docno, Field.Store.YES))
buf.write(line)
value = buf.getvalue()
if len(value) > 0:
doc.add(TextField("contents", value, Field.Store.NO))
except IOError:
print IOError
return doc
def remove(self):
# Do nothing, but don't complain
pass