-
Notifications
You must be signed in to change notification settings - Fork 0
/
conllutil.py
123 lines (101 loc) · 3.9 KB
/
conllutil.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
from nltk.parse.dependencygraph import DependencyGraph
from os import path
import re, sys
# TODO this class could actually implement a generator (for the trees)
class CoNLLData(object):
def __init__(self, root_dir, file_list, tok_to_ix_map, min_len=0, max_len=sys.maxsize, lazy_loading=False, word_transform=lambda s: s):
self._sequences = []
punctuation = re.compile('\.|,|\?|!|:|;')
for fname in file_list:
with open(path.join(root_dir, fname)) as f:
lines = f.readlines()
sequence = []
roots = 0
drop_it = False
for line in lines:
line = line.strip()
if line:
tpl = line.split('\t')[:8]
if tpl[-1] == 'S':
roots += 1
if word_transform(tpl[1]) in tok_to_ix_map:
sequence.append([tpl[0]]+[word_transform(tpl[1])]+tpl[2:])
elif punctuation.match(tpl[0][0]): drop_it = True
else:
if not drop_it and roots == 1 and len(sequence) >= min_len and len(sequence) <= max_len:
self._sequences.append(sequence)
sequence = []
roots = 0
drop_it = False
self._fix_indices()
self._wordseqs = None if lazy_loading else [[tup[1] for tup in seq] for seq in self._sequences]
self._posseqs = None if lazy_loading else [[tup[4] for tup in seq] for seq in self._sequences]
if lazy_loading:
self._word_transform = word_transform
self._tok2ix = tok_to_ix_map
self.reset()
self.reset_sentences()
def wordsequences(self):
return self._wordseqs if self._wordseqs else [[self._word_transform(tup[1]) for tup in seq] for seq in self._sequences]
def sequences(self):
return self._sequences
def possequences(self):
return self._posseqs if self._posseqs else [[tup[4] for tup in seq] for seq in self._sequences]
def iter_posseqs(self):
for seq in self._sequences:
yield [tup[4] for tup in seq]
def _trees(self):
for seq in self._sequences:
gs = ''
for nd in seq:
gs += '\t'.join(nd+['_','_']) + '\n'
try:
yield DependencyGraph(gs, top_relation_label='S', cell_separator='\t')
except UserWarning:
yield None
def _sentences(self):
s_ix = -1
for seq in self._sequences:
gs = ''
s_ix += 1
pos_seq = []
ix_seq = []
for nd in seq:
gs += '\t'.join(nd+['_','_']) + '\n'
ix_seq.append(self._tok2ix[nd[1]])
pos_seq.append(nd[4])
try:
graph = DependencyGraph(gs, top_relation_label='S', cell_separator='\t')
yield s_ix, ix_seq, pos_seq, graph
except UserWarning:
continue
def tree(self):
try:
return next(self._tree)
except StopIteration:
return
def sentence(self):
try:
return next(self._sentence)
except StopIteration:
return
def trees(self):
self.reset()
return self._trees()
def reset(self):
self._tree = self._trees()
def reset_sentences(self):
self._sentence = self._sentences()
def _fix_indices(self):
for seq in self._sequences:
expected_id = 1
for elem in seq:
if elem[0] != str(expected_id):
self._fix_index(seq, elem[0], str(expected_id))
expected_id += 1
def _fix_index(self, seq, i_old, i_new):
for elem in seq:
if elem[0] == i_old:
elem[0] = i_new
if elem[-2] == i_old:
elem[-2] = i_new