/
dict_to_4lang.py
197 lines (172 loc) · 6.69 KB
/
dict_to_4lang.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
from __future__ import with_statement
from collections import defaultdict
import json
import logging
import os
import sys
import threading
import time
import traceback
from collins_parser import CollinsParser
from corenlp_wrapper import CoreNLPWrapper
from dep_to_4lang import DepTo4lang
from eksz_parser import EkszParser
from entry_preprocessor import EntryPreprocessor
from lexicon import Lexicon
from longman_parser import LongmanParser
from magyarlanc_wrapper import Magyarlanc
from nszt_parser import NSzTParser
from utils import batches, ensure_dir, get_cfg
from wiktionary_parser import WiktParser
assert Lexicon # silence pyflakes (Lexicon must be imported for cPickle)
ONE_BY_ONE = False # run threads after one another (to avoid memory issues)
class DictTo4lang():
def __init__(self, cfg):
self.dictionary = {}
self.cfg = cfg
self.output_fn = self.cfg.get('dict', 'output_file')
ensure_dir(os.path.dirname(self.output_fn))
self.tmp_dir = self.cfg.get('data', 'tmp_dir')
ensure_dir(self.tmp_dir)
self.graph_dir = self.cfg.get('machine', 'graph_dir')
ensure_dir(self.graph_dir)
self.get_parser_and_lang()
self.machine_wrapper = None
def get_parser_and_lang(self):
input_type = self.cfg.get('dict', 'input_type')
logging.info('input type: {0}'.format(input_type))
if input_type == 'wiktionary':
self.parser = WiktParser()
self.lang = 'eng'
elif input_type == 'longman':
self.parser = LongmanParser()
self.lang = 'eng'
elif input_type == 'collins':
self.parser = CollinsParser()
self.lang = 'eng'
elif input_type == 'eksz':
self.parser = EkszParser()
self.lang = 'hun'
elif input_type == 'nszt':
self.parser = NSzTParser()
self.lang = 'hun'
else:
raise Exception('unknown input format: {0}'.format(input_type))
def parse_dict(self):
input_file = self.cfg.get('dict', 'input_file')
self.raw_dict = defaultdict(dict)
for entry in self.parser.parse_file(input_file):
if 'senses' not in entry or entry['senses'] == []:
continue # todo
self.unify(self.raw_dict[entry['hw']], entry)
def unify(self, entry1, entry2):
if entry1 == {}:
entry1.update(entry2)
elif entry1['hw'] != entry2['hw']:
raise Exception(
"cannot unify entries with different headwords: " +
"{0} vs. {1}".format(entry1['hw'], entry2['hw']))
# print 'entry1: ' + repr(entry1)
# print 'entry2: ' + repr(entry2)
else:
entry1['senses'] += entry2['senses']
def process_entries(self, words):
entry_preprocessor = EntryPreprocessor(self.cfg)
entries = map(entry_preprocessor.preprocess_entry,
(self.raw_dict[word] for word in words))
if self.lang == 'eng':
corenlp_wrapper = CoreNLPWrapper(self.cfg)
entries = corenlp_wrapper.parse_entries(entries)
elif self.lang == 'hun':
magyarlanc_wrapper = Magyarlanc(self.cfg)
entries = magyarlanc_wrapper.parse_entries(entries)
else:
print 'incorrect lang'
for entry in entries:
if entry['to_filter']:
continue
word = entry['hw']
for sense in entry['senses']:
definition = sense['definition']
if definition is None:
continue
if word in self.dictionary:
logging.warning(
"entries with identical headwords:\n{0}\n{1}".format(
entry, self.dictionary[word]))
self.unify(self.dictionary[word], entry)
else:
self.dictionary[word] = entry
def process_entries_thread(self, i, words):
try:
self.process_entries(words)
except:
self.thread_states[i] = False
traceback.print_exc()
else:
self.thread_states[i] = True
def run(self, no_threads=1):
logging.info('parsing xml...')
self.parse_dict()
# print "\n".join(["\n".join(["{0}\t{1}".format(
# w, d['definition']) for d in s['senses']])
# for w, s in self.raw_dict.items()])
# print self.raw_dict
# sys.exit(-1)
entries_per_thread = (len(self.raw_dict) / no_threads) + 1
self.thread_states = {}
# may turn out to be less then "no_threads" with small input
started_threads = 0
if ONE_BY_ONE:
logging.warning('running threads one by one!')
for i, batch in enumerate(batches(self.raw_dict.keys(),
entries_per_thread)):
if ONE_BY_ONE:
logging.warning('running batch #{0}'.format(i))
self.process_entries_thread(i, batch)
else:
t = threading.Thread(
target=self.process_entries_thread, args=(i, batch))
t.start()
started_threads += 1
logging.info("started {0} threads".format(started_threads))
while True:
if len(self.thread_states) < started_threads:
time.sleep(1)
continue
elif all(self.thread_states.values()):
logging.info(
"{0} threads finished successfully".format(no_threads))
break
else:
raise Exception("some threads failed")
def read_dict(self):
logging.info(
'loading dict_to_4lang intermediate state from {0}'.format(
self.output_fn))
with open(self.output_fn, 'r') as dict_file:
self.dictionary = json.load(dict_file)
logging.info('done!')
def print_dict(self, stream=None):
if stream is None:
with open(self.output_fn, 'w') as out:
json.dump(self.dictionary, out)
else:
json.dump(self.dictionary, stream)
def main():
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s : " +
"%(module)s (%(lineno)s) - %(levelname)s - %(message)s")
cfg_file = sys.argv[1] if len(sys.argv) > 1 else None
no_threads = int(sys.argv[2]) if len(sys.argv) > 2 else 1
cfg = get_cfg(cfg_file)
dict_to_4lang = DictTo4lang(cfg)
dict_to_4lang.run(no_threads)
dict_to_4lang.print_dict()
dep_to_4lang = DepTo4lang(cfg)
dep_to_4lang.dep_to_4lang()
dep_to_4lang.save_machines()
dep_to_4lang.print_graphs()
if __name__ == '__main__':
main()