-
Notifications
You must be signed in to change notification settings - Fork 3
/
APT.py
471 lines (376 loc) · 16.6 KB
/
APT.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
# -*- coding: UTF-8 -*-
#####################################################################
# APT.py
#
# This program calculates the accuracy of pronoun translation.
#
# Copyright (c) 20xx Idiap Research Institute, http://www.idiap.ch/
# Written by Lesly Miculich <Lesly.Miculich@idiap.ch>
#
# This file is part of APT.
#
# APT is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 3 as
# published by the Free Software Foundation.
#
# APT is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Foobar. If not, see <http://www.gnu.org/licenses/>.
#
####################################################################
import sys, os
import ConfigParser
import codecs
import numpy
import improve_alignment
sep_nl = "\n"
sep_tab = "\t"
sep_esp = " "
sep_lin = "-"
encoding = "utf-8"
case_2_probabilities = False
other_category = "OTHER"
none_category = "NONE"
full_match = False
def normalize_word(word, list_target_words, dict_equal):
for i, group in enumerate(dict_equal):
if word in group:
return list_target_words.index(group[0])
if word in list_target_words:
return list_target_words.index(word)
return None
def normalize_dict_similar(list_target_words, dict_equal, dict_similar):
new_dict_similar = []
if case_2_probabilities:
for i, (s, r, t, p) in enumerate(dict_similar):
r = normalize_word(r, list_target_words, dict_equal)
t = normalize_word(t, list_target_words, dict_equal)
if r and t:
new_dict_similar.append([s, r, t, float(p)])
else:
for i, d in enumerate(dict_similar):
d = [normalize_word(w, list_target_words, dict_equal) for w in d if w in list_target_words]
if len(d)>1: new_dict_similar.append(d)
return new_dict_similar
def normalize_dict_equal(list_target_words, dict_equal):
new_dict_equal = []
for d in dict_equal:
new_d = [w for w in d if w in list_target_words]
if len(new_d)>1: new_dict_equal.append(new_d)
return new_dict_equal
def similar(word_source, axis_ref, axis_target, dict_similar):
if len(dict_similar):
if case_2_probabilities:
for s, r, t, p in dict_similar:
r, t = set([r]), set([t])
if word_source == s:
if full_match:
if r == axis_ref and t == axis_target:
return p, r, t
else:
if (r & axis_ref) and (t & axis_target):
return p, r, t
else:
for d in dict_similar:
d = set(d)
if full_match:
if axis_ref <= d and axis_target <= d :
return 1., axis_ref , axis_target
else:
if axis_ref & d and axis_target & d:
return 1., axis_ref & d, axis_target & d
return 0., None, None
def equal(axis_ref, axis_target, count_other, index_other):
if full_match:
if axis_ref == axis_target:
if count_other or not index_other or (not index_other in axis_ref):
return axis_ref
else: return set([])
else:
if not count_other and index_other:
return axis_ref & axis_target - index_other
else:
return axis_ref & axis_target
def get_case(l_source, l_ref, l_target, list_target_words, dict_equal, dict_similar, count_other, index_other):
word_source = l_source[2]
prob = 1.
axis_ref = [-1]
axis_target = [-1]
if l_ref:
axis_ref = set([normalize_word(w, list_target_words, dict_equal) for w in l_ref[2]])
if l_target:
axis_target = set([normalize_word(w, list_target_words, dict_equal) for w in l_target[2]])
if not l_ref:
case = 5 if l_target else 6
elif not l_target:
case = 4
else:
axis_equal = equal(axis_ref, axis_target, count_other, index_other)
if axis_equal:
case, axis_ref, axis_target = 1, axis_equal, axis_equal
else:
prob_2, axis_sim_1, axis_sim_2 = similar(word_source, axis_ref, axis_target, dict_similar)
if prob_2:
case, prob, axis_ref, axis_target = 2, prob_2, axis_sim_1, axis_sim_2
else: case = 3
return case, prob, (axis_ref, axis_target)
def score_words(l_source_words, l_ref_words, l_target_words, l_cases, l_weights, list_target_words, dict_equal, dict_similar, count_multiword, count_other):
cases = numpy.zeros(len(l_cases))
weights = numpy.array(l_weights)
matrix = numpy.zeros((len(list_target_words), len(list_target_words)), dtype=int)
list_cases = [0]*len(l_source_words)
dict_equal = normalize_dict_equal(list_target_words, dict_equal)
dict_similar = normalize_dict_similar(list_target_words, dict_equal, dict_similar)
index_other = set([list_target_words.index(other_category)]) if other_category in list_target_words else None
for c, (l_source, l_ref, l_target) in enumerate(zip(l_source_words, l_ref_words, l_target_words)):
case, prob, pos_matrix = get_case(l_source, l_ref, l_target, list_target_words, dict_equal, dict_similar, count_other, index_other)
if case in l_cases:
cases[l_cases.index(case)] += prob
for i in pos_matrix[0]:
for j in pos_matrix[1]:
matrix[i,j] += 1
if not count_multiword: break
else: continue
break
list_cases[c] = case
score = sum(cases * weights)/sum(cases)
return score, cases, matrix, list_cases
def get_words_from_position(sentences, l_sentences, l_positions, list_words, sep = None):
with codecs.open(sentences, encoding=encoding) as f:
l_words = [[]]*len(l_sentences)
l_all_words = set()
for s, line in enumerate(f):
if s in l_sentences:
line = line.strip().lower().split(sep_esp)
for i in numpy.where(l_sentences == s)[0]:
pos = l_positions[i]
if not pos: continue
word = [line[p] for p in pos]
l_words[i] = [pos, word, []]
if list_words:
for w in word:
for sub_word in w.split(sep):
if sub_word in list_words:
l_words[i][2].append(sub_word)
break
if not l_words[i][2]: l_words[i][2] = [other_category]
else:
l_words[i][2] = word
l_all_words.update(word)
return l_words, l_all_words
def get_aligned_positions(l_sentences, l_words, alignment):
with open(alignment) as f:
l_positions = [[]]*len(l_sentences)
for s, line in enumerate(f):
if s in l_sentences:
line = line.strip().split(sep_esp)
for i in numpy.where(l_sentences == s)[0]:
pos = [int(e.split(sep_lin)[1]) for e in line if e.startswith(str(l_words[i][0]) + sep_lin)]
l_positions[i] = sorted(pos)
return l_positions
def get_words_from_list(sentences, list_words, sep_source = None, input_type = "word"):
with codecs.open(sentences, encoding=encoding) as f:
sentences = [line.strip() for line in f]
l_sentences = []
l_words = []
if input_type == "word":
for i, sentence in enumerate(sentences):
sentence = sentence.strip().lower().split(sep_esp)
for pos, word in enumerate(sentence):
sub_words = word.split(sep_source)
for sub_word in sub_words:
if sub_word in list_words:
l_sentences.append(i)
l_words.append([pos, word, sub_word])
elif input_type == "possition":
with codecs.open(list_words, encoding=encoding) as f_w:
for line in f_w:
n_sentence, possition = line.split()
n_sentence = int(n_sentence.strip())
possition = int(possition.strip())
sentence = sentences[n_sentence].lower().split(sep_esp)
l_sentences.append(n_sentence)
l_words.append([possition, sentence[possition], sentence[possition]])
return numpy.array(l_sentences), l_words
def get_list_from_file(file_name, sep = None):
with codecs.open(file_name, encoding=encoding) as f:
list_words = [line.strip().lower() for line in f if line.strip() != ""]
if sep:
list_words = [w.replace(" ","").split(sep) for w in list_words]
return list_words
def update_list_target(list_target_words, vocabulary_ref, vocabulary_target):
if list_target_words:
list_target_words.append(other_category)
else:
list_target_words = list(vocabulary_ref.union(vocabulary_target))
list_target_words.append(none_category)
return list_target_words
def print_output_detail(l_sentences, l_source_words, l_ref_words, l_target_words, list_cases, out):
with codecs.open(out + ".detail", "w", encoding=encoding) as f:
f.write(sep_tab.join(["SENT.", "POS. SOURCE", "SOURCE", "POS. REF.", "REF.", "POS. TARGET", "TARGET", "CASE"]) + sep_nl)
for sentence, source, ref, target, case in zip(l_sentences, l_source_words, l_ref_words, l_target_words, list_cases):
f.write(sep_tab.join([ \
str(sentence), str(source[0]), source[2], \
sep_esp.join([str(r) for r in ref[0]] if ref else sep_lin), sep_esp.join(ref[2]) if ref else sep_lin, \
sep_esp.join([str(t) for t in target[0]] if target else sep_lin), sep_esp.join(target[2]) if target else sep_lin, \
str(case) \
]) + sep_nl)
def add(matrix, maxtix_row, matrix_col, matrix_dia, list_target_words, word, n = None):
row_lefth = maxtix_row.sum(0)
col_lefth = matrix_col.sum(1)
if n:
row_lefth = row_lefth[:n]
col_lefth = col_lefth[:n]
list_target_words = list_target_words[:n]
col_lefth = numpy.hstack((col_lefth, matrix_dia.sum()))
col_lefth = numpy.array(numpy.matrix(col_lefth).T)
matrix = numpy.vstack((matrix, row_lefth))
matrix = numpy.hstack((matrix, col_lefth))
list_target_words = numpy.append(list_target_words, word)
return matrix, list_target_words
def print_output(score, cases, matrix, l_cases, weihgts, list_target_words, out_file, max_len_matrix):
out = "Score: %0.4f\n" % score
out += "Cases: " + ",".join([str(c) for c in l_cases]) + sep_nl
out += "Weights: " + ",".join([str(w) for w in weihgts]) + sep_nl
out += "Findings per case: " + ",".join(["%0.0f" % c for c in cases]) + sep_nl
out += "Total findings: %0.0f\n" % sum(cases)
order = numpy.argsort(matrix.diagonal()).tolist()
order.reverse()
matrix = (matrix[order])[:,order]
list_target_words = numpy.array(list_target_words)[order]
if len(list_target_words) > max_len_matrix:
matrix, list_target_words = add(matrix[:max_len_matrix, :max_len_matrix], \
matrix[max_len_matrix:], matrix[:,max_len_matrix:], \
matrix[max_len_matrix:,max_len_matrix:], list_target_words, \
"...", max_len_matrix)
matrix, list_target_words = add(matrix, matrix, matrix, matrix, list_target_words, "-sum-")
out += sep_tab + sep_tab.join(list_target_words) + sep_nl
for label, row in zip(list_target_words, matrix):
out += label + sep_tab + sep_tab.join([str(r) for r in row]) + sep_nl
with codecs.open(out_file + ".score", "w", encoding=encoding) as f:
f.write(out)
def aligment(sr_align, st_aling, s, r, t):
if sr_align.strip() == "":
sr_align = run_giza(s,r)
if st_align.strip() == "":
st_align = run_giza(s,t)
return sr_align, st_aling
def main(argv):
try:
print "*** Reading input\n"
config_file = argv[0]
open(config_file, 'r')
config = ConfigParser.RawConfigParser()
config.read(config_file)
lang_source = config.get("lang", "source")
lang_target = config.get("lang", "target")
sep_source = config.get("lang", "source_word_separator")
if not sep_source or sep_source.strip() == "":
sep_source = None
sep_target = config.get("lang", "target_word_separator")
if not sep_target or sep_target.strip() == "":
sep_target = None
source = config.get("files", "source")
if not os.path.isfile(source):
raise NameError("Source file does not exist")
reference = config.get("files", "reference")
if not os.path.isfile(reference):
raise NameError("Reference file does not exist")
target = config.get("files", "target")
if not os.path.isfile(target):
raise NameError("Target file does not exist")
align_sr = config.get("files", "alignment_source_reference")
if not os.path.isfile(align_sr):
raise NameError("Source-reference alignment file does not exist")
align_st = config.get("files", "alignment_source_target")
if not os.path.isfile(align_st):
raise NameError("Target-reference alignment file does not exist")
input_type = config.get("files", "input_type")
list_source_words = config.get("files", "list_source_pronouns")
if not os.path.isfile(list_source_words):
print "Not list of source pronouns"
list_source_words = []
elif input_type == "word":
list_source_words = get_list_from_file(list_source_words)
list_target_words = config.get("files", "list_target_pronouns")
if not os.path.isfile(list_target_words):
print "Not list of target pronouns"
list_target_words = []
else:
list_target_words = get_list_from_file(list_target_words)
dict_equal= config.get("dictionary", "equal")
if not os.path.isfile(dict_equal):
print "Not dictionary of equal pronouns"
dict_equal = []
else:
dict_equal = get_list_from_file(dict_equal, sep = ",")
dict_similar= config.get("dictionary", "similar")
if not os.path.isfile(dict_similar):
print "Not dictionary of similar pronouns"
dict_similar = []
else:
dict_similar = get_list_from_file(dict_similar, sep = ",")
dict_source_words = config.get("dictionary", "source_pronouns")
if not os.path.isfile(dict_source_words):
print "Not dictionary of source pronouns"
dict_source_words = []
else:
dict_source_words = get_list_from_file(dict_source_words)
dict_target_words = config.get("dictionary", "target_pronouns")
if not os.path.isfile(dict_target_words):
print "Not dictionary of target pronouns"
dict_target_words = []
else:
dict_target_words = get_list_from_file(dict_target_words)
l_cases = config.get("cases", "cases_to_use")
if l_cases:
l_cases = [int(c.strip()) for c in l_cases.split(",") if int(c.strip()) in range(1,7)]
else:
raise NameError("List of cases to use is empty")
weights = config.get("cases", "weigths_per_case")
if weights:
weights = [float(c.strip()) for c in weights.split(",")]
else:
raise NameError("List of weights is empty")
count_other = config.getboolean("cases", "count_OTHER_as_equal")
out = config.get("output", "output_file")
count_multiword = config.getboolean("output", "counting_multiword_in_matrix")
max_len_matrix = config.getint("output", "max_length_matrix")
print "*** End Reading input\n"
except IndexError:
print 'Missing argument. Use:\n\t python APT.py <configuration file>'
sys.exit(2)
except IOError:
print 'File does not exits. Use:\n\t python APT.py <configuration file>'
sys.exit(2)
except NameError as e:
print "Input error: " + e.message
sys.exit(2)
except:
print("Unexpected error:", sys.exc_info()[0])
raise
l_sentences, l_source_words = get_words_from_list(source, list_source_words, sep_source, input_type)
l_ref_pos = get_aligned_positions(l_sentences, l_source_words, align_sr)
l_ref_words, l_ref_vocabulary = get_words_from_position(reference, l_sentences, l_ref_pos, list_target_words, sep_target )
l_target_pos = get_aligned_positions(l_sentences, l_source_words, align_st)
l_target_words, l_target_vocabulary = get_words_from_position(target, l_sentences, l_target_pos, list_target_words, sep_target )
print "*** Improving alignment\n"
print "For reference:"
improve_alignment.improve_alignment(l_sentences, l_source_words, l_ref_words, source, reference, align_sr, dict_source_words, dict_target_words)
print "For target:"
improve_alignment.improve_alignment(l_sentences, l_source_words, l_target_words, source, target, align_st, dict_source_words, dict_target_words)
print "*** End improving alignment\n"
print "*** Calculating score\n"
list_target_words = update_list_target(list_target_words, l_ref_vocabulary, l_target_vocabulary)
score, cases, matrix, list_cases = score_words(l_source_words, l_ref_words, l_target_words, l_cases, weights, list_target_words, dict_equal, dict_similar, count_multiword, count_other)
print_output_detail(l_sentences, l_source_words, l_ref_words, l_target_words, list_cases, out)
print_output(score, cases, matrix, l_cases, weights, list_target_words, out, max_len_matrix)
print "Score: %0.4f" % score
print "More details in output files"
print "*** End calculating score\n"
if __name__ == "__main__":
main(sys.argv[1:])