-
Notifications
You must be signed in to change notification settings - Fork 5.3k
/
compute_sentence_scores.py
304 lines (263 loc) · 12.3 KB
/
compute_sentence_scores.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
# Copyright 2020 Ke Li
""" This script computes sentence scores in a batch computation mode with a
PyTorch-trained neural LM.
It is called by steps/pytorchnn/lmrescore_{nbest, lattice}_pytorchnn.sh
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import argparse
from collections import defaultdict
import torch
import torch.nn as nn
def load_sents(path):
r"""Read word sentences that represent hypotheses of utterances.
Assume the input file format is "utterance-id word-sequence" in each line:
en_4156-A_030185-030248-1 oh yeah
en_4156-A_030470-030672-1 well i'm going to have mine and two more classes
en_4156-A_030470-030672-2 well i'm gonna have mine and two more classes
...
Args:
path (str): A file of word sentences in the above format.
Returns:
The sentences represented by a map from a string (utterance-id) to
a list of strings (hypotheses).
"""
sents = defaultdict()
with open(path, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
try:
key, hyp = line.split(' ', 1)
except ValueError:
key = line
hyp = ' '
key = key.rsplit('-', 1)[0]
if key not in sents:
sents[key] = [hyp]
else:
sents[key].append(hyp)
return sents
def read_vocab(path):
r"""Read vocabulary.
Args:
path (str): A file with a word and its integer index per line.
Returns:
A vocabulary represented by a map from string to int (starting from 0).
"""
word2idx = {}
idx2word = []
with open(path, 'r', encoding='utf-8') as f:
for line in f:
word = line.split()
assert len(word) == 2
word = word[0]
if word not in word2idx:
idx2word.append(word)
word2idx[word] = len(idx2word) - 1
return word2idx
def get_input_and_target(args, hyps, vocab):
r"""Convert hypotheses to lists of integers, with input and target separately.
Args:
hyps (str): Hypotheses, with words separated by spaces, e.g.'hello there'
vocab: A map from string to int, e.g. {'<s>':0, 'hello':1,
'there':2, 'apple':3, ...}
Returns:
A pair of lists, one with the integerized input sequence, one with the
integerized output (target) sequence: in this case ([0 1 2], [1 2 0]),
since the input sequence has '<s>' at the beginning and the output
sequence has '<s>' at the end. Words that are not in the vocabulary are
mapped to a special oov symbol, which is expected to be in the vocabulary.
"""
batch_size = len(hyps)
assert batch_size > 0
# Preprocess input and target sequences
inputs, outputs = [], []
for hyp in hyps:
input_string = args.sent_boundary + ' ' + hyp
output_string = hyp + ' ' + args.sent_boundary
input_ids, output_ids = [], []
for word in input_string.split():
try:
input_ids.append(vocab[word])
except KeyError:
input_ids.append(vocab[args.oov])
for word in output_string.split():
try:
output_ids.append(vocab[word])
except KeyError:
output_ids.append(vocab[args.oov])
inputs.append(input_ids)
outputs.append(output_ids)
batch_lens = [len(seq) for seq in inputs]
seq_lens = torch.LongTensor(batch_lens)
max_len = max(batch_lens)
# Zero padding for input and target sequences.
data = torch.LongTensor(batch_size, max_len).zero_()
target = torch.LongTensor(batch_size, max_len).zero_()
for idx, seq_len in enumerate(batch_lens):
data[idx, :seq_len] = torch.LongTensor(inputs[idx])
target[idx, :seq_len] = torch.LongTensor(outputs[idx])
data = data.t().contiguous()
target = target.t().contiguous().view(-1)
return data, target, seq_lens
def compute_sentence_score(model, criterion, ntokens, data, target,
model_type='LSTM', hidden=None):
r"""Compute neural language model scores of hypotheses of an utterance.
Args:
model: A neural language model.
criterion: Training criterion of a neural language model, e.g.
cross entropy.
ntokens: Vocabulary size.
data: Integerized input sentences (hypotheses).
target: Integerized target sentences for data.
model_type: Model type, e.g. LSTM or Transformer or others.
hidden: Initial hidden state for a recurrent-typed model (optional).
Returns:
The scores (negative log-likelihood) of words in input hypotheses.
If the model is recurrent-typed, the function has an extra output:
the last hidden state from the best hypothesis for an utterance.
"""
with torch.no_grad():
if model_type == 'Transformer':
output = model(data)
else:
output, _ = model(data, hidden)
# Run a forward pass of the model on the best path of current
# utterance to get the last hidden state to initialize the initial
# hidden state for next sentence.
h = hidden[0][:,0,:].unsqueeze(1)
c = hidden[1][:,0,:].unsqueeze(1)
_, hidden = model(data[:, 0].unsqueeze(1), (h, c))
loss = criterion(output.view(-1, ntokens), target)
loss = torch.reshape(loss, data.size())
loss = loss.t() # [batch_size, length]
sent_scores = loss.numpy()
if model_type == 'Transformer':
return sent_scores
return sent_scores, hidden
def compute_scores(args, sents, model, criterion, ntokens, vocab, model_type='LSTM'):
r"""Compute neural language model scores of hypotheses for all utterances.
Args:
sents: Hypotheses for all utterances represented by a map from
a string (utterance-id) to a list of strings.
model: A neural language model.
criterion: Training criterion of the neural language model, e.g.
cross entropy.
ntokens: Vocabulary size.
model_type: Model type, e.g. LSTM or Transformer or others.
Returns:
The hypotheses and corresponding neural language model scores for all
utterances.
"""
# Turn on evaluation mode which disables dropout.
model.eval()
sents_and_scores = defaultdict()
for idx, key in enumerate(sents.keys()):
batch_size = len(sents[key])
# Dimension of input data is [seq_len, batch_size]
data, targets, seq_lens = get_input_and_target(args, sents[key], vocab)
if model_type != 'Transformer':
if idx == 0:
hidden = model.init_hidden(batch_size)
else:
# For RNN based LMs, initialize current initial hidden states
# with those from the best hypothsis of its previous utterance.
# This hidden state carry-over trick achieves modest WER
# reductions compared with zero initialization.
# Both h and c has dimension [nlayers, 1, hidden_dim].
h = hidden[0][:, 0, :].unsqueeze(1)
c = hidden[1][:, 0, :].unsqueeze(1)
hidden = (h.repeat(1, batch_size, 1), c.repeat(1, batch_size, 1))
if model_type == 'Transformer':
scores = compute_sentence_score(model, criterion, ntokens, data,
targets, model_type)
else:
scores, hidden = compute_sentence_score(model, criterion, ntokens,
data, targets, model_type,
hidden)
for idx, hyp in enumerate(sents[key]):
if key in sents_and_scores:
sents_and_scores[key].append((hyp, scores[idx][:seq_lens[idx]]))
else:
sents_and_scores[key] = [(hyp, scores[idx][:seq_lens[idx]])]
return sents_and_scores
def write_scores(sents_and_scores, path):
r"""Write out neural language model scores for all hypotheses in the
following format:
en_4156-A_030185-030248-1 2.7702 1.9545 0.9442
en_4156-A_030470-030672-1 3.6918 3.7159 4.1794 0.1375 2.3944 9.3834 4.5469 7.0772 3.6172 7.2183 2.1540
en_4156-A_030470-030672-2 3.6918 3.7159 4.5248 2.3689 8.9368 4.2876 7.0702 3.0812 7.5044 2.2388
...
Args:
sents_and_scores: The hypotheses and scores represented by a map from
a string to a pair of a hypothesis and scores.
path (str): A output file of scores in the above format.
"""
with open(path, 'w', encoding='utf-8') as f:
for key in sents_and_scores.keys():
for idx, (_, score_list) in enumerate(sents_and_scores[key], 1):
current_key = '-'.join([key, str(idx)])
f.write('{} '.format(current_key))
for score in score_list:
f.write("{0:.4f} ".format(score))
f.write('\n')
print("Write neural LM scores to %s" % path)
def main():
parser = argparse.ArgumentParser(description="Compute word scores of"
"hypotheses for each utterance in parallel"
"with a PyTorch-trained neural language model.")
parser.add_argument('--infile', type=str, required=True,
help="Word hypotheses generated from a lattice.")
parser.add_argument('--outfile', type=str, required=True,
help="Output file with neural language model scores"
"for input word hypotheses.")
parser.add_argument('--vocabulary', type=str, required=True,
help="Vocabulary used for neural language model training.")
parser.add_argument('--model-path', type=str, required=True,
help="Path to a pretrained neural language model.")
parser.add_argument('--model', type=str, default='LSTM',
help='Network type. Can be RNN, LSTM or Transformer.')
parser.add_argument('--emsize', type=int, default=200,
help='Size of word embeddings.')
parser.add_argument('--nhid', type=int, default=200,
help='Number of hidden units per layer.')
parser.add_argument('--nlayers', type=int, default=2,
help='Number of layers.')
parser.add_argument('--nhead', type=int, default=2,
help='Number of heads in a Transformer model.')
parser.add_argument('--oov', type=str, default='<unk>',
help='Out of vocabulary word.')
parser.add_argument('--sent-boundary', type=str, default='<s>',
help='Sentence boundary symbol.')
args = parser.parse_args()
assert os.path.exists(args.infile), "Path for input word sequences does not exist."
assert os.path.exists(args.vocabulary), "Vocabulary path does not exist."
assert os.path.exists(args.model_path), "Model path does not exist."
print("Load vocabulary.")
vocab = read_vocab(args.vocabulary)
ntokens = len(vocab)
print("Load model and criterion.")
import model
if args.model == 'Transformer':
model = model.TransformerModel(ntokens, args.emsize, args.nhead,
args.nhid, args.nlayers,
activation="gelu", tie_weights=True)
else:
model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid,
args.nlayers, tie_weights=True)
with open(args.model_path, 'rb') as f:
model.load_state_dict(torch.load(f, map_location=lambda storage, loc: storage))
if args.model in ['RNN_TANH', 'RNN_RELU', 'LSTM', 'GRU']:
model.rnn.flatten_parameters()
criterion = nn.CrossEntropyLoss(reduction='none')
print("Load input word hypotheses.")
sents = load_sents(args.infile)
print("Compute word scores with a ", args.model, " model.")
sents_and_scores = compute_scores(args, sents, model, criterion, ntokens, vocab,
model_type=args.model)
print("Write out word scores.")
write_scores(sents_and_scores, args.outfile)
if __name__ == '__main__':
main()