-
Notifications
You must be signed in to change notification settings - Fork 5.3k
/
prepare_stm.pl
executable file
·311 lines (289 loc) · 12.9 KB
/
prepare_stm.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
#!/usr/bin/env perl
use Getopt::Long;
use Encode;
################################################################################
#
# Script to prepare a NIST .stm file for scoring ASR output. Based on the files
# that are naturally created for Kaldi acoustic training:
#
# - data/segments: contains segmentID, recordingID, start-time & end-time
#
# - data/wav.scp: contains recordingID & waveform-name (or sph2pipe command)
#
# - data/utt2spk: contains segmentID % speakerID
#
# - data/text: contains segment ID and transcription
#
# The .stm file has lines of the form
#
# waveform-name channel speakerID start-time end-time [<attr>] transcription
#
# Clearly, most of the information needed for creating the STM file is present
# in the four Kaldi files mentioned above, except channel --- its value will be
# obtained from the sph2pipe command if present, or will default to "1" --- and
# <attributes> from a separate demographics.tsv file. (A feature to add later?)
#
# Note: Some text filtering is done by this script, such as removing non-speech
# tokens from the transcription, e.g. <cough>, <breath>, etc.
$fragMarkers = ""; # If given by the user, they are stripped from words
# But two types of tokens are retained as is, if present.
#
$Hesitation = "<hes>"; # which captures hesitations, filled pauses, etc.
$OOV_symbol = "<unk>"; # which our system outputs occasionally.
#
# Note: The .stm file must be sorted by filename and channel in ASCII order and
# by the start=time in numerical order. NIST recommends the unix command
# "sort +0 -1 +1 -2 +3nb -4"
#
# This script will also produce an auxilliary file named reco2file_and_channel
# which is used by Kaldi scripts to produce output in .ctm format for scoring.
# So any channel ID assigned here will be consistent between ref and output.
#
# If the training text is Viterbi-aligned to the speech to obtain time marks,
# it should be straightforward to modify this script to produce a .ctm file:
#
# waveform-file channel start-time duration word
#
# which lists the transcriptions with word-level time marks.
#
# Note: A .ctm file must be sorted via "sort +0 -1 +1 -2 +2nb -3"
#
################################################################################
GetOptions("fragmentMarkers=s" => \$fragMarkers, "hesitationToken=s" => \$Hesitation,"oovToken=s" => \$OOV_symbol);
if ($#ARGV == 0) {
$inDir = $ARGV[0];
print STDERR ("$0: Making stm file from information in $inDir\n");
print STDERR ("\tRemoving [$fragMarkers]+ from ends of tokens\n") if ($fragMarkers);
print STDERR ("\tPreserving hesitation tokens $Hesitation\n") unless ($Hesitation eq "<hes>");
print STDERR ("\tUsing $OOV_symbol as the OOV symbol\n") unless ($OOV_symbol eq "<unk>");
} else {
print STDERR ("Usage: $0 [--options] DataDir\n");
print STDERR ("\t--fragmentMarkers <chars> Strip these from ends of each token (default: none)\n");
print STDERR ("\t--hesitationToken <foo> Preserve <foo> when deleting non-speech tokens (default: <hes>)\n");
print STDERR ("\t--oovToken <bar> Use <bar> to replace hard-coded OOVs (default: <unk>)\n");
exit(1);
}
$segmentsFile = "$inDir/segments";
$scpFile = "$inDir/wav.scp";
$utt2spkFile = "$inDir/utt2spk";
$textFile = "$inDir/text";
$stmFile = "$inDir/stm";
$charStmFile = "$inDir/char.stm";
$reco2ctmFile = "$inDir/reco2file_and_channel";
################################################################################
# Read the segmentIDs, file-IDs, start- and end-times from the segments file
################################################################################
die "Current version of script requires a segments file" unless (-e $segmentsFile);
open(SEGMENTS, $segmentsFile)
|| die "Unable to read segments file $segmentsFile";
$numSegments = 0;
while ($line=<SEGMENTS>) {
@tokens = split(/\s+/, $line);
unless ($#tokens == 3) {
print STDERR ("$0: Couldn't parse line $. in $segmentsFile\n\t$line\n");
next;
}
$segmentID = shift @tokens;
if (exists $fileID{$segmentID}) {
print STDERR ("$0: Skipping duplicate segment ID $segmentID in $segmentsFile\n");
next;
}
$fileID{$segmentID} = shift @tokens;
$startTime{$segmentID} = shift @tokens;
$endTime{$segmentID} = shift @tokens;
++$numSegments;
}
close(SEGMENTS);
print STDERR ("$0: Read info about $numSegments segment IDs from $segmentsFile\n");
################################################################################
# Read the waveform filenames from the wav.scp file. (Parse sph2pipe command.)
################################################################################
open(SCP, $scpFile)
|| die "Unable to open scp file $scpFile\n";
$numRecordings = 0;
while ($line=<SCP>) {
chomp;
if ($line =~ m:^\s*(\S+)\s+(.+)$:) {
$recordingID = $1;
$waveformFile = $2;
} else {
print STDERR ("$0: Couldn't parse line $. in $scpFile\n\t$line\n");
next;
}
if (exists $waveform{$recordingID}) {
print STDERR ("$0: Skipping duplicate recording ID $recordingID in $scpFile\n");
# BUG ALERT: This check may need to be turned off for multi-channel recordings,
# since the same recording may appear with with different channels?
next;
}
if ($waveformFile =~ m:^\S+$:) {
# This is a single filename, no shp2pipe or gunzip for reading waveforms
$waveform{$recordingID} = $waveformFile;
} elsif (($waveformFile =~ m:(sph2pipe|gunzip|gzip|cat|zcat)\s+:) &&
($waveformFile =~ m:\s+(\S+)\s*\|$:)) {
# HACK ALERT: the filename is *assumed* to be at the END of the command
$waveform{$recordingID} = $1;
$channel{$recordingID} = $1 if ($waveformFile =~ m:sph2pipe\s+.*\-c\s+(\S+)\s+.+:);
} elsif (($waveformFile =~ m:(sox)\s+:) &&
($waveformFile =~ m:\s+(\S+)\s*\|$:)) {
# HACK ALERT: the first element that does ends with '.wav' is assumed to
# be the original filename
@elems=split(/\s+/, $waveformFile);
foreach $elem (@elems) {
if ($elem =~ m/.*\.wav/) {
$filename=$elem;
last;
}
}
die ("$0: Couldn't parse waveform filename on line $. in $scpFile\n\t$line\n") if not defined $filename;
die ("$0: Filename $filename does not exist: in $scpFile\n\t$line\n") unless (-e $filename);
$waveform{$recordingID} = $filename;
#$channel{$recordingID} = $filename;
} else {
print STDERR ("$0: Couldn't parse waveform filename on line $. in $scpFile\n\t$line\n");
next;
}
$waveform{$recordingID} =~ s:.+/::; # remove path prefix
$waveform{$recordingID} =~ s:\.(sph|wav)\s*$::; # remove file extension
$channel{$recordingID} = 1 # Default
unless (exists $channel{$recordingID});
++$numRecordings;
}
close(SCP);
print STDERR ("$0: Read filenames for $numRecordings recording IDs from $scpFile\n");
################################################################################
# Read speaker information from the utt2spk file
################################################################################
open(UTT2SPK, $utt2spkFile)
|| die "Unable to read utt2spk file $utt2spkFile";
$numSegments = 0;
while ($line=<UTT2SPK>) {
@tokens = split(/\s+/, $line);
if (! ($#tokens == 1)) {
print STDERR ("$0: Couldn't parse line $. in $utt2spkFile\n\t$line\n");
next;
}
$segmentID = shift @tokens;
if (exists $speakerID{$segmentID}) {
print STDERR ("$0: Skipping duplicate segment ID $segmentID in $utt2spkFile\n");
next;
}
$speakerID{$segmentID} = shift @tokens;
++$numSegments;
}
close(UTT2SPK);
print STDERR ("$0: Read speaker IDs for $numSegments segments from $utt2spkFile\n");
################################################################################
# Read the transcriptions from the text file
################################################################################
open(TEXT, $textFile)
|| die "Unable to read text file $textFile";
$numSegments = $numWords = 0;
while ($line=<TEXT>) {
chomp;
if ($line =~ m:^(\S+)\s+(.+)$:) {
$segmentID = $1;
$text = $2;
} else {
print STDERR ("$0: Couldn't parse line $. in $textFile\n\t$line\n");
next;
}
if (exists $transcription{$segmentID}) {
print STDERR ("$0: Skipping duplicate segment ID $segmentID in $segmentsFile\n");
next;
}
$transcription{$segmentID} = "";
@tokens = split(/\s+/, $text);
# This is where one could filter the transcription as necessary.
# E.g. remove noise tokens, mark non-scoring segments, etc.
# HACK ALERT: Current version does this is an ad hoc manner!
while ($w = shift(@tokens)) {
# Substitute OOV tokens specific to the Babel data
$w = $OOV_symbol if ($w eq "(())");
# Remove fragMarkers, if provided, from either end of the word
$w =~ s:(^[$fragMarkers]|[$fragMarkers]$)::g if ($fragMarkers);
# Omit non-speech symbols such as <cough>, <breath>, etc.
$w =~ s:^<[^>]+>$:: unless (($w eq $OOV_symbol) || ($w eq $Hesitation));
next if ($w eq "");
$transcription{$segmentID} .= " $w";
$numWords++;
}
$transcription{$segmentID} =~ s:^\s+::; # Remove leading white space
$transcription{$segmentID} =~ s:\s+$::; # Remove training white space
$transcription{$segmentID} =~ s:\s+: :g; # Normalize remaining white space
# Transcriptions containing no words, or only OOVs and hesitations are not scored
$transcription{$segmentID} = "IGNORE_TIME_SEGMENT_IN_SCORING"
if (($transcription{$segmentID} eq "") ||
($transcription{$segmentID} =~ m:^(($OOV_symbol|$Hesitation)\s*)+$:));
++$numSegments;
}
close(TEXT);
print STDERR ("$0: Read transcriptions for $numSegments segments ($numWords words) from $textFile\n");
################################################################################
# Write the transcriptions in stm format to a file named stm
################################################################################
print STDERR ("$0: Overwriting existing stm file $stmFile\n")
if (-s $stmFile);
open(STM, "| sort +0 -1 +1 -2 +3nb -4 > $stmFile")
|| die "Unable to write to stm file $stmFile";
$numSegments = 0;
foreach $segmentID (sort keys %fileID) {
if (exists $waveform{$fileID{$segmentID}}) {
printf STM ("%s %s %s %.2f %.2f",
$waveform{$fileID{$segmentID}},
$channel{$fileID{$segmentID}},
$speakerID{$segmentID},
$startTime{$segmentID},
$endTime{$segmentID});
printf STM (" <%s>", $attributes{$segmentID}) if (exists $attributes{$segmentID});
printf STM (" %s\n", $transcription{$segmentID});
++$numSegments;
} else {
print STDERR ("$0: No waveform found for segment $segmentID, file $fileID{$segmentID}\n");
}
}
close(STM);
print STDERR ("$0: Wrote reference transcriptions for $numSegments segments to $stmFile\n");
################################################################################
# Write a character-separated stm file as well, for CER computation
################################################################################
print STDERR ("$0: Overwriting existing stm file $charStmFile\n")
if (-s $charStmFile);
open(STM, "$stmFile")
|| die "Unable to read back stm file $stmFile";
binmode STM,":encoding(utf8)";
open(CHARSTM, "> $charStmFile")
|| die "Unable to write to char.stm file $charStmFile";
binmode CHARSTM,":encoding(utf8)";
while ($line=<STM>) {
@tokens = split(/\s+/, $line);
# The first 5 tokens are filename, channel, speaker, start- and end-time
for ($n=0; $n<5; $n++) {
$w = shift @tokens;
print CHARSTM ("$w ");
}
# CER is used only for some scripts, e.g. CJK. So only non-ASCII characters
# in the remaining tokens should be split into individual tokens.
$w = join (" ", @tokens);
$w =~ s:([^\x00-\x7F])(?=[^\x00-\x7F]):$1 :g; # split adjacent non-ASCII chars
print CHARSTM ("$w\n");
}
close(CHARSTM);
close(STM);
print STDERR ("$0: Wrote char.stm file $charStmFile\n");
################################################################################
# Write the reco2file_and_channel file for use by Kaldi scripts
################################################################################
print STDERR ("$0: Overwriting existing reco2file_and_channel file $reco2ctmFile\n")
if (-s $reco2ctmFile);
open(RECO2CTM, "| sort > $reco2ctmFile")
|| die "Unable to write to reco2file_and_channel file $reco2ctmFile";
$numRecordings = 0;
foreach $recordingID (sort keys %waveform) {
printf RECO2CTM ("%s %s %s\n", $recordingID, $waveform{$recordingID}, $channel{$recordingID});
++$numRecordings;
}
close(RECO2CTM);
print STDERR ("$0: Wrote file_and_channel info for $numRecordings recordings to $reco2ctmFile\n");
print STDERR ("$0: Done!\n");
exit(0);