Skip to content

Commit

Permalink
[scripts] Modify split_data.sh to split data evenly when utt2dur exis…
Browse files Browse the repository at this point in the history
…ts (#3653)
  • Loading branch information
RuABraun authored and danpovey committed Oct 15, 2019
1 parent 3492d6e commit f8bf717
Show file tree
Hide file tree
Showing 3 changed files with 127 additions and 12 deletions.
8 changes: 0 additions & 8 deletions egs/wsj/s5/utils/filter_scps.pl
Expand Up @@ -90,7 +90,6 @@

# Some variables that we set to produce a warning.
$warn_uncovered = 0;
$warn_multiply_covered = 0;

for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
$idlist_n = $idlist;
Expand Down Expand Up @@ -133,9 +132,6 @@
$warn_uncovered = 1;
} else {
@jobs = @{$id2jobs{$id}}; # this dereferences the array reference.
if (@jobs > 1) {
$warn_multiply_covered = 1;
}
foreach $job_id (@jobs) {
if (!defined $job2output{$job_id}) {
die "Likely code error";
Expand Down Expand Up @@ -164,7 +160,3 @@
if ($warn_uncovered && $print_warnings) {
print STDERR "filter_scps.pl: warning: some input lines did not get output\n";
}
if ($warn_multiply_covered && $print_warnings) {
print STDERR "filter_scps.pl: warning: some input lines were output to multiple files [OK if splitting per utt] " .
join(" ", @ARGV) . "\n";
}
12 changes: 11 additions & 1 deletion egs/wsj/s5/utils/split_data.sh
Expand Up @@ -67,6 +67,11 @@ if [ -f $data/text ] && [ $nu -ne $nt ]; then
echo "** use utils/fix_data_dir.sh to fix this."
fi

ns=`cat $data/spk2utt | wc -l`
if [ $numsplit -gt $ns ] && [ $split_per_spk = "true" ]; then
echo "You should reduce the number of jobs ($numsplit) as there are not enough speakers ($ns)."
exit 1
fi

if $split_per_spk; then
utt2spk_opt="--utt2spk=$data/utt2spk"
Expand All @@ -76,6 +81,11 @@ else
utt="utt"
fi

utt2dur_opt=
if [ -f $data/utt2dur ]; then
utt2dur_opt="--utt2dur=$data/utt2dur"
fi

s1=$data/split${numsplit}${utt}/1
if [ ! -d $s1 ]; then
need_to_split=true
Expand Down Expand Up @@ -108,7 +118,7 @@ fi
which lockfile >&/dev/null && lockfile -l 60 $data/.split_lock
trap 'rm -f $data/.split_lock' EXIT HUP INT PIPE TERM

utils/split_scp.pl $utt2spk_opt $data/utt2spk $utt2spks || exit 1
utils/split_scp.pl $utt2spk_opt $utt2dur_opt $data/utt2spk $utt2spks || exit 1

for n in `seq $numsplit`; do
dsn=$data/split${numsplit}${utt}/$n
Expand Down
119 changes: 116 additions & 3 deletions egs/wsj/s5/utils/split_scp.pl
Expand Up @@ -47,6 +47,7 @@
$num_jobs = 0;
$job_id = 0;
$utt2spk_file = "";
$utt2dur_file = "";
$one_based = 0;

for ($x = 1; $x <= 3 && @ARGV > 0; $x++) {
Expand All @@ -59,6 +60,12 @@
$utt2spk_file=$1;
shift;
}

if ($ARGV[0] =~ "--utt2dur=(.+)") {
$utt2dur_file=$1;
shift;
}

if ($ARGV[0] eq '--one-based') {
$one_based = 1;
shift @ARGV;
Expand All @@ -69,15 +76,16 @@
$job_id - $one_based >= $num_jobs)) {
die "$0: Invalid job number/index values for '-j $num_jobs $job_id" .
($one_based ? " --one-based" : "") . "'\n"

}

$one_based
and $job_id--;

if(($num_jobs == 0 && @ARGV < 2) || ($num_jobs > 0 && (@ARGV < 1 || @ARGV > 2))) {
die
"Usage: split_scp.pl [--utt2spk=<utt2spk_file>] in.scp out1.scp out2.scp ...
or: split_scp.pl -j num-jobs job-id [--one-based] [--utt2spk=<utt2spk_file>] in.scp [out.scp]
"Usage: split_scp.pl [--utt2spk=<utt2spk_file>] [--utt2dur=<utt2dur_file>] in.scp out1.scp out2.scp ...
or: split_scp.pl -j num-jobs job-id [--one-based] [--utt2spk=<utt2spk_file>] [--utt2dur=<utt2dur_file>] in.scp [out.scp]
... where 0 <= job-id < num-jobs, or 1 <= job-id <- num-jobs if --one-based.\n";
}

Expand All @@ -95,8 +103,113 @@
}
}
}
if ($utt2spk_file ne "" && $utt2dur_file ne "" ) { # --utt2spk and --utt2dur
open(U, "<$utt2spk_file") || die "Failed to open utt2spk file $utt2spk_file";
while(<U>) {
@A = split;
@A == 2 || die "Bad line $_ in utt2spk file $utt2spk_file";
($u,$s) = @A;
$utt2spk{$u} = $s;
}
$dursum = 0.0;
open(U, "<$utt2dur_file") || die "Failed to open utt2dur file $utt2dur_file";
while(<U>) {
@A = split;
@A == 2 || die "Bad line $_ in utt2spk file $utt2dur_file";
($u,$d) = @A;
$utt2dur{$u} = $d;
$dursum += $d;
}
open(I, "<$inscp") || die "Opening input scp file $inscp";
@spkrs = ();
while(<I>) {
@A = split;
if(@A == 0) { die "Empty or space-only line in scp file $inscp"; }
$u = $A[0];
$s = $utt2spk{$u};
if(!defined $s) { die "No such utterance $u in utt2spk file $utt2spk_file"; }
if(!defined $spk_count{$s}) {
push @spkrs, $s;
$spk_count{$s} = 0;
$spk_data{$s} = []; # ref to new empty array.
}
if(!defined $spk2utt{$s}) {
$spk2utt{$s} = [];
}
$spk_count{$s}++;
push @{$spk_data{$s}}, $_;
push @{$spk2utt{$s}}, $u;
}

$numspks = @spkrs; # number of speakers.
$numscps = @OUTPUTS; # number of output files.
if ($numspks < $numscps) {
die "Refusing to split data because number of speakers $numspks is less " .
"than the number of output .scp files $numscps";
}
for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
$scparray[$scpidx] = []; # [] is array reference.
}
$splitdur = $dursum / $numscps;
$dursum = 0.0;
$scpidx = 0;
for my $spk (sort (keys %spk2utt)) {
$scpcount[$scpidx] += $spk_count{$spk};
push @{$scparray[$scpidx]}, $spk;
for my $utt (@{$spk2utt{$spk}}) {
$dur = $utt2dur{$utt};
$dursum += $dur;
}
if ( $dursum >= $splitdur ) {
$scpidx += 1;
$dursum = 0.0;
}
}

# Because scpidx might not have gone up to numscps (because all utts from one
# speaker go into one split means a major imbalance will mean not all splits
# are filled), move one speaker inside scparray to the indices which don't have
# any.
if ( $scpidx + 1 < $numscps || @{$scparray[$scpidx]} == 0 ) {
$scpdone = $scpidx;
if ( @{$scparray[$scpidx]} == 0 ) {
$scpdone -= 1;
}
for(; $scpidx < $numscps; $scpidx++) {
$i = 0;
for(; $i < $scpdone; $i++) {
$numspk = @{$scparray[$i]};
if ($numspk > 1) {
last;
}
}
$spk = pop @{$scparray[$i]};
$scpcount[$i] -= $spk_count{$spk};

push @{$scparray[$scpidx]}, $spk;
$scpcount[$scpidx] += $spk_count{$spk};
}
}

# Now print out the files...
for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
$scpfn = $OUTPUTS[$scpidx];
open(F, ">$scpfn") || die "Could not open scp file $scpfn for writing.";
$count = 0;
if(@{$scparray[$scpidx]} == 0) {
print STDERR "Error: split_scp.pl producing empty .scp file $scpfn (too many splits and too few speakers?)\n";
$error = 1;
} else {
foreach $spk ( sort @{$scparray[$scpidx]} ) {
print F @{$spk_data{$spk}};
$count += $spk_count{$spk};
}
if($count != $scpcount[$scpidx]) { die "Count mismatch [code error]"; }
}
close(F);
}
} elsif ($utt2spk_file ne "") { # We have the --utt2spk option...

if ($utt2spk_file ne "") { # We have the --utt2spk option...
open($u_fh, '<', $utt2spk_file) || die "$0: Error opening utt2spk file $utt2spk_file: $!\n";
while(<$u_fh>) {
@A = split;
Expand Down

0 comments on commit f8bf717

Please sign in to comment.