Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: cued-rnnlm n-best and lattice rescoring #971

Open
wants to merge 17 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
36 changes: 27 additions & 9 deletions egs/ami/s5/cmd.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,31 @@ export cuda_cmd="queue.pl --gpu 1 --mem 20G"
#export highmem_cmd="queue.pl -P inf_hcrc_cstr_nst -l h_rt=05:00:00 -pe memory-2G 4"
#export scoring_cmd="queue.pl -P inf_hcrc_cstr_nst -l h_rt=00:20:00"

if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then
# BUT cluster:
queue="all.q@@blade,all.q@@speech"
gpu_queue="long.q@@gpu"
storage="matylda5"
export train_cmd="queue.pl -q $queue -l ram_free=1.5G,mem_free=1.5G,${storage}=1"
export decode_cmd="queue.pl -q $queue -l ram_free=2.5G,mem_free=2.5G,${storage}=0.5"
export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1"
fi
#<<<<<<< HEAD
# JSALT2015 workshop, cluster AWS-EC2, (setup from Vijay)
export train_cmd="queue.pl -l arch=*64* --mem 1G"
export decode_cmd="queue.pl -l arch=*64* --mem 2G"
export highmem_cmd="queue.pl -l arch=*64* --mem 4G"
export scoring_cmd="queue.pl -l arch=*64*"
export cuda_cmd="queue.pl --gpu 1 -l mem_free=20G,ram_free=20G"
export cuda_mem_cmd="queue.pl --gpu 1 -l mem_free=42G,ram_free=42G"
export cntk_decode_cmd="queue.pl -l arch=*64* --mem 1G -pe smp 2"

# To run locally, use:
#export train_cmd=run.pl
#export decode_cmd=run.pl
#export highmem_cmd=run.pl
#export cuda_cmd=run.pl

#=======
#>>>>>>> 6c7c0170812a1f7dfb5c09c078787e79ee72333a
#if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then
# # BUT cluster:
# queue="all.q@@blade,all.q@@speech"
# gpu_queue="long.q@@gpu"
# storage="matylda5"
# export train_cmd="queue.pl -q $queue -l ram_free=1.5G,mem_free=1.5G,${storage}=1"
# export decode_cmd="queue.pl -q $queue -l ram_free=2.5G,mem_free=2.5G,${storage}=0.5"
# export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1"
#fi

13 changes: 10 additions & 3 deletions egs/ami/s5/local/nnet3/run_ivector_common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ num_threads_ubm=32
nj=10
use_ihm_ali=false
use_sat_alignments=true
nj=30

. cmd.sh
. ./path.sh
Expand Down Expand Up @@ -109,9 +110,15 @@ if [ $stage -le 7 ]; then
fi


if [ $stage -le 8 ]; then
rm -f exp/$mic/nnet3/.error 2>/dev/null
ivectordir=exp/$mic/nnet3/ivectors_train_sp_hires
#<<<<<<< HEAD
if [ $stage -le 6 ]; then
if [ -f exp/$mic/nnet3/.error ]; then rm exp/$mic/nnet3/.error 2>/dev/null; fi
ivectordir=exp/$mic/nnet3/ivectors_${train_set}_hires
#=======
#if [ $stage -le 8 ]; then
# rm -f exp/$mic/nnet3/.error 2>/dev/null
# ivectordir=exp/$mic/nnet3/ivectors_train_sp_hires
#>>>>>>> 6c7c0170812a1f7dfb5c09c078787e79ee72333a
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/ami-$mic-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
fi
Expand Down
39 changes: 39 additions & 0 deletions egs/ami/s5/local/run_cued_rnnlm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/bin/bash

mic=sdm1
crit=vr
n=50
ngram_order=4

. ./utils/parse_options.sh
. ./cmd.sh
. ./path.sh

set -e

local/train_cued_rnnlms.sh --crit $crit --train-text data/$mic/train/text data/$mic/cued_rnn_$crit

final_lm=ami_fsh.o3g.kn
LM=$final_lm.pr1-7

for decode_set in dev eval; do
dir=exp/$mic/nnet3/tdnn_sp/
decode_dir=${dir}/decode_${decode_set}

# N-best rescoring
steps/rnnlmrescore.sh \
--rnnlm-ver cuedrnnlm \
--N $n --cmd "$decode_cmd --mem 16G" --inv-acwt 10 0.5 \
data/lang_$LM data/$mic/cued_rnn_$crit \
data/$mic/$decode_set ${decode_dir} \
${decode_dir}.rnnlm.$crit.cued.$n-best

# Lattice rescoring
steps/lmrescore_rnnlm_lat.sh \
--cmd "$decode_cmd --mem 16G" \
--rnnlm-ver cuedrnnlm --weight 0.5 --max-ngram-order $ngram_order \
data/lang_$LM data/$mic/cued_rnn_$crit \
data/$mic/${decode_set}_hires ${decode_dir} \
${decode_dir}.rnnlm.$crit.cued.lat.${ngram_order}gram

done
38 changes: 38 additions & 0 deletions egs/ami/s5/local/run_rnnlm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/bin/bash

mic=sdm1

. ./utils/parse_options.sh
. ./cmd.sh
. ./path.sh

local/train_rnnlms.sh --train-text data/$mic/train/text data/$mic/mik_rnn

final_lm=ami_fsh.o3g.kn
LM=$final_lm.pr1-7

graph_dir=exp/$mic/tri4a/graph_${LM}


for decode_set in eval dev; do
dir=exp/$mic/nnet3/tdnn_sp/
decode_dir=${dir}/decode_${decode_set}

# N-best rescoring with Tomas Mikolov's version.
( steps/rnnlmrescore.sh \
--rnnlm-ver rnnlm-0.3e \
--N 50 --cmd "$decode_cmd --mem 16G" --inv-acwt 10 0.5 \
data/lang_$LM data/$mic/mik_rnn \
data/$mic/$decode_set ${decode_dir} \
${decode_dir}.rnnlm.mik.50-best || exit 1 ) &

# Lattice rescoring with Tomas Mikolov's version.
( steps/lmrescore_rnnlm_lat.sh \
--weight 0.5 --cmd "$decode_cmd --mem 16G" --max-ngram-order 5 \
data/lang_$LM data/$mic/mik_rnn \
data/$mic/$decode_set ${decode_dir} \
${decode_dir}.rnnlm.mik.lat || exit 1;) &
done

wait

114 changes: 114 additions & 0 deletions egs/ami/s5/local/train_cued_rnnlms.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
#!/bin/bash

train_text=
nwords=10000
hidden=200
cachesize=20
crit=ce

rnnlm_ver=cuedrnnlm

bptt=5

#set -v
set -e

. path.sh
. cmd.sh

. utils/parse_options.sh

if [ $# != 1 ]; then
echo "Usage: $0 [options] <dest-dir>"
echo "For options, see top of script file"
exit 1;
fi

dir=$1
srcdir=data/local/dict

mkdir -p $dir

$KALDI_ROOT/tools/extras/check_for_rnnlm.sh "$rnnlm_ver" || exit 1
export PATH=$KALDI_ROOT/tools/$rnnlm_ver:$PATH

cat $srcdir/lexicon.txt | awk '{print $1}' | grep -v -w '!SIL' > $dir/wordlist.all

# Get training data with OOV words (w.r.t. our current vocab) replaced with <unk>.
cat $train_text | awk -v w=$dir/wordlist.all \
'BEGIN{while((getline<w)>0) v[$1]=1;}
{for (i=2;i<=NF;i++) if ($i in v) printf $i" ";else printf "<unk> ";print ""}'|sed 's/ $//g' \
| perl -e ' use List::Util qw(shuffle); @A=<>; print join("", shuffle(@A)); ' \
| gzip -c > $dir/all.gz

echo "Splitting data into train and validation sets."
heldout_sent=10000
gunzip -c $dir/all.gz | head -n $heldout_sent > $dir/valid.in # validation data
gunzip -c $dir/all.gz | tail -n +$heldout_sent > $dir/train.in # training data


# The rest will consist of a word-class represented by <RNN_UNK>, that
# maps (with probabilities) to a whole class of words.

# Get unigram counts from our training data, and use this to select word-list
# for RNNLM training; e.g. 10k most frequent words. Rest will go in a class
# that we (manually, at the shell level) assign probabilities for words that
# are in that class. Note: this word-list doesn't need to include </s>; this
# automatically gets added inside the rnnlm program.
# Note: by concatenating with $dir/wordlist.all, we are doing add-one
# smoothing of the counts.

# get rid of this design -
cat $dir/train.in $dir/wordlist.all | grep -v '</s>' | grep -v '<s>' | \
awk '{ for(x=1;x<=NF;x++) count[$x]++; } END{for(w in count){print count[w], w;}}' | \
sort -nr > $dir/unigram.counts

total_nwords=`wc -l $dir/unigram.counts | awk '{print $1}'`

#head -$nwords_input $dir/unigram.counts | awk '{print $2}' | tee $dir/wordlist.rnn.input | awk '{print NR-1, $1}' > $dir/wordlist.rnn.id.input
#head -$nwords_output $dir/unigram.counts | awk '{print $2}' | tee $dir/wordlist.rnn.output | awk '{print NR-1, $1}' > $dir/wordlist.rnn.id.output
head -$nwords $dir/unigram.counts | awk '{print $2}' | tee $dir/wordlist.rnn | awk '{print NR-1, $1}' > $dir/wordlist.rnn.id

tail -n +$nwords $dir/unigram.counts > $dir/unk_class.counts

for type in train valid; do
mv $dir/$type.in $dir/$type
done

# Now randomize the order of the training data.
cat $dir/train | awk -v rand_seed=$rand_seed 'BEGIN{srand(rand_seed);} {printf("%f\t%s\n", rand(), $0);}' | \
sort | cut -f 2 > $dir/foo
mv $dir/foo $dir/train

# OK we'll train the RNNLM on this data.

echo "Training CUED-RNNLM on GPU"

layer_str=$[$nwords+2]:$hidden:$[$nwords+2]
bptt_delay=0

echo $layer_str > $dir/layer_string
$cuda_mem_cmd $dir/rnnlm.log \
steps/train_cued_rnnlm.sh -train -trainfile $dir/train \
-validfile $dir/valid -minibatch 64 -layers $layer_str \
-bptt $bptt -bptt-delay $bptt_delay -traincrit $crit -lrtune newbob \
-inputwlist $dir/wordlist.rnn.id -outputwlist $dir/wordlist.rnn.id \
-independent 1 -learnrate 1.0 \
-fullvocsize $total_nwords \
-writemodel $dir/rnnlm -randseed 1 -debug 2

touch $dir/unk.probs # dummy file, not used for cued-rnnlm

# make it like a Kaldi table format, with fake utterance-ids.
cat $dir/valid | awk '{ printf("uttid-%d ", NR); print; }' > $dir/valid.with_ids

utils/rnnlm_compute_scores.sh --rnnlm_ver $rnnlm_ver $dir $dir/tmp.valid $dir/valid.with_ids $dir/valid.scores

nw=`cat $dir/valid.with_ids | sed 's= =\n=g' | wc -l | awk '{print $1}'`
# Note: valid.with_ids includes utterance-ids which
# is one per word, to account for the </s> at the end of each sentence; this is the
# correct number to normalize buy.
# not use "wc -w" here since it does not work well for non-ascii characters
p=`awk -v nw=$nw '{x=x+$2} END{print exp(x/nw);}' <$dir/valid.scores`
echo Perplexity is $p | tee $dir/perplexity.log

120 changes: 120 additions & 0 deletions egs/ami/s5/local/train_rnnlms.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
#!/bin/bash

train_text=
nwords=10000
hidden=200

rand_seed=0
cmd=run.pl
class=120 # Num-classes... should be somewhat larger than sqrt of nwords.
direct=10 # Number of weights that are used for "direct" connections, in millions.
rnnlm_ver=rnnlm-0.3e # version of RNNLM to use
threads=1 # for RNNLM-HS
bptt=3 # length of BPTT unfolding in RNNLM
bptt_block=20 # length of BPTT unfolding in RNNLM
dict_suffix=
# End configuration section.

. path.sh
. cmd.sh

. utils/parse_options.sh

if [ $# != 1 ]; then
echo "Usage: $0 [options] <dest-dir>"
echo "For options, see top of script file"
exit 1;
fi

dir=$1
srcdir=data/local/dict

mkdir -p $dir

$KALDI_ROOT/tools/extras/check_for_rnnlm.sh "$rnnlm_ver" || exit 1
export PATH=$KALDI_ROOT/tools/$rnnlm_ver:$PATH

cat $srcdir/lexicon.txt | awk '{print $1}' | grep -v -w '!SIL' > $dir/wordlist.all

# Get training data with OOV words (w.r.t. our current vocab) replaced with <UNK>.
cat $train_text | awk -v w=$dir/wordlist.all \
'BEGIN{while((getline<w)>0) v[$1]=1;}
{for (i=2;i<=NF;i++) if ($i in v) printf $i" ";else printf "<UNK> ";print ""}'|sed 's/ $//g' \
| gzip -c > $dir/all.gz

echo "Splitting data into train and validation sets."
heldout_sent=10000
gunzip -c $dir/all.gz | head -n $heldout_sent > $dir/valid.in # validation data
gunzip -c $dir/all.gz | tail -n +$heldout_sent | \
perl -e ' use List::Util qw(shuffle); @A=<>; print join("", shuffle(@A)); ' \
> $dir/train.in # training data


# The rest will consist of a word-class represented by <RNN_UNK>, that
# maps (with probabilities) to a whole class of words.

# Get unigram counts from our training data, and use this to select word-list
# for RNNLM training; e.g. 10k most frequent words. Rest will go in a class
# that we (manually, at the shell level) assign probabilities for words that
# are in that class. Note: this word-list doesn't need to include </s>; this
# automatically gets added inside the rnnlm program.
# Note: by concatenating with $dir/wordlist.all, we are doing add-one
# smoothing of the counts.

cat $dir/train.in $dir/wordlist.all | grep -v '</s>' | grep -v '<s>' | \
awk '{ for(x=1;x<=NF;x++) count[$x]++; } END{for(w in count){print count[w], w;}}' | \
sort -nr > $dir/unigram.counts

head -$nwords $dir/unigram.counts | awk '{print $2}' | tee $dir/wordlist.rnn | awk '{print NR-1, $1}' > $dir/wordlist.rnn.id

tail -n +$nwords $dir/unigram.counts > $dir/unk_class.counts

tot=`awk '{x=x+$1} END{print x}' $dir/unk_class.counts`
awk -v tot=$tot '{print $2, ($1*1.0/tot);}' <$dir/unk_class.counts >$dir/unk.probs


for type in train valid; do
cat $dir/$type.in | awk -v w=$dir/wordlist.rnn \
'BEGIN{while((getline<w)>0) v[$1]=1;}
{for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf "<RNN_UNK> ";print ""}'|sed 's/ $//g' \
> $dir/$type
done
rm $dir/train.in # no longer needed-- and big.

# Now randomize the order of the training data.
cat $dir/train | awk -v rand_seed=$rand_seed 'BEGIN{srand(rand_seed);} {printf("%f\t%s\n", rand(), $0);}' | \
sort | cut -f 2 > $dir/foo
mv $dir/foo $dir/train

# OK we'll train the RNNLM on this data.

# todo: change 100 to 320.
# using 100 classes as square root of 10k.
echo "Training RNNLM (note: this uses a lot of memory! Run it on a big machine.)"
#time rnnlm -train $dir/train -valid $dir/valid -rnnlm $dir/100.rnnlm \
# -hidden 100 -rand-seed 1 -debug 2 -class 100 -bptt 2 -bptt-block 20 \
# -direct-order 4 -direct 1000 -binary >& $dir/rnnlm1.log &

$cmd $dir/rnnlm.log \
$KALDI_ROOT/tools/$rnnlm_ver/rnnlm -threads $threads -independent -train $dir/train -valid $dir/valid \
-rnnlm $dir/rnnlm -hidden $hidden -rand-seed 1 -debug 2 -class $class -bptt $bptt -bptt-block $bptt_block \
-direct-order 4 -direct $direct -binary

# make it like a Kaldi table format, with fake utterance-ids.
cat $dir/valid.in | awk '{ printf("uttid-%d ", NR); print; }' > $dir/valid.with_ids

utils/rnnlm_compute_scores.sh --rnnlm_ver $rnnlm_ver $dir $dir/tmp.valid $dir/valid.with_ids \
$dir/valid.scores
nw=`wc -w < $dir/valid.with_ids` # Note: valid.with_ids includes utterance-ids which
# is one per word, to account for the </s> at the end of each sentence; this is the
# correct number to normalize buy.
p=`awk -v nw=$nw '{x=x+$2} END{print exp(x/nw);}' <$dir/valid.scores`
echo Perplexity is $p | tee $dir/perplexity.log

rm $dir/train $dir/all.gz

# This is a better setup, but takes a long time to train:
#echo "Training RNNLM (note: this uses a lot of memory! Run it on a big machine.)"
#time rnnlm -train $dir/train -valid $dir/valid -rnnlm $dir/320.rnnlm \
# -hidden 320 -rand-seed 1 -debug 2 -class 300 -bptt 2 -bptt-block 20 \
# -direct-order 4 -direct 2000 -binary
3 changes: 2 additions & 1 deletion egs/wsj/s5/local/wsj_train_rnnlms.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ nwords=10000 # This is how many words we're putting in the vocab of the RNNLM.
hidden=30
class=200 # Num-classes... should be somewhat larger than sqrt of nwords.
direct=1000 # Number of weights that are used for "direct" connections, in millions.
rnnlm_ver=rnnlm-0.3e # version of RNNLM to use
rnnlm_ver=cuedrnnlm # version of RNNLM to use
rnnlm_ver=rnnlm-0.3e
threads=1 # for RNNLM-HS
bptt=2 # length of BPTT unfolding in RNNLM
bptt_block=20 # length of BPTT unfolding in RNNLM
Expand Down