Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: add DFSMN related codes and example scripts #2437

Closed
wants to merge 16 commits into from
Closed
114 changes: 114 additions & 0 deletions egs/librispeech/s5/RESULTS
Original file line number Diff line number Diff line change
Expand Up @@ -681,3 +681,117 @@
%WER 14.64 [ 7664 / 52343, 818 ins, 956 del, 5890 sub ] exp/chain/tdnn_6z_sp_smbr/decode_test_other_tgsmall_epoch3/wer_13_0.0
%WER 14.70 [ 7696 / 52343, 835 ins, 945 del, 5916 sub ] exp/chain/tdnn_6z_sp_smbr/decode_test_other_tgsmall_epoch6/wer_13_0.0
%WER 14.75 [ 7722 / 52343, 892 ins, 849 del, 5981 sub ] exp/chain/tdnn_6z_sp_smbr/decode_test_other_tgsmall_epoch9/wer_12_0.0

# Results with nnet DFMSN_S + CE
# local/nnet/run_fsmn_ivector.sh DFSMN_S
# Training on the "cleaned" data
%WER 3.97 [ 2160 / 54402, 264 ins, 233 del, 1663 sub ] exp/tri7b_DFSMN_S/decode_fglarge_dev_clean/wer_10_1.0
%WER 4.10 [ 2230 / 54402, 270 ins, 242 del, 1718 sub ] exp/tri7b_DFSMN_S/decode_tglarge_dev_clean/wer_12_0.5
%WER 5.06 [ 2752 / 54402, 274 ins, 360 del, 2118 sub ] exp/tri7b_DFSMN_S/decode_tgmed_dev_clean/wer_13_0.5
%WER 5.55 [ 3022 / 54402, 335 ins, 347 del, 2340 sub ] exp/tri7b_DFSMN_S/decode_tgsmall_dev_clean/wer_11_0.0

%WER 4.61 [ 2423 / 52576, 313 ins, 288 del, 1822 sub ] exp/tri7b_DFSMN_S/decode_fglarge_test_clean/wer_12_1.0
%WER 4.69 [ 2468 / 52576, 325 ins, 282 del, 1861 sub ] exp/tri7b_DFSMN_S/decode_tglarge_test_clean/wer_13_0.5
%WER 5.69 [ 2992 / 52576, 370 ins, 344 del, 2278 sub ] exp/tri7b_DFSMN_S/decode_tgmed_test_clean/wer_11_0.5
%WER 6.28 [ 3301 / 52576, 366 ins, 428 del, 2507 sub ] exp/tri7b_DFSMN_S/decode_tgsmall_test_clean/wer_12_0.5

%WER 11.71 [ 5968 / 50948, 719 ins, 704 del, 4545 sub ] exp/tri7b_DFSMN_S/decode_fglarge_dev_other/wer_15_0.0
%WER 12.19 [ 6209 / 50948, 700 ins, 866 del, 4643 sub ] exp/tri7b_DFSMN_S/decode_tglarge_dev_other/wer_17_0.0
%WER 14.25 [ 7260 / 50948, 755 ins, 1011 del, 5494 sub ] exp/tri7b_DFSMN_S/decode_tgmed_dev_other/wer_15_0.0
%WER 15.35 [ 7823 / 50948, 741 ins, 1165 del, 5917 sub ] exp/tri7b_DFSMN_S/decode_tgsmall_dev_other/wer_15_0.0

%WER 12.01 [ 6289 / 52343, 649 ins, 930 del, 4710 sub ] exp/tri7b_DFSMN_S/decode_fglarge_test_other/wer_16_0.0
%WER 12.45 [ 6519 / 52343, 688 ins, 929 del, 4902 sub ] exp/tri7b_DFSMN_S/decode_tglarge_test_other/wer_15_0.0
%WER 14.56 [ 7622 / 52343, 715 ins, 1191 del, 5716 sub ] exp/tri7b_DFSMN_S/decode_tgmed_test_other/wer_15_0.0
%WER 15.68 [ 8208 / 52343, 743 ins, 1321 del, 6144 sub ] exp/tri7b_DFSMN_S/decode_tgsmall_test_other/wer_15_0.0

# Results with nnet DFMSN_S + CE + SMBR
# local/nnet/run_fsmn_ivector.sh DFSMN_S
# Training on the "cleaned" data
%WER 3.77 [ 2053 / 54402, 233 ins, 274 del, 1546 sub ] exp/tri7b_DFSMN_S_smbr/decode_fglarge_dev_clean/wer_27_0.5
%WER 3.86 [ 2102 / 54402, 231 ins, 264 del, 1607 sub ] exp/tri7b_DFSMN_S_smbr/decode_tglarge_dev_clean/wer_26_0.5
%WER 4.77 [ 2597 / 54402, 230 ins, 366 del, 2001 sub ] exp/tri7b_DFSMN_S_smbr/decode_tgmed_dev_clean/wer_29_0.5
%WER 5.21 [ 2836 / 54402, 290 ins, 329 del, 2217 sub ] exp/tri7b_DFSMN_S_smbr/decode_tgsmall_dev_clean/wer_26_0.0

%WER 4.26 [ 2239 / 52576, 250 ins, 313 del, 1676 sub ] exp/tri7b_DFSMN_S_smbr/decode_fglarge_test_clean/wer_27_1.0
%WER 4.34 [ 2282 / 52576, 265 ins, 306 del, 1711 sub ] exp/tri7b_DFSMN_S_smbr/decode_tglarge_test_clean/wer_29_0.5
%WER 5.22 [ 2746 / 52576, 319 ins, 315 del, 2112 sub ] exp/tri7b_DFSMN_S_smbr/decode_tgmed_test_clean/wer_28_0.0
%WER 5.81 [ 3055 / 52576, 357 ins, 329 del, 2369 sub ] exp/tri7b_DFSMN_S_smbr/decode_tgsmall_test_clean/wer_25_0.0

%WER 11.77 [ 5996 / 50948, 734 ins, 767 del, 4495 sub ] exp/tri7b_DFSMN_S_smbr/decode_fglarge_dev_other/wer_30_0.0
%WER 12.09 [ 6158 / 50948, 644 ins, 926 del, 4588 sub ] exp/tri7b_DFSMN_S_smbr/decode_tglarge_dev_other/wer_30_0.5
%WER 13.92 [ 7090 / 50948, 763 ins, 1018 del, 5309 sub ] exp/tri7b_DFSMN_S_smbr/decode_tgmed_dev_other/wer_30_0.0
%WER 14.85 [ 7564 / 50948, 749 ins, 1108 del, 5707 sub ] exp/tri7b_DFSMN_S_smbr/decode_tgsmall_dev_other/wer_30_0.0

%WER 11.65 [ 6099 / 52343, 595 ins, 1001 del, 4503 sub ] exp/tri7b_DFSMN_S_smbr/decode_fglarge_test_other/wer_30_0.5
%WER 12.09 [ 6329 / 52343, 686 ins, 907 del, 4736 sub ] exp/tri7b_DFSMN_S_smbr/decode_tglarge_test_other/wer_30_0.0
%WER 14.17 [ 7418 / 52343, 729 ins, 1115 del, 5574 sub ] exp/tri7b_DFSMN_S_smbr/decode_tgmed_test_other/wer_30_0.0
%WER 15.16 [ 7933 / 52343, 759 ins, 1228 del, 5946 sub ] exp/tri7b_DFSMN_S_smbr/decode_tgsmall_test_other/wer_30_0.0

# Results with nnet DFMSN_M + CE
# local/nnet/run_fsmn_ivector.sh DFSMN_M
# Training on the "cleaned" data
%WER 4.04 [ 2200 / 54402, 254 ins, 255 del, 1691 sub ] exp/tri7b_DFSMN_M/decode_fglarge_dev_clean/wer_12_1.0
%WER 4.15 [ 2257 / 54402, 266 ins, 247 del, 1744 sub ] exp/tri7b_DFSMN_M/decode_tglarge_dev_clean/wer_13_0.5
%WER 5.01 [ 2727 / 54402, 304 ins, 308 del, 2115 sub ] exp/tri7b_DFSMN_M/decode_tgmed_dev_clean/wer_12_0.5
%WER 5.54 [ 3014 / 54402, 356 ins, 306 del, 2352 sub ] exp/tri7b_DFSMN_M/decode_tgsmall_dev_clean/wer_11_0.0

%WER 4.50 [ 2367 / 52576, 328 ins, 263 del, 1776 sub ] exp/tri7b_DFSMN_M/decode_fglarge_test_clean/wer_13_0.5
%WER 4.63 [ 2436 / 52576, 328 ins, 279 del, 1829 sub ] exp/tri7b_DFSMN_M/decode_tglarge_test_clean/wer_12_0.5
%WER 5.50 [ 2894 / 52576, 331 ins, 373 del, 2190 sub ] exp/tri7b_DFSMN_M/decode_tgmed_test_clean/wer_13_0.5
%WER 5.94 [ 3124 / 52576, 381 ins, 368 del, 2375 sub ] exp/tri7b_DFSMN_M/decode_tgsmall_test_clean/wer_13_0.0

%WER 11.88 [ 6055 / 50948, 650 ins, 891 del, 4514 sub ] exp/tri7b_DFSMN_M/decode_fglarge_dev_other/wer_17_0.5
%WER 12.24 [ 6236 / 50948, 746 ins, 811 del, 4679 sub ] exp/tri7b_DFSMN_M/decode_tglarge_dev_other/wer_17_0.0
%WER 14.18 [ 7223 / 50948, 728 ins, 1056 del, 5439 sub ] exp/tri7b_DFSMN_M/decode_tgmed_dev_other/wer_17_0.0
%WER 15.17 [ 7731 / 50948, 758 ins, 1139 del, 5834 sub ] exp/tri7b_DFSMN_M/decode_tgsmall_dev_other/wer_16_0.0

%WER 12.23 [ 6404 / 52343, 716 ins, 908 del, 4780 sub ] exp/tri7b_DFSMN_M/decode_fglarge_test_other/wer_18_0.0
%WER 12.61 [ 6598 / 52343, 736 ins, 890 del, 4972 sub ] exp/tri7b_DFSMN_M/decode_tglarge_test_other/wer_16_0.0
%WER 14.55 [ 7614 / 52343, 710 ins, 1195 del, 5709 sub ] exp/tri7b_DFSMN_M/decode_tgmed_test_other/wer_17_0.0
%WER 15.51 [ 8119 / 52343, 736 ins, 1272 del, 6111 sub ] exp/tri7b_DFSMN_M/decode_tgsmall_test_other/wer_16_0.0

# Results with nnet DFMSN_L + CE
# local/nnet/run_fsmn_ivector.sh DFSMN_L
# Training on the "cleaned" data
%WER 3.93 [ 2136 / 54402, 287 ins, 210 del, 1639 sub ] exp/tri7b_DFSMN_L/decode_fglarge_dev_clean/wer_12_0.5
%WER 3.99 [ 2170 / 54402, 279 ins, 225 del, 1666 sub ] exp/tri7b_DFSMN_L/decode_tglarge_dev_clean/wer_12_0.5
%WER 4.78 [ 2598 / 54402, 317 ins, 272 del, 2009 sub ] exp/tri7b_DFSMN_L/decode_tgmed_dev_clean/wer_12_0.0
%WER 5.20 [ 2829 / 54402, 320 ins, 303 del, 2206 sub ] exp/tri7b_DFSMN_L/decode_tgsmall_dev_clean/wer_12_0.0

%WER 4.36 [ 2294 / 52576, 285 ins, 267 del, 1742 sub ] exp/tri7b_DFSMN_L/decode_fglarge_test_clean/wer_14_0.5
%WER 4.45 [ 2342 / 52576, 295 ins, 256 del, 1791 sub ] exp/tri7b_DFSMN_L/decode_tglarge_test_clean/wer_12_0.5
%WER 5.29 [ 2782 / 52576, 320 ins, 326 del, 2136 sub ] exp/tri7b_DFSMN_L/decode_tgmed_test_clean/wer_12_0.5
%WER 5.68 [ 2988 / 52576, 372 ins, 317 del, 2299 sub ] exp/tri7b_DFSMN_L/decode_tgsmall_test_clean/wer_11_0.0

%WER 11.71 [ 5964 / 50948, 584 ins, 881 del, 4499 sub ] exp/tri7b_DFSMN_L/decode_fglarge_dev_other/wer_17_0.5
%WER 12.13 [ 6179 / 50948, 668 ins, 822 del, 4689 sub ] exp/tri7b_DFSMN_L/decode_tglarge_dev_other/wer_17_0.0
%WER 13.86 [ 7063 / 50948, 694 ins, 1051 del, 5318 sub ] exp/tri7b_DFSMN_L/decode_tgmed_dev_other/wer_17_0.0
%WER 14.82 [ 7548 / 50948, 718 ins, 1075 del, 5755 sub ] exp/tri7b_DFSMN_L/decode_tgsmall_dev_other/wer_15_0.0

%WER 12.00 [ 6281 / 52343, 724 ins, 813 del, 4744 sub ] exp/tri7b_DFSMN_L/decode_fglarge_test_other/wer_16_0.0
%WER 12.43 [ 6505 / 52343, 719 ins, 858 del, 4928 sub ] exp/tri7b_DFSMN_L/decode_tglarge_test_other/wer_16_0.0
%WER 13.99 [ 7323 / 52343, 688 ins, 1125 del, 5510 sub ] exp/tri7b_DFSMN_L/decode_tgmed_test_other/wer_17_0.0
%WER 14.90 [ 7797 / 52343, 754 ins, 1106 del, 5937 sub ] exp/tri7b_DFSMN_L/decode_tgsmall_test_other/wer_15_0.0

# Results with nnet DFMSN_L + CE + SMBR
# Training on speed-perturbed and volumn-perturbed "cleaned" data
%WER 3.60 [ 1959 / 54402, 243 ins, 171 del, 1545 sub ] exp/DFSMN_L_sp_vp_smbr/decode_fglarge_dev_clean/wer_26_0.0
%WER 3.69 [ 2010 / 54402, 194 ins, 226 del, 1590 sub ] exp/DFSMN_L_sp_vp_smbr/decode_tglarge_dev_clean/wer_25_1.0
%WER 4.40 [ 2394 / 54402, 213 ins, 278 del, 1903 sub ] exp/DFSMN_L_sp_vp_smbr/decode_tgmed_dev_clean/wer_25_0.5
%WER 4.79 [ 2608 / 54402, 261 ins, 267 del, 2080 sub ] exp/DFSMN_L_sp_vp_smbr/decode_tgsmall_dev_clean/wer_19_0.5

%WER 3.96 [ 2083 / 52576, 280 ins, 177 del, 1626 sub ] exp/DFSMN_L_sp_vp_smbr/decode_fglarge_test_clean/wer_23_0.5
%WER 4.09 [ 2152 / 52576, 243 ins, 238 del, 1671 sub ] exp/DFSMN_L_sp_vp_smbr/decode_tglarge_test_clean/wer_25_1.0
%WER 4.73 [ 2486 / 52576, 276 ins, 264 del, 1946 sub ] exp/DFSMN_L_sp_vp_smbr/decode_tgmed_test_clean/wer_25_0.5
%WER 5.10 [ 2682 / 52576, 304 ins, 279 del, 2099 sub ] exp/DFSMN_L_sp_vp_smbr/decode_tgsmall_test_clean/wer_21_0.5

%WER 10.21 [ 5203 / 50948, 531 ins, 650 del, 4022 sub ] exp/DFSMN_L_sp_vp_smbr/decode_fglarge_dev_other/wer_30_0.5
%WER 10.77 [ 5485 / 50948, 554 ins, 697 del, 4234 sub ] exp/DFSMN_L_sp_vp_smbr/decode_tglarge_dev_other/wer_29_0.5
%WER 12.39 [ 6314 / 50948, 534 ins, 891 del, 4889 sub ] exp/DFSMN_L_sp_vp_smbr/decode_tgmed_dev_other/wer_29_0.5
%WER 13.01 [ 6630 / 50948, 608 ins, 885 del, 5137 sub ] exp/DFSMN_L_sp_vp_smbr/decode_tgsmall_dev_other/wer_30_0.0

%WER 10.39 [ 5439 / 52343, 560 ins, 637 del, 4242 sub ] exp/DFSMN_L_sp_vp_smbr/decode_fglarge_test_other/wer_30_0.5
%WER 10.89 [ 5702 / 52343, 580 ins, 696 del, 4426 sub ] exp/DFSMN_L_sp_vp_smbr/decode_tglarge_test_other/wer_30_0.5
%WER 12.49 [ 6540 / 52343, 669 ins, 782 del, 5089 sub ] exp/DFSMN_L_sp_vp_smbr/decode_tgmed_test_other/wer_30_0.0
%WER 13.29 [ 6958 / 52343, 737 ins, 799 del, 5422 sub ] exp/DFSMN_L_sp_vp_smbr/decode_tgsmall_test_other/wer_26_0.0
23 changes: 23 additions & 0 deletions egs/librispeech/s5/local/nnet/DFSMN_L.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
<NnetProto>
<AffineTransform> <InputDim> 1020 <OutputDim> 2048 <Xavier> 1
<ParametricRelu> <InputDim> 2048 <OutputDim> 2048
<LinearTransform> <InputDim> 2048 <OutputDim> 512 <Xavier> 1
<Fsmn> <InputDim> 512 <OutputDim> 512 <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
<DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048 <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
<DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048 <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
<DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048 <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
<DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048 <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
<DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048 <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
<DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048 <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
<DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048 <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
<DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048 <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
<DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048 <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
<AffineTransform> <InputDim> 512 <OutputDim> 2048 <Xavier> 1
<ParametricRelu> <InputDim> 2048 <OutputDim> 2048
<AffineTransform> <InputDim> 2048 <OutputDim> 2048 <Xavier> 1
<ParametricRelu> <InputDim> 2048 <OutputDim> 2048
<LinearTransform> <InputDim> 2048 <OutputDim> 512 <Xavier> 1
<AffineTransform> <InputDim> 512 <OutputDim> 5777 <Xavier> 1
<Softmax> <InputDim> 5777 <OutputDim> 5777
</NnetProto>

19 changes: 19 additions & 0 deletions egs/librispeech/s5/local/nnet/DFSMN_M.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
<NnetProto>
<AffineTransform> <InputDim> 1020 <OutputDim> 2048 <MaxNorm> 0.000000 <Xavier> 1
<ParametricRelu> <InputDim> 2048 <OutputDim> 2048
<LinearTransform> <InputDim> 2048 <OutputDim> 512 <ParamStddev> 0.010000 <Xavier> 1
<Fsmn> <InputDim> 512 <OutputDim> 512 <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
<DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048 <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
<DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048 <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
<DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048 <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
<DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048 <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
<DeepFsmn> <InputDim> 512 <OutputDim> 512 <HidSize> 2048 <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
<AffineTransform> <InputDim> 512 <OutputDim> 2048 <Xavier> 1
<ParametricRelu> <InputDim> 2048 <OutputDim> 2048
<AffineTransform> <InputDim> 2048 <OutputDim> 2048 <Xavier> 1
<ParametricRelu> <InputDim> 2048 <OutputDim> 2048
<LinearTransform> <InputDim> 2048 <OutputDim> 512 <Xavier> 1
<AffineTransform> <InputDim> 512 <OutputDim> 5777 <Xavier> 1
<Softmax> <InputDim> 5777 <OutputDim> 5777
</NnetProto>

19 changes: 19 additions & 0 deletions egs/librispeech/s5/local/nnet/DFSMN_S.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
<NnetProto>
<AffineTransform> <InputDim> 1020 <OutputDim> 1024 <MaxNorm> 0.000000 <Xavier> 1
<ParametricRelu> <InputDim> 1024 <OutputDim> 1024
<LinearTransform> <InputDim> 1024 <OutputDim> 384 <ParamStddev> 0.010000 <Xavier> 1
<Fsmn> <InputDim> 384 <OutputDim> 384 <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
<DeepFsmn> <InputDim> 384 <OutputDim> 384 <HidSize> 1024 <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
<DeepFsmn> <InputDim> 384 <OutputDim> 384 <HidSize> 1024 <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
<DeepFsmn> <InputDim> 384 <OutputDim> 384 <HidSize> 1024 <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
<DeepFsmn> <InputDim> 384 <OutputDim> 384 <HidSize> 1024 <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
<DeepFsmn> <InputDim> 384 <OutputDim> 384 <HidSize> 1024 <LOrder> 20 <ROrder> 20 <LStride> 2 <RStride> 2
<AffineTransform> <InputDim> 384 <OutputDim> 1024 <Xavier> 1
<ParametricRelu> <InputDim> 1024 <OutputDim> 1024
<AffineTransform> <InputDim> 1024 <OutputDim> 1024 <Xavier> 1
<ParametricRelu> <InputDim> 1024 <OutputDim> 1024
<LinearTransform> <InputDim> 1024 <OutputDim> 384 <Xavier> 1
<AffineTransform> <InputDim> 384 <OutputDim> 5777 <Xavier> 1
<Softmax> <InputDim> 5777 <OutputDim> 5777
</NnetProto>

144 changes: 144 additions & 0 deletions egs/librispeech/s5/local/nnet/run_fsmn_ivector.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
. ./path.sh
. ./cmd.sh

. utils/parse_options.sh || exit 1;

set -e
set -u
set -o pipefail
#########################

dnn_model=$1

stage=1

##Make fbank features
if [ $stage -le 1 ]; then
mkdir -p data_fbank

for x in train_960_cleaned test_other test_clean dev_other dev_clean; do
fbankdir=fbank/$x

cp -r data/$x data_fbank/$x
steps/make_fbank.sh --nj 30 --cmd "$train_cmd" --fbank-config conf/fbank.cfg \
data_fbank/$x exp/make_fbank/$x $fbankdir
steps/compute_cmvn_stats.sh data_fbank/$x exp/make_fbank/$x $fbankdir
done
fi
###############
if [ $stage -le 2 ]; then

steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
data/train_960_cleaned data/lang exp/tri6b_cleaned exp/tri6b_cleaned_ali_train_960_cleaned
steps/align_fmllr.sh --nj 10 --cmd "$train_cmd" \
data/dev_clean data/lang exp/tri6b_cleaned exp/tri6b_cleaned_ali_dev_clean
steps/align_fmllr.sh --nj 10 --cmd "$train_cmd" \
data/dev_other data/lang exp/tri6b_cleaned exp/tri6b_cleaned_ali_dev_other
fi
#####CE-training
lrate=0.00001
dir=exp/tri7b_${dnn_model}
data_fbk=data_fbank
if [ $stage -le 3 ]; then
proto=local/nnet/${dnn_model}.proto

cat exp/nnet3_cleaned/ivectors_train_960_cleaned_hires/ivector_online.scp exp/nnet3_cleaned/ivectors_dev_clean_hires/ivector_online.scp \
exp/nnet3_cleaned/ivectors_dev_other_hires/ivector_online.scp > exp/nnet3_cleaned/ivectors_train_960_dev_hires/ivector_online.scp

$cuda_cmd $dir/_train_nnet.log \
steps/nnet/train_faster.sh --learn-rate $lrate --nnet-proto $proto \
--start_half_lr 5 --momentum 0.9 \
--train-tool "nnet-train-fsmn-streams" \
--feat-type plain --splice 1 \
--cmvn-opts "--norm-means=true --norm-vars=false" --delta_opts "--delta-order=2" \
--train-tool-opts "--minibatch-size=4096" \
--ivector scp:exp/nnet3_cleaned/ivectors_train_960_dev_hires/ivector_online.scp \
--ivector-append-tool "append-ivector-to-feats --online-ivector-period=10" \
$data_fbk/train_960_cleaned $data_fbk/dev_clean data/lang exp/tri6b_cleaned_ali_train_960_cleaned exp/tri6b_cleaned_ali_dev_clean $dir
fi
####Decode
acwt=0.08
if [ $stage -le 4 ]; then
gmm=exp/tri6b_cleaned
dataset="test_clean dev_clean test_other dev_other"
for set in $dataset
do
steps/nnet/decode.sh --nj 16 --cmd "$decode_cmd" \
scoring_opts "--min-lmwt 10 --max-lmwt 30" \
--config conf/decode.config --acwt $acwt \
$gmm/graph_tgsmall \
$data_fbk/$set $dir/decode_tgsmall_${set}

steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
$data_fbk/$set $dir/decode_{tgsmall,tgmed}_${set}

steps/lmrescore_const_arpa.sh \
scoring_opts "--min-lmwt 10 --max-lmwt 30" \
--cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
$data_fbk/$set $dir/decode_{tgsmall,tglarge}_${set}

steps/lmrescore_const_arpa.sh \
scoring_opts "--min-lmwt 10 --max-lmwt 30" \
--cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \
$data_fbk/$set $dir/decode_{tgsmall,fglarge}_${set}
done

for set in $dataset;
do
for lm in fglarge tglarge tgmed tgsmall;
do
grep WER $dir/decode_${lm}_${set}*/wer* | ./utils/best_wer.sh
done
done
fi

nj=32
if [ $stage -le 5 ]; then
steps/nnet/align.sh --nj $nj --cmd "$train_cmd" $data_fbk/train_960_cleaned data/lang $dir ${dir}_ali
steps/nnet/make_denlats.sh --nj $nj --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt $acwt \
$data_fbk/train_960_cleaned data/lang $dir ${dir}_denlats
fi

####do smbr
if [ $stage -le 5 ]; then
steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 2 --learn-rate 0.0000002 --acwt $acwt --do-smbr true \
$data_fbk/train_960_cleaned data/lang $dir ${dir}_ali ${dir}_denlats ${dir}_smbr
fi

###decode
dir=${dir}_smbr
acwt=0.03
if [ $stage -le 6 ]; then
gmm=exp/tri6b_cleaned
dataset="test_clean dev_clean test_other dev_other"
for set in $dataset
do
steps/nnet/decode.sh --nj 16 --cmd "$decode_cmd" \
scoring_opts "--min-lmwt 10 --max-lmwt 30" \
--config conf/decode_dnn.config --acwt $acwt \
$gmm/graph_tgsmall \
$data_fbk/$set $dir/decode_tgsmall_${set}

steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
$data_fbk/$set $dir/decode_{tgsmall,tgmed}_${set}

steps/lmrescore_const_arpa.sh \
scoring_opts "--min-lmwt 10 --max-lmwt 30" \
--cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
$data_fbk/$set $dir/decode_{tgsmall,tglarge}_${set}

steps/lmrescore_const_arpa.sh \
scoring_opts "--min-lmwt 10 --max-lmwt 30" \
--cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \
$data_fbk/$set $dir/decode_{tgsmall,fglarge}_${set}
done
for set in $dataset;
do
for lm in fglarge tglarge tgmed tgsmall;
do
grep WER $dir/decode_${lm}_${set}*/wer* | ./utils/best_wer.sh
done
done

fi

10 changes: 10 additions & 0 deletions egs/librispeech/s5/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -372,5 +372,15 @@ local/chain/run_tdnn.sh # set "--stage 11" if you have already run local/nnet3/r
# ## to train but slightly worse.
# # local/online/run_nnet2.sh


# ## Traing FSMN models on the cleaned-up data
# ## Three configurations of DFSMN with different model size: DFSMN_S, DFSMN_M, DFSMN_L
local/nnet/run_fsmn_ivector.sh DFSMN_S
# local/nnet/run_fsmn_ivector.sh DFSMN_M
# local/nnet/run_fsmn_ivector.sh DFSMN_L

# Wait for decodings in the background


# Wait for decodings in the background
wait