From cc2628f085491615f12b5be1797eadbbdd9e51e8 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Tue, 1 Mar 2022 16:27:34 +0800 Subject: [PATCH] Update result for full libri + GigaSpeech using transducer_stateless. --- ...r-stateless-librispeech-multi-datasets.yml | 154 ++++++++++++++++++ README.md | 2 +- egs/librispeech/ASR/RESULTS-100hours.md | 2 + egs/librispeech/ASR/RESULTS.md | 82 +++++++++- .../train.py | 31 +++- 5 files changed, 264 insertions(+), 7 deletions(-) create mode 100644 .github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml diff --git a/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml b/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml new file mode 100644 index 0000000000..ccf9028cb1 --- /dev/null +++ b/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml @@ -0,0 +1,154 @@ +# Copyright 2021 Fangjun Kuang (csukuangfj@gmail.com) + +# See ../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: run-pre-trained-trandsucer-stateless-multi-datasets-librispeech-960h + +on: + push: + branches: + - master + pull_request: + types: [labeled] + +jobs: + run_pre_trained_transducer_stateless_multi_datasets_librispeech_960h: + if: github.event.label.name == 'ready' || github.event_name == 'push' + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-18.04] + python-version: [3.7, 3.8, 3.9] + torch: ["1.10.0"] + torchaudio: ["0.10.0"] + k2-version: ["1.9.dev20211101"] + + fail-fast: false + + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 0 + + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v1 + with: + python-version: ${{ matrix.python-version }} + + - name: Install Python dependencies + run: | + python3 -m pip install --upgrade pip pytest + # numpy 1.20.x does not support python 3.6 + pip install numpy==1.19 + pip install torch==${{ matrix.torch }}+cpu torchaudio==${{ matrix.torchaudio }}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html + pip install k2==${{ matrix.k2-version }}+cpu.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/ + + python3 -m pip install git+https://github.com/lhotse-speech/lhotse + python3 -m pip install kaldifeat + # We are in ./icefall and there is a file: requirements.txt in it + pip install -r requirements.txt + + - name: Install graphviz + shell: bash + run: | + python3 -m pip install -qq graphviz + sudo apt-get -qq install graphviz + + - name: Download pre-trained model + shell: bash + run: | + sudo apt-get -qq install git-lfs tree sox + cd egs/librispeech/ASR + mkdir tmp + cd tmp + git lfs install + git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01 + + + cd .. + tree tmp + soxi tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/*.wav + ls -lh tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/*.wav + + - name: Run greedy search decoding (max-sym-per-frame 1) + shell: bash + run: | + export PYTHONPATH=$PWD:PYTHONPATH + cd egs/librispeech/ASR + ./transducer_stateless_multi_datasets/pretrained.py \ + --method greedy_search \ + --max-sym-per-frame 1 \ + --checkpoint ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/exp/pretrained.pt \ + --bpe-model ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/data/lang_bpe_500/bpe.model \ + ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/1089-134686-0001.wav \ + ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/1221-135766-0001.wav \ + ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/1221-135766-0002.wav + + - name: Run greedy search decoding (max-sym-per-frame 2) + shell: bash + run: | + export PYTHONPATH=$PWD:PYTHONPATH + cd egs/librispeech/ASR + ./transducer_stateless_multi_datasets/pretrained.py \ + --method greedy_search \ + --max-sym-per-frame 2 \ + --checkpoint ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/exp/pretrained.pt \ + --bpe-model ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/data/lang_bpe_500/bpe.model \ + ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/1089-134686-0001.wav \ + ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/1221-135766-0001.wav \ + ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/1221-135766-0002.wav + + - name: Run greedy search decoding (max-sym-per-frame 3) + shell: bash + run: | + export PYTHONPATH=$PWD:PYTHONPATH + cd egs/librispeech/ASR + ./transducer_stateless_multi_datasets/pretrained.py \ + --method greedy_search \ + --max-sym-per-frame 3 \ + --checkpoint ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/exp/pretrained.pt \ + --bpe-model ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/data/lang_bpe_500/bpe.model \ + ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/1089-134686-0001.wav \ + ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/1221-135766-0001.wav \ + ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/1221-135766-0002.wav + + - name: Run beam search decoding + shell: bash + run: | + export PYTHONPATH=$PWD:$PYTHONPATH + cd egs/librispeech/ASR + ./transducer_stateless_multi_datasets/pretrained.py \ + --method beam_search \ + --beam-size 4 \ + --checkpoint ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/exp/pretrained.pt \ + --bpe-model ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/data/lang_bpe_500/bpe.model \ + ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/1089-134686-0001.wav \ + ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/1221-135766-0001.wav \ + ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/1221-135766-0002.wav + + + - name: Run modified beam search decoding + shell: bash + run: | + export PYTHONPATH=$PWD:$PYTHONPATH + cd egs/librispeech/ASR + ./transducer_stateless_multi_datasets/pretrained.py \ + --method modified_beam_search \ + --beam-size 4 \ + --checkpoint ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/exp/pretrained.pt \ + --bpe-model ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/data/lang_bpe_500/bpe.model \ + ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/1089-134686-0001.wav \ + ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/1221-135766-0001.wav \ + ./tmp/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01/test_wavs/1221-135766-0002.wav diff --git a/README.md b/README.md index aa77b5aa77..ec9d7e69c1 100644 --- a/README.md +++ b/README.md @@ -84,7 +84,7 @@ The best WER using modified beam search with beam size 4 is: | | test-clean | test-other | |-----|------------|------------| -| WER | 2.67 | 6.57 | +| WER | 2.61 | 6.46 | Note: No auxiliary losses are used in the training and no LMs are used in the decoding. diff --git a/egs/librispeech/ASR/RESULTS-100hours.md b/egs/librispeech/ASR/RESULTS-100hours.md index 40245c917f..2e1bbd6870 100644 --- a/egs/librispeech/ASR/RESULTS-100hours.md +++ b/egs/librispeech/ASR/RESULTS-100hours.md @@ -7,6 +7,8 @@ train-clean-100 subset as training data. ### 2022-02-21 +Using commit `2332ba312d7ce72f08c7bac1e3312f7e3dd722dc`. + | | test-clean | test-other | comment | |-------------------------------------|------------|------------|------------------------------------------| | greedy search (max sym per frame 1) | 6.34 | 16.7 | --epoch 57, --avg 17, --max-duration 100 | diff --git a/egs/librispeech/ASR/RESULTS.md b/egs/librispeech/ASR/RESULTS.md index 45f23e95e9..cc2aebac17 100644 --- a/egs/librispeech/ASR/RESULTS.md +++ b/egs/librispeech/ASR/RESULTS.md @@ -52,11 +52,89 @@ avg=15 #### Conformer encoder + embedding decoder -Using commit `a8150021e01d34ecbd6198fe03a57eacf47a16f2`. - Conformer encoder + non-recurrent decoder. The decoder contains only an embedding layer and a Conv1d (with kernel size 2). +See + +- [./transducer_stateless](./transducer_stateless) +- [./transducer_stateless_multi_datasets](./transducer_stateless_multi_datasets) + +##### 2022-03-01 + +Using commit `fill in it after merging`. + +It uses [GigaSpeech](https://github.com/SpeechColab/GigaSpeech) +as extra training data. 20% of the time it selects a batch from L subset of +GigaSpeech and 80% of the time it selects a batch from LibriSpeech. + +The WERs are + +| | test-clean | test-other | comment | +|-------------------------------------|------------|------------|------------------------------------------| +| greedy search (max sym per frame 1) | 2.64 | 6.55 | --epoch 39, --avg 15, --max-duration 100 | +| modified beam search (beam size 4) | 2.61 | 6.46 | --epoch 39, --avg 15, --max-duration 100 | + +The training command for reproducing is given below: + +```bash +cd egs/librispeech/ASR/ +./prepare.sh +./prepare_giga_speech.sh + +export CUDA_VISIBLE_DEVICES="0,1,2,3" + +./transducer_stateless_multi_datasets/train.py \ + --world-size 4 \ + --num-epochs 40 \ + --start-epoch 0 \ + --exp-dir transducer_stateless_multi_datasets/exp-full-2 \ + --full-libri 1 \ + --max-duration 300 \ + --lr-factor 5 \ + --bpe-model data/lang_bpe_500/bpe.model \ + --modified-transducer-prob 0.25 \ + --giga-prob 0.2 +``` + +The tensorboard training log can be found at + + +The decoding command is: + +```bash +epoch=39 +avg=15 +sym=1 + +# greedy search +./transducer_stateless_multi_datasets/decode.py \ + --epoch $epoch \ + --avg $avg \ + --exp-dir transducer_stateless_multi_datasets/exp-full-2 \ + --bpe-model ./data/lang_bpe_500/bpe.model \ + --max-duration 100 \ + --context-size 2 \ + --max-sym-per-frame $sym + +# modified beam search +./transducer_stateless_multi_datasets/decode.py \ + --epoch $epoch \ + --avg $avg \ + --exp-dir transducer_stateless_multi_datasets/exp-full-2 \ + --bpe-model ./data/lang_bpe_500/bpe.model \ + --max-duration 100 \ + --context-size 2 \ + --decoding-method modified_beam_search \ + --beam-size 4 +``` + + +##### 2022-02-07 + +Using commit `a8150021e01d34ecbd6198fe03a57eacf47a16f2`. + + The WERs are | | test-clean | test-other | comment | diff --git a/egs/librispeech/ASR/transducer_stateless_multi_datasets/train.py b/egs/librispeech/ASR/transducer_stateless_multi_datasets/train.py index 720151ea02..105f82417a 100755 --- a/egs/librispeech/ASR/transducer_stateless_multi_datasets/train.py +++ b/egs/librispeech/ASR/transducer_stateless_multi_datasets/train.py @@ -19,16 +19,39 @@ """ Usage: +cd egs/librispeech/ASR/ +./prepare.sh +./prepare_giga_speech.sh + +# 100-hours +export CUDA_VISIBLE_DEVICES="0,1" + +./transducer_stateless_multi_datasets/train.py \ + --world-size 2 \ + --num-epochs 60 \ + --start-epoch 0 \ + --exp-dir transducer_stateless_multi_datasets/exp-100-2 \ + --full-libri 0 \ + --max-duration 300 \ + --lr-factor 1 \ + --bpe-model data/lang_bpe_500/bpe.model \ + --modified-transducer-prob 0.25 + --giga-prob 0.2 + +# 960-hours export CUDA_VISIBLE_DEVICES="0,1,2,3" ./transducer_stateless_multi_datasets/train.py \ --world-size 4 \ - --num-epochs 30 \ + --num-epochs 40 \ --start-epoch 0 \ - --exp-dir transducer_stateless_multi_datasets/exp \ + --exp-dir transducer_stateless_multi_datasets/exp-full-2 \ --full-libri 1 \ - --max-duration 250 \ - --lr-factor 2.5 + --max-duration 300 \ + --lr-factor 5 \ + --bpe-model data/lang_bpe_500/bpe.model \ + --modified-transducer-prob 0.25 \ + --giga-prob 0.2 """