Unfortunately Paraformer packages are not compatible with ESPnet. Run
```
conda create -n funasr python=3.10  # installing jiwer fails on 3.11 (levenshtein dependency)
conda activate funasr
pip install jiwer==2.5  # 2.6 onwards conflicts with funasr's g2p click dependency
pip install -U modelscope funasr pywordseg torchaudio charset-normalizer tabulate overrides==3
```
before running this.

In [2]:
from pathlib import Path
import os
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks

import jiwer
import re

PWD = %pwd
PWD = Path(PWD)
outputs_dir = PWD / 'outputs'
os.makedirs(outputs_dir, exist_ok=True)
asr_dir = outputs_dir / 'asr-aishell2'
os.makedirs(asr_dir, exist_ok=True)
jets_dir = outputs_dir / 'tts_train_jets_raw_phn_tacotron_g2p_en_no_space/aishell3'
nopitch_dir = outputs_dir / 'tts_train_jets_raw_phn_tacotron_g2p_en_no_space/aishell3_nopitch'
baseline_dir = outputs_dir / 'zm-text-tts/aishell3_nopitch'
data_dir = Path('../../datasets/data_aishell3/').resolve()

In [None]:
os.getcwd()

In [2]:
asr_pipeline = pipeline(
    task=Tasks.auto_speech_recognition,
    model='damo/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch',
    # model='damo/speech_paraformer_asr_nat-aishell1-pytorch',
)

2023-11-01 02:04:04,454 - modelscope - INFO - initiate model from /home/perry/.cache/modelscope/hub/damo/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch
2023-11-01 02:04:04,454 - modelscope - INFO - initiate model from location /home/perry/.cache/modelscope/hub/damo/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch.
2023-11-01 02:04:04,456 - modelscope - INFO - initialize model from /home/perry/.cache/modelscope/hub/damo/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch


In [4]:
# Try running ASR on the first transcript
asr_pipeline(audio_in=str(jets_dir / 'SSB00090179.wav'))

2023-11-01 02:05:17,789 - modelscope - INFO - Decoding with wav files ...
2023-11-01 02:05:17,851 - modelscope - INFO - Computing the result of ASR ...


{'text': '在赛斯'}

In [7]:
import logging
logging.basicConfig(level=logging.WARNING)

In [5]:
def run_asr(wav_paths, asr_result_path):
    with open(asr_result_path, 'w') as f:
        for wav_path in wav_paths:
            try:
                result = asr_pipeline(audio_in=str(wav_path))
                text = result['text']
            except TypeError:
                text = ''
            f.write(f'{wav_path.parts[-1]} {text}\n')

In [3]:
gt_dir = data_dir / 'test/wav'
gt_paths = sorted(gt_dir.glob('*/*.wav'))
gt_asr_path = asr_dir / 'gt_result.txt'

jets_paths = sorted(jets_dir.glob('*.wav'))
jets_asr_path = asr_dir / 'jets_result.txt'

nopitch_paths = sorted(nopitch_dir.glob('*.wav'))
nopitch_asr_path = asr_dir / 'nopitch_result.txt'

baseline_paths = sorted(nopitch_dir.glob('*.wav'))
baseline_asr_path = asr_dir / 'baseline_result.txt'

In [None]:
run_asr(gt_paths, gt_asr_path)

In [6]:
run_asr(jets_paths, jets_asr_path)

2023-09-29 01:50:16,931 - modelscope - INFO - Decoding with wav files ...
2023-09-29 01:50:18,001 - modelscope - INFO - Computing the result of ASR ...
2023-09-29 01:50:18,002 - modelscope - INFO - Decoding with wav files ...
2023-09-29 01:50:18,035 - modelscope - INFO - Computing the result of ASR ...
2023-09-29 01:50:18,036 - modelscope - INFO - Decoding with wav files ...
2023-09-29 01:50:18,069 - modelscope - INFO - Computing the result of ASR ...
2023-09-29 01:50:18,073 - modelscope - INFO - Decoding with wav files ...
2023-09-29 01:50:18,106 - modelscope - INFO - Computing the result of ASR ...
2023-09-29 01:50:18,107 - modelscope - INFO - Decoding with wav files ...
2023-09-29 01:50:18,138 - modelscope - INFO - Computing the result of ASR ...
2023-09-29 01:50:18,138 - modelscope - INFO - Decoding with wav files ...
2023-09-29 01:50:18,169 - modelscope - INFO - Decoding with wav files ...
2023-09-29 01:50:18,191 - modelscope - INFO - Decoding with wav files ...
2023-09-29 01:50:1

In [7]:
run_asr(nopitch_paths, nopitch_asr_path)

2023-09-29 02:06:48,498 - modelscope - INFO - Decoding with wav files ...
2023-09-29 02:06:48,537 - modelscope - INFO - Computing the result of ASR ...
2023-09-29 02:06:48,539 - modelscope - INFO - Decoding with wav files ...
2023-09-29 02:06:48,572 - modelscope - INFO - Computing the result of ASR ...
2023-09-29 02:06:48,573 - modelscope - INFO - Decoding with wav files ...
2023-09-29 02:06:48,601 - modelscope - INFO - Computing the result of ASR ...
2023-09-29 02:06:48,602 - modelscope - INFO - Decoding with wav files ...
2023-09-29 02:06:48,632 - modelscope - INFO - Computing the result of ASR ...
2023-09-29 02:06:48,633 - modelscope - INFO - Decoding with wav files ...
2023-09-29 02:06:48,666 - modelscope - INFO - Computing the result of ASR ...
2023-09-29 02:06:48,667 - modelscope - INFO - Decoding with wav files ...
2023-09-29 02:06:48,687 - modelscope - INFO - Decoding with wav files ...
2023-09-29 02:06:48,711 - modelscope - INFO - Decoding with wav files ...
2023-09-29 02:06:4

In [None]:
run_asr(baseline_paths, baseline_asr_path)

In [4]:
transcript_file = data_dir / 'test/content.txt'

In [5]:
def get_transcripts():
    transcripts = {}
    with open(transcript_file) as f:
        for line in f:
            wav_file, transcript = line.strip().split(maxsplit=1)
            transcripts[wav_file] = re.sub(r'[ a-z0-9]', '', transcript)
    return transcripts

transcripts = get_transcripts()

In [14]:
def eval_wer(transcripts, asr_result_path, wer_path):
    with open(wer_path, 'w') as wer_file:
        wer_file.write('wav_file,gt_len,wer,eng_chars\n')
        with open(asr_result_path) as f:
            for line in f:
                wav_file, asr_output = line.strip('\n').split(' ', maxsplit=1)
                asr_output = list(asr_output)
                eng_chars = sum([word.isascii() for word in asr_output])
                transcript = transcripts[wav_file]
                gt_len = len(transcript)
                wer = jiwer.wer(truth=' '.join(transcript), hypothesis=' '.join(asr_output))
                wer_file.write(f'{wav_file},{gt_len},{wer},{eng_chars}\n')

In [15]:
jets_wer_path = asr_dir / 'jets_wer.csv'
eval_wer(transcripts=transcripts, asr_result_path=jets_asr_path, wer_path=jets_wer_path)

In [11]:
nopitch_wer_path = asr_dir / 'nopitch_wer.csv'
eval_wer(transcripts=transcripts, asr_result_path=nopitch_asr_path, wer_path=nopitch_wer_path)

In [None]:
baseline_wer_path = asr_dir / 'baseline_wer.csv'
eval_wer(transcripts=transcripts, asr_result_path=baseline_asr_path, wer_path=baseline_wer_path)

In [12]:
gt_wer_path = asr_dir / 'gt_wer.csv'
eval_wer(transcripts=transcripts, asr_result_path=gt_asr_path, wer_path=gt_wer_path)

In [6]:
def eval_cer(transcripts, asr_result_path, cer_path):
    from prosody.en_to_zh import hans_to_pinyin
    with open(cer_path, 'w') as cer_file:
        cer_file.write('wav_file,gt_len,cer,eng_chars\n')
        with open(asr_result_path) as f:
            for line in f:
                wav_file, asr_output = line.strip('\n').split(' ', maxsplit=1)
                asr_output = list(asr_output)
                eng_chars = sum([word.isascii() for word in asr_output])
                transcript = transcripts[wav_file]
                trans_pinyin = ''.join(hans_to_pinyin(transcript))
                gt_len = len(trans_pinyin)
                asr_pinyin = ''.join(hans_to_pinyin(asr_output)).lower()
                cer = jiwer.cer(truth=trans_pinyin, hypothesis=asr_pinyin)
                cer_file.write(f'{wav_file},{gt_len},{cer},{eng_chars}\n')

In [6]:
jets_asr_path = asr_dir / 'jets_result.txt'
jets_cer_path = asr_dir / 'jets_cer.csv'
eval_cer(transcripts=transcripts, asr_result_path=jets_asr_path, cer_path=jets_cer_path)

In [9]:
baseline_asr_path = asr_dir / 'baseline_result.txt'
baseline_cer_path = asr_dir / 'baseline_cer.csv'
eval_cer(transcripts=transcripts, asr_result_path=baseline_asr_path, cer_path=baseline_cer_path)