Unfortunately Paraformer packages are not compatible with ESPnet. Run
```
conda create -n funasr python=3.10  # installing jiwer fails on 3.11 (levenshtein dependency)
conda activate funasr
pip install jiwer==2.5  # 2.6 onwards conflicts with funasr's g2p click dependency
pip install -U modelscope==1.10 funasr==0.8.7 pywordseg torchaudio transformers==4.39.2 charset-normalizer tabulate overrides==4.1.2
```
before running this.

In [82]:
from pathlib import Path
import os
from modelscope.utils.constant import Tasks

PWD = %pwd
PWD = Path(PWD)
prosody_dir = PWD.parent
outputs_dir = PWD / 'outputs'
os.makedirs(outputs_dir, exist_ok=True)
prosody_outdir = prosody_dir / 'outputs'
jets_dir = prosody_outdir / 'tts_train_jets_raw_phn_tacotron_g2p_en_no_space/aishell3'
nopitch_dir = prosody_outdir / 'tts_train_jets_raw_phn_tacotron_g2p_en_no_space/aishell3_nopitch'
nodur_dir = prosody_outdir / 'tts_train_jets_raw_phn_tacotron_g2p_en_no_space/aishell3_nopitch_nodur'
baseline_dir = prosody_outdir / 'zm-text-tts/aishell3'
data_dir = (prosody_dir / '../../datasets/data_aishell3/').resolve()

In [2]:
transcript_file = data_dir / 'test/content_pinyin.txt'

In [3]:
def get_transcripts():
    transcripts = {}
    with open(transcript_file) as f:
        for line in f:
            wav_file, transcript = line.strip().split('\t', maxsplit=1)
            hans, pinyin_str = transcript.split('|', maxsplit=1)
            pinyin_str = ''.join(pinyin_str.split())
            transcripts[wav_file] = (hans, pinyin_str)
    return transcripts

transcripts = get_transcripts()

In [4]:
from prosody.pinyin import hans_to_pinyins

In [57]:
from funasr import AutoModel
# paraformer-zh is a multi-functional asr model
# use vad, punc, spk or not as you need
auto_model = AutoModel(
    # model='paraformer-zh',
    model='iic/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch', hub='ms',
    # vad_model="fsmn-vad", vad_model_revision="v2.0.4",
    # punc_model="ct-punc-c", punc_model_revision="v2.0.4",
    # spk_model="cam++", spk_model_revision="v2.0.2",
)
model, kwargs = auto_model.model, auto_model.kwargs
kwargs['frontend'].fs = 22050
asr_dir = outputs_dir / 'aishell3'
os.makedirs(asr_dir, exist_ok=True)
asr_dir = asr_dir / 'aishell2-vocab5212'
os.makedirs(asr_dir, exist_ok=True)

2024-07-16 23:56:41,067 - modelscope - INFO - Use user-specified model revision: master


In [9]:
from funasr.register import tables
from pprint import pprint
pprint(tables.encoder_classes)

{'BranchformerEncoder': <class 'funasr.models.branchformer.encoder.BranchformerEncoder'>,
 'ChunkConformerEncoder': <class 'funasr.models.conformer.encoder.ConformerChunkEncoder'>,
 'ConformerEncoder': <class 'funasr.models.conformer.encoder.ConformerEncoder'>,
 'ConvBiasPredictor': <class 'funasr.models.lcbnet.encoder.ConvPredictor'>,
 'DFSMN': <class 'funasr.models.fsmn_vad_streaming.encoder.DFSMN'>,
 'EBranchformerEncoder': <class 'funasr.models.e_branchformer.encoder.EBranchformerEncoder'>,
 'FSMN': <class 'funasr.models.fsmn_vad_streaming.encoder.FSMN'>,
 'FSMNExport': <class 'funasr.models.fsmn_vad_streaming.encoder.FSMNExport'>,
 'FusionSANEncoder': <class 'funasr.models.lcbnet.encoder.SelfSrcAttention'>,
 'QwenAudioEncoder': <class 'funasr.models.qwen_audio.audio.QwenAudioEncoder'>,
 'RWKVEncoder': <class 'funasr.models.rwkv_bat.rwkv_encoder.RWKVEncoder'>,
 'SANMEncoder': <class 'funasr.models.sanm.encoder.SANMEncoder'>,
 'SANMEncoderChunkOpt': <class 'funasr.models.scama.encod

In [15]:
%load_ext autoreload
%autoreload 2

In [84]:
from funasr import AutoModel
# paraformer-zh is a multi-functional asr model
# use vad, punc, spk or not as you need
auto_model = AutoModel(
    # model='paraformer-zh',
    model='iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch',
    # model_conf='/home/perry/.cache/modelscope/hub/iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/configuration.json'
    # vad_model="fsmn-vad", vad_model_revision="v2.0.4",
    # punc_model="ct-punc-c", punc_model_revision="v2.0.4",
    # spk_model="cam++", spk_model_revision="v2.0.2",
)
model, kwargs = auto_model.model, auto_model.kwargs
kwargs['frontend'].fs = 22050

asr_dir = outputs_dir / 'aishell3'
os.makedirs(asr_dir, exist_ok=True)
asr_dir = asr_dir / 'paraformer-large-zh'
os.makedirs(asr_dir, exist_ok=True)

2024-07-17 17:19:11,350 - modelscope - INFO - Use user-specified model revision: master


In [11]:
from funasr import AutoModel
# paraformer-zh is a multi-functional asr model
# use vad, punc, spk or not as you need
auto_model = AutoModel(
    model='iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch',
    # model='iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch', hub='ms',
    # vad_model="fsmn-vad", vad_model_revision="v2.0.4",
    # punc_model="ct-punc-c", punc_model_revision="v2.0.4",
    # spk_model="cam++", spk_model_revision="v2.0.2",
)
model, kwargs = auto_model.model, auto_model.kwargs
kwargs['frontend'].fs = 22050

asr_dir = outputs_dir / 'aishell3'
os.makedirs(asr_dir, exist_ok=True)
asr_dir = asr_dir / 'paraformer-zh'
os.makedirs(asr_dir, exist_ok=True)

You are using the latest version of funasr-1.1.0


2024-07-16 07:35:53,739 - modelscope - INFO - Use user-specified model revision: master


In [11]:
import numpy as np
tokens = np.array(kwargs['token_list'])
''.join(tokens[kwargs['suppress_idx']])

"amcteiopsdfnlvhbrgwkuyjxqz''''ＡＴＭ"

In [111]:
gt_dir = data_dir / 'test/wav'
# gt_paths = sorted(gt_dir.glob('*/*.wav'))
gt_asr_path = asr_dir / 'gt_result.txt'

# jets_paths = sorted(jets_dir.glob('*.wav'))
jets_asr_path = asr_dir / 'jets_result.txt'

# nopitch_paths = sorted(nopitch_dir.glob('*.wav'))
nopitch_asr_path = asr_dir / 'nopitch_result.txt'

# baseline_paths = sorted(nopitch_dir.glob('*.wav'))
baseline_asr_path = asr_dir / 'baseline_result.txt'

nodur_asr_path = asr_dir / 'nodur_result.txt'

In [86]:
import regex as re
suppress_idx = []
for i, token in enumerate(kwargs['tokenizer'].token_list):
    if re.search(r'\P{Han}', token) and not token.startswith('<'):
        suppress_idx.append(i)
kwargs['suppress_idx'] = suppress_idx

In [51]:
# Try running ASR on the first transcript
model.eval()
kwargs['fs'] = 22050
mandarin_dir = jets_dir.parent / 'mandarin'
result, _ = model.inference(data_in=str(jets_dir.parent / 'mandarin' / '四十四岁的徐女士报警称.wav'), key=['SSB06930030.wav'], **kwargs)
print(result)

[{'key': 'SSB06930030.wav', 'text': '说声说说二词喂对谁去女女说吧谁你唱'}]


In [38]:
del kwargs['suppress_idx']

In [45]:
import logging
logging.basicConfig(level=logging.WARNING)

In [87]:
from tqdm import tqdm
def run_asr(filenames, audio_dir, asr_result_path, is_two_level=False, audio_fs=22050):
    model.eval()
    kwargs['fs'] = audio_fs
    audio_dir = Path(audio_dir)
    with open(asr_result_path, 'w') as f:
        for filename in tqdm(filenames):
            key = [filename]
            if is_two_level:
                data_in = str(audio_dir / filename[:7] / filename)
            else:
                data_in = str(audio_dir / filename)
            try:
                result, _ = model.inference(data_in=data_in, key=key, **kwargs)
                text = result[0]['text']
            except ValueError:
                print('Error during inference for:', filename)
                text = ''
            f.write(f'{filename}|{text}\n')

In [65]:
gt_asr_path

PosixPath('/home/perry/PycharmProjects/present/prosody/funasr/outputs/aishell3/aishell2-vocab5212/gt_result.txt')

In [88]:
run_asr(transcripts.keys(), gt_dir, gt_asr_path, is_two_level=True, audio_fs=44100)

100%|██████████| 24773/24773 [33:14<00:00, 12.42it/s]


In [89]:
run_asr(transcripts.keys(), nopitch_dir, nopitch_asr_path)
run_asr(transcripts.keys(), baseline_dir, baseline_asr_path)
nodur_dir = prosody_outdir / 'tts_train_jets_raw_phn_tacotron_g2p_en_no_space/aishell3_nopitch_nodur'
run_asr(transcripts.keys(), nodur_dir, nodur_asr_path)

100%|██████████| 24773/24773 [31:23<00:00, 13.15it/s]
100%|██████████| 24773/24773 [36:31<00:00, 11.30it/s]
100%|██████████| 24773/24773 [37:19<00:00, 11.06it/s]


In [61]:
run_asr(transcripts.keys(), jets_dir, jets_asr_path)

 55%|█████▌    | 13717/24773 [14:52<09:39, 19.07it/s]

Error during inference for: SSB00090221.wav


 58%|█████▊    | 14281/24773 [15:20<08:58, 19.47it/s]

Error during inference for: SSB03950012.wav


100%|██████████| 24773/24773 [26:34<00:00, 15.53it/s]


In [71]:
run_asr(transcripts.keys(), nopitch_dir, nopitch_asr_path)

 55%|█████▌    | 13716/24773 [15:15<09:31, 19.35it/s]

Error during inference for: SSB00090179.wav
Error during inference for: SSB00090221.wav


 58%|█████▊    | 14281/24773 [15:44<08:46, 19.94it/s]

Error during inference for: SSB03950012.wav


100%|██████████| 24773/24773 [27:08<00:00, 15.21it/s]


In [72]:
baseline_dir = prosody_outdir / 'zm-text-tts' / 'aishell3'
run_asr(transcripts.keys(), baseline_dir, baseline_asr_path)

 43%|████▎     | 10716/24773 [14:26<18:13, 12.85it/s]

Error during inference for: SSB13820236.wav


 52%|█████▏    | 12801/24773 [17:16<14:09, 14.10it/s]

Error during inference for: SSB18100419.wav


 55%|█████▌    | 13715/24773 [18:33<11:15, 16.38it/s]

Error during inference for: SSB00090179.wav


 60%|█████▉    | 14754/24773 [19:35<10:12, 16.36it/s]

Error during inference for: SSB06860272.wav


 88%|████████▊ | 21787/24773 [28:51<04:08, 11.99it/s]

Error during inference for: SSB14370165.wav


100%|██████████| 24773/24773 [32:36<00:00, 12.66it/s]


In [81]:
nodur_dir

PosixPath('/home/perry/PycharmProjects/present/prosody/outputs/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/aishell3_nopitchnodur')

In [83]:
run_asr(transcripts.keys(), nodur_dir, nodur_asr_path)

100%|██████████| 24773/24773 [34:13<00:00, 12.07it/s]


In [90]:
import re
import jiwer
def eval_cer(transcripts, asr_result_path, cer_path):
    with open(cer_path, 'w') as cer_file:
        cer_file.write('wav_file,hanzi_len,hanzi_cer,pinyin_len,pinyin_cer\n')
        with open(asr_result_path) as f:
            for line in f:
                wav_file, asr_hans = line.strip('\n').split('|', maxsplit=1)
                asr_pinyins = ''.join(hans_to_pinyins(asr_hans)) 
                # eng_chars = sum([word.isascii() for word in asr_output])
                hans, pinyins = transcripts[wav_file]
                pinyins = re.sub(r'\s', '', pinyins)
                hanzi_len = len(hans)
                pinyin_len = len(pinyins)
                hanzi_cer = jiwer.cer(reference=hans, hypothesis=asr_hans)
                pinyin_cer = jiwer.cer(reference=pinyins, hypothesis=asr_pinyins)
                cer_file.write(f'{wav_file},{hanzi_len},{hanzi_cer},{pinyin_len},{pinyin_cer}\n')
                

In [25]:
hans_to_pinyins(hans)

IndexError: list index out of range

In [110]:
asr_dir = asr_dir.parent / 'paraformer-large-zh'

In [63]:
jets_cer_path = asr_dir / 'jets_cer.csv'
eval_cer(transcripts=transcripts, asr_result_path=jets_asr_path, cer_path=jets_cer_path)

In [112]:
nopitch_cer_path = asr_dir / 'nopitch_cer.csv'
eval_cer(transcripts=transcripts, asr_result_path=nopitch_asr_path, cer_path=nopitch_cer_path)

In [113]:
baseline_cer_path = asr_dir / 'baseline_cer.csv'
eval_cer(transcripts=transcripts, asr_result_path=baseline_asr_path, cer_path=baseline_cer_path)

PosixPath('/home/perry/PycharmProjects/present/prosody/funasr/outputs/aishell3/aishell2-vocab5212/gt_cer.csv')

In [114]:
gt_cer_path = asr_dir / 'gt_cer.csv'
eval_cer(transcripts=transcripts, asr_result_path=gt_asr_path, cer_path=gt_cer_path)

In [115]:
nodur_cer_path = asr_dir / 'nodur_cer.csv'
eval_cer(transcripts=transcripts, asr_result_path=nodur_asr_path, cer_path=nodur_cer_path)