Unfortunately Paraformer packages are not compatible with ESPnet. Run
```
conda create -n funasr python=3.10  # installing jiwer fails on 3.11 (levenshtein dependency)
conda activate funasr
pip install jiwer==2.5  # 2.6 onwards conflicts with funasr's g2p click dependency
pip install -U modelscope==1.10 funasr==0.8.7 pywordseg torchaudio transformers==4.39.2 charset-normalizer tabulate overrides==4.1.2
```
before running this.

In [1]:
from pathlib import Path
import os
import faster_whisper
from tqdm import tqdm
import jiwer
import regex as re

PWD = %pwd
PWD = Path(PWD)
prosody_dir = PWD.parent
outputs_dir = PWD / 'outputs' / 'aishell3'
os.makedirs(outputs_dir, exist_ok=True)
prosody_outdir = prosody_dir / 'outputs'
jets_dir = prosody_outdir / 'tts_train_jets_raw_phn_tacotron_g2p_en_no_space/aishell3'
nopitch_dir = prosody_outdir / 'tts_train_jets_raw_phn_tacotron_g2p_en_no_space/aishell3_nopitch'
baseline_dir = prosody_outdir / 'zm-text-tts/aishell3'
data_dir = (prosody_dir / '../../datasets/data_aishell3/').resolve()

In [2]:
from prosody.pinyin import hans_to_pinyins

In [2]:
ld_lib_path = os.environ['LD_LIBRARY_PATH']
assert 'cublas' in ld_lib_path and 'cudnn' in ld_lib_path
model = faster_whisper.WhisperModel("large-v2", device='cuda', compute_type='float16')
asr_dir = outputs_dir / 'large-v2'
os.makedirs(asr_dir, exist_ok=True)

In [3]:
tokenizer = model.hf_tokenizer

In [19]:
# Suppress numbers and letters from output
tokenizer = model.hf_tokenizer
suppress_tokens = [-1]
alpha = set('qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM')
for i in range(tokenizer.get_vocab_size()):
    token = tokenizer.decode([i]).removeprefix(" ")
    # if re.search(r'\P{Han}', token):
    if re.search(r'[0-9]', token):
        suppress_tokens.append(i)
    if token in alpha:
        suppress_tokens.append(i)

whisper_kwargs = {
    'suppress_tokens': suppress_tokens,
    # 'temperature': 0.0,
    'condition_on_previous_text': False,
    'prepend_punctuations': '',
    'append_punctuations': '',
}

In [16]:
len(suppress_tokens)

531

In [2]:
import opencc
t2s = opencc.OpenCC('t2s.json')
def normalize_chinese(hans):
    hans = re.sub(r'\P{Han}', '', hans)
    hans = t2s.convert(hans)
    return hans

In [8]:
line = '''SSB06930002.wav|武術始終被看作我國的國粹
'''
wav_file, asr_hans = line.strip('\n').split('|', maxsplit=1)
asr_hans = normalize_chinese(asr_hans)
asr_pinyins = hans_to_pinyins(asr_hans)
hans, pinyins = transcripts[wav_file]
hanzi_len = len(hans)
pinyin_len = len(pinyins)

In [12]:
jiwer.cer(truth=hans, hypothesis=asr_hans)

0.0

In [21]:
def whisper_transcribe(filepath, kwargs=whisper_kwargs):
    segments, _ = model.transcribe(filepath, language='zh', **kwargs)
    text = ''.join(segment.text for segment in segments)
    return text

In [22]:
whisper_transcribe(jets_dir / 'SSB18720176.wav')

'我們下次節目再見'

In [26]:
def run_asr(filenames, audio_dir, asr_result_path, is_two_level=False):
    with open(asr_result_path, 'w') as f:
        for filename in tqdm(filenames):
            if is_two_level:
                wav_path = audio_dir / filename[:7] / filename
            else:
                wav_path = audio_dir / filename
            text = whisper_transcribe(wav_path)
            f.write(f'{filename}|{text}\n')

In [5]:
transcript_file = data_dir / 'test/content_pinyin.txt'
def get_transcripts():
    transcripts = {}
    with open(transcript_file) as f:
        for line in f:
            wav_file, transcript = line.strip().split('\t', maxsplit=1)
            hans, pinyin_str = transcript.split('|', maxsplit=1)
            pinyin_str = ''.join(pinyin_str.split())
            transcripts[wav_file] = (hans, pinyin_str)
    return transcripts

transcripts = get_transcripts()

In [6]:
import logging
logging.basicConfig(level=logging.WARNING)

In [17]:
gt_dir = data_dir / 'test/wav'
gt_asr_path = asr_dir / 'gt_result.txt'

jets_asr_path = asr_dir / 'jets_result.txt'

nopitch_asr_path = asr_dir / 'nopitch_result.txt'

baseline_asr_path = asr_dir / 'baseline_result.txt'

In [27]:
run_asr(transcripts.keys(), gt_dir, gt_asr_path, is_two_level=True)

100%|██████████| 24773/24773 [2:58:47<00:00,  2.31it/s]  


In [28]:
run_asr(transcripts.keys(), jets_dir, jets_asr_path)

100%|██████████| 24773/24773 [3:22:05<00:00,  2.04it/s]  


In [29]:
run_asr(transcripts.keys(), nopitch_dir, nopitch_asr_path)

100%|██████████| 24773/24773 [3:31:30<00:00,  1.95it/s]   


In [33]:
run_asr(transcripts.keys(), baseline_dir, baseline_asr_path)

100%|██████████| 24773/24773 [6:52:47<00:00,  1.00it/s]   


In [35]:
nodur_dir = Path(str(nopitch_dir) + '_nodur')
nodur_asr_path = asr_dir / 'nopitch_nodur_result.txt'
run_asr(transcripts.keys(), nodur_dir, nodur_asr_path)

100%|██████████| 24773/24773 [7:02:01<00:00,  1.02s/it]   


In [36]:
def check_nonhans(asr_result_path):
    non_hans = set()
    with open(asr_result_path) as f:
        for line in f:
            asr_output = line.strip('\n').split('|', maxsplit=1)[1]
            asr_output = re.sub(r'\p{Han}', '', asr_output)
            non_hans |= set(asr_output)
    return non_hans

In [39]:
print(''.join(sorted(check_nonhans(gt_asr_path))))

 !"$%',-.:?BCDEFGHIJKLMNOPQRSTUVWXYabcdeghiklmnoprstuwy~¥·—…、。《》「」【】�


In [42]:
import pandas as pd
def write_nonhans(asr_result_path):
    filenames, asr_outputs = [], []
    with open(asr_result_path) as f:
        for line in f:
            filename, asr_output = line.strip('\n').split('|', maxsplit=1)
            if re.search(r'[$%BCDEFGHIJKLMNOPQRSTUVWXYabcdeghiklmnoprstuwy¥�]', line):
                filenames.append(filenames)
                asr_outputs.append(asr_output)
    df = pd.DataFrame({'filename': filenames, 'asr_output': asr_outputs})
    df.to_csv(str(asr_result_path) + '.csv', sep='|', index=False)
    return df
 
write_nonhans(gt_asr_path)

KeyboardInterrupt: 

In [33]:
def eval_cer(transcripts, asr_result_path, cer_path):
    with open(cer_path, 'w') as cer_file:
        cer_file.write('wav_file,hanzi_len,hanzi_cer,pinyin_len,pinyin_cer\n')
        with open(asr_result_path) as f:
            for line in f:
                wav_file, asr_hans = line.strip('\n').split('|', maxsplit=1)
                asr_hans = normalize_chinese(asr_hans)
                asr_pinyins = ''.join(hans_to_pinyins(asr_hans)) 
                # eng_chars = sum([word.isascii() for word in asr_output])
                hans, pinyins = transcripts[wav_file]
                pinyins = re.sub(r'\s', '', pinyins)
                hanzi_len = len(hans)
                pinyin_len = len(pinyins)
                hanzi_cer = jiwer.cer(reference=hans, hypothesis=asr_hans)
                pinyin_cer = jiwer.cer(reference=pinyins, hypothesis=asr_pinyins)
                cer_file.write(f'{wav_file},{hanzi_len},{hanzi_cer},{pinyin_len},{pinyin_cer}\n')
                

In [35]:
jets_cer_path = asr_dir / 'jets_cer.csv'
eval_cer(transcripts=transcripts, asr_result_path=jets_asr_path, cer_path=jets_cer_path)

In [11]:
nopitch_cer_path = asr_dir / 'nopitch_cer.csv'
eval_cer(transcripts=transcripts, asr_result_path=nopitch_asr_path, cer_path=nopitch_cer_path)

In [None]:
baseline_cer_path = asr_dir / 'baseline_cer.csv'
eval_cer(transcripts=transcripts, asr_result_path=baseline_asr_path, cer_path=baseline_cer_path)

In [34]:
gt_cer_path = asr_dir / 'gt_cer.csv'
eval_cer(transcripts=transcripts, asr_result_path=gt_asr_path, cer_path=gt_cer_path)