It's better to have Whisper in a separate environment from ESPnet. Also, we use `faster-whisper` as it is not only faster but more stable than the original.
```shell
conda create -n whisper python=3.10
conda activate whisper
pip install faster-whisper
```
before running this.

You also need to set the env variable LD_LIBRARY_PATH:

```shell
conda activate whisper
env_loc=$(conda env list | grep '*' | awk '{print $3}' | tr -d '\n')
activator="${env_loc}/etc/conda/activate.d/cuda_activate.sh"
echo export LD_LIBRARY_PATH=`python3 -c 'import os; import nvidia.cublas.lib; import nvidia.cudnn.lib; import torch; print(os.path.dirname(nvidia.cublas.lib.__file__) + ":" + os.path.dirname(nvidia.cudnn.lib.__file__) + ":" + os.path.dirname(torch.__file__) +"/lib")'` > "${activator}"
chmod +x "${activator}"
conda deactivate
conda activate whisper
```

In [1]:
from pathlib import Path
import os
import faster_whisper
from tqdm import tqdm
import jiwer
import re

PWD = %pwd
PWD = Path(PWD)
prosody_dir = PWD.parent
outputs_dir = PWD / 'outputs'
os.makedirs(outputs_dir, exist_ok=True)
asr_dir = outputs_dir / 'CSS10' / 'spanish'
os.makedirs(asr_dir, exist_ok=True)
jets_dir = prosody_dir / 'outputs' / 'tts_train_jets_raw_phn_tacotron_g2p_en_no_space/CSS10/spanish'
data_dir = (prosody_dir / '../../datasets/CSS10/spanish/').resolve()

In [2]:
import os
ld_lib_path = os.environ['LD_LIBRARY_PATH']
assert 'cublas' in ld_lib_path and 'cudnn' in ld_lib_path

In [55]:
faster_whisper.available_models()

['tiny.en',
 'tiny',
 'base.en',
 'base',
 'small.en',
 'small',
 'medium.en',
 'medium',
 'large-v1',
 'large-v2',
 'large-v3',
 'large',
 'distil-large-v2',
 'distil-medium.en',
 'distil-small.en',
 'distil-large-v3']

In [3]:
model = faster_whisper.WhisperModel("large-v2", device='cuda', compute_type='float16')
asr_dir = asr_dir / 'large-v2'
os.makedirs(asr_dir, exist_ok=True)

In [4]:
# If you need to suppress numeric chars from output. This slows ASR by 20x, so i use num2words instead
tokenizer = model.hf_tokenizer
suppress_tokens = [-1] + [
    i 
    for i in range(tokenizer.get_vocab_size())
    if all(c in "0123456789" for c in tokenizer.decode([i]).removeprefix(" "))
]

In [4]:
spanish_letters = set()
with open(data_dir / 'transcript.txt') as f:
    for line in f:
        spanish_letters |= set(line.split('|')[2].lower())

In [5]:
''.join(sorted(spanish_letters))

" !',-.:;?abcdefghijklmnopqrstuvwxyz¡¿áèéëíñóúü"

In [5]:
def normalize_spanish(text):
    text = text.lower()
    text = re.sub(r"[¡¿…«»“”?!',.:;–-]", ' ', text)
    text = re.sub(r'[^abcdefghijklmnopqrstuvwxyzáéíñóúü ]', '', text)
    text = ' '.join(text.strip().split())
    # normalize umlauts?
    return text

In [4]:
transcript_file = data_dir / 'transcript_normalized.txt'
if not transcript_file.exists():
    with open(data_dir / 'transcript.txt') as f:
        with open(transcript_file, 'w') as norm_f:
            for line in f:
                filename, _, transcript, _ = line.split('|')
                if transcript:
                    transcript = normalize_spanish(transcript)
                    norm_f.write(f'{filename}|{transcript}\n')

def get_transcripts():
    transcripts = {}
    with open(transcript_file) as f:
        for line in f:
            filename, transcript = line.strip().split('|', maxsplit=1)
            transcripts[filename] = transcript
    return transcripts

transcripts = get_transcripts()

In [8]:
filenames = list(transcripts.keys())

In [7]:
whisper_kwargs = {
    # 'suppress_tokens': suppress_tokens,
    # 'temperature': 0.0,
    # 'condition_on_previous_text': False,
    'prepend_punctuations': '',
    'append_punctuations': '',
}

In [9]:
from num2words import num2words
import numpy as np
def whisper_transcribe(filepath, kwargs=whisper_kwargs):
    segments, _ = model.transcribe(filepath, language='es', **kwargs)
    text = ' '.join(segment.text for segment in segments)
    pads = 1
    while not text:  # whisper sometimes randomly fails to produce anything
        audio = faster_whisper.audio.decode_audio(filepath)
        audio_pad = np.pad(audio, (pads * 100, 0), mode='constant', constant_values=0)
        segments, _ = model.transcribe(audio_pad, language='es', **kwargs)
        text = ' '.join(segment.text for segment in segments)
        pads += 1
        if pads == 10:
            break
    splits = re.split(r'(\d+)', text)
    for i in range(len(splits)):
        if re.fullmatch(r'\d+', splits[i]):
            splits[i] = num2words(splits[i], lang='es')
    text = ''.join(splits)
    return text

In [10]:
def run_asr(filenames, audio_dir, asr_result_path, kwargs=whisper_kwargs):
    with open(asr_result_path, 'w') as f:
        for filename in tqdm(filenames):
            wav_path = audio_dir / filename
            text = whisper_transcribe(wav_path, kwargs)
            text = normalize_spanish(text)
            f.write(f'{filename}|{text}\n')

In [8]:
audio_dir = Path('/home/perry/PycharmProjects/present/prosody/outputs/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/spanish')
def get_phone_cer(phone):
    print('\n', phone)
    phone_dir = audio_dir / phone
    fnames = os.listdir(phone_dir)
    avg_cer = 0
    for fname in fnames:
        print(fname)
        book = fname.rsplit('_', 1)[0]
        orig = transcripts[f'{book}/{fname}']
        trans = normalize_spanish(whisper_transcribe(phone_dir / fname))
        print(orig)
        print(trans)
        cer = jiwer.cer(orig.replace(' ', ''), trans.replace(' ', ''))
        print(cer)
        avg_cer += cer
    avg_cer /= len(fnames)
    return avg_cer

In [19]:
phones = 'a aɪ aʊ b β d ð e eɪ eʊ ɛ f ɡ ɣ i j ʝ k l ʎ m n ɲ ŋ o oɪ p pː r ɾ s t tʃ θ u w x'.split()
phone_cers = []
for phone in phones:
    avg_cer = get_phone_cer(phone)
    phone_cers.append(avg_cer)


 a
bailen_3222.wav
vienen por mengíbar y anuncian que de estanoche a mañana llegarán a casa donde piensan detenerse algunos días no sólo para tomar descanso sino para que ambas familias se conozcan y traten pues son ramas que van a injertarse formando un solo árbol frondoso que eche profundas raíces en el suelo de la nación
bien informen hegivar y janusian que de esta noche a mañana llegarán a casa donde piensan detenerse algunos días no solo para tomar descanso sino para que ambas familias se conozcan y traten pues son ramas que van a inhertarse formando un solo árbol frondoso que echa profundas raíces en el suelo de la nación
0.05138339920948617
batalla_arapiles_1599.wav
oscuro olvidado y no muy bien quisto santorcaz se consuela con la masonería y en la logia de la calle de tentenecios unos cuantos perdidos españoles y franceses lo peor sin duda de ambas naciones se entretienen en exterminar al género humano volviendo al mundo patas arriba suprimiendo la aristocracia y poniendo a lo

In [12]:
get_phone_cer('aɪ')


 aɪ
batalla_arapiles_3910.wav
erais el cid bernardo del carpio zaide abenamar celindos lanzarote del lago fernán gonzález y pedro ansúrez

1.0
batalla_arapiles_2254.wav
señora estamos perdidos no contábamos con la traición la traición dijo confusa miss fly no puede ser
señora estamos perdidos no contábamos con la traición la traición dijo confuso a mi seifel y griega no puede ser
0.11904761904761904
bailen_0246.wav
pues si no hay enemigos en bailén qué es eso de atacar a bailén
pues si no hay enemigos en bailén que se sotaque a bailén
0.14
batalla_arapiles_4347.wav
lo sé creíais rebajaros sólo ocupándoos del asunto lo cierto es que oíais todo y callabais
los he creía es rebajaros solo ocupandoos del asunto los diertoes que wea estodo vi que leabais
0.21333333333333335


0.36809523809523814

In [20]:
for p, cer in zip(phones, phone_cers):
    print(p, cer)

a 0.05639379908740501
aɪ 0.36809523809523814
aʊ 0.12191188234186956
b 0.10217227947491105
β 0.03601829435499667
d 0.09111984152754606
ð 0.05330444984471725
e 0.07090332747903523
eɪ 0.09761748513516548
eʊ 0.25855341732789583
ɛ 0.0278244243450429
f 0.06591258953316373
ɡ 0.068148761771887
ɣ 0.15889349787304352
i 0.04998677742295804
j 0.04841987022760434
ʝ 0.06425448374600917
k 0.04481438934212548
l 0.08391712992118013
ʎ 0.08520473716552149
m 0.015560107756201595
n 0.03965739741376484
ɲ 0.13661205203730978
ŋ 0.10872618798397105
o 0.06341291860362723
oɪ 0.1629768123557387
p 0.06303505931804534
pː 0.21163852440448183
r 0.07876761747697114
ɾ 0.06434722463728283
s 0.043380175062373696
t 0.047444519965566395
tʃ 0.12634910168164432
θ 0.29426996167492303
u 0.07374169050303499
w 0.10083341734717882
x 0.19669756727056426


In [46]:
import logging
logging.basicConfig(level=logging.WARNING)

In [13]:
gt_dir = data_dir
gt_asr_path = asr_dir / 'gt_result.txt'

jets_asr_path = asr_dir / 'jets_result.txt'

In [25]:
run_asr(filenames, gt_dir, gt_asr_path)

100%|██████████| 11016/11016 [1:42:31<00:00,  1.79it/s] 


In [26]:
run_asr(filenames, jets_dir, jets_asr_path)

100%|██████████| 11016/11016 [1:44:19<00:00,  1.76it/s] 


In [14]:
def eval_cer(transcripts, asr_result_path, cer_path):
    with open(cer_path, 'w') as cer_file:
        cer_file.write('wav_file,gt_len,cer\n')
        with open(asr_result_path) as f:
            for line in f:
                wav_file, asr_output = line.strip('\n').split('|', maxsplit=1)
                transcript = transcripts[wav_file]
                transcript_nospace = transcript.replace(' ', '')
                asr_nospace = asr_output.replace(' ', '')
                gt_len = len(transcript)
                cer = jiwer.cer(truth=transcript_nospace, hypothesis=asr_nospace)
                cer_file.write(f'{wav_file},{gt_len},{cer}\n')

In [28]:
gt_cer_path = asr_dir / 'gt_cer.csv'
eval_cer(transcripts=transcripts, asr_result_path=gt_asr_path, cer_path=gt_cer_path)

In [15]:
jets_cer_path = asr_dir / 'jets_cer.csv'
eval_cer(transcripts=transcripts, asr_result_path=jets_asr_path, cer_path=jets_cer_path)