In [None]:
import pandas as pd
import numpy as np

### Loading Data

In [None]:
from datasets import load_dataset, Audio
atco2 = load_dataset('jlvdoorn/atco2')
atco2 = atco2.cast_column("audio", Audio(sampling_rate=16000))
atcosim = load_dataset('jlvdoorn/atcosim')
atcosim = atcosim.cast_column("audio", Audio(sampling_rate=16000))

In [None]:
audio = []
text = []
info = []

for j in range(5):
    audio.append(atco2['train'][j]['audio']['array'])
    text.append(atco2['train'][j]['text'])
    info.append(atco2['train'][j]['info'].replace('\n', ' '))
    
    audio.append(atcosim['train'][j]['audio']['array'])
    text.append(atcosim['train'][j]['text'])

### Transcribing Audio

In [None]:
import whisper
model = whisper.load_model('large-v2')

In [None]:
df = pd.DataFrame(columns=['hyp', 'ref'])

In [None]:
for i in range(len(audio)):
    aud = whisper.pad_or_trim(audio[i])
    mel = whisper.log_mel_spectrogram(np.float32(aud)).to(model.device)
    options = whisper.DecodingOptions(language='en', fp16=False)
    hyp = whisper.decode(model, mel, options=options)
    ref = text[i]
    df.loc[i] = [hyp, ref]

### Building the Normalizer

The normalizer file (WhisperATC/Evaluate/Normalizer.py) can be edited and reloaded in this notebook.

In [None]:
import sys
import os
current = os.path.dirname(os.path.realpath(__file__))
parent = os.path.dirname(current)
sys.path.append(parent+'/WhisperATC/Evaluate')
from Normalizer import filterAndNormalize

### Calculate WER

In [None]:
df['hyp-norm'] = df.apply(lambda x: filterAndNormalize(x['hyp']), axis=1)
df['ref-norm'] = df.apply(lambda x: filterAndNormalize(x['ref']), axis=1)

In [None]:
import jiwer

In [None]:
wer_cln = jiwer.wer(list(df['ref']), list(df['hyp']))
wer_nrm = jiwer.wer(list(df['ref-norm']), list(df['hyp-norm']))
print('clean: {} %'.format(round(wer_cln*100,4)))
print('norm : {} %'.format(round(wer_nrm*100,4)))