In [None]:
import pandas as pd
import numpy as np

### Loading Data

In [None]:
from datasets import load_dataset, Audio
atco2 = load_dataset('jlvdoorn/atco2')
atco2 = atco2.cast_column("audio", Audio(sampling_rate=16000))
atcosim = load_dataset('jlvdoorn/atcosim')
atcosim = atcosim.cast_column("audio", Audio(sampling_rate=16000))

In [None]:
audio = []
text = []
info = []

for j in range(5):
    audio.append(atco2['train'][j]['audio']['array'])
    text.append(atco2['train'][j]['text'])
    info.append(atco2['train'][j]['info'].replace('\n', ' '))
    
    audio.append(atcosim['train'][j]['audio']['array'])
    text.append(atcosim['train'][j]['text'])

### Load Model

In [None]:
import whisper
model = whisper.load_model('large-v2')

In [None]:
df = pd.DataFrame(columns=['hyp', 'hyp-prmpt', 'ref'])

### Setup Prompts

Here, the prompt can be edited for every try. In the second block, if available, the radar data is added.

In [None]:
general = 'Air Traffic Control communications'
nato = 'alpha,bravo,charlie,delta,echo,foxtrot,golf,hotel,india,juliett,kilo,lima,mike,november,oscar,papa,quebec,romeo,sierra,tango,uniform,victor,whiskey,xray,yankee,zulu'
terminology = 'climb, climbing, descend, descending, passing, feet, knots, degrees, direct, maintain, identified, ILS, VFR, IFR, contact, frequency, turn, right, left, heading, altitude, flight, level, cleared, squawk, approach, runway, established, report, affirm, negative, wilco, roger, radio, radar'

In [None]:
for i in range(len(audio)):
    aud = whisper.pad_or_trim(audio[i])
    mel = whisper.log_mel_spectrogram(np.float32(aud)).to(model.device)
    
    options = whisper.DecodingOptions(language='en', fp16=False)
    hyp = whisper.decode(model, mel, options=options)
    
    try:
        prompt = general+' '+info[i]+' '+nato.replace(',', ' ')+' '+terminology.replace(',', ' ')
    except:
        prompt = general+' '+nato.replace(',', ' ')+' '+terminology.replace(',', ' ')
    options = whisper.DecodingOptions(language='en', fp16=False, prompt=prompt)
    hyp_prmpt = whisper.decode(model, mel, options=options)
    
    ref = text[i]
    
    df.loc[i] = [hyp, hyp_prmpt, ref]

### Building the Normalizer

In [None]:
import sys
import os
current = os.path.dirname(os.path.realpath(__file__))
parent = os.path.dirname(current)
sys.path.append(parent+'/Evaluate')
from Normalizer import filterAndNormalize

### Calculate WER

In [None]:
df['hyp-norm'] = df.apply(lambda x: filterAndNormalize(x['hyp']), axis=1)
df['ref-norm'] = df.apply(lambda x: filterAndNormalize(x['ref']), axis=1)

In [None]:
import jiwer

In [None]:
wer_cln = jiwer.wer(list(df['ref']), list(df['hyp']))
wer_nrm = jiwer.wer(list(df['ref-norm']), list(df['hyp-norm']))
print('clean: {} %'.format(round(wer_cln*100,4)))
print('norm : {} %'.format(round(wer_nrm*100,4)))