# Whisper ATCO2 Testing

In [5]:
import glob
import whisper
from whisper.normalizers import EnglishTextNormalizer
import re
import pandas as pd
import jiwer
import numpy as np

In [7]:
print(wav_atco2_asr)

['./WhisperModel/ATCO2-ASR/DATA/LKPR_RUZYNE_Radar_120_520MHz_20201025_091112.wav', './WhisperModel/ATCO2-ASR/DATA/LKPR_RUZYNE_Radar_120_520MHz_20201025_120512.wav', './WhisperModel/ATCO2-ASR/DATA/LKPR_RUZYNE_Radar_120_520MHz_20201025_121325.wav', './WhisperModel/ATCO2-ASR/DATA/LKPR_RUZYNE_Radar_120_520MHz_20201025_130407.wav', './WhisperModel/ATCO2-ASR/DATA/LKPR_RUZYNE_Radar_120_520MHz_20201025_140929.wav']


In [8]:
wav_files  = glob.glob('./WhisperModel/ATCO2-ASR/DATA/*.wav')
wav_files.sort()
info_files = glob.glob('./WhisperModel/ATCO2-ASR/DATA/*.conv.info')
info_files.sort()
txt_files  = glob.glob('./WhisperModel/ATCO2-ASR/DATA/*.txt')
txt_files.sort()

In [9]:
print(wav_files[:5])

['./WhisperModel/ATCO2-ASR/DATA/LKPR_RUZYNE_Radar_120_520MHz_20201025_091112.wav', './WhisperModel/ATCO2-ASR/DATA/LKPR_RUZYNE_Radar_120_520MHz_20201025_120512.wav', './WhisperModel/ATCO2-ASR/DATA/LKPR_RUZYNE_Radar_120_520MHz_20201025_121325.wav', './WhisperModel/ATCO2-ASR/DATA/LKPR_RUZYNE_Radar_120_520MHz_20201025_130407.wav', './WhisperModel/ATCO2-ASR/DATA/LKPR_RUZYNE_Radar_120_520MHz_20201025_140929.wav']


In [10]:
def convertInfoFile(file):
    airport = ''
    airport_full = ''
    channel = ''
    wpts = ''
    csns = ''
    csns_full = ''
    
    with open(file, 'r') as f:
        info_full = f.read()
        csns = ''
        csns_full = ''
        for line in info_full.splitlines():
            if 'airport' in line:
                airport = line[9:13]
                x = line.find('(')
                y = line.find(')')
                airport_full = line[x+1:y]
            if 'channel' in line:
                channel = line[9:]
            if 'waypoints nearby' in line:
                wpts = line[18:]
            if 'callsigns nearby' in line:
                x = info_full.splitlines().index(line) + 1
                for line in info_full.splitlines()[x:]:
                    csn = line.split(' ')[0]
                    csns = csns + ' ' + csn
                    
                    csn_full = ' '.join(line.split(' ')[2:])
                    csns_full = csns_full + ' ' + csn_full
                csns = csns[1:]
                csns_full = csns_full[1:]
                
                csns_full = list(dict.fromkeys(csns_full.split(' ')))

    nato_alphabet =['alpha', 'bravo', 'charlie', 'delta', 'echo', 
                    'foxtrot', 'golf', 'hotel', 'india', 'juliett',
                    'kilo', 'lima', 'mike', 'november', 'oscar',
                    'papa', 'quebec', 'romeo', 'sierra', 'tango',
                    'uniform', 'victor', 'whiskey', 'xray', 'x-ray', 'yankee', 'zulu',
                         
                    'one', 'two', 'three', 'four', 'five',
                    'six', 'seven', 'eight', 'nine', 'ten', 
                    'zero', 'hundred', 'thousand']
    
    csns_full = [x for x in csns_full if x.lower() not in nato_alphabet and len(x)!=0]
    
    # print(airport)
    # print(channel)
    # print(wpts)
    # print(csns)
    # print(csns_full)
    # print(info_full)
    # return airport, channel, wpts, csns, csns_full

    file = file[:-4]+'conv.info'
    with open(file, 'x') as f:
        f.write(airport)
        f.write('\n')
        f.write(airport_full)
        f.write('\n')
        f.write(channel)
        f.write('\n')
        f.write(wpts)
        f.write('\n')
        f.write(csns)
        f.write('\n')
        f.write(' '.join(csns_full))

In [12]:
model = whisper.load_model('large-v2')

In [13]:
def readInfoFile(file):
    with open(file, 'r') as f:
        airport = " "
        airport_full = " "
        channel = " "
        wpts = " "
        csns = " "
        csns_full = " "
        lines = f.read().splitlines()
        try:
            airport = lines[0]
        except:
            pass
        try:
            airport_full = lines[1]
        except:
            pass
        try:
            channel = lines[2]
        except:
            pass
        try:
            wpts = lines[3]
        except:
            pass
        try:
            csns = lines[4]
        except:
            pass
        try:
            csns_full = lines[5]
        except:
            pass
        
    return airport, airport_full, channel, wpts, csns, csns_full

In [14]:
hyp_clean = []
hyp_prmpt = []
ref = []

for file in wav_files[:5]:
    prompt_general = "Air Traffic Control communications"
    airport, airport_full, channel, wpts, csns, csns_full = readInfoFile(file[:-3]+'conv.info')
    nato = "alpha,bravo,charlie,delta,echo,foxtrot,golf,hotel,india,juliett,kilo,lima,mike,november,oscar,papa,quebec,romeo,sierra,tango,uniform,victor,whiskey,xray,yankee,zulu"
    terminology = "climb, climbing, descend, descending, passing, feet, knots, degrees, direct, maintain, identified, ILS, VFR, IFR, contact, frequency, turn, right, left, heading, altitude, flight, level, cleared, squawk, approach, runway, established, report, affirm, negative, wilco, roger, radio, radar"
    prompt = prompt_general + " " + airport + " " + airport_full + " " + channel + " " + wpts + " " + csns + " " + csns_full + " " + nato + " " + terminology
    
    res_clean = model.transcribe(file, language='en')
    hyp_clean.append(res_clean)
    res_prmpt = model.transcribe(file, language='en', initial_prompt = prompt)
    hyp_prmpt.append(res_prmpt)
    
    with open(file[:-3]+'txt') as f:
        correct = f.read()
        ref.append(correct)

In [15]:
df = pd.DataFrame([hyp_clean, hyp_prmpt, ref]).T # columns: hyp-clean, hyp-prmpt, ref
df = df.rename(columns={0: 'hyp-clean', 1: 'hyp-prmpt', 2: 'ref'})
# df.to_excel('WER-tests-ATCO2-before-norm.xlsx')

In [20]:
import pandas as pd
import numpy as np
#df = pd.read_excel('./PromptTesting/ATCO2/WER-tests-ATCO2-before-norm.xlsx', index_col=0, dtype={'hyp-clean': dict, 'hyp-prmpt': dict, 'ref': str})
df2 = df
df = df
df2 = df2
df = df.reset_index()
df = df.drop(columns='index')

In [21]:
df

Unnamed: 0,hyp-clean,hyp-prmpt,ref
0,"{'text': ' Oscar Kilo Papa Mike Bravo, descend...","{'text': ' Oscar Kilo Papa Mike Bravo, recent ...",Oscar Kilo Papa Mike Bravo descend flight leve...
1,"{'text': ' OSCQA, QNH 100, IFR flight starts n...",{'text': ' Oscar Kilo Kilo Echo Alpha Praha Ra...,Oscar Kilo Kilo Echo Alfa Praha Radar identifi...
2,{'text': ' Ryanair 730H turn left heading 360 ...,{'text': ' Ryanair 730H turn left heading 360 ...,Ryanair Seven Three Alpha Hotel turn left head...
3,"{'text': ' Oskar, Kilo Kilo Info, November, pr...","{'text': ' Oskar, Kilo Kilo info, November, pr...",Oscar Kilo Kilo Uniform November proceed direc...
4,"{'text': ' EW7AB, turn right heading 210, clea...","{'text': ' Eurowings 7A, bravo, turn right hea...",Eurowings Seven Alfa Bravo turn right heading ...


In [17]:
import ast

def convertStringToDict(str):
    return ast.literal_eval(str)
def replaceDictByText(dct):
    return convertStringToDict(dct)['text']

In [22]:
df.loc[:, 'hyp-clean'] = df.apply(lambda x: replaceDictByText(x['hyp-clean']), axis=1)
df.loc[:, 'hyp-prmpt'] = df.apply(lambda x: replaceDictByText(x['hyp-prmpt']), axis=1)

ValueError: malformed node or string: {'text': ' Oscar Kilo Papa Mike Bravo, descend FL100 Descend FL100, Oscar Kilo Papa Mike Bravo', 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 3.2, 'text': ' Oscar Kilo Papa Mike Bravo, descend FL100', 'tokens': [50364, 20718, 591, 10720, 21102, 6602, 28861, 11, 16333, 24720, 6879, 50524], 'temperature': 0.0, 'avg_logprob': -0.6286844106820914, 'compression_ratio': 1.4310344827586208, 'no_speech_prob': 0.4093279540538788}, {'id': 1, 'seek': 0, 'start': 4.4, 'end': 6.4, 'text': ' Descend FL100, Oscar Kilo Papa Mike Bravo', 'tokens': [50584, 3885, 21153, 24720, 6879, 11, 20718, 591, 10720, 21102, 6602, 28861, 50684], 'temperature': 0.0, 'avg_logprob': -0.6286844106820914, 'compression_ratio': 1.4310344827586208, 'no_speech_prob': 0.4093279540538788}], 'language': 'en'}

In [24]:
df.to_excel('test.xlsx')

In [27]:
df = pd.read_excel('test.xlsx')
df.columns

Index(['Unnamed: 0', 'hyp-clean', 'hyp-prmpt', 'ref'], dtype='object')

In [33]:
#df = df.drop(columns='Unnamed: 0')
print(df.dtypes)
df

hyp-clean    object
hyp-prmpt    object
ref          object
dtype: object


Unnamed: 0,hyp-clean,hyp-prmpt,ref
0,"Oscar Kilo Papa Mike Bravo, descend FL100 Desc...","Oscar Kilo Papa Mike Bravo, recent flight leve...",Oscar Kilo Papa Mike Bravo descend flight leve...
1,"OSCQA, QNH 100, IFR flight starts now, time 05...",Oscar Kilo Kilo Echo Alpha Praha Radar identif...,Oscar Kilo Kilo Echo Alfa Praha Radar identifi...
2,Ryanair 730H turn left heading 360 Go Ryanair ...,Ryanair 730H turn left heading 360 Go Ryanair ...,Ryanair Seven Three Alpha Hotel turn left head...
3,"Oskar, Kilo Kilo Info, November, proceed direc...","Oskar, Kilo Kilo info, November, proceed direc...",Oscar Kilo Kilo Uniform November proceed direc...
4,"EW7AB, turn right heading 210, cleared ILS app...","Eurowings 7A, bravo, turn right heading 210, c...",Eurowings Seven Alfa Bravo turn right heading ...


In [134]:
nato_alphabet_mapping       = {'A': 'alpha', 'B': 'bravo', 'C': 'charlie', 'D': 'delta', 'E': 'echo', 
                            'F': 'foxtrot', 'G': 'golf', 'H': 'hotel', 'I': 'india', 'J': 'juliett',
                            'K': 'kilo', 'L': 'lima', 'M': 'mike', 'N': 'november', 'O': 'oscar',
                            'P': 'papa', 'Q': 'quebec', 'R': 'romeo', 'S': 'sierra', 'T': 'tango',
                            'U': 'uniform', 'V': 'victor', 'W': 'whiskey', 'X': 'xray', 'Y': 'yankee', 'Z': 'zulu',
                         
                            '1': 'one', '2': 'two', '3': 'three', '4': 'four', '5': 'five',
                            '6': 'six', '7': 'seven', '8': 'eight', '9': 'nine', '10': 'ten', 
                            '0': 'zero', '00': 'hundred', '000': 'thousand',
                         
                            '.': 'decimal', ',': 'comma', '-': 'dash',}
nato_similarities           = {'alfa': 'alpha', 'oskar': 'oscar', 'ekko': 'echo', 'gulf': 'golf', 'charly': 'charlie'}
terminology_mapping         = {'FL': 'flight level'}
text_similarities           = {'descent': 'descend'}

# Not needed for WER calculations
# airlines_icao_mapping       = {'lufthansa': 'lufthansa', 'speedbird': 'british airways'}
# airlines_synonym_mapping    = {'hansa': 'lufthansa'}

# Sometimes Whisper is intelligent enough to perceive 'eurowings seven alpha bravo' as 'EW7AB'
airlines_iata_codes         = {'BA': 'british airways', 'KL': 'klm', 'LH': 'lufthansa', 'EW': 'eurowings'}
airlines_icao_codes         = {'BAW': 'british airways', 'DLH': 'lufthansa', 'KLM': 'klm', 'EWG': 'eurowings', 'RYR': 'ryanair'}

def aerospaceTransform(text):
    wrds = text.split()
    for word in wrds:
        if word in nato_alphabet_mapping:
            x = wrds.index(word)
            wrds[x] = nato_alphabet_mapping[word]
        if word.lower() in nato_similarities:
            x = wrds.index(word)
            wrds[x] = nato_similarities[word.lower()]
        if word in terminology_mapping:
            x = wrds.index(word)
            wrds[x] = terminology_mapping[word]
        if word.lower() in text_similarities:
            x = wrds.index(word)
            wrds[x] = text_similarities[word.lower()]
        if word.upper() in airlines_iata_codes:
            x = wrds.index(word)
            wrds[x] = airlines_iata_codes[word.upper()]            
        if word.upper() in airlines_icao_codes:
            x = wrds.index(word)
            wrds[x] = airlines_icao_codes[word.upper()]
    return ' '.join(wrds)

In [34]:
normalizer = EnglishTextNormalizer()
def removePunctuation(text):
    text = ''.join(
        ' ' if c in '!@#$%^&*~-+=_\|;:,.?' else c
        for c in text
    )
    return text

def separateNumbersAndText(text):
    text = re.split('(\d+)', text)
    text = ' '.join(text)
    return text

def separateCallSignLetters(text):
    wrds = text.split()
    prohibited_words = ['ILS', 'IFR', 'FL']
    for word in wrds:
        if word.isupper() and word not in prohibited_words:
            ltrs = [str(l) for l in word]
            ltrs = ' '.join(str(l) for l in ltrs)
            x = wrds.index(word)
            wrds[x] = ltrs
    
    return ' '.join(wrds)

def splitNumbersIntoDigits(text):
    wrds = text.split()
    for word in wrds:
        if word.isnumeric():
            dgts = [int(d) for d in word]
            dgts = ' '.join(str(d) for d in dgts)
            x = wrds.index(word)
            wrds[x] = dgts
        
    return ' '.join(wrds)

def removeSpokenSeparators(text):
    wrds = text.split()
    for word in wrds:
        if word.lower() in ['decimal', 'comma', 'point']:
            x = wrds.index(word)
            wrds[x] = ''
        
    return ' '.join(wrds)

def splitGreetings(text):
    wrds = text.split()
    for word in wrds:
        if word.lower() in ['goodbye']:
            x = wrds.index(word)
            wrds[x] = 'good bye'
            
    return ' '.join(wrds)

def splitStandBy(text):
    wrds = text.split()
    for word in wrds:
        if word.lower() in ['standby']:
            x = wrds.index(word)
            wrds[x] = 'stand by'
            
    return ' '.join(wrds)

def removeCharSet(text, c1, c2): # for removing all text within (and including) a character set (ex.: [TRANSCRIPT] )
    while c1 in text and c2 in text:
        x = text.find(c1)
        y = text.rfind(c2) # Should be the last entry of the closing element ) ] > 
        text = text[0:x] + text[y+1:]
    return text

def removeChar(text, c1): # for removing a single character (ex.: @ )
    while c1 in text:
        x = text.find(c1)
        text = text[0:x] + text[x+1:]
    return text

def removeNonAlphaNum(text): # for removing all non alphanumeric characters (ex.: ! @ # $ % ^ & * ) (AlphanNum.: A-Z, a-z, 0-9)
    for c in text:
        if c.isalnum() == False and c != ' ' :
            x = text.find(c)
            text = text[0:x] + text[x+1:]
    return text

def splitIfContainsNatoAlphabet(text):
    wrds = text.split()
    for word in wrds:
        x = wrds.index(word)
        for val in list(nato_alphabet_mapping.values()):
            if val in word:
                y = word.find(val)
                newWord = word[:y] + ' ' + word[y:y+len(val)] + ' ' + word[y+len(val):]
                wrds[x] = newWord

    return ' '.join(wrds)

def transformNatoToLetterOnly(text):
    wrds = text.split()
    vals = list(nato_alphabet_mapping.values())
    keys = list(nato_alphabet_mapping.keys())
    for word in wrds:
        x = wrds.index(word)
        if word in vals:
            wrds[x] = keys[vals.index(word)].lower()
            
    return ' '.join(wrds)

def looseLettersTogetherIfNatoKeys(text):
    text_f4 = [*text[:4]]
    text_l4 = [*text[-4:]]
    text_re = text[4:-4]
    
    if text_f4[1] == ' ' and text_f4[3] == ' ':
        text = text[0] + text[2:]
    
    if text_l4[0] == ' ' and text_l4[2] == ' ':
        text = text[:-2] + text[-1]
    
    lst = re.findall('\w\D', text_re) # find all \w (alphanumeric and _, but _ is nonexistent) but not \D (digits) => only characters
    lst = [l.strip() for l in lst if len(l.strip())==1]
    print(lst)
    
    for l in lst:
        i = lst.index(l)
        x = text.find(l)
        if i+2 <= len(lst):
            y = text.find(lst[i+1])
            text = text[:x+1] + text[y:]
        
    return text

In [429]:
text = 'a b c d e he y d e llo c d'
chars = re.finditer(r'(?i)\b[a-z]\b', text)
for char in chars:
    print(char)

<re.Match object; span=(0, 1), match='a'>
<re.Match object; span=(2, 3), match='b'>
<re.Match object; span=(4, 5), match='c'>
<re.Match object; span=(6, 7), match='d'>
<re.Match object; span=(8, 9), match='e'>
<re.Match object; span=(13, 14), match='y'>
<re.Match object; span=(15, 16), match='d'>
<re.Match object; span=(17, 18), match='e'>
<re.Match object; span=(23, 24), match='c'>
<re.Match object; span=(25, 26), match='d'>


In [35]:
def filterAndNormalize(text):   
    text = removeCharSet(text, '[', ']')
    text = removeCharSet(text, '<', '>')
    # text = removeCharSet(text, '(', ')')
    
    text = removeNonAlphaNum(text)
    text = separateNumbersAndText(text)
    text = aerospaceTransform(text)
    text = removeSpokenSeparators(text)
    # text = separateCallSignLetters(text)

    text = normalizer(text)
    text = normalizer(text)
    # Running twice because the normalizer will replace 'zero five' by '05' but also replaces '05' by '5' (removing leading zeros).
    
    text = splitNumbersIntoDigits(text)

    text = splitGreetings(text)
    text = splitStandBy(text)
    
    text = splitIfContainsNatoAlphabet(splitIfContainsNatoAlphabet(text))
    text = text.lower()
    return text

def normalizeOnly(text):
    return normalizer(text)

In [36]:
df

Unnamed: 0,hyp-clean,hyp-prmpt,ref
0,"Oscar Kilo Papa Mike Bravo, descend FL100 Desc...","Oscar Kilo Papa Mike Bravo, recent flight leve...",Oscar Kilo Papa Mike Bravo descend flight leve...
1,"OSCQA, QNH 100, IFR flight starts now, time 05...",Oscar Kilo Kilo Echo Alpha Praha Radar identif...,Oscar Kilo Kilo Echo Alfa Praha Radar identifi...
2,Ryanair 730H turn left heading 360 Go Ryanair ...,Ryanair 730H turn left heading 360 Go Ryanair ...,Ryanair Seven Three Alpha Hotel turn left head...
3,"Oskar, Kilo Kilo Info, November, proceed direc...","Oskar, Kilo Kilo info, November, proceed direc...",Oscar Kilo Kilo Uniform November proceed direc...
4,"EW7AB, turn right heading 210, cleared ILS app...","Eurowings 7A, bravo, turn right heading 210, c...",Eurowings Seven Alfa Bravo turn right heading ...


In [37]:
df.loc[:, 'hyp-clean-norm'] = df.apply(lambda x: normalizeOnly(x['hyp-clean']), axis=1)
df.loc[:, 'hyp-prmpt-norm'] = df.apply(lambda x: normalizeOnly(x['hyp-prmpt']), axis=1)
df.loc[:, 'ref-norm'] = df.apply(lambda x: normalizeOnly(x['ref']), axis=1)

# writer = pd.ExcelWriter('./PromptTesting/ATCO2/Transcripts-ATCO2-ASR.xlsx', engine='xlsxwriter')   
# df.T.to_excel(excel_writer=writer, sheet_name='ATCO2-ASR')
# writer.save()

In [38]:
df

Unnamed: 0,hyp-clean,hyp-prmpt,ref,hyp-clean-norm,hyp-prmpt-norm,ref-norm
0,"Oscar Kilo Papa Mike Bravo, descend FL100 Desc...","Oscar Kilo Papa Mike Bravo, recent flight leve...",Oscar Kilo Papa Mike Bravo descend flight leve...,oscar kilo papa mike bravo descend fl 100 desc...,oscar kilo papa mike bravo recent flight level...,oscar kilo papa mike bravo descend flight leve...
1,"OSCQA, QNH 100, IFR flight starts now, time 05...",Oscar Kilo Kilo Echo Alpha Praha Radar identif...,Oscar Kilo Kilo Echo Alfa Praha Radar identifi...,oscqa qnh 100 ifr flight starts now time 5 cle...,oscar kilo kilo echo alpha praha radar identif...,oscar kilo kilo echo alfa praha radar identifi...
2,Ryanair 730H turn left heading 360 Go Ryanair ...,Ryanair 730H turn left heading 360 Go Ryanair ...,Ryanair Seven Three Alpha Hotel turn left head...,ryanair 730 h turn left heading 360 go ryanair...,ryanair 730 h turn left heading 360 go ryanair...,ryanair 73 alpha hotel turn left heading 360 r...
3,"Oskar, Kilo Kilo Info, November, proceed direc...","Oskar, Kilo Kilo info, November, proceed direc...",Oscar Kilo Kilo Uniform November proceed direc...,oskar kilo kilo info november proceed direct b...,oskar kilo kilo info november proceed direct b...,oscar kilo kilo uniform november proceed direc...
4,"EW7AB, turn right heading 210, cleared ILS app...","Eurowings 7A, bravo, turn right heading 210, c...",Eurowings Seven Alfa Bravo turn right heading ...,ew 7 ab turn right heading 210 cleared ils app...,eurowings 7 a bravo turn right heading 210 cle...,eurowings 7 alfa bravo turn right heading 210 ...


In [39]:
wer_clean = jiwer.wer(list(df['ref-norm']), list(df['hyp-clean-norm']))
wer_prmpt = jiwer.wer(list(df['ref-norm']), list(df['hyp-prmpt-norm']))
print('ATCO2 ASR -- no prompt: {} %'.format(round(wer_clean*100,2)))
print('ATCO2 ASR --    prompt: {} %'.format(round(wer_prmpt*100,2)))

ATCO2 ASR -- no prompt: 38.14 %
ATCO2 ASR --    prompt: 23.71 %
