In [33]:
import pandas as pd
import numpy as np
import string
from jiwer import wer 
import os
import eng_to_ipa as eti

translator = str.maketrans('', '', string.punctuation)

In [34]:
def lookup_actual(lookup_file, query, type='Sentence'):
    df = pd.read_excel(lookup_file)
    result = df.loc[df['lookupval'] == query].iloc[0]
    return result[type].lower().translate(translator).rstrip()

In [35]:
def lookupval_check(file_name):
    check = "sensical" not in file_name and "senseless" not in file_name and "nonword" not in file_name
    return check

In [36]:
def calculate_wer(results, output_file, mode="default", lookup_file="lookup_file"):
    input_excel = pd.ExcelFile(results)
    match mode:
        case "KT":
            with pd.ExcelWriter(output_file) as writer:
                for sheet in input_excel.sheet_names:
                    # read excel files
                    df = pd.read_excel(results, sheet_name=sheet)
                    df = df.drop(0)                                   # drop the separator row
                    produced = np.array(df['Predicted'].astype(str))  # columns of whisper-produced sentences
                    file_list = np.array(df['File'].astype(str)) 

                    # convert to all lower cases
                    produced = np.array([string.lower() for string in produced])

                    # remove all puncutations
                    produced = np.array([string.translate(translator) for string in produced])
                    expected = np.array([file_name[file_name.rfind('_')+1:file_name.rfind('.')] for file_name in file_list])

                    # compute wer
                    wer_s = np.array([wer(e, p) for e, p in zip(expected, produced)])

                    # add WER to the csv file 
                    df['WER'] = wer_s

                    # output to output_file 
                    df.to_excel(writer, sheet_name=sheet, index=False)
        case "default":
            with pd.ExcelWriter(output_file) as writer:
                for sheet in input_excel.sheet_names:
                    # read excel files 
                    df = pd.read_excel(results, sheet_name=sheet)
                    df = df.drop(0)                                   # drop the separator row
                    expected = np.array(df['Actual'].astype(str))     # columns of actual sentences
                    produced = np.array(df['Predicted'].astype(str))  # columns of whisper-produced sentences

                    # convert to all lower cases
                    expected = np.array([string.lower() for string in expected])
                    produced = np.array([string.lower() for string in produced])

                    # remove all puncutations
                    expected = np.array([string.translate(translator) for string in expected])
                    produced = np.array([string.translate(translator) for string in produced])

                    # compute wer
                    wer_s = np.array([wer(e, p) for e, p in zip(expected, produced)])

                    # add WER to the csv file 
                    df['WER'] = wer_s

                    # output to output_file 
                    df.to_excel(writer, sheet_name=sheet, index=False)
        case "lookup":  
            with pd.ExcelWriter(output_file) as writer:
                # read excel files 
                df = pd.read_excel(results)
                df = df.drop(0)                                   # drop the separator row
                produced = np.array(df['Predicted'].astype(str))  # columns of whisper-produced sentences
                file_list = np.array(df['File'].astype(str))

                # convert to all lower cases
                produced = np.array([string.lower().rstrip() for string in produced])

                # remove all puncutations
                produced = np.array([string.translate(translator) for string in produced])

                # compute wer
                wer_l = []
                expect_l = []
                for p in range(produced.shape[0]):
                    file_name = file_list[p]
                    # check to see if file_name matches format
                    if lookupval_check(file_name):
                        expect_l.append("Query not found!")
                        print("Query not found:", file_name)
                        wer_l.append(-1)                        # -1 will be appended if query not found
                        continue
                    if "WRONG" in file_name:
                        expect_l.append("Query not found!")
                        print("Query not found:", file_name)
                        wer_l.append(-1)
                        continue
                    # extract query from file_name for lookups
                    query = file_name[file_name.index('_') + 1:-4] + file_name[8:file_name.index('_')]
                    word = produced[p] 
                    if "sent" in file_name: # comparing sentences
                        expected = lookup_actual(lookup_file, query, type='Sentence')
                        expect_l.append(expected)
                        wer_s = wer(expected, word)
                        wer_l.append(wer_s)
                    else:
                        if ' ' in word: # comparing last words
                            word = word[word.rfind(' '):]   # extract last word from sentnece
                        expected = lookup_actual(lookup_file, query, type='FinalWord')
                        expect_l.append(expected)
                        wer_s = wer(expected, word)
                        wer_l.append(wer_s)
                        
                # add WER to the csv file 
                df['Actual'] = np.array(expect_l)
                df['WER'] = np.array(wer_l)

                # output to output_file 
                df.to_excel(writer)
    return 

In [37]:
# calculate_wer("US&MX_database_output.xlsx", "US&MX_database_wer.xlsx", mode="lookup", lookup_file="USMXlookup.xlsx")
# calculate_wer("US&MX_datatiny_output.xlsx", "US&MX_datatiny_wer.xlsx", mode="lookup", lookup_file="USMXlookup.xlsx")
# calculate_wer("US&MX_datasmall_output.xlsx", "US&MX_datasmall_wer.xlsx", mode="lookup", lookup_file="USMXlookup.xlsx")
# calculate_wer("US&MX_datamedium_output.xlsx", "US&MX_datamedium_wer.xlsx", mode="lookup", lookup_file="USMXlookup.xlsx")
# calculate_wer("US&MX_datalarge_output.xlsx", "US&MX_datalarge_wer.xlsx", mode="lookup", lookup_file="USMXlookup.xlsx")
# calculate_wer("US&MX/US&MXhumantranscriptions.xlsx", "US&MXhumantranscriptions_wer_full.xlsx", mode="lookup", lookup_file="US&MX/USMXlookup.xlsx")
# calculate_wer("KT_results_tiny.xlsx", "KT_results_tiny_wer.xlsx", mode="KT")
# calculate_wer("KT_results_small.xlsx", "KT_results_small_wer.xlsx", mode="KT")
# calculate_wer("KT_results_medium.xlsx", "KT_results_medium_wer.xlsx", mode="KT")

In [38]:
def to_ipa(input):
    # read excel files 
    df = pd.read_excel(input)
    df = df.drop(0)             # drop the separator row
    produced = np.array(df['Predicted'].astype(str))
    produced = [eti.convert(word) for word in produced]
    return produced