# Word Error Rate 

In [None]:
# Environment Python 3.9.16
# !pip install jiwer

import pandas as pd
from jiwer import wer

# Load the data
data = pd.read_csv('(Supplementary)_02_Input_Transcript.csv')

print("Columns in DataFrame:", data.columns.tolist())

Columns in DataFrame: ['ID', 'Sentence_Type', 'Sentence_Original', 'STANDARD_TRANSCRIPT', 'vosk-model-small-en-us-0.15', 'Wav2vec2', 'HuBERT', 'Whisper_Base-En', 'Whisper_Large-v3-EN', 'Azure']


In [4]:
# Define the list of ASR transcription columns to compare
asr_columns = [
    "STANDARD_TRANSCRIPT", "vosk-model-small-en-us-0.15", "Wav2vec2",
    "HuBERT", "Whisper_Base-En", "Whisper_Large-v3-EN", "Azure"
]

# Custom WER function using Levenshtein distance
def calculate_wer(reference, hypothesis):
    if pd.isnull(reference) or pd.isnull(hypothesis):
        return None

    ref_words = str(reference).strip().split()
    hyp_words = str(hypothesis).strip().split()
    r_len = len(ref_words)
    h_len = len(hyp_words)

    # Initialize matrix
    d = [[0] * (h_len + 1) for _ in range(r_len + 1)]
    for i in range(r_len + 1):
        d[i][0] = i
    for j in range(h_len + 1):
        d[0][j] = j

    # Populate matrix
    for i in range(1, r_len + 1):
        for j in range(1, h_len + 1):
            if ref_words[i - 1] == hyp_words[j - 1]:
                cost = 0
            else:
                cost = 1
            d[i][j] = min(
                d[i - 1][j] + 1,      # deletion
                d[i][j - 1] + 1,      # insertion
                d[i - 1][j - 1] + cost  # substitution
            )

    wer_result = d[r_len][h_len] / max(r_len, 1)  # Avoid division by zero
    return wer_result


In [6]:
# Calculate WER for each column and add as new columns
for column in asr_columns:
    if column in data.columns and column != 'STANDARD_TRANSCRIPT':
        wer_column_name = f"WER_{column}"
        data[wer_column_name] = data.apply(
            lambda row: calculate_wer(row['STANDARD_TRANSCRIPT'], row[column]), axis=1)
    elif column != 'STANDARD_TRANSCRIPT':
        print(f"Column '{column}' does not exist in the DataFrame.")

# Save the updated dataframe to a new Excel file
data.to_csv('(Supplementary)_02_Output_Transcript_WER.csv', index=False)

# Display the first few rows of the updated dataframe
print(data.head(20))

    ID  Sentence_Type               Sentence_Original   
0    1  interrogative            HOW DO WE GET THERE?  \
1    2    declaritive                 GOOD AFTERNOON.   
2    3    declaritive                  THIS IS KUMAR.   
3    4    declaritive                     I THINK SO.   
4    5    exclamatory                      WATCH OUT!   
5    6    declaritive            HE IS A NEW STUDENT.   
6    7  interrogative                  ANYTHING ELSE?   
7    8    exclamatory           OUR TEAM IS THE BEST!   
8    9  interrogative          WHERE ARE THE ANIMALS?   
9   10    declaritive                      FOLLOW ME.   
10  11    exclamatory                  YES, LET'S GO!   
11  12  interrogative        CAN WE HAVE THREE SODAS?   
12  13    declaritive                 YOU'RE WELCOME.   
13  14  interrogative               WHAT'S YOUR NAME?   
14  15    declaritive                  I LIKE SOCCER.   
15  16    declaritive  NOW I KNOW ABOUT MANY ANIMALS.   
16  17    declaritive          

In [None]:
# End.