# Step 2.9: Getting Word Error Rates (WER) Pre- and Post-Feature

This code will get the Word Error Rate (WER) for the content of ASR output before and after the occurrence of the feature in question by dividing the number of errors by the number of words in the original (cleaned) utterance content which occur either before or after the feature.

## Required Packages

The following packages are necessary to run this code: os, [pandas](https://pypi.org/project/pandas/), [numpy](https://pypi.org/project/numpy/)

## Intitial Setup

In [None]:
# Import required packages
import pandas as pd
import numpy as np
import os

In [None]:
#filepath for the csv produced in Step 2.8
aint_file_path = "path"

be_file_path = "path"

done_file_path = "path"

#reads in the gold standard dataframe    
aint_gs_df = pd.read_csv(aint_file_path)

be_gs_df = pd.read_csv(be_file_path)

done_gs_df = pd.read_csv(done_file_path)

# Defining the Pre-Feature Word Error Rate (WER) Getting Function

This function takes the following arguments:
1. The number of errors which occur before the feature
2. The number of words which occur before the feature (in the cleaned, original utterance content)

In [None]:
def getPreFeatureWER(pre_feature_error_count, cleaned_pre_feature_utterance_word_count):
    
    """
    This function will get the word error rate (WER) pre-feature
    by dividing the number of pre-feature errors by the number
    of words in the original (cleaned) utterance content
    """
    
    try:
        
        return pre_feature_error_count/cleaned_pre_feature_utterance_word_count
    
    except ZeroDivisionError:
        
        return 0

# Defining the Post-Feature Word Error Rate (WER) Getting Function

This function takes the following arguments:
1. The number of errors which occur after the feature
2. The number of words which occur after the feature (in the cleaned, original utterance content)

In [None]:
def getPostFeatureWER(post_feature_error_count, cleaned_post_feature_utterance_word_count):
    
    """
    This function will get the word error rate (WER) post-feature
    by dividing the number of post-feature errors by the number
    of words in the original (cleaned) utterance content
    """
    
    try:
        
        return post_feature_error_count/cleaned_post_feature_utterance_word_count
    
    except ZeroDivisionError:
        
        return 0

## Executing the Code

In [None]:
# a list of column names to be appended next to
column_names = ["amazon_transcription_cleaned", 
                "deepspeech_transcription_cleaned", "google_transcription_cleaned", 
                "IBMWatson_transcription_cleaned", "microsoft_transcription_cleaned"]

### Feature: Ain't

In [None]:
# Appends new columns
for column_name in column_names:
    
    col_index = aint_gs_df.columns.get_loc(column_name)
    
    aint_gs_df.insert(col_index+5, f"{column_name}_preFeature_WER", np.nan)
        
    aint_gs_df.insert(col_index+7, f"{column_name}_postFeature_WER", np.nan)
    

# loops through rows and executes the fuctions
for file_row in aint_gs_df.itertuples():
    
    aint_gs_df.loc[file_row.Index, "amazon_transcription_cleaned_preFeature_WER"] = getPreFeatureWER(file_row.amazon_transcription_cleaned_preFeature_errorCount, file_row.Content_cleaned_PreFeature_WordCount)

    aint_gs_df.loc[file_row.Index, "deepspeech_transcription_cleaned_preFeature_WER"] = getPreFeatureWER(file_row.deepspeech_transcription_cleaned_preFeature_errorCount, file_row.Content_cleaned_PreFeature_WordCount)

    aint_gs_df.loc[file_row.Index, "google_transcription_cleaned_preFeature_WER"] = getPreFeatureWER(file_row.google_transcription_cleaned_preFeature_errorCount, file_row.Content_cleaned_PreFeature_WordCount)

    aint_gs_df.loc[file_row.Index, "IBMWatson_transcription_cleaned_preFeature_WER"] = getPreFeatureWER(file_row.IBMWatson_transcription_cleaned_preFeature_errorCount, file_row.Content_cleaned_PreFeature_WordCount)

    aint_gs_df.loc[file_row.Index, "microsoft_transcription_cleaned_preFeature_WER"] = getPreFeatureWER(file_row.microsoft_transcription_cleaned_preFeature_errorCount, file_row.Content_cleaned_PreFeature_WordCount)

    
    
    aint_gs_df.loc[file_row.Index, "amazon_transcription_cleaned_postFeature_WER"] = getPostFeatureWER(file_row.amazon_transcription_cleaned_postFeature_errorCount, file_row.Content_cleaned_PostFeature_WordCount)

    aint_gs_df.loc[file_row.Index, "deepspeech_transcription_cleaned_postFeature_WER"] = getPostFeatureWER(file_row.deepspeech_transcription_cleaned_postFeature_errorCount, file_row.Content_cleaned_PostFeature_WordCount)

    aint_gs_df.loc[file_row.Index, "google_transcription_cleaned_postFeature_WER"] = getPostFeatureWER(file_row.google_transcription_cleaned_postFeature_errorCount, file_row.Content_cleaned_PostFeature_WordCount)

    aint_gs_df.loc[file_row.Index, "IBMWatson_transcription_cleaned_postFeature_WER"] = getPostFeatureWER(file_row.IBMWatson_transcription_cleaned_postFeature_errorCount, file_row.Content_cleaned_PostFeature_WordCount)

    aint_gs_df.loc[file_row.Index, "microsoft_transcription_cleaned_postFeature_WER"] = getPostFeatureWER(file_row.microsoft_transcription_cleaned_postFeature_errorCount, file_row.Content_cleaned_PostFeature_WordCount)

### Feature: Be

In [None]:
# Appends new columns
for column_name in column_names:
    
    col_index = be_gs_df.columns.get_loc(column_name)
    
    be_gs_df.insert(col_index+5, f"{column_name}_preFeature_WER", np.nan)
        
    be_gs_df.insert(col_index+7, f"{column_name}_postFeature_WER", np.nan)
    

# loops through rows and executes the fuctions
for file_row in be_gs_df.itertuples():
    
    be_gs_df.loc[file_row.Index, "amazon_transcription_cleaned_preFeature_WER"] = getPreFeatureWER(file_row.amazon_transcription_cleaned_preFeature_errorCount, file_row.Content_cleaned_PreFeature_WordCount)

    be_gs_df.loc[file_row.Index, "deepspeech_transcription_cleaned_preFeature_WER"] = getPreFeatureWER(file_row.deepspeech_transcription_cleaned_preFeature_errorCount, file_row.Content_cleaned_PreFeature_WordCount)

    be_gs_df.loc[file_row.Index, "google_transcription_cleaned_preFeature_WER"] = getPreFeatureWER(file_row.google_transcription_cleaned_preFeature_errorCount, file_row.Content_cleaned_PreFeature_WordCount)

    be_gs_df.loc[file_row.Index, "IBMWatson_transcription_cleaned_preFeature_WER"] = getPreFeatureWER(file_row.IBMWatson_transcription_cleaned_preFeature_errorCount, file_row.Content_cleaned_PreFeature_WordCount)

    be_gs_df.loc[file_row.Index, "microsoft_transcription_cleaned_preFeature_WER"] = getPreFeatureWER(file_row.microsoft_transcription_cleaned_preFeature_errorCount, file_row.Content_cleaned_PreFeature_WordCount)

    
    
    be_gs_df.loc[file_row.Index, "amazon_transcription_cleaned_postFeature_WER"] = getPostFeatureWER(file_row.amazon_transcription_cleaned_postFeature_errorCount, file_row.Content_cleaned_PostFeature_WordCount)

    be_gs_df.loc[file_row.Index, "deepspeech_transcription_cleaned_postFeature_WER"] = getPostFeatureWER(file_row.deepspeech_transcription_cleaned_postFeature_errorCount, file_row.Content_cleaned_PostFeature_WordCount)

    be_gs_df.loc[file_row.Index, "google_transcription_cleaned_postFeature_WER"] = getPostFeatureWER(file_row.google_transcription_cleaned_postFeature_errorCount, file_row.Content_cleaned_PostFeature_WordCount)

    be_gs_df.loc[file_row.Index, "IBMWatson_transcription_cleaned_postFeature_WER"] = getPostFeatureWER(file_row.IBMWatson_transcription_cleaned_postFeature_errorCount, file_row.Content_cleaned_PostFeature_WordCount)

    be_gs_df.loc[file_row.Index, "microsoft_transcription_cleaned_postFeature_WER"] = getPostFeatureWER(file_row.microsoft_transcription_cleaned_postFeature_errorCount, file_row.Content_cleaned_PostFeature_WordCount)

### Feature: Done

In [None]:
# Appends new columns
for column_name in column_names:
    
    col_index = done_gs_df.columns.get_loc(column_name)
    
    done_gs_df.insert(col_index+5, f"{column_name}_preFeature_WER", np.nan)
        
    done_gs_df.insert(col_index+7, f"{column_name}_postFeature_WER", np.nan)
    

# loops through rows and executes the fuctions
for file_row in done_gs_df.itertuples():
    
    done_gs_df.loc[file_row.Index, "amazon_transcription_cleaned_preFeature_WER"] = getPreFeatureWER(file_row.amazon_transcription_cleaned_preFeature_errorCount, file_row.Content_cleaned_PreFeature_WordCount)

    done_gs_df.loc[file_row.Index, "deepspeech_transcription_cleaned_preFeature_WER"] = getPreFeatureWER(file_row.deepspeech_transcription_cleaned_preFeature_errorCount, file_row.Content_cleaned_PreFeature_WordCount)

    done_gs_df.loc[file_row.Index, "google_transcription_cleaned_preFeature_WER"] = getPreFeatureWER(file_row.google_transcription_cleaned_preFeature_errorCount, file_row.Content_cleaned_PreFeature_WordCount)

    done_gs_df.loc[file_row.Index, "IBMWatson_transcription_cleaned_preFeature_WER"] = getPreFeatureWER(file_row.IBMWatson_transcription_cleaned_preFeature_errorCount, file_row.Content_cleaned_PreFeature_WordCount)

    done_gs_df.loc[file_row.Index, "microsoft_transcription_cleaned_preFeature_WER"] = getPreFeatureWER(file_row.microsoft_transcription_cleaned_preFeature_errorCount, file_row.Content_cleaned_PreFeature_WordCount)

    
    
    done_gs_df.loc[file_row.Index, "amazon_transcription_cleaned_postFeature_WER"] = getPostFeatureWER(file_row.amazon_transcription_cleaned_postFeature_errorCount, file_row.Content_cleaned_PostFeature_WordCount)

    done_gs_df.loc[file_row.Index, "deepspeech_transcription_cleaned_postFeature_WER"] = getPostFeatureWER(file_row.deepspeech_transcription_cleaned_postFeature_errorCount, file_row.Content_cleaned_PostFeature_WordCount)

    done_gs_df.loc[file_row.Index, "google_transcription_cleaned_postFeature_WER"] = getPostFeatureWER(file_row.google_transcription_cleaned_postFeature_errorCount, file_row.Content_cleaned_PostFeature_WordCount)

    done_gs_df.loc[file_row.Index, "IBMWatson_transcription_cleaned_postFeature_WER"] = getPostFeatureWER(file_row.IBMWatson_transcription_cleaned_postFeature_errorCount, file_row.Content_cleaned_PostFeature_WordCount)

    done_gs_df.loc[file_row.Index, "microsoft_transcription_cleaned_postFeature_WER"] = getPostFeatureWER(file_row.microsoft_transcription_cleaned_postFeature_errorCount, file_row.Content_cleaned_PostFeature_WordCount)

## Sorting the Dataframes by File and Line

This will sort the dataframes first by filename and then by line number. Doing this each step will ensure consistency across the board.

### Feature: Ain't

In [None]:
aint_gs_df = aint_gs_df.sort_values(by=['File', 'Line'])

### Feature: Be

In [None]:
be_gs_df = be_gs_df.sort_values(by=['File', 'Line'])

### Feature: Done

In [None]:
done_gs_df = done_gs_df.sort_values(by=['File', 'Line'])

## Exporting Dataframes to CSV Files

This will export the dataframes to CSV files.

In [None]:
# Designate the output path where the CSVs will be stored
csv_output_path = "path"

### Feature: Ain't

In [None]:
aint_gs_df.to_csv(f"{csv_output_path}aint_variations_prePostWER.csv", index=False)

### Feature: Be

In [None]:
be_gs_df.to_csv(f"{csv_output_path}be_prePostWER.csv", index=False)

### Feature: Done

In [None]:
done_gs_df.to_csv(f"{csv_output_path}done_prePostWER.csv", index=False)