# Step 2.5: Cleaning Utterance Content

This code will clean the utterance content of the original CORAAL content along with the ASR transcriptions. This code is specifically attuned to address the idiosyncracies of the ASR outputs.

## Required Packages

The following packages are necessary to run this code: string, [pandas](https://pypi.org/project/pandas/), [numpy](https://pypi.org/project/numpy/), [nltk](https://pypi.org/project/nltk/), [num2words](https://pypi.org/project/num2words/)

## Intitial Setup

In [None]:
# Import required packages
import pandas as pd
import numpy as np
import os

In [None]:
#filepath for the csv produced in Step 2.4
aint_file_path = "path"

be_file_path = "path"

done_file_path = "path"

#reads in the gold standard dataframe    
aint_gs_df = pd.read_csv(aint_file_path)

be_gs_df = pd.read_csv(be_file_path)

done_gs_df = pd.read_csv(done_file_path)

## Defining the Cleaning Function

This function takes one argument:
1. The utterance content as a string

In [None]:
def clean_utterance_content(utterance_content):
    
    """
    Cleans the utterance content of both the original CORAAL utterances
    and also the ASR outputs. This code is specifically attuned to address
    the idiosyncracies of the ASR outputs.
    
    Takes the utterance content (a string) as it's first argument
    """

    #imports required libraries
    from num2words import num2words
    from nltk.tokenize import word_tokenize
    import numpy
    import string
    
    
    #if the entered utterance_content is not a string, returns a NaN
    if type(utterance_content) != str:
        
        return np.nan
    
    #else, execute the code
    else:

        #a list of censored words. CORAAL includes these without censoring
        #  however, the ASR services censor them. I believe it is possible to turn
        #  off the censoring filter. However, given that most everyday users of an ASR like Siri would
        #  likely not take this step, I felt it more natural to include the censorship
        censor_words = ["shit", "shits", "fuck", "fucks", "fucking", "fucked",
                       "fucker", "fuckers", "motherfucker", "motherfuckers",
                       "damn", "bitch", "bitches", "bastard", "bastards",
                       "ass", "asses", "goddamn", "nigga", "niggas"]

        #a dictionary of reductions to be converted
        reduction_dict = {
            "musta": ["must", "have"],
            "woulda": ["would", "have"],
            "shoulda": ["should", "have"],
            "coulda": ["could", "have"],
            "mighta": ["might", "have"],
            "gonna": ["going", "to"],
            "hafta": ["have", "to"],
            "tryna": ["trying", "to"],
            "sposta": ["supposed", "to"],
            "finna": ["fixing", "to"],
            "gotta": ["got", "to"],
            "wanna": ["want", "to"],
            "oughta": ["ought", "to"],
            "cause": ["because"],
            "til": ["until"],
            "'em": ["them"],
            "lemme": ["let", "me"],
            "whatchu": ["what", "are", "you"],
            "gotcha": ["got", "you"],
        }

        
        #replaces a few reductions because the nltk tokenizer will split them here
        # and the reduction cleaner later in the code won't catch them. this should fix that
        utterance_content = utterance_content.replace("gotta", "got to").replace("gonna", "going to").replace("wanna", "want to").replace("lemme", "let me").replace("%HESITATION", "").replace("#", "number").replace("&", "and").replace("%", "percent").replace("+", "plus").replace("***", "****").replace("*****", "****").strip()
        
        
        #strips the words in the utterance of hyphens and underscores
        #  this is because if a conjunction has one of these at the end
        #  nltk will not tokenize it correctly. instead of tokenizing
        #  "ain't-" as ["ai", "n't", "-"] it will do ["ain't="]. This
        #  will take care of that ahead of the tokenizing
        utterance_content = " ".join([word.strip(string.punctuation) for word in utterance_content.split()])
        
        
        #tokenizes the utterance content by word
        # importantly: this tokenizer will separate punctuation on the edge
        #  of words out into separate tokens. it will also separate conjunctions
        #  into separate words as well as informally contracted words like "gonna"
        tokenized_content = word_tokenize(utterance_content)

        #loops through the tokens
        for content_word in tokenized_content:

            #gets the index of the token
            index = tokenized_content.index(content_word)

           
            #replaces censored words with the correct amount of stars
            #  depending on the service. if the service is deepspeech
            #  (the else condition), the word is not replaced
            if content_word in censor_words:

                tokenized_content[index] = "****"


            #replaces reductions with complete, separated words
            elif content_word in reduction_dict:

                if len(reduction_dict[content_word]) == 1:

                    tokenized_content[index] = reduction_dict[content_word][0]

                else:

                    tokenized_content = tokenized_content[:index] + reduction_dict[content_word] + tokenized_content[index+1:]



            #looks for strings which contain the :
            # which will only be times in the ASR transcriptions
            #  this is because CORAAL generally transcribes times
            #   into words rather than numbers
            elif ":" in content_word.strip(":"):

                #separates the hour
                hour = content_word.split(":")[0]

                #separates the minute
                minute = content_word.split(":")[1]

                #converts the hour number to a word
                hour = num2words(hour)

                #converts the minute number to words
                # if the minute is 00, converts to o'clock
                if minute == "00":
                    
                    minute = "o'clock"

                # any other minute, converts to number
                else: 
                    minute = num2words(minute)

                #breaks the original sentence into two by the index of the original time
                #  adds the hour and minute at the index
                tokenized_content = tokenized_content[:index] + [hour, minute] + tokenized_content[index+1:]


            #converts a dollar or cent amount from numbers to words
            elif tokenized_content[index-1] == "$":

                #converts a cent amount
                if content_word.startswith("0."):

                    #splits at the decimal and takes only the cent amount number
                    cent_amount = tokenized_content[index].split(".")[1]

                    #converts words to numbers
                    cent_amount = num2words(cent_amount)

                    #replaces the dollar sign with the cent amount in the list
                    tokenized_content[index-1] = cent_amount

                    #replaces the number cent amount with the word 'cents' in the list
                    tokenized_content[index] = "cents"


                #converts a cent amount 
                else:

                    #removes any potential comma in the number and converts to words
                    dollar_amount = num2words(content_word.replace(",",""))

                    #creates a split list from the dollar_amount
                    split_dollar_amount = dollar_amount.split()

                    #if the dollar amount is a single word, which would either be
                    #  a single digit or a two digit number like fifty-six
                    #  then just replaces the number with the word
                    if len(split_dollar_amount) == 1:

                        #replaces the dollar sign with the dollar amount in the list
                        tokenized_content[index-1] = dollar_amount

                        #replaces the dollar amount with either the word "dollar" or "dollars" in the list
                        if dollar_amount == "one":

                            tokenized_content[index] = "dollar"

                        else:

                            tokenized_content[index] = "dollars"

                    #if the dollar amount is multiple words long, replaces the number
                    #  with the words separated out
                    else:

                        #loops through the words in the split dollar amount
                        for local_index in range(len(split_dollar_amount)):

                            #inserts the words into the larger list one by one
                            tokenized_content.insert(local_index + index, split_dollar_amount[local_index])

                        #replaces the number amount with the word "dollars"
                        tokenized_content[index + len(split_dollar_amount)] = "dollars"

                        #removes the dollar sign from the list
                        tokenized_content.pop(index-1)


            #converts numbers to words
            elif content_word.replace(",", "").isnumeric():

                #removes commas because num2words won't accept it otherwise
                number_words = num2words(content_word.replace(",", ""))

                #creates a split list from number_words
                split_number_words = number_words.split()

                #if the number is a single digit or a hyphenated two digit like fifty-six
                #  then the number is replaced by the word in the original list
                if len(split_number_words) == 1:

                    tokenized_content[index] = number_words

                #if the number is multiple words long, replaces the number
                #  with the words in the original list
                else:

                    #loops through the words in the split number words
                    for local_index in range(len(split_number_words)):

                        #inserts the words into the larger list one by one
                        tokenized_content.insert(local_index + index, split_number_words[local_index])

                    #removes the original number
                    tokenized_content.pop(index + len(split_number_words))



            # removes tokens which are only punctuation
            #  this list of punctuation is specific and not the same as 
            #  the python string library which has string.punctuation
            elif content_word in '!"#%&\'()*+,-./:;<=>?@[\\]^_`{|}~':

                tokenized_content.remove(content_word)


        #strips hyphens from the ends of words
        #  lowercases all letters
        #  removes any redactions in CORAAL because anytime a redaction 
        #  occurs in the audio, it is covered by a beep which the ASR
        #  will not be able to transcribe
        tokenized_content = [token.strip("-").lower() for token in tokenized_content if "RD-NAME" not in token]     
        
        tokenized_content = (" ").join(tokenized_content)
        
        tokenized_content = tokenized_content.replace("o'clock o'clock", "o'clock")
        
        return tokenized_content

## Executing the Code

In [None]:
# A list of column names to be appended next to
column_names = ["Content", "amazon_transcription", "deepspeech_transcription", 
                "google_transcription", "IBMWatson_transcription", "microsoft_transcription"]

### Feature: Ain't

In [None]:
# loops through column names
for column_name in column_names:
    
    col_index = aint_gs_df.columns.get_loc(column_name)

    aint_gs_df.insert(col_index+1, f"{column_name}_cleaned", np.nan)

# loops through the rows, cleans each utterance content and writes the result
for file_row in aint_gs_df.itertuples():

    aint_gs_df.loc[file_row.Index, "Content_cleaned"] = clean_utterance_content(file_row.Content)
    
    aint_gs_df.loc[file_row.Index, "amazon_transcription_cleaned"] = clean_utterance_content(file_row.amazon_transcription)
    
    aint_gs_df.loc[file_row.Index, "deepspeech_transcription_cleaned"] = clean_utterance_content(file_row.deepspeech_transcription)
    
    aint_gs_df.loc[file_row.Index, "google_transcription_cleaned"] = clean_utterance_content(file_row.google_transcription)
    
    aint_gs_df.loc[file_row.Index, "IBMWatson_transcription_cleaned"] = clean_utterance_content(file_row.IBMWatson_transcription)
    
    aint_gs_df.loc[file_row.Index, "microsoft_transcription_cleaned"] = clean_utterance_content(file_row.microsoft_transcription)

### Feature: Be

In [None]:
# loops through column names
for column_name in column_names:
    
    col_index = be_gs_df.columns.get_loc(column_name)

    be_gs_df.insert(col_index+1, f"{column_name}_cleaned", np.nan)

# loops through the rows, cleans each utterance content and writes the result
for file_row in be_gs_df.itertuples():

    be_gs_df.loc[file_row.Index, "Content_cleaned"] = clean_utterance_content(file_row.Content)
    
    be_gs_df.loc[file_row.Index, "amazon_transcription_cleaned"] = clean_utterance_content(file_row.amazon_transcription)
    
    be_gs_df.loc[file_row.Index, "deepspeech_transcription_cleaned"] = clean_utterance_content(file_row.deepspeech_transcription)
    
    be_gs_df.loc[file_row.Index, "google_transcription_cleaned"] = clean_utterance_content(file_row.google_transcription)
    
    be_gs_df.loc[file_row.Index, "IBMWatson_transcription_cleaned"] = clean_utterance_content(file_row.IBMWatson_transcription)
    
    be_gs_df.loc[file_row.Index, "microsoft_transcription_cleaned"] = clean_utterance_content(file_row.microsoft_transcription)

### Feature: Done

In [None]:
# loops through column names
for column_name in column_names:
    
    col_index = done_gs_df.columns.get_loc(column_name)

    done_gs_df.insert(col_index+1, f"{column_name}_cleaned", np.nan)

# loops through the rows, cleans each utterance content and writes the result
for file_row in done_gs_df.itertuples():

    done_gs_df.loc[file_row.Index, "Content_cleaned"] = clean_utterance_content(file_row.Content)
    
    done_gs_df.loc[file_row.Index, "amazon_transcription_cleaned"] = clean_utterance_content(file_row.amazon_transcription)
    
    done_gs_df.loc[file_row.Index, "deepspeech_transcription_cleaned"] = clean_utterance_content(file_row.deepspeech_transcription)
    
    done_gs_df.loc[file_row.Index, "google_transcription_cleaned"] = clean_utterance_content(file_row.google_transcription)
    
    done_gs_df.loc[file_row.Index, "IBMWatson_transcription_cleaned"] = clean_utterance_content(file_row.IBMWatson_transcription)
    
    done_gs_df.loc[file_row.Index, "microsoft_transcription_cleaned"] = clean_utterance_content(file_row.microsoft_transcription)

## Sorting the Dataframes by File and Line

This will sort the dataframes first by filename and then by line number. Doing this each step will ensure consistency across the board.

### Feature: Ain't

In [None]:
aint_gs_df = aint_gs_df.sort_values(by=['File', 'Line'])

### Feature: Be

In [None]:
be_gs_df = be_gs_df.sort_values(by=['File', 'Line'])

### Feature: Done

In [None]:
done_gs_df = done_gs_df.sort_values(by=['File', 'Line'])

## Exporting Dataframes to CSV Files

This will export the dataframes to CSV files.

In [None]:
# Designate the output path where the CSVs will be stored
csv_output_path = "path"

### Feature: Ain't

In [None]:
aint_gs_df.to_csv(f"{csv_output_path}aint_variations_cleanedUtterances.csv", index=False)

### Feature: Be

In [None]:
be_gs_df.to_csv(f"{csv_output_path}be_cleanedUtterances.csv", index=False)

### Feature: Done

In [None]:
done_gs_df.to_csv(f"{csv_output_path}done_cleanedUtterances.csv", index=False)