# Step 1.1: Creating Dataframes from Corpora

This code will produce the following pandas dataframes:
<ol>
<li>Dataframes for each corpus that contain transcript lines that contain the search word</li>
<li>Dataframes that provide quantitative information from each corpus</li>
<li>A dataframe that combines quantitative information from all corpora into one dataframe</li>
</ol>

## Required Packages

The following packages are necessary to run this code:
string, os, re, [pandas](https://pypi.org/project/pandas/), [numpy](https://pypi.org/project/numpy/), [nltk](https://pypi.org/project/nltk/)

## Designate Corpora Filepaths

Ensure each path folder contains all the text files for each corpus. Be sure to end file paths with the appropriate slash.

In [None]:
#This is the file path where you keep all the corpora sub-folders
corpora_path = "path"

#This is the file path extension for the corpus specific sub-folders
coraal_extension = "CORAAL/"
fisher_extension = "Fisher/"
librispeech_extension = "LibriSpeech/"
switchboard_extension = "SwitchboardHub5/Switchboard/"
hub5_extension = "SwitchboardHub5/hub5_noHeader/"
timit_extension = "TIMIT/"

## Define the Dataframe Creating Function

This function takes the following arguments:

<ol>
<li>The filepath to set of txt files from a corpus</li>
<li>The word being searched for</li>
<li>The name of the corpus taken from the following names: CORAAL, Switchboard, Hub5, Fisher, LibriSpeech, and TIMIT </li>
</ol>

It will return two pandas dataframes. One will contain all transcript lines in the corpus which contain the search word. The second will contain information about the corpus, including:

<ol>
<li>The total word count</li>
<li>The total number of instances of the search word</li>
<li>The normalized number of search word instances per 100,000 words </li>
<li>The total number of files in the corpus </li>
</ol>

In [None]:
def get_instances_info_dataframes(txt_import_path, search_word_string, corpus_name):
    """
    Takes a set of txt files from a corpus, a search word, and a corpus name.
    Returns two Pandas dataframes. One will contain all transcript lines in the corpus
    which contain the search word. The second will contain information about the corpus, including:
    total word count, total number of instances of the search word, the normalized number of
    search word instances per 100,000 words, and the total number of files in the corpus.
    There are six options for corpora: CORAAL, Switchboard, Hub5, Fisher, LibriSpeech, and TIMIT.
    One of these must be given in the corpus_name argument or the code will ask you to correct it.
    """

#####################################################################################################
############################### SECTION 1: PRELIMINARY ACTIONS ######################################
#####################################################################################################

    if corpus_name not in ['CORAAL', 'Switchboard', 'Hub5', 'Fisher', 'LibriSpeech', 'Librispeech', 'TIMIT',
                           'corral', 'switchboard', 'hub5', 'fisher', 'librispeech', 'timit']:
        
        raise Exception("""The corpus name you gave is not valid. Please use one of the following: CORAAL, Switchboard, Hub5, Fisher, LibriSpeech, or TIMIT.""")

    import pandas as pd
    import string
    import os
    import re
    from nltk import word_tokenize


#####################################################################################################
############################### SECTION 2: INTERNAL FUNCTIONS #######################################
#####################################################################################################


    def create_coraal_df(transcript_filepath, transcript_filename):
        """Takes a txt file transcript from the CORAAL corpus and creates a cleaned Pandas dataframe."""

        import pandas as pd

        # reads in csv and creates Pandas dataframe
        coraal_df = pd.read_csv(
            f"{transcript_filepath}{transcript_filename}", sep="\t")

        # creates a 'File' column
        coraal_df['File'] = transcript_filename[:-4]

        # renames column names for consistency with other corpora dataframes
        coraal_df = coraal_df.rename(
            columns={"Spkr": "Speaker", "StTime": "UttStartTime", "EnTime": "UttEndTime"})
        
        # creates a column with the utterance audio length in seconds
        #  this will be used later to calculate speech rate
        coraal_df['UttLength'] = coraal_df['UttEndTime'] - coraal_df['UttStartTime']
        
        # drops the 'all_info' column from the dataframe
        coraal_df = coraal_df[['File', 'Line', 'Speaker', 'UttStartTime', 
                               'UttEndTime', 'UttLength', 'Content']]

        # removes all metalinguistic and paralinguistic information except for pauses
        coraal_df = coraal_df.replace(
            to_replace='\/(\?+|[Uu]nintelligible)\/|\/|\[|\]|\<.*?\>|\((?!pause).*?\)', value='', regex=True)
        
        #this will remove all White interviewers from the dataframe
        white_interviewer_ids = ['DCA_int_01', 'DCA_int_02', 'DCA_int_04', 'DCA_int_07','DCA_int_08','LES_int_01','PRV_int_01','PRV_int_02']
        
        coraal_df = coraal_df[~coraal_df['Speaker'].isin(white_interviewer_ids)]
        
        # returns the dataframe
        return coraal_df

    
    
    def create_switchboard_df(transcript_filepath, transcript_filename):
        """Takes a txt file transcript from the Switchboard corpus and creates a cleaned Pandas dataframe."""

        import pandas as pd

        # reads in csv and creates Pandas dataframe
        swb_df = pd.read_csv(f"{transcript_filepath}{transcript_filename}",
                             sep="\t", header=None, names=['all_info'])

        # replaces the first three spaces in each transcript line with tab characters for separation purposes
        swb_df = pd.DataFrame([row.all_info.replace(" ", "\t", 3)
                               for row in swb_df.itertuples()], columns=['all_info'])

        # splits each row into multiple columns, breaking on tab characters
        swb_df = swb_df.join(swb_df['all_info'].str.split('\t', expand=True).rename(
            columns={0: 'File-Line', 1: 'UttStartTime', 2: 'UttEndTime', 3: 'Content'}))
        
        # converts the start and end times from a string format to a float format
        #  python can't perform mathematical functions on strings
        swb_df['UttStartTime'] = swb_df['UttStartTime'].astype(float)
        swb_df['UttEndTime'] = swb_df['UttEndTime'].astype(float)

        # splits the file-line combination that is built into Switchboard transcript lines
        # creates a 'File' column
        swb_df['File'] = swb_df['File-Line'].str.slice(0, 14)

        # creates a 'Line column'
        swb_df['Line'] = swb_df['File-Line'].str.slice(-4,)

        # gets the speaker id from the filename
        swb_df['Speaker'] = transcript_filename[6]

        # creates a column with the utterance audio length in seconds
        #  this will be used later to calculate speech rate
        swb_df['UttLength'] = swb_df['UttEndTime'] - swb_df['UttStartTime']
        
        # drops the 'all_info' column from the dataframe
        swb_df = swb_df[['File', 'Line', 'Speaker', 'UttStartTime', 
                         'UttEndTime', 'UttLength', 'Content']]

        # removes all metalinguistic and paralinguistic information, except for silence markers since those compose half the lines in Switchboard
        #  Switchboard is a phone conversation corpus and they split the transcript into each caller's audio instead of combining each side
        #  that means, when the speaker in the transcript is not talking, the line will just be silence
        swb_df = swb_df.replace(to_replace='\[(?!silence).*?\]|\<.*?\>', value='', regex=True)

        # returns the dataframe
        return swb_df

    
    
    def create_hub5_df(transcript_filepath, transcript_filename):
        """Takes a txt file transcript from the Hub5 corpus and creates a cleaned Pandas dataframe."""

        import pandas as pd
        import numpy as np

        # reads in csv and creates Pandas dataframe
        hub5_df = pd.read_csv(f"{transcript_filepath}{transcript_filename}",
                              sep="\t", header=None, names=['all_info'])

        # replaces the first three spaces in each transcript line with tab characters for separation purposes
        hub5_df = pd.DataFrame([row.all_info.replace(" ", "\t", 3)
                                for row in hub5_df.itertuples()], columns=['all_info'])

        # splits each row into multiple columns, breaking on tab characters
        hub5_df = hub5_df.join(hub5_df['all_info'].str.split('\t', expand=True).rename(
            columns={0: 'UttStartTime', 1: 'UttEndTime', 2: 'Speaker', 3: 'Content'}))
        
        # converts the start and end times from a string format to a float format
        #  python can't perform mathematical functions on strings
        hub5_df['UttStartTime'] = hub5_df['UttStartTime'].astype(float)
        hub5_df['UttEndTime'] = hub5_df['UttEndTime'].astype(float)
        
        # creates a 'File' column
        hub5_df['File'] = hub5_df['File'] = transcript_filename[:-4]

        # creates a 'Line column'
        # NOTE: The Hub5 transcripts DO NOT have line numbers. The numbers added here are based on Pandas row numbers
        hub5_df['Line'] = np.arange(hub5_df.shape[0])
        hub5_df['Line'] = hub5_df['Line'] + 1

        #removes the colon from the speaker id
        hub5_df['Speaker'] = hub5_df['Speaker'].apply(lambda speaker_id: speaker_id.strip(string.punctuation))
        
        # creates a column with the utterance audio length in seconds
        #  this will be used later to calculate speech rate
        hub5_df['UttLength'] = hub5_df['UttEndTime'] - hub5_df['UttStartTime']
        
        # drops the 'all_info' column from the dataframe
        hub5_df = hub5_df[['File', 'Line', 'Speaker', 'UttStartTime', 
                           'UttEndTime', 'UttLength', 'Content']]

        # removes all metalinguistic and paralinguistic information
        hub5_df = hub5_df.replace(
            to_replace='\<.*?\[.*?\]\[.*?\].*?\>|\{.*?\}|\%|\&|\*{2}.*?\*{2}|\[{2}.*?\]{2}|\({2}|\){2}', value='', regex=True)
        hub5_df = hub5_df.replace(
            to_replace='\<.*?\>|\[.*?\]', value='', regex=True)

        # returns the dataframe
        return hub5_df

    
    
    def create_fisher_df(transcript_filepath, transcript_filename):
        """Takes a txt file transcript from the Fisher corpus and creates a cleaned Pandas dataframe."""

        import pandas as pd
        import numpy as np

        # reads in csv and creates Pandas dataframe
        fisher_df = pd.read_csv(f"{transcript_filepath}{transcript_filename}", header=None, names=[
                                'all_info'], sep="\t", skiprows=2)

        # replaces the first three spaces in each transcript line with tab characters for separation purposes
        fisher_df = pd.DataFrame([row.all_info.replace(" ", "\t", 3)
                                  for row in fisher_df.itertuples()], columns=['all_info'])

        # splits each row into multiple columns, breaking on tab characters
        fisher_df = fisher_df.join(fisher_df['all_info'].str.split('\t', expand=True).rename(
            columns={0: 'UttStartTime', 1: 'UttEndTime', 2: 'Speaker', 3: 'Content'}))
        
        # converts the start and end times from a string format to a float format
        #  python can't perform mathematical functions on strings
        fisher_df['UttStartTime'] = fisher_df['UttStartTime'].astype(float)
        fisher_df['UttEndTime'] = fisher_df['UttEndTime'].astype(float)
        
        # creates a 'File' column
        fisher_df['File'] = fisher_df['File'] = transcript_filename[:-4]

        # creates a 'Line column'
        # NOTE: The Fisher transcripts DO NOT have line numbers. The numbers added here are based on Pandas row numbers
        fisher_df['Line'] = np.arange(fisher_df.shape[0])
        fisher_df['Line'] = fisher_df['Line'] + 1

        #removes the colon from the speaker id
        fisher_df['Speaker'] = fisher_df['Speaker'].apply(lambda speaker_id: speaker_id.strip(string.punctuation))

        # creates a column with the utterance audio length in seconds
        #  this will be used later to calculate speech rate
        fisher_df['UttLength'] = fisher_df['UttEndTime'] - fisher_df['UttStartTime']
        
        # drops the 'all_info' column from the dataframe
        fisher_df = fisher_df[['File', 'Line', 'Speaker', 'UttStartTime', 
                               'UttEndTime', 'UttLength', 'Content']]

        # removes all metalinguistic and paralinguistic information
        fisher_df = fisher_df.replace(
            to_replace='\_|\({2}|\){2}|\s{2}|\[.*?\]', value='', regex=True)

        # returns the dataframe
        return fisher_df

    
    
    def create_librispeech_df(transcript_filepath, transcript_filename):
        """Takes a txt file transcript from the LibriSpeech corpus and creates a cleaned Pandas dataframe."""

        import pandas as pd

        # reads in csv and creates Pandas dataframe
        librispeech_df = pd.read_csv(
            f"{transcript_filepath}{transcript_filename}", sep="\t", header=None, names=['all_info'])

        # replaces the first space in each transcript line with tab characters for separation purposes
        librispeech_df = pd.DataFrame([row.all_info.replace(
            " ", "\t", 1) for row in librispeech_df.itertuples()], columns=['all_info'])

        # splits each row into multiple columns, breaking on tab characters
        librispeech_df = librispeech_df.join(librispeech_df['all_info'].str.split(
            '\t', expand=True).rename(columns={0: 'File-Line', 1: 'Content'}))

        # splits the file-line combination that is built into Switchboard transcript lines
        # creates a 'File' column
        librispeech_df['File'] = librispeech_df['File-Line'].str.slice(0, 16)

        # creates a 'Line column'
        librispeech_df['Line'] = librispeech_df['File-Line'].str.slice(-4,)

        # drops the 'all_info' column from the dataframe
        librispeech_df = librispeech_df[['File', 'Line', 'Content']]

        # lowercases all letters in Content. LibriSpeech transcripts are all uppercase with no punctuation
        librispeech_df['Content'] = librispeech_df['Content'].str.lower()

        # returns the dataframe
        return librispeech_df

    
    
    def create_timit_df(transcript_filepath, transcript_filename):
        """Takes a txt file transcript from the TIMIT corpus and creates a cleaned Pandas dataframe."""

        import pandas as pd

        # reads in csv and creates Pandas dataframe
        timit_df = pd.read_csv(f"{transcript_filepath}{transcript_filename}",
                               header=None, sep="\t", names=['all_info'])

        # replaces the first two spaces in each transcript line with tab characters for separation purposes
        timit_df = pd.DataFrame([row.all_info.replace(" ", "\t", 2)
                                 for row in timit_df.itertuples()], columns=['all_info'])

        # splits each row into multiple columns, breaking on tab characters
        timit_df = timit_df.join(timit_df['all_info'].str.split('\t', expand=True).rename(
            columns={0: 'BeginningIntegerSampleNumber', 1: 'EndIntegerSampleNumber', 2: 'Content'}))

        # splits the file-line combination that is built into Switchboard transcript lines
        # creates a 'File' column
        timit_df['File'] = transcript_filename[:-4]

        # drops the 'all_info' column from the dataframe
        timit_df = timit_df[['File', 'BeginningIntegerSampleNumber',
                             'EndIntegerSampleNumber', 'Content']]

        # returns the dataframe
        return timit_df

    
    
    def filter_df_by_word(dataframe, search_word_string):
        """
        Takes a word and filters a Pandas dataframe and leaves only rows 
        that contain that search word in its Content.
        """

        import pandas as pd

        # takes the search word input and transforms it into a regular expression that will search for only whole words
        #  if the sequence of strings submitted is contained within a larger word, this will filter those instances out
        #  and leave only whole matches
        search_word_regex = f"\\b[{search_word_string[0].upper()}|{search_word_string[0].lower()}]{search_word_string[1:]}\\b"

        # filters the dataframe by rows whose 'Content' contains the word
        word_df = dataframe[dataframe['Content'].str.contains(
            search_word_regex, case=False, flags=re.IGNORECASE, regex=True)]

        #creates an empty list for number of instances of search word to be appended to
        instances_count_per_line = []
        
        #loops through rows
        for row in word_df.itertuples():
            
            #appends number of search word instances per row
            instances_count_per_line.append(len(re.findall(search_word_regex, row.Content)))
        
        #adds a column to the dataframe that has the number of instances per row
        word_df["InstancesCountPerLine"] = instances_count_per_line

        # returns the dataframe
        return word_df

#####################################################################################################
############################### SECTION 3: EXECUTION OF CODE ######################################
#####################################################################################################

    # gathers all the filenames for the txt files
    txt_filenames = []
    
    for root, dirs, files in os.walk(txt_import_path):
        
        for file in files:
            
            #this is because the 2021 CORAAL files have duplicate versions
            #  of files that start with ._ and contain nothing
            if file.startswith("._"):
                pass
            
            elif file.endswith(".txt") or file.endswith(".TXT") or file.endswith(".text"):
                txt_filenames.append(file)

    # creates an empty list for each file's word totals to be appended to
    total_word_count_list = []

    # creates an empty list for each dataframe of lines containing instances of the search word
    #  to be appended to
    instances_list = []

    # creates a dataframe for the transcript
    #  uses the correct corpus based on the corpus_name given by the user
    for txt_filename in txt_filenames:
        
        if corpus_name.lower() == "coraal":
            
            file_df = create_coraal_df(txt_import_path, txt_filename)
            
        elif corpus_name.lower() == "switchboard":
            
            file_df = create_switchboard_df(txt_import_path, txt_filename)
            
        elif corpus_name.lower() == "hub5":
            
            file_df = create_hub5_df(txt_import_path, txt_filename)
            
        elif corpus_name.lower() == "fisher":
            
            file_df = create_fisher_df(txt_import_path, txt_filename)
            
        elif corpus_name.lower() == "librispeech":
            
            file_df = create_librispeech_df(txt_import_path, txt_filename)
            
        elif corpus_name.lower() == "timit":
            
            file_df = create_timit_df(txt_import_path, txt_filename)
            

        # creates an empty list for each line's total word count to be appended to
        file_total_word_count = []

        # calculates the total number of words in each line's Content
        for file_row in file_df.itertuples():
            
            # ensures CORAAL rows composed only of pauses won't be counted
            if corpus_name.lower() == "coraal":
                
                if file_row.Content.startswith("(pause"):
                    continue
                    
                else:
                    #tokenizes words in Content using nltk's word tokenizer
                    # important to note that this will divide contracted words into two
                    content_words_tokenized = word_tokenize(file_row.Content)
                    
                    #cleans words by: (1) lowercasing, stripping punctuation to the right
                    # so that the apostrophe in contacted words can be preserved
                    # (2) eliminates tokens which are only punctuation markers
                    # since nltk separates those out
                    content_words_tokenized_cleaned = [word.lower().rstrip(string.punctuation) for word in content_words_tokenized if word not in string.punctuation]
                    
                    # appends the len of the Content word list to the file total count list
                    file_total_word_count.append(len(content_words_tokenized_cleaned))

            # ensures Switchboard rows composed only of silence won't be counted
            elif corpus_name.lower() == "switchboard":
                
                if file_row.Content.startswith("[silence"):
                    continue
                    
                else:
                    #tokenizes words in Content using nltk's word tokenizer
                    # important to note that this will divide contracted words into two
                    content_words_tokenized = word_tokenize(file_row.Content)
                   
                    #cleans words by: (1) lowercasing, stripping punctuation to the right
                    # so that the apostrophe in contacted words can be preserved
                    # (2) eliminates tokens which are only punctuation markers
                    # since nltk separates those out
                    content_words_tokenized_cleaned = [word.lower().rstrip(string.punctuation) for word in content_words_tokenized if word not in string.punctuation]
                    
                    # appends the len of the Content word list to the file total count list
                    file_total_word_count.append(len(content_words_tokenized_cleaned))


            # for all other corpora
            else:
                #tokenizes words in Content using nltk's word tokenizer
                #important to note that this will divide contracted words into two
                content_words_tokenized = word_tokenize(file_row.Content)
                
                #cleans words by: (1) lowercasing, stripping punctuation to the right
                # so that the apostrophe in contacted words can be preserved
                # (2) eliminates tokens which are only punctuation markers
                # since nltk separates those out
                content_words_tokenized_cleaned = [word.lower().rstrip(string.punctuation) for word in content_words_tokenized if word not in string.punctuation]
                
                # appends the len of the Content word list to the file total count list
                file_total_word_count.append(len(content_words_tokenized_cleaned))


        # appends the sum of the file's lines' Content count
        total_word_count_list.append(sum(file_total_word_count))

        # appends the dataframe of instances found within the file
        instances_list.append(filter_df_by_word(file_df, search_word_string))

    # calculates the total word count for the corpus by summing the word count totals for every file
    total_word_count = sum(total_word_count_list)

    # calculates the total number of files by the length of the filenames list
    total_file_count = len(txt_filenames)

    # concatenates the list of instance dataframes for each file into one dataframe
    instances_df = pd.concat(instances_list).reset_index(drop=True)

    # calculates the total number of instances of the search word occur in the corpus
    #  this may be a different number than the row count in the instances_df
    #  that is because there may be more than once instance of the search word
    #  in a given Content line
    total_instances_count = instances_df["InstancesCountPerLine"].sum()

    # calculates the normalized amount of instances of the search word per 100,000
    #  the formula is total number of instances / total word count in the corpus * 100,000
    normalized_instances_count = total_instances_count / total_word_count * 100000

    # creates a dataframe that provides the corpus' total word count, total number of instances of the search word,
    #   the normalized number of instances per 100,000, and the total file count
    info_df = pd.DataFrame({f'{corpus_name}': [total_word_count,
                                               total_instances_count,
                                               normalized_instances_count,
                                               total_file_count]}, index=['TotalCorpusWordCount',
                                                                          'TotalWordInstancesCount',
                                                                          'NormalizedWordInstancesCount',
                                                                          'TotalFileCount'])
    
    #Pandas defaults to scientific notation. This will correct that.
    info_df = info_df.round(2)

    # returns (1) the instances dataframe which will contain full information for lines that contain the search word
    #       (2) the informational dataframe created one step before this
    #  to make this work, write the code like this:
    #  example1, example2 = get_instances_info_dataframes('/dir/ec/tory', 'search_word', 'corpus_name')
    #  example1 will contain the instances_df, example 2 will contain the info_df
    return instances_df, info_df

## Creating Instances and Info Dataframes

This will create instances and info dataframes for each corpus. Because the function returns two variables, two variables must be assigned.

### Feature: Ain't

In [None]:
aint_coraal_instances_df, aint_coraal_info_df = get_instances_info_dataframes(f"{corpora_path}{coraal_extension}", "ain\'t", "CORAAL")

aint_fisher_instances_df, aint_fisher_info_df = get_instances_info_dataframes(f"{corpora_path}{fisher_extension}", "ain\'t", "Fisher")

aint_librispeech_instances_df, aint_librispeech_info_df = get_instances_info_dataframes(f"{corpora_path}{librispeech_extension}", "ain\'t", "LibriSpeech")

aint_switchboard_instances_df, aint_switchboard_info_df = get_instances_info_dataframes(f"{corpora_path}{switchboard_extension}", "ain\'t", "Switchboard")

aint_hub5_instances_df, aint_hub5_info_df = get_instances_info_dataframes(f"{corpora_path}{hub5_extension}", "ain\'t", "Hub5")

aint_timit_instances_df, aint_timit_info_df = get_instances_info_dataframes(f"{corpora_path}{timit_extension}", "ain\'t", "TIMIT")

### Feature: Be

In [None]:
be_coraal_instances_df, be_coraal_info_df = get_instances_info_dataframes(f"{corpora_path}{coraal_extension}", "be", "CORAAL")

be_fisher_instances_df, be_fisher_info_df = get_instances_info_dataframes(f"{corpora_path}{fisher_extension}", "be", "Fisher")

be_librispeech_instances_df, be_librispeech_info_df = get_instances_info_dataframes(f"{corpora_path}{librispeech_extension}", "be", "LibriSpeech")

be_switchboard_instances_df, be_switchboard_info_df = get_instances_info_dataframes(f"{corpora_path}{switchboard_extension}", "be", "Switchboard")

be_hub5_instances_df, be_hub5_info_df = get_instances_info_dataframes(f"{corpora_path}{hub5_extension}", "be", "Hub5")

be_timit_instances_df, be_timit_info_df = get_instances_info_dataframes(f"{corpora_path}{timit_extension}", "be", "TIMIT")

### Feature: Done

In [None]:
done_coraal_instances_df, done_coraal_info_df = get_instances_info_dataframes(f"{corpora_path}{coraal_extension}", "done", "CORAAL")

done_fisher_instances_df, done_fisher_info_df = get_instances_info_dataframes(f"{corpora_path}{fisher_extension}", "done", "Fisher")

done_librispeech_instances_df, done_librispeech_info_df = get_instances_info_dataframes(f"{corpora_path}{librispeech_extension}", "done", "LibriSpeech")

done_switchboard_instances_df, done_switchboard_info_df = get_instances_info_dataframes(f"{corpora_path}{switchboard_extension}", "done", "Switchboard")

done_hub5_instances_df, done_hub5_info_df = get_instances_info_dataframes(f"{corpora_path}{hub5_extension}", "done", "Hub5")

done_timit_instances_df, done_timit_info_df = get_instances_info_dataframes(f"{corpora_path}{timit_extension}", "done", "TIMIT")

## Combining Switchboard and Hub5 Dataframes

The Switchboard and Hub5 corpora are often used together in ASR development and evaluation. I analyze them together, so the following code will create combined instances and info dataframes for Switchboard and Hub5. If you would like to analyze them separately, simply use the variables created for each in the previous step.

### Feature: Ain't

In [None]:
import pandas as pd

aint_switchboardHub5_instances_df = pd.concat([aint_switchboard_instances_df, aint_hub5_instances_df])

#adds the switchboard and hub5 info dataframes together
aint_switchboardHub5_info_df = aint_switchboard_info_df.add(aint_hub5_info_df, fill_value=0)

#sums the values in each dataframe and inserts the info in a new column
aint_switchboardHub5_info_df["SwitchboardHub5"] = aint_switchboardHub5_info_df.sum(axis=1)

#recalculates the normalized instance count based on the new combined figures
# if you don't do this, the process will simply sum the two normalized counts
# which would be highly inaccurate
aint_switchboardHub5_info_df.iloc[2,2] = round(aint_switchboardHub5_info_df.iloc[1,2]/
                                          aint_switchboardHub5_info_df.iloc[0,2]
                                          *100000, 2)

#drops the separate switchboard and hub5 columns and leaves the combined column
aint_switchboardHub5_info_df = aint_switchboardHub5_info_df.drop(['Hub5', 'Switchboard'], axis=1)

### Feature: Be

In [None]:
be_switchboardHub5_instances_df = pd.concat([be_switchboard_instances_df, be_hub5_instances_df])

#adds the switchboard and hub5 info dataframes together
be_switchboardHub5_info_df = be_switchboard_info_df.add(be_hub5_info_df, fill_value=0)

#sums the values in each dataframe and inserts the info in a new column
be_switchboardHub5_info_df["SwitchboardHub5"] = be_switchboardHub5_info_df.sum(axis=1)

#recalculates the normalized instance count based on the new combined figures
# if you don't do this, the process will simply sum the two normalized counts
# which would be highly inaccurate
be_switchboardHub5_info_df.iloc[2,2] = round(be_switchboardHub5_info_df.iloc[1,2]/
                                          be_switchboardHub5_info_df.iloc[0,2]
                                          *100000, 2)

#drops the separate switchboard and hub5 columns and leaves the combined column
be_switchboardHub5_info_df = be_switchboardHub5_info_df.drop(['Hub5', 'Switchboard'], axis=1)

### Feature: Done

In [None]:
done_switchboardHub5_instances_df = pd.concat([done_switchboard_instances_df, done_hub5_instances_df])

#adds the switchboard and hub5 info dataframes together
done_switchboardHub5_info_df = done_switchboard_info_df.add(done_hub5_info_df, fill_value=0)

#sums the values in each dataframe and inserts the info in a new column
done_switchboardHub5_info_df["SwitchboardHub5"] = done_switchboardHub5_info_df.sum(axis=1)

#recalculates the normalized instance count based on the new combined figures
# if you don't do this, the process will simply sum the two normalized counts
# which would be highly inaccurate
done_switchboardHub5_info_df.iloc[2,2] = round(done_switchboardHub5_info_df.iloc[1,2]/
                                          done_switchboardHub5_info_df.iloc[0,2]
                                          *100000, 2)

#drops the separate switchboard and hub5 columns and leaves the combined column
done_switchboardHub5_info_df = done_switchboardHub5_info_df.drop(['Hub5', 'Switchboard'], axis=1)

## Sorting the Dataframes by File and Line

This will sort the dataframes first by filename and then by line number. Doing this each step will ensure consistency across the board.

### Feature: Ain't

In [None]:
aint_coraal_instances_df = aint_coraal_instances_df.sort_values(by=['File', 'Line'])

aint_fisher_instances_df = aint_fisher_instances_df.sort_values(by=['File', 'Line'])

aint_librispeech_instances_df = aint_librispeech_instances_df.sort_values(by=['File', 'Line'])

aint_switchboardHub5_instances_df = aint_switchboardHub5_instances_df.sort_values(by=['File', 'Line'])

aint_timit_instances_df = aint_timit_instances_df.sort_values(by=['File'])

### Feature: Be

In [None]:
be_coraal_instances_df = be_coraal_instances_df.sort_values(by=['File', 'Line'])

be_fisher_instances_df = be_fisher_instances_df.sort_values(by=['File', 'Line'])

be_librispeech_instances_df = be_librispeech_instances_df.sort_values(by=['File', 'Line'])

be_switchboardHub5_instances_df = be_switchboardHub5_instances_df.sort_values(by=['File', 'Line'])

be_timit_instances_df = be_timit_instances_df.sort_values(by=['File'])

### Feature: Done

In [None]:
done_coraal_instances_df = done_coraal_instances_df.sort_values(by=['File', 'Line'])

done_fisher_instances_df = done_fisher_instances_df.sort_values(by=['File', 'Line'])

done_librispeech_instances_df = done_librispeech_instances_df.sort_values(by=['File', 'Line'])

done_switchboardHub5_instances_df = done_switchboardHub5_instances_df.sort_values(by=['File', 'Line'])

done_timit_instances_df = done_timit_instances_df.sort_values(by=['File'])

## Creating a Summary Dataframe of All the Quantitative Information from Each Corpus

This will combine the quantitative information from each corpus into one dataframe.

### Feature: Ain't

In [None]:
#Creates a summary info dataframe where SWITCHBOARD AND HUB5 ARE COMBINED
aint_all_corpora_info_df = aint_coraal_info_df.join(aint_fisher_info_df).join(aint_librispeech_info_df).join(aint_switchboardHub5_info_df).join(aint_timit_info_df)

##Creates a summary info dataframe where SWITCHBOARD AND HUB5 ARE SEPARATE
#aint_all_corpora_info_df = aint_coraal_info_df.join(aint_fisher_info_df).join(aint_librispeech_info_df).join(aint_switchboard_info_df).join(aint_hub5_info_df).join(aint_timit_info_df)

### Feature: Be

In [None]:
#Creates a summary info dataframe where SWITCHBOARD AND HUB5 ARE COMBINED
be_all_corpora_info_df = be_coraal_info_df.join(be_fisher_info_df).join(be_librispeech_info_df).join(be_switchboardHub5_info_df).join(be_timit_info_df)

##Creates a summary info dataframe where SWITCHBOARD AND HUB5 ARE SEPARATE
#be_all_corpora_info_df = be_coraal_info_df.join(be_fisher_info_df).join(be_librispeech_info_df).join(be_switchboard_info_df).join(be_hub5_info_df).join(be_timit_info_df)

### Feature: Done

In [None]:
#Creates a summary info dataframe where SWITCHBOARD AND HUB5 ARE COMBINED
done_all_corpora_info_df = done_coraal_info_df.join(done_fisher_info_df).join(done_librispeech_info_df).join(done_switchboardHub5_info_df).join(done_timit_info_df)

##Creates a summary info dataframe where SWITCHBOARD AND HUB5 ARE SEPARATE
#done_all_corpora_info_df = done_coraal_info_df.join(done_fisher_info_df).join(done_librispeech_info_df).join(done_switchboard_info_df).join(done_hub5_info_df).join(done_timit_info_df)

## Exporting Dataframes to CSV Files

This will export the dataframes to CSV files.

In [None]:
# Set the output filepath
output_filepath = "path"

### Feature: Ain't

In [None]:
aint_coraal_instances_df.to_csv(f"{output_filepath}aint_coraal_instances.csv", index=False)

aint_fisher_instances_df.to_csv(f"{output_filepath}aint_fisher_instances.csv", index=False)

aint_librispeech_instances_df.to_csv(f"{output_filepath}aint_librispeech_instances.csv", index=False)

aint_switchboardHub5_instances_df.to_csv(f"{output_filepath}aint_switchboardHub5_instances.csv", index=False)

aint_timit_instances_df.to_csv(f"{output_filepath}aint_timit_instances.csv", index=False)


aint_all_corpora_info_df.to_csv(f"{output_filepath}aint_all_corpora_info.csv")

### Feature: Be

In [None]:
be_coraal_instances_df.to_csv(f"{output_filepath}be_coraal_instances.csv", index=False)

be_fisher_instances_df.to_csv(f"{output_filepath}be_fisher_instances.csv", index=False)

be_librispeech_instances_df.to_csv(f"{output_filepath}be_librispeech_instances.csv", index=False)

be_switchboardHub5_instances_df.to_csv(f"{output_filepath}be_switchboardHub5_instances.csv", index=False)

be_timit_instances_df.to_csv(f"{output_filepath}be_timit_instances.csv", index=False)


be_all_corpora_info_df.to_csv(f"{output_filepath}be_all_corpora_info.csv")

### Feature: Done

In [None]:
done_coraal_instances_df.to_csv(f"{output_filepath}done_coraal_instances.csv", index=False)

done_fisher_instances_df.to_csv(f"{output_filepath}done_fisher_instances.csv", index=False)

done_librispeech_instances_df.to_csv(f"{output_filepath}done_librispeech_instances.csv", index=False)

done_switchboardHub5_instances_df.to_csv(f"{output_filepath}done_switchboardHub5_instances.csv", index=False)

done_timit_instances_df.to_csv(f"{output_filepath}done_timit_instances.csv", index=False)


done_all_corpora_info_df.to_csv(f"{output_filepath}done_all_corpora_info.csv")