# Step 1.8: Getting n-gram Dataframes

This code will produce CSV files which have n-grams (of up to 3 words) to the left and right of the feature.It will create four sets of columns:

<ol>
<li>A column of pre-cleaned lists of the trigrams</li>
<li>A column of pre-cleaned tuples of the trigrams</li>
<li>A column of cleaned lists of the trigrams</li>
<li>A column of cleaned tuples of the trigrams</li>
</ol>

Tuples will be included because they are easier to examine as a unit.

## Required Packages

The following packages are necessary to run this code:
string, os, re, [pandas](https://pypi.org/project/pandas/), [numpy](https://pypi.org/project/numpy/)

## Define the Dataframe Creating Function

This function takes the following arguments:

<ol>
<li>The filepath to the split content CSV produced in Step 1.5</li>
<li>The filepath to the folder where the newly created CSVs will be stored</li>
<li>The word being searched for</li>
</ol>

In [None]:
def create_trigram_df(csv_input_path, csv_output_path, search_word_string):

    """Creates a csv file with trigrams on either side of the feature."""

    import re
    import pandas as pd
    import numpy as np
    from string import punctuation
    
    
    
    if search_word_string == "ain\'t":
        
        #creates a list of the split content .csv's which should be stored together in 
        # the same folder (csv_input_path)
        csv_filenames = [file for file in os.listdir(csv_input_path) 
                         if file.endswith(".csv") and
                         file.startswith("aint") and "info" not in file]
        
        # filename for the all_corpora_info csv from Step1-3
        #  creates a list of the one filename and then uses [0] to get the filename
        #  out of the list format
        all_corpora_info_csv_path = [f"{csv_input_path}{filename}" 
                                     for filename in os.listdir(csv_input_path) 
                                     if "all_corpora_info" in filename 
                                     and filename.startswith("aint")][0]
    
    else:
                                     
        #creates a list of the split content .csv's which should be stored together in 
        # the same folder (csv_input_path)
        csv_filenames = [file for file in os.listdir(csv_input_path) 
                         if file.endswith(".csv") and
                         file.startswith(search_word_string) and "info" not in file]

        # filename for the all_corpora_info csv from Step1-3
        #  creates a list of the one filename and then uses [0] to get the filename
        #  out of the list format
        all_corpora_info_csv_path = [f"{csv_input_path}{filename}" 
                                     for filename in os.listdir(csv_input_path) 
                                     if "all_corpora_info" in filename 
                                     and filename.startswith(search_word_string)][0]

    #creates a list of tuples with the csv input full paths and the corpus name
    filePath_corpusName = [(f"{csv_input_path}{filename}",
                            re.search(r"_(.*?)_", filename).group(1).lower())
                           for filename in csv_filenames if "info" not in filename]
    
     ####################
    
    
    # creates a dataframe from the all_corpora_info_df csv
    all_corpora_info_df = pd.read_csv(f"{all_corpora_info_csv_path}", index_col=0)
    
    # lowercases the column names of this dataframe, which are corpus names
    #  this is to ensure the next immediate lines of code will fuction correctly
    all_corpora_info_df.columns = map(str.lower, all_corpora_info_df.columns)
    
    ####################
    
    
    # loops through the filepath and corpus name tuples list
    for file_path, corpus_name in filePath_corpusName:
    
        # creates a variable of the corpus' total word count
        corpus_word_count_total = all_corpora_info_df.loc['TotalCorpusWordCount', corpus_name]
    
    
        # creates a string of punctuation markers to be used for cleaning later
        #  that does not include a single apostrophe or dash
        #  the dash is because when words are cut off in the transcript
        #  a dash will be used to represent that and that can be important
        #  in the eventual data analysis
        punctuation_no_apostrophe_no_dash = punctuation.replace("'", "").replace("-","")
        
        # creates a string of punctuation markers to be used for cleaning later
        #  that does not include a single apostrophe or dash
        punctuation_no_apostrophe = punctuation.replace("'", "")

        # reads in the split content csv
        patterns_df = pd.read_csv(file_path)

        # filters the data frame to only include needed columns
        trigram_df = patterns_df[['File', 'Line', 'InstancesCountPerLine', 'FeatureCountPerLine', 'Content']]

        # if the split content csv is empty because it had no
        # instances of the feature, this will return an empty
        # csv
        if len(trigram_df) == 0:
            
            #creates an empty dataframe since there are no instances
            # of the morphosyntactic feature
            trigram_empty_df = pd.DataFrame(columns=[
                'File', 'Line', 'InstancesCountPerLine', 
                'FeatureCountPerLine', 'Content'])

             #exports dataframe to csv
            trigram_empty_df.to_csv(f"{csv_output_path}/{search_word_string}_{corpus_name}_trigrams.csv", index=False)

        # otherwise, continue on with the code
        else:

        #################

            # adds a column with a list of tokens from L3 to L1 (3 words to the
            # left to 1 word to the left of the feature) for each row
            trigram_df['L3_L1_List'] = patterns_df[['L3', 'L2', 'L1']].values.tolist()

            # adds a column with tuples of the L3_L1 lists
            trigram_df['L3_L1_Tuple'] = [tuple(sublist) for sublist 
                                        in list(trigram_df['L3_L1_List'])] 

            # creates a list of the lists in the column
            L3_L1_trigrams_list = list(trigram_df['L3_L1_List'])

            # creates an empty list to be appended to
            cleaned_L3_L1_trigrams_list = []

            # loops through the lists in the L3_L1 trigrams list
            for sublist in L3_L1_trigrams_list:
                #creates an empty list to be appended to
                # this list will be emptied on every loop
                cleaned_sublist = []

                # loops through the word in each sub-list
                for word in sublist:

                    # if the token is a string, appends the word to the cleaned_sublist
                    if type(word) != float:
                        cleaned_sublist.append(word.lower().strip().translate(
                            str.maketrans("", "", punctuation_no_apostrophe_no_dash)))

                    # skips the token if it is NaN
                    else:
                        continue

                # appends the cleaned list to the larger list        
                cleaned_L3_L1_trigrams_list.append(cleaned_sublist)

            # creates a column with the cleaned lists    
            trigram_df['CleanedL3_L1_List'] = cleaned_L3_L1_trigrams_list

            # creates a column with cleaned tuples  
            trigram_df['CleanedL3_L1_Tuple'] = [tuple(sublist) for sublist in cleaned_L3_L1_trigrams_list] 

            # creates an empty column for tuple counts to be inserted
            trigram_df['Count_CleanedL3_L1_Tuple'] = np.nan

            # count number of occurences of the trigram tuples in the corpus
            for row in trigram_df.itertuples():

                # inserts the count of the trigram tuple into the empty column
                trigram_df.loc[row.Index, 'Count_CleanedL3_L1_Tuple'] = list(trigram_df['CleanedL3_L1_Tuple']).count(row.CleanedL3_L1_Tuple)

            # creates column with normalized count of tuple (number of instances divided by
            #  total word count in the corpus multiplied by 100,000)
            trigram_df['NormCount_CleanedL3_L1_Tuple'] = trigram_df['Count_CleanedL3_L1_Tuple']/corpus_word_count_total*100000
                        
            #################

            # adds a column with a list of tokens from R1 to R3 (1 word to the
            # right to 3 words to the right of the feature) for each row
            trigram_df['R1_R3_List'] = patterns_df[['R1', 'R2', 'R3']].values.tolist()

            # adds a column with tuples of the R1_R3 lists
            trigram_df['R1_R3_Tuple'] = [tuple(sublist) for sublist 
                                        in list(trigram_df['R1_R3_List'])]

            # creates a list of the lists in the column                            
            R1_R3_trigrams_list = list(trigram_df['R1_R3_List'])

            # creates an empty list to be appended to
            cleaned_R1_R3_trigrams_list = []

            # loops through the lists in the L3_L1 trigrams list
            for sublist in R1_R3_trigrams_list:
                #creates an empty list to be appended to
                # this list will be emptied on every loop        
                cleaned_sublist = []

                # loops through the word in each sub-list
                for word in sublist:

                    # if the token is a string, appends the word to the cleaned_sublist
                    if type(word) != float:
                        cleaned_sublist.append(word.lower().strip().translate(
                            str.maketrans("", "", punctuation_no_apostrophe_no_dash)))

                    # skips the token if it is NaN
                    else:
                        continue

                # appends the cleaned list to the larger list              
                cleaned_R1_R3_trigrams_list.append(cleaned_sublist)

            # creates a column with the cleaned lists            
            trigram_df['CleanedR1_R3_List'] = cleaned_R1_R3_trigrams_list

            # creates a column with cleaned tuples       
            trigram_df['CleanedR1_R3_Tuple'] = [tuple(sublist) for sublist in cleaned_R1_R3_trigrams_list] 

            # creates an empty column for tuple counts to be inserted
            trigram_df['Count_CleanedR1_R3_Tuple'] = np.nan

            # count number of occurences of the trigram tuples in the corpus
            for row in trigram_df.itertuples():

                # inserts the count of the trigram tuple into the empty column
                trigram_df.loc[row.Index, 'Count_CleanedR1_R3_Tuple'] = list(trigram_df['CleanedR1_R3_Tuple']).count(row.CleanedR1_R3_Tuple)

            # creates column with normalized count of tuple (number of instances divided by
            #  total word count in the corpus multiplied by 100,000)
            trigram_df['NormCount_CleanedR1_R3_Tuple'] = trigram_df['Count_CleanedR1_R3_Tuple']/corpus_word_count_total*100000

            
            #################
            
            
            def get_individual_L3_R3(headers_list):

                """ 
                Takes in a list of column headers which match the L3-R3 positions
                from the previous split content csvs. Creates new columns in the
                dataframe which contain:

                (1) The raw token of that position
                (2) The cleaned token of that position
                (3) The raw frequency count of that type in that position
                (4) The normalized count of that type in that position

                """
            
                #loops through the headers list
                for header in headers_list:
                    
                    # adds a column with a list of cleaned tokens
                    trigram_df[f"{header}"] = patterns_df[f"{header}"]

                    # adds a column with a list of cleaned tokens
                    trigram_df[f"Cleaned_{header}"] = patterns_df[f"{header}"].apply(lambda x : x.lower().strip().translate(
                                    str.maketrans("", "", punctuation_no_apostrophe)) if type(x) != float else x)

                    #creates an empty column to enter raw frequency counts into
                    trigram_df[f"{header}_Count"] = np.nan

                    # count number of occurences of the trigram tuples in the corpus
                    for row in trigram_df.itertuples():

                        # inserts the count of the trigram tuple into the empty column
                        trigram_df.loc[row.Index, f"{header}_Count"] = list(trigram_df[f"Cleaned_{header}"]).count(trigram_df.loc[row.Index, f"Cleaned_{header}"])

                    # creates column with normalized count of the token (number of instances divided by
                    #  total word count in the corpus multiplied by 100,000)
                    trigram_df[f"{header}_Norm_Count"] = trigram_df[f"{header}_Count"]/corpus_word_count_total*100000
                    
                #returns the dataframe
                return trigram_df
            
            get_individual_L3_R3(['L3', 'L2', 'L1', 'R1', 'R2', 'R3'])
            
            
            if search_word_string == "ain\'t":
                
                #exports dataframe to csv
                trigram_df.to_csv(f"{csv_output_path}/aint_{corpus_name}_trigrams.csv", index=False)
                
            else:
                
                #exports dataframe to csv
                trigram_df.to_csv(f"{csv_output_path}/{search_word_string}_{corpus_name}_{search_word_string}_trigrams.csv", index=False)

# Creating Quantitative  Dataframes and Exporting Dataframes to CSV Files

This will execute the code and create the dataframes and then export them as CSV files.

In [None]:
# Designate the input path where the gold standard CSVs are stored
csv_input_path = "path"

# Designate the output path where the gold standard CSVs are stored
csv_output_path = "path"

In [None]:
import os

### Feature: Ain't

In [None]:
# Designate the search word
search_word_string = "ain\'t"

# execute code
create_trigram_df(csv_input_path, csv_output_path, search_word_string)

### Feature: Be

In [None]:
# Designate the search word
search_word_string = "be"

# execute code
create_trigram_df(csv_input_path, csv_output_path, search_word_string)

### Feature: Done

In [None]:
# Designate the search word
search_word_string = "done"

# execute code
create_trigram_df(csv_input_path, csv_output_path, search_word_string)