# Step 1.7: Getting Word Types per Structural Patterns

This code will produce CSV files which have the raw and normalized frequency of each subject and predicate word type in each structural pattern in each corpus.

## Required Packages

The following packages are necessary to run this code:
string, os, re, [pandas](https://pypi.org/project/pandas/), [numpy](https://pypi.org/project/numpy/)

## Define the Dataframe Creating Function

This function takes the following arguments:

<ol>
<li>The filepath to the split content CSV produced in Step 1.5</li>
<li>The filepath to the folder where the newly created CSVs will be stored</li>
<li>The word being searched for</li>
</ol>

In [None]:
def get_subj_pred_structural_pattern(csv_input_path, csv_output_path, search_word_string):
    
    
    """
    Reads in split content csvs produced in step 1-5 and all copora info csv 
    produced in step 1-3 and creates the following csvs for each corpus:
    (1) A dictionary of the subject word type token counts in each structual pattern
    (2) A dictionary of the predicate word type token counts in each structual pattern
    """
    
    
    ####################
    
    
    import os
    import re
    import pandas as pd
    import numpy as np
    from string import punctuation
    
    
    ####################
    
    
    if search_word_string == "ain\'t":
        
        #creates a list of the split content .csv's which should be stored together in 
        # the same folder (csv_input_path)
        csv_filenames = [file for file in os.listdir(csv_input_path) 
                         if file.endswith(".csv") and
                         file.startswith("aint") and "info" not in file]
        
        # filename for the all_corpora_info csv from Step1-3
        #  creates a list of the one filename and then uses [0] to get the filename
        #  out of the list format
        all_corpora_info_csv_path = [f"{csv_input_path}{filename}" 
                                     for filename in os.listdir(csv_input_path) 
                                     if "all_corpora_info" in filename 
                                     and filename.startswith("aint")][0]
    
    else:
                                     
        #creates a list of the split content .csv's which should be stored together in 
        # the same folder (csv_input_path)
        csv_filenames = [file for file in os.listdir(csv_input_path) 
                         if file.endswith(".csv") and
                         file.startswith(search_word_string) and "info" not in file]

        # filename for the all_corpora_info csv from Step1-3
        #  creates a list of the one filename and then uses [0] to get the filename
        #  out of the list format
        all_corpora_info_csv_path = [f"{csv_input_path}{filename}" 
                                     for filename in os.listdir(csv_input_path) 
                                     if "all_corpora_info" in filename 
                                     and filename.startswith(search_word_string)][0]

    #creates a list of tuples with the csv input full paths and the corpus name
    filePath_corpusName = [(f"{csv_input_path}{filename}",
                            re.search(r"_(.*?)_", filename).group(1).lower())
                           for filename in csv_filenames if "info" not in filename]
    
    
    ####################
    
    
    # creates a dataframe from the all_corpora_info_df csv
    all_corpora_info_df = pd.read_csv(f"{all_corpora_info_csv_path}", index_col=0)
    
    # lowercases the column names of this dataframe, which are corpus names
    #  this is to ensure the next immediate lines of code will fuction correctly
    all_corpora_info_df.columns = map(str.lower, all_corpora_info_df.columns)
    
    
    ####################
    
    
    
    # loops through the filepath and corpus name tuples list
    for file_path, corpus_name in filePath_corpusName:
    
        # creates a variable of the corpus' total word count
        corpus_word_count_total = all_corpora_info_df.loc['TotalCorpusWordCount', corpus_name]

        
        ###################
        
        # creates a dataframe from the split content csv for the corpus
        patterns_df = pd.read_csv(file_path)
        
        
        # if the feature count in a corpus is zero, skips the corpus
        if len(patterns_df) == 0:
            
            continue
            
        else:
        
            ####################

            #creates a list of Part of Speech structural patterns present in the corpus
            # the key=lambda part ensures that capitalized types will be alphabetized
            #  just like the rest. Python sorts things by capitals before alphabet
            POS_patterns = sorted(set([row.POSPattern for row in patterns_df.itertuples()]), key=lambda v: v.upper())
            
            #creates an empty list for the column names
            column_tuples = []
            
            #creates a list of tuples for the multilevel column headers
            for POS_pattern in POS_patterns:
                column_tuples.append((POS_pattern, "Raw"))
                
                column_tuples.append((POS_pattern, "Normalized"))
                
            #creates column headers from tuples
            column_names = pd.MultiIndex.from_tuples(column_tuples)
            
            #creates a list of subject word types present in the corpus
            subject_word_types = sorted(set([row.SubjectWordToken for row in patterns_df.itertuples()]), key=lambda v: v.upper())
            
            #creates a list of predicate word types present in the corpus
            predicate_word_types = sorted(set([row.PredicateWordToken for row in patterns_df.itertuples()]), key=lambda v: v.upper())
            
            
            # creates an empty dataframe with POS_patterns list as the columns
            # and subject word types as the indexes

            subject_POS_patterns_df = pd.DataFrame(columns=column_names, index=subject_word_types)
            
            # creates an empty dataframe with POS_patterns list as the columns
            # and predicate word types as the indexes

            predicate_POS_patterns_df = pd.DataFrame(columns=column_names, index=predicate_word_types)
            
            
            #loops through the Part of Speech patterns list
            for POS_pattern in POS_patterns:
                
                #creates an empty dictionary for subject and predicate type-token counts                                    
                subject_count_dict = {}
                
                predicate_count_dict = {}
                
                #loops through the rows and looks at the POS patterns
                for row in patterns_df.itertuples():
                    
                    #if the POS Pattern from the top level for loop matches
                    #  the one in the row, continue
                    if row.POSPattern == POS_pattern:
                        
                        # if the subject word is in the dictionary, add one
                        if row.SubjectWordToken in subject_count_dict:
                            
                            subject_count_dict[row.SubjectWordToken] += 1
                            
                        # if the subject word is not in the dictionary, create a key with a value of 1
                        elif row.SubjectWordToken not in subject_count_dict:
                            
                            subject_count_dict[row.SubjectWordToken] = 1
                        
                        # if the predicate word is in the dictionary, add one
                        if row.PredicateWordToken in predicate_count_dict:
                            
                            predicate_count_dict[row.PredicateWordToken] += 1
                            
                        # if the predicate word is not in the dictionary, create a key with a value of 1
                        elif row.PredicateWordToken not in predicate_count_dict:
                            
                            predicate_count_dict[row.PredicateWordToken] = 1
            
            
                # loops through the rows of the empty subject dataframe and appends the 
                #  token count for each type
                for row in subject_POS_patterns_df.itertuples():
                    
                    #checks to see if the subject word type is in the subject
                    #  count dictionary for this part of speech structural pattern
                    if row.Index in subject_count_dict:
                        
                        #if it is, appends the count to the dataframe
                        subject_POS_patterns_df.loc[row.Index, (POS_pattern, "Raw")] = subject_count_dict[row.Index]
                    
                    # if it is not, appends a zero to the dataframe
                    else:
                        subject_POS_patterns_df.loc[row.Index, (POS_pattern, "Raw")] = 0
                
                
                #calculates the normalized frequency count and appends to the dataframe
                subject_POS_patterns_df[(POS_pattern, "Normalized")] = subject_POS_patterns_df[(POS_pattern, "Raw")]/corpus_word_count_total*100000
                                
                # loops through the rows of the empty subject dataframe and appends the 
                #  token count for each type
                for row in predicate_POS_patterns_df.itertuples():
                    
                    #checks to see if the subject word type is in the subject
                    #  count dictionary for this part of speech structural pattern
                    if row.Index in predicate_count_dict:
                        
                        #if it is, appends the count to the dataframe
                        predicate_POS_patterns_df.loc[row.Index, (POS_pattern, "Raw")] = predicate_count_dict[row.Index]
                    
                    # if it is not, appends a zero to the dataframe
                    else:
                        predicate_POS_patterns_df.loc[row.Index, (POS_pattern, "Raw")] = 0
                        
                #calculates the normalized frequency count and appends to the dataframe
                predicate_POS_patterns_df[(POS_pattern, "Normalized")] = predicate_POS_patterns_df[(POS_pattern, "Raw")]/corpus_word_count_total*100000
                    
            
            if search_word_string == "ain\'t":
            
                #exports the subject_POS_patterns_df to a csv
                subject_POS_patterns_df.to_csv(f"{csv_output_path}/aint_{corpus_name}_subjectPOSPatternTypeToken.csv")

                #exports the predicate_POS_patterns_df to a csv
                predicate_POS_patterns_df.to_csv(f"{csv_output_path}/aint_{corpus_name}_predicatePOSPatternTypeToken.csv")
                
            else:
            
                #exports the subject_POS_patterns_df to a csv
                subject_POS_patterns_df.to_csv(f"{csv_output_path}/{search_word_string}_{corpus_name}_subjectPOSPatternTypeToken.csv")

                #exports the predicate_POS_patterns_df to a csv
                predicate_POS_patterns_df.to_csv(f"{csv_output_path}/{search_word_string}_{corpus_name}_predicatePOSPatternTypeToken.csv")

# Creating Quantitative  Dataframes and Exporting Dataframes to CSV Files

This will execute the code and create the dataframes and then export them as CSV files.

In [None]:
# Designate the input path where the gold standard CSVs are stored
csv_input_path = "path"

# Designate the output path where the gold standard CSVs are stored
csv_output_path = "path"

### Feature: Ain't

In [None]:
# Designate the search word
search_word_string = "ain\'t"

# execute code
get_subj_pred_structural_pattern(csv_input_path, csv_output_path, search_word_string)

### Feature: Be

In [None]:
# Designate the search word
search_word_string = "be"

# execute code
get_subj_pred_structural_pattern(csv_input_path, csv_output_path, search_word_string)

### Feature: Done

In [None]:
# Designate the search word
search_word_string = "done"

# execute code
get_subj_pred_structural_pattern(csv_input_path, csv_output_path, search_word_string)