In [2]:
import re
import pandas as pd
import numpy as np
from glob import glob
import os

# Globals

Load data, define regexes and repeated functions

In [3]:
# load quotative constructions
data = pd.read_csv("output/detected_quotatives_.csv")
len(data)

2789

In [4]:
PREFIX_RE = re.compile(r"(so|but|now|and)", flags=re.IGNORECASE)

PER_RE = {
    "1st": r"(\bI|we\b)",
    "3rd": r"(\bs?he|they\b)",
}

NUM_RE = {
    "SG": r"(\bI|s?he\b)",
    "PL": r"(\bwe|they\b)"
}

q_copula_forms = [
    "q_like_copula",
    "q_like_contracted",
    "q_like_zero",
    "q_all_copula",
    "q_all_contracted",
    "q_all_zero"
]

gen_suffix = r"((,|-)|[<\(\/\[]{1})"
opt_like_suffix = r"(?:like)"
opt_intj_suffix = r"\b(ah|ay|eh|ha|hey|hm|huh|man|mm|oh|uh|um|yo|wow)"

suffix_patterns = {
    "like_all": [gen_suffix, opt_intj_suffix],
    "other": [gen_suffix, opt_like_suffix, opt_intj_suffix]
}

optional_suffixes = [opt_like_suffix, opt_intj_suffix]
all_suffixes = [opt_like_suffix, opt_intj_suffix, gen_suffix]

In [5]:
# Preprocessing step - add spaces before punctuation
def tokenize_with_punctuation(text):
    # Add space before trailing punctuation
    text = re.sub(gen_suffix, r" \1", text)
    # Add space before punctuation attached to words
    text = re.sub(fr"(\w+){gen_suffix}", r'\1 \2', text)
    return [s for s in text.split() if len(s)]
    
def find_regex(pat, s):
    search_result = re.search(pat, s)
    if search_result:
        return search_result.group(0)
    else:
        return "###"

def update_target(l):
    if len(l):
        return l[1:]

def extract_verb_construction(row):
    """
    Remove suffix tokens from temp based on q_type and suffix content.
    
    Returns the verb construction without suffix elements.
    """
    temp_list = row['temp'].copy()
    suffix = row['q_suffix']
    q_type = row['q_type']  

    # Remove suffixes if they have been detected
    for s_type in all_suffixes:
        if re.match(s_type, suffix):
            temp_list = [item for item in temp_list if not re.search(s_type, item)]

    return temp_list

# Main data processing
Extract components with quotative constructions, assign to display names

In [6]:
# create copy of data
output = data.copy()
print(len(output))
# create temp column with list of non-empty tokens to parse
output["temp"] = output.target.apply(tokenize_with_punctuation)

# extract if the first element matches the known set of discourse prefixes (T/F)
output["has_prefix"] = output.temp.apply(
    lambda t: bool(PREFIX_RE.search(t[0]))
)

# drop this element from the temp list column
# temp should now only have subject + q_form + suffix
output.loc[output["has_prefix"], "temp"] = output.loc[output["has_prefix"], "temp"].copy().apply(update_target)

# extract subject value from temp
output["subj_val"] = output.temp.apply(
    lambda t: t[0]
)

# extract subject person from value
for subj_per in PER_RE.keys():
    output[f"subj_{subj_per}"] = output.subj_val.apply(
        lambda sv: bool(re.search(PER_RE[subj_per], sv, flags=re.IGNORECASE))
    )

# extract subject number from value
for subj_num in NUM_RE.keys():
    output[f"subj_{subj_num}"] = output.subj_val.apply(
        lambda sv: bool(re.search(NUM_RE[subj_num], sv, flags=re.IGNORECASE))
    )

# temp should now only be q_form + suffix; remove the 1st element
output.temp = output.temp.copy().apply(update_target)

# split data into q_like_*, q_all_* vs. all others since they allow different suffixes
output_split = {
    "like_all": output.loc[output.q_type.isin(q_copula_forms)],
    "other": output.loc[~output.q_type.isin(q_copula_forms)]
}

# Separate suffix from main verb construction
for q_type in output_split:
    curr = output_split[q_type].copy()
    
    # Get the appropriate pattern
    pattern = suffix_patterns.get(q_type, suffix_patterns["other"])
    
    # Extract suffix and update temp
    curr["q_suffix"] = curr.temp.apply(lambda t: find_regex(r"|".join(pattern), t[-1]))
    # curr["temp"] = curr["temp"].
    # Remove suffix sequence from remaining target based on q_type and detected interjections, etc.
    curr["temp"] = curr.apply(extract_verb_construction, axis=1)
    # Reassign the DF
    output_split[q_type] = curr

# Reassign total output to the split data as well
output = pd.concat(output_split.values())

# Map q_type to display names
verb_mapping = {
    'q_say': 'say',
    'q_like_copula': 'be+like',
    'q_like_contracted': 'be+like',
    'q_like_zero': 'be+like',
    'q_go': 'go',
    'q_ask': 'other',
    'q_repeat': 'other',
    'q_think': 'other',
    'q_tell': 'other',
    'q_all_zero': 'other',
    'q_all_copula': 'other',
    'q_all_contracted': 'other'
}
output['verb_display'] = output['q_type'].map(verb_mapping)
print(len(output))

2789
2789


# Incorporating speaker variables

Read speaker metadata, rename columns for easy merging, reproducing figures

In [7]:
columns_to_use = [
    'Age', # used to create unified age cohorts
    'Age.Group', # original age cohorts, depends on corpus
    'Gender',
    'Other.Places.Lived',
    'Year.of.Birth',
    'Year.of.Interview',
    'region_id',
    'speaker_id',
    'source_file'
]

# Procure metadata
metadata = dict()
for f in glob("data/*_metadata_*.txt"):
    curr = pd.read_csv(f, sep="\t", dtype ={'Age.Group': str})
    # Assign region ID
    curr["region_id"] = os.path.basename(f)[0:3]
    
    # Filter by relevant speakers
    curr = curr.loc[curr['CORAAL.Spkr'].isin(output.speaker_id.tolist())]

    curr = curr.rename(
        columns={
            'CORAAL.Spkr': 'speaker_id',
            'CORAAL.File': 'source_file'
        }
    )
    # Filter by relevant columns
    metadata[f] = curr[columns_to_use]

# Flatten metadata and generate new age cohorts
spkr_df = pd.concat(metadata.values())

In [8]:
# Merge metadata content onto quotative productions
df = output.merge(
    spkr_df,
    on=['source_file', 'speaker_id', 'region_id'],
    how='left'
)

# Check if any rows failed to merge
initial_matches = df[columns_to_use[0]].notna().sum()
print(f"  - Failed to match: {len(df) - initial_matches} rows\n")

# Identify speaker_ids that need fallback
needs_fallback = df[df[columns_to_use[0]].isna()]['speaker_id'].unique()
print(f"  - {len(needs_fallback)} unique speaker_ids need fallback\n")

# For rows still missing demographics, do a fallback merge on speaker_id only
missing_mask = df[columns_to_use[0]].isna()

if missing_mask.any():
    # Get unique metadata for each speaker_id (keep first occurrence)
    metadata_lookup = spkr_df.drop_duplicates(
        subset='speaker_id', 
        keep='first'
    )
# For rows with missing data, look up by speaker_id
    missing_speakers = df.loc[missing_mask, 'speaker_id']
    fallback_data = missing_speakers.to_frame().merge(
        metadata_lookup,
        on='speaker_id',
        how='left'
    )
    
    # Fill in the missing demographic columns
    for col in columns_to_use:
        df.loc[missing_mask, col] = fallback_data[col].values

print(len(df))

  - Failed to match: 3 rows

  - 1 unique speaker_ids need fallback

2789


In [9]:
def get_age_cohort(age):
    if age < 29:
        return '-29'
    elif age > 50:
        return '51+'
    else:
        return '30-50'   

# Create age cohorts
def assign_age_group(age):
    if age <= 12:
        return "9-12"
    elif age >=13 and age <=16:
        return "13-16"
    elif age >=17 and age <=29:
        return "17-29"
    elif age >=30 and age <=39:
        return "30-39"
    elif age >=40 and age <=49:
        return "40-49"
    elif age >=50 and age <=59:
        return "50-59"
    elif age >=60:
        return "60+"
    else:
        return None
        
# Create age cohorts
def assign_cohort(year_of_birth):
    if year_of_birth < 1941:
        return "Before 1941"
    if 1941 <= year_of_birth <= 1960:
        return "Cohort 1\n1941-60"
    elif 1961 <= year_of_birth <= 1977:
        return "Cohort 2\n1961-77"
    elif 1978 <= year_of_birth <= 1990:
        return "Cohort 3\n1978-90"
    elif 1991 <= year_of_birth <= 2002:
        return "Cohort 4\n1991-2002"
    elif year_of_birth > 2002:
        return "After 2002"

# Assign age cohorts in 3 ways
# unified given CORAAL conventions
df['Age.Cohort'] = df['Age'].apply(get_age_cohort)
# based on T&D (2009)
df['age_group'] = df['Age'].apply(assign_age_group)
# based on Cukor-Avila (2012)
df['cohort'] = df['Year.of.Birth'].apply(assign_cohort)

In [33]:
spkrs = df.drop_duplicates(subset='speaker_id')

In [48]:
df.q_type.value_counts()

q_type
q_say                970
q_like_copula        885
q_like_contracted    528
q_think              188
q_tell                97
q_go                  46
q_like_zero           45
q_all_zero            16
q_all_copula           7
q_ask                  5
q_all_contracted       1
q_repeat               1
Name: count, dtype: int64

In [12]:
df.to_csv("output/coded.csv")