In [2]:
%load_ext autoreload
%autoreload 2


In [3]:
# - dataset -> read in
# select constructions
# run code using idioms to identify the word spans
# get scores
# plot as boxplots?

In [5]:
"""
Read in original cogs dataset
"""
from data_config import Exp2Cogs
import pandas as pd
from typing import Dict, List

def read_csv_by_column(file_path: str) -> Dict[str, List]:
    """Reads a CSV file and returns a dictionary where keys are column names
    and values are lists of column data.

    Args:
        file_path (str): Path to the CSV file.

    Returns:
        Dict[str, List]: Dictionary containing column-wise data.
    """
    df = pd.read_csv(file_path)
    return df.to_dict(orient="list")

# Example usage
csv_data = read_csv_by_column(Exp2Cogs.original_csv)

In [6]:
"""
Process part 1 - get data into another csv for manual review
# filter parenthetical
# check num periods
# check that the words are present
# remove bad rows
"""
from typing import Set, Tuple

from dataclasses import dataclass, field, asdict

@dataclass
class DataRow:
    cx_type: str = field(init=False)
    sentence: str = field(init=False)
    sentence_with_idxs: List[Tuple[int, str]] = field(init=False)
    tgt_words: List[int] = field(init=False)
    errors: List[str] = field(default_factory=list)


def count_symbols(s: str, symbols: List[str]) -> Dict[str, int]:
    """Counts occurrences of specified symbols in a string.

    Args:
        s (str): The input string.
        symbols (Set[str]): A set of symbols to count.

    Returns:
        Dict[str, int]: A dictionary mapping each symbol to its count in the string.
    """
    symbols = set(symbols)
    ct = 0
    for sym in symbols:
        ct += s.count(sym)
    return ct

def get_end_of_str_by_punct(s: str):
    for sym in ['.', '!', '?']:
        idx = s.find(sym)
        if idx != -1: break

    return idx

def clean_data_list(data_list: List[str], cx_type: str, tgt_words: str) -> List[DataRow]:
    ret_list: List[DataRow] = []
    for s in data_list:
        # make sure valid string (some are weird or nans)
        if not isinstance(s, str):
            continue
        if len(s) < 4:
            continue

        # accrue info
        dr = DataRow()
        ret_list.append(dr)
        dr.cx_type = cx_type
        dr.sentence_with_idxs = [(idx, w) for idx, w in enumerate(s.split(" "))]

        if count_symbols(s, ['.', '!', '?']) != 1:
            dr.errors.append("no punct")
            dr.sentence = s
        else:
            idx = get_end_of_str_by_punct(s)
            truncated_str = s[:idx + 1]
            dr.sentence = truncated_str

        dr.tgt_words = []
        tgts = tgt_words.split(" ")
        for t in tgts:
            t_idxs = [idx for (idx, w) in dr.sentence_with_idxs if w.lower() == t.lower()]
            dr.tgt_words.extend(t_idxs)
            if s.lower().count(t) != 1:
                dr.errors.append("tgt word ct != 1")
    return ret_list

def write_data_rows_to_csv(rows: List[DataRow], file_path: str) -> None:
    """Writes a list of DataRow instances to a CSV file.

    Args:
        rows (List[DataRow]): The list of DataRow instances.
        file_path (str): Path to save the CSV file.
    """
    # Convert dataclass instances to dictionaries
    data_dicts = [asdict(row) for row in rows]

    # Convert list fields to string for CSV storage
    for data in data_dicts:
        data["sentence_with_idxs"] = str(data["sentence_with_idxs"])
        data["tgt_words"] = str(data["tgt_words"])
        data["errors"] = ", ".join(data["errors"])  # Join error messages for readability

    # Convert to DataFrame and save to CSV
    df = pd.DataFrame(data_dicts)
    df.to_csv(file_path, index=False)



In [7]:

print(csv_data.keys())


dict_keys(['Let Alone', 'Way Manner', 'Resultative', 'Conative', 'Intransitive Motion', 'Caused Motion', 'Causative with CxN', 'Ditransitive CxN', 'Comparative Correlative ', 'Unnamed: 9', 'Much Less ', 'Unnamed: 11', 'Unnamed: 12'])


In [8]:

tgt_keys_str_map = {
    'Let Alone': 'let alone',
    'Way Manner': 'way',
    'Conative': 'at',
    'Comparative Correlative ': 'the',
    'Much Less ': 'much less',
    'Causative with CxN': 'with'
}
def get_all_data():
    all_data: List[DataRow] = []
    for k, words in tgt_keys_str_map.items():
        unclean_data = csv_data[k]
        clean_data = clean_data_list(unclean_data, k, words)
        all_data.extend(clean_data)
    return all_data

all_data = get_all_data()

# uncomment to reproduce
# write_data_rows_to_csv(all_data, Exp2Cogs.cogs_parsed)


In [9]:
"""
we manually processed data by hand in google sheets where there were errors
"""

'\nwe manually processed data by hand in google sheets where there were errors\n'

In [10]:
from paper.exp2_cogs.cogs_utils import read_csv_row_by_row, get_all_data_clean

"""
Read in final dataset + verify
"""

csv_data_clean = read_csv_row_by_row(Exp2Cogs.cogs_parsed_final)
print(csv_data_clean[0])


['Let Alone', 'Most wives are too bloody old, let alone mothers.', '[6, 7]', "[(0, 'Most'), (1, 'wives'), (2, 'are'), (3, 'too'), (4, 'bloody'), (5, 'old,'), (6, 'let'), (7, 'alone'), (8, 'mothers.'), (9, '(FN'), (10, 'Construction)')]", '']


In [11]:

all_cogs_clean = get_all_data_clean(csv_data_clean)
# all_clean[0]

in sent Though punctuated by frequent flash-backs to the period before, during and just after the war, temporal progression in the present is clearly marked by the development of two narrative lines which weave their ways in and out of the novel., idx 33 != way
note expected one error message bc one word is 'ways' instead of 'way'


In [12]:
all_cogs_clean[0]

CogsEntry(id=0, cx_type='Let Alone', sent='Most wives are too bloody old, let alone mothers.', tgt_words=[6, 7], tgt_word_offsets=[(31, 34), (35, 40)])