In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd

from pws_in_context.constants import DATA_PATH

STIMULI_DATA = DATA_PATH / "stimuli_data"

targets = pd.read_csv(STIMULI_DATA / "new_targets.csv")
sentences = pd.read_csv(STIMULI_DATA / "upd_sent_n_comp.csv")
lookup_table = pd.read_csv(STIMULI_DATA / "lookup_table.csv")

In [None]:
lookup_table = lookup_table[lookup_table["OG_of_Anagrams"].notna()]

anagram_to_pws = {
    anagram: original_pw
    for anagram, original_pw in zip(
        lookup_table["Target "].tolist(), lookup_table["OG_of_Anagrams"]
    )
}


def map_column_polarity(col: str) -> str:
    if col[:3] == "neg":
        return "negative"
    elif col[:2] == "nt" or col[:3] == "neu":
        return "neutral"
    else:
        return "positive"


def map_column_target_type(col: str) -> tuple[str, str]:
    item_type = "pseudo" if "pws" in col else "word"
    item_prev = "low" if "low" in col else "high" if "high" in col else None
    return item_type, item_prev


def is_anagram_check(target: str) -> bool | str:
    if target in anagram_to_pws.keys():
        return anagram_to_pws[target]
    return False


def rm_apostrophe(sentence: str) -> str:
    return sentence.replace("â€™", "'")

In [None]:
target_polarity_dict = {}
target_type_and_prev_dict = {}
for col in targets.columns:
    for tg in targets[col].tolist():
        if isinstance(tg, str):
            polarity = map_column_polarity(col)
            item_type, item_prev = map_column_target_type(col)
            target = tg.strip("*")
            target_type_and_prev_dict[target] = (item_type, item_prev)
            if polarity in target_polarity_dict.keys():
                target_polarity_dict[polarity].append(target)
            else:
                target_polarity_dict[polarity] = [target]

sentence_polarity_dict = {}
context_to_sentence_template_dict = {}
for col in [col for col in sentences.columns if "sent" in col]:
    for sent in sentences[col].tolist():
        if isinstance(sent, str):
            sent = rm_apostrophe(sent)
            polarity = map_column_polarity(col)
            context = sent.split(" TARGET")[0]
            context_to_sentence_template_dict[context] = sent
            post_target = sent.split(" TARGET ")[1].split()[0]
            if polarity in sentence_polarity_dict.keys():
                sentence_polarity_dict[polarity].append((context, post_target))
            else:
                sentence_polarity_dict[polarity] = [(context, post_target)]

In [None]:
target_sent_matching = {
    "sentence_polarity": [],
    "item_polarity": [],
    "item_type": [],
    "prevalence": [],
    "target": [],
    "sentence": [],
    "sentence_template": [],
    "context": [],
    "post_target": [],
    "anagram": [],
    "original_item": [],
}

for target_cat in target_polarity_dict.keys():
    for tg in target_polarity_dict[target_cat]:
        is_anagram = is_anagram_check(tg)
        anagram_bool = True if is_anagram else False
        original_item = anagram_to_pws[tg] if anagram_bool else None
        item_type, item_prev = target_type_and_prev_dict[tg]
        for sentence_cat in sentence_polarity_dict.keys():
            for sent, post_target in sentence_polarity_dict[sentence_cat]:
                target_sent_matching["context"].append(sent)
                target_sent_matching["target"].append(tg)
                target_sent_matching["post_target"].append(post_target)
                target_sent_matching["sentence_polarity"].append(sentence_cat)
                target_sent_matching["item_polarity"].append(target_cat)
                target_sent_matching["item_type"].append(item_type)
                target_sent_matching["prevalence"].append(item_prev)
                sentence_template = context_to_sentence_template_dict[sent]
                target_sent_matching["sentence_template"].append(sentence_template)
                target_sent_matching["sentence"].append(sentence_template.replace("TARGET", tg))

                target_sent_matching["anagram"].append(anagram_bool)
                target_sent_matching["original_item"].append(original_item)


target_sent_df = pd.DataFrame.from_dict(target_sent_matching)

target_sent_df.to_csv(DATA_PATH / "target_sent_combinations.csv", index=False)