In [45]:
import pandas as pd
import spacy
import re
df = pd.read_csv("instances_retrial_2.csv")
df = df.rename(
    columns={
        'intended_ambiguous': 'intended_ambiguous_old',
        'intended_figurative': 'intended_figurative_old',
        'intended_literal': 'intended_literal_old',
        'intended_ambiguous_retrial': 'intended_ambiguous', 
        'intended_figurative_retrial': 'intended_figurative',
        'intended_literal_retrial': 'intended_literal',
    })
nlp = spacy.load("en_core_web_sm")

In [46]:
SHES = ['she', 'her', 'hers', 'herself']
HES = ['he', 'his', 'him', 'himself']
THEYS = ['they', 'their', 'them', 'theirs', 'themselves']
SHES_TO_HES = {
    ('she', '*') : ('he', '*'),
    ('her', 'poss') : ('his', 'poss'),
    ('her', 'dobj') : ('him', 'dobj'),
    ('her', 'pobj') : ('him', 'pobj'),
    ('hers', '*') : ('his', 'attr'),
    ('herself', '*') : ('himself', '*')
}
HES_TO_SHES = {v: k for k, v in SHES_TO_HES.items()}
HES_TO_THEYS = {
    ('he', '*') : ('they', '*'),
    ('his', 'poss'): ('their', '*'),
    ('him', 'dobj') : ('them', 'dobj'),
    ('him', 'pobj') : ('them', 'pobj'),
    ('his', 'attr') : ('theirs', '*'),
    ('himself', '*') : ('themselves', '*')
}
SHES_TO_THEYS = {HES_TO_SHES[k]: v for k, v in HES_TO_THEYS.items()}

def f(row):
    row['intended_ambiguous'] = row['intended_ambiguous'].replace("<", "").replace(">", "")
    row['intended_figurative'] = row['intended_figurative'].replace("<", "").replace(">", "")
    row['intended_literal'] = row['intended_literal'].replace("<", "").replace(">", "")

    converted = pronoun_conversion(
        row['intended_ambiguous'], 
        row['intended_figurative'], 
        row['intended_literal']
    )
    row['converted'] = False
    if converted is not None:
        row[['intended_ambiguous', 'intended_figurative', 'intended_literal']] = converted
        row['converted'] = True
    return row
    
def pronoun_conversion(ss_ambiguous, s_figurative, s_literal):

    ss_ambiguous_doc = nlp(ss_ambiguous)
    s_figurative_doc = nlp(s_figurative)
    s_literal_doc = nlp(s_literal)

    if ss_ambiguous_doc[0].pos_ != 'VERB':
        return None

    idx_fig = -1
    for i in range(len(s_figurative_doc) - len(ss_ambiguous_doc) + 1):
        substring = s_figurative_doc[i : i + len(ss_ambiguous_doc)]
        if substring.text == ss_ambiguous_doc.text:
            idx_fig = i
            break

    if idx_fig in [-1, 0]: return None

    idx_lit = -1
    for i in range(len(s_literal_doc) - len(ss_ambiguous_doc) + 1):
        substring = s_literal_doc[i : i + len(ss_ambiguous_doc)]
        if substring.text == ss_ambiguous_doc.text:
            idx_lit = i
            break
        
    if idx_lit in {-1, 0}: return None

    subj_fig = s_figurative_doc[idx_fig - 1]
    subj_lit = s_literal_doc[idx_lit - 1]
    
    if {subj_fig.text, subj_lit.text} == {'she', 'he'}:

        new_s_figurative = convert(s_figurative_doc, SHES_TO_HES, HES_TO_SHES, SHES, HES)
        if new_s_figurative is not None \
                and f"{subj_lit.text} {ss_ambiguous}" in new_s_figurative:
            return (f"{subj_lit.text} {ss_ambiguous}", new_s_figurative, s_literal)
        
        new_s_literal = convert(s_literal_doc, SHES_TO_HES, HES_TO_SHES, SHES, HES)
        if new_s_literal is not None \
                and f"{subj_fig.text} {ss_ambiguous}" in new_s_literal:
            return (f"{subj_fig.text} {ss_ambiguous}", s_figurative, new_s_literal)
        
    elif subj_fig.text == 'they' and not contains_they(s_literal_doc):
        new_s_literal = None
        if subj_lit.text == 'she':
            new_s_literal = convert(s_literal_doc, SHES_TO_THEYS, {}, SHES, [])
        elif subj_lit.text == 'he':
            new_s_literal = convert(s_literal_doc, HES_TO_THEYS, {}, HES, [])

        if new_s_literal is not None \
                and f"they {ss_ambiguous}" in new_s_literal:
            return (f"they {ss_ambiguous}", s_figurative, new_s_literal)
    
    elif subj_lit.text == 'they' and not contains_they(s_figurative_doc):
        new_s_figurative = None
        if subj_fig.text == 'she':
            new_s_figurative = convert(s_figurative_doc, SHES_TO_THEYS, {}, SHES, [])
        elif subj_fig.text == 'he':
            new_s_figurative = convert(s_figurative_doc, HES_TO_THEYS, {}, HES, [])

        if new_s_figurative is not None \
                and f"they {ss_ambiguous}" in new_s_figurative:
            return (f"they {ss_ambiguous}", new_s_figurative, s_literal)
        
    return None

def contains_they(doc):
    for token in doc:
        for they in THEYS:
            if token.text == they:
                return True
    return False
    

def convert(doc, dict1, dict2, list1, list2):
    words = []
    for token in doc:
        token_tuple = (token.text, token.dep_)
        token_wild = (token.text, '*')
        if token_tuple in dict1:
            words.append(dict1[token_tuple][0])
        elif token_wild in dict1: 
            words.append(dict1[token_wild][0])
        elif token_tuple in dict2:
            words.append(dict2[token_tuple][0])
        elif token_wild in dict2: 
            words.append(dict2[token_wild][0])
        elif token.text in list1 or token.text in list2:
            return None
        else:
            words.append(token.text)
    return re.sub(r'\s([?.!,"\'](?:\s|$))', r'\1', ' '.join(words))
    

In [47]:
from tqdm import tqdm
tqdm.pandas()
df = df.progress_apply(f, axis=1)

100%|██████████| 731/731 [00:07<00:00, 99.42it/s] 


In [48]:
df.to_csv('instances_chatgpt.csv', index=False)

In [61]:
doc[-1].dep_

'attr'

In [28]:
print(type(doc) == spacy.tokens.doc.Doc)

True


In [20]:
for token in doc[0:3]:
    print(token)

She
looks
at
