In [20]:
import pandas as pd
import spacy
import re
df = pd.read_csv("gen_final_chatgpt_prePronounConversion.csv")
# df = df.rename(
#     columns={
#         'intended_ambiguous': 'intended_ambiguous_old',
#         'intended_figurative': 'intended_figurative_old',
#         'intended_literal': 'intended_literal_old',
#         'intended_ambiguous_retrial': 'intended_ambiguous', 
#         'intended_figurative_retrial': 'intended_figurative',
#         'intended_literal_retrial': 'intended_literal',
#     })
nlp = spacy.load("en_core_web_sm")

In [21]:
SHES = ['she', 'her', 'hers', 'herself']
HES = ['he', 'his', 'him', 'himself']
THEYS = ['they', 'their', 'them', 'theirs', 'themselves']
SHES_TO_HES = {
    ('she', '*') : ('he', '*'),
    ('her', 'poss') : ('his', 'poss'),
    ('her', 'dobj') : ('him', 'dobj'),
    ('her', 'pobj') : ('him', 'pobj'),
    ('hers', '*') : ('his', 'attr'),
    ('herself', '*') : ('himself', '*')
}
HES_TO_SHES = {v: k for k, v in SHES_TO_HES.items()}
HES_TO_THEYS = {
    ('he', '*') : ('they', '*'),
    ('his', 'poss'): ('their', '*'),
    ('him', 'dobj') : ('them', 'dobj'),
    ('him', 'pobj') : ('them', 'pobj'),
    ('his', 'attr') : ('theirs', '*'),
    ('himself', '*') : ('themselves', '*')
}
SHES_TO_THEYS = {HES_TO_SHES[k]: v for k, v in HES_TO_THEYS.items()}

def f(row):
    row['intended_ambiguous'] = row['intended_ambiguous'].replace("<", "").replace(">", "")
    row['intended_figurative'] = row['intended_figurative'].replace("<", "").replace(">", "")
    row['intended_literal'] = row['intended_literal'].replace("<", "").replace(">", "")

    converted = pronoun_conversion(
        row['intended_ambiguous'], 
        row['intended_figurative'], 
        row['intended_literal']
    )
    row['converted'] = False
    if converted is not None:
        row[['intended_ambiguous', 'intended_figurative', 'intended_literal']] = converted
        row['converted'] = True
    return row
    
def pronoun_conversion(ss_ambiguous, s_figurative, s_literal):

    ss_ambiguous_doc = nlp(ss_ambiguous)
    s_figurative_doc = nlp(s_figurative)
    s_literal_doc = nlp(s_literal)

    if ss_ambiguous_doc[0].pos_ != 'VERB':
        return None

    idx_fig = -1
    for i in range(len(s_figurative_doc) - len(ss_ambiguous_doc) + 1):
        substring = s_figurative_doc[i : i + len(ss_ambiguous_doc)]
        if substring.text == ss_ambiguous_doc.text:
            idx_fig = i
            break

    if idx_fig in [-1, 0]: return None

    idx_lit = -1
    for i in range(len(s_literal_doc) - len(ss_ambiguous_doc) + 1):
        substring = s_literal_doc[i : i + len(ss_ambiguous_doc)]
        if substring.text == ss_ambiguous_doc.text:
            idx_lit = i
            break
        
    if idx_lit in {-1, 0}: return None

    subj_fig = s_figurative_doc[idx_fig - 1]
    subj_lit = s_literal_doc[idx_lit - 1]

    if subj_fig.text == subj_lit.text:
        return (f"{subj_lit.text} {ss_ambiguous}", s_figurative, s_literal)
    
    if {subj_fig.text, subj_lit.text} == {'she', 'he'}:

        new_s_figurative = convert(s_figurative_doc, SHES_TO_HES, HES_TO_SHES, SHES, HES)
        if new_s_figurative is not None \
                and f"{subj_lit.text} {ss_ambiguous}" in new_s_figurative:
            return (f"{subj_lit.text} {ss_ambiguous}", new_s_figurative, s_literal)
        
        new_s_literal = convert(s_literal_doc, SHES_TO_HES, HES_TO_SHES, SHES, HES)
        if new_s_literal is not None \
                and f"{subj_fig.text} {ss_ambiguous}" in new_s_literal:
            return (f"{subj_fig.text} {ss_ambiguous}", s_figurative, new_s_literal)
        
    elif subj_fig.text == 'they' and not contains_they(s_literal_doc):
        new_s_literal = None
        if subj_lit.text == 'she':
            new_s_literal = convert(s_literal_doc, SHES_TO_THEYS, {}, SHES, [])
        elif subj_lit.text == 'he':
            new_s_literal = convert(s_literal_doc, HES_TO_THEYS, {}, HES, [])

        if new_s_literal is not None \
                and f"they {ss_ambiguous}" in new_s_literal:
            return (f"they {ss_ambiguous}", s_figurative, new_s_literal)
    
    elif subj_lit.text == 'they' and not contains_they(s_figurative_doc):
        new_s_figurative = None
        if subj_fig.text == 'she':
            new_s_figurative = convert(s_figurative_doc, SHES_TO_THEYS, {}, SHES, [])
        elif subj_fig.text == 'he':
            new_s_figurative = convert(s_figurative_doc, HES_TO_THEYS, {}, HES, [])

        if new_s_figurative is not None \
                and f"they {ss_ambiguous}" in new_s_figurative:
            return (f"they {ss_ambiguous}", new_s_figurative, s_literal)
        
    return None

def contains_they(doc):
    for token in doc:
        for they in THEYS:
            if token.text == they:
                return True
    return False
    

def convert(doc, dict1, dict2, list1, list2):
    words = []
    for token in doc:
        token_tuple = (token.text, token.dep_)
        token_wild = (token.text, '*')
        if token_tuple in dict1:
            words.append(dict1[token_tuple][0])
        elif token_wild in dict1: 
            words.append(dict1[token_wild][0])
        elif token_tuple in dict2:
            words.append(dict2[token_tuple][0])
        elif token_wild in dict2: 
            words.append(dict2[token_wild][0])
        elif token.text in list1 or token.text in list2:
            return None
        else:
            words.append(token.text)
    return re.sub(r'\s([?.!,"\'](?:\s|$))', r'\1', ' '.join(words))
    

In [22]:
from tqdm import tqdm
tqdm.pandas()
df = df.progress_apply(f, axis=1)

100%|██████████| 717/717 [00:06<00:00, 103.49it/s]


In [24]:
df.to_csv('gen_final_chatgpt.csv', index=False)

In [25]:
df[df['converted']]

Unnamed: 0,idiom,meaning,intended_ambiguous,intended_figurative,intended_literal,iter,converted,intended_ambiguous_old,intended_figurative_old,intended_literal_old,annotations_ambiguous,annotations_figurative,annotations_literal,well_formed,maxvote - for sort,maxvote_ambiguous,maxvote_figurative,maxvote_literal
7,a lot on my plate,to have so many things to do at one time,was having a lot on the plate,"juggling multiple projects at work, she was ha...","at the potluck dinner, he was having a lot on ...",1,True,I have a lot on my plate,I have a lot on my plate with all the tasks I ...,I have a lot on my plate because I loaded it up,"['ambiguous', 'ambiguous', 'ambiguous']","['figurative', 'figurative', 'figurative']","['literal', 'literal', 'literal']",True,ambiguous : figurative : literal,ambiguous,figurative,literal
27,against the clock,in a great hurry to finish something before a ...,were racing against the clock,"with the deadline approaching, we were racing ...","in the unique competition, participants were r...",0,True,We are working against the clock,We are working against the clock to make sure ...,We are working against the clock because it's ...,"['figurative', 'ambiguous', 'ambiguous']","['literal', 'literal', 'figurative']","['literal', 'literal', 'literal']",False,ambiguous : literal : literal,ambiguous,literal,literal
30,ahead of the curve,better than others,was staying ahead of the curve,"by constantly learning new skills, she was sta...",the race car driver was staying ahead of the c...,0,True,She's ahead of the curve,"She's ahead of the curve, always thinking abou...","She's ahead of the curve, as her car was in fr...","['ambiguous', 'ambiguous', 'ambiguous']","['figurative', 'figurative', 'figurative']","['literal', 'literal', 'literal']",True,ambiguous : figurative : literal,ambiguous,figurative,literal
32,air their dirty laundry,to make public something embarrassing that sho...,they aired their dirty laundry,"during the family reunion, they aired their di...",they aired their dirty laundry outside on the ...,0,True,We air our dirty laundry,"We air our dirty laundry to other people, mean...",We air our dirty laundry on the clothesline,"['ambiguous', 'ambiguous', 'ambiguous']","['figurative', 'figurative', 'figurative']","['literal', 'literal', 'literal']",True,ambiguous : figurative : literal,ambiguous,figurative,literal
44,apples to apples,having a fair comparison between the two things,to compare apples to apples,"to make a fair judgment, we need to compare ap...","in the fruit market, we had to compare apples ...",0,True,Let's compare apples to apples,Let's compare apples to apples to see which pr...,Let's compare apples to apples to see which on...,"['ambiguous', 'ambiguous', 'ambiguous']","['figurative', 'ambiguous', 'ambiguous']","['literal', 'literal', 'literal']",False,ambiguous : ambiguous : literal,ambiguous,ambiguous,literal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
674,wag the dog,to divert attention from something that is bad,was wagging the dog,the politician was wagging the dog by creating...,the playful puppy was wagging the dog by grabb...,0,True,He was wagging the dog,He was wagging the dog by trying to distract p...,He was wagging the dog when playing with his pet,"['ambiguous', 'ambiguous', 'ambiguous']","['figurative', 'figurative', 'figurative']","['literal', 'ambiguous', 'literal']",True,ambiguous : figurative : literal,ambiguous,figurative,literal
679,wash dirty linen in public,discuss publicly matters that one should keep ...,they washed their dirty linen in public,they washed their dirty linen in public by arg...,they washed their dirty linen in public at the...,0,True,She washed her dirty linen in public,She washed her dirty linen in public by arguin...,She washed her dirty linen in public at a laun...,"['ambiguous', 'ambiguous', 'ambiguous']","['figurative', 'figurative', 'figurative']","['literal', 'literal', 'literal']",True,ambiguous : figurative : literal,ambiguous,figurative,literal
683,way around,"to find an alternative for something, someone ...",he knows his way around,he knows his way around computers and can fix ...,"as a local, he knows his way around the city a...",0,True,"They found a way around it""",They found a way around it by doing the opposite,They found a way around it by going to the oth...,"['ambiguous', 'ambiguous', 'figurative']","['figurative', 'figurative', 'figurative']","['literal', 'ambiguous', 'literal']",True,ambiguous : figurative : literal,ambiguous,figurative,literal
694,whistle in the dark,to try to hide your fear in a scary or dangero...,was whistling in the dark,he was whistling in the dark when he tried to ...,"while walking through the forest at night, she...",0,True,We are whistling in the dark,We are whistling in the dark to try to hide ou...,We are whistling in the dark to help us find o...,"['ambiguous', 'ambiguous', 'ambiguous']","['figurative', 'figurative', 'figurative']","['ambiguous', 'literal', 'literal']",True,ambiguous : figurative : literal,ambiguous,figurative,literal


In [28]:
print(type(doc) == spacy.tokens.doc.Doc)

True


In [20]:
for token in doc[0:3]:
    print(token)

She
looks
at
