In [2]:
import openai
import re
import config
import os
openai.organization = config.openai_organization
openai.api_key = config.openai_api_key
import pandas as pd
RANDOM_STATE = 42
idioms = pd.read_csv("first_batch_noBadIdioms.csv")

In [3]:
os.chdir('./chatgpt_prompts/')
with open('msg_0_system.txt', 'r') as msg_0_system:
    msg_0_system = msg_0_system.read()

with open('msg_1_user.txt', 'r') as msg_1_user:
    msg_1_user = msg_1_user.read()

with open('msg_1_assistant.json', 'r') as msg_1_assistant:
    msg_1_assistant = msg_1_assistant.read()

with open('msg_2_user.txt', 'r') as msg_2_user:
    msg_2_user = msg_2_user.read()

with open('msg_2_assistant.json', 'r') as msg_2_assistant:
    msg_2_assistant = msg_2_assistant.read()

with open('msg_3_user.txt', 'r') as msg_3_user:
    msg_3_user = msg_3_user.read()

with open('msg_3_assistant.json', 'r') as msg_3_assistant:
    msg_3_assistant = msg_3_assistant.read()

default_messages = [
    {
        "role": "system",
        "content": msg_0_system,
    },
    {
        "role": "user",
        "content": msg_1_user,
    },
    {
        "role": "assistant",
        "content": msg_1_assistant,
    },
    {
        "role": "user",
        "content": msg_2_user,
    },
    {
        "role": "assistant",
        "content": msg_2_assistant,
    },
    {
        "role": "user",
        "content": msg_3_user,
    },
    {
        "role": "assistant",
        "content": msg_3_assistant,
    },
]
os.chdir("../")

In [9]:
import json

def generate(row):
    if not row['error'] and not row['well_formed']:
        return row
    # if row['well_formed']: 
    #     return row
    try:
        idiom = row['idiom']
        messages = default_messages + [{
            "role": "user",
            "content": f"IDIOM: {idiom}"
        }]
        return generate_helper(row, messages)
    except: 
        row['error'] = True
        return row
    # idiom = row['idiom']
    # messages = default_messages + [{
    #     "role": "user",
    #     "content": f"IDIOM: {idiom}"
    # }]
    # return generate_helper(row, messages)

def generate_helper(row, messages, i=0):
    if i == 5: return row
    text = openai.ChatCompletion.create(
        model="gpt-4",
        messages=messages,
        max_tokens=512,
        temperature=0.1, # the higher this value, the less deterministic
        top_p=1, # the higher this value, the wider range of vocab is used
    ).choices[0].message.content.strip()

    try:
        response = json.loads(text)
        row['intended_ambiguous_retrial'] = response['AMBIGUOUS'].strip()
        row['intended_figurative_retrial'] = response['FIGURATIVE'].strip()
        row['intended_literal_retrial'] = response['LITERAL'].strip()
    except:
        row['not_found'] = True
        return row

    # Check ambiguous subsentence inclusion
    if row['intended_ambiguous_retrial'] not in row['intended_figurative_retrial']:
        messages = messages + [
            { "role": "assistant", "content": text },
            {
                "role": "user",
                "content": "The AMBIGUOUS phrase must be an exact substring of the FIGURATIVE sentence, but yours isn't. Rewrite your AMBIGUOUS phrase, FIGURATIVE sentence, and LITERAL sentence accordingly.",
            },
        ]
        return generate_helper(row, messages, i+1)
    elif row['intended_ambiguous_retrial'] not in row['intended_literal_retrial']:
        messages = messages + [
            { "role": "assistant", "content": text },
            {
                "role": "user",
                "content": "The AMBIGUOUS phrase must be an exact substring of the LITERAL sentence, but yours isn't. Rewrite your AMBIGUOUS phrase, FIGURATIVE sentence, and LITERAL sentence accordingly.",
            },
        ]
        return generate_helper(row, messages, i+1)
    
    # Ban the metnions of certain substrings
    banned_substrings = ["literal", "figurative", "ambiguous", "physical", "idiomatic"]
    for substring in banned_substrings:
        if substring in row['intended_ambiguous_retrial']:
            print(row['intended_ambiguous_retrial'])
            messages = messages + [
                { "role": "assistant", "content": text },
                {
                    "role": "user",
                    "content": f"One of the banned words \"{substring}\" was mentioned in your AMBIGUOUS phrase; you should never explicitly state these labels in your sentences. Rewrite your AMBIGUOUS phrase, FIGURATIVE sentence, and LITERAL sentence accordingly.",
                },
            ]
            return generate_helper(row, messages, i+1)
        
        elif substring in row['intended_figurative_retrial']:
            print(row['intended_figurative_retrial'])
            messages = messages + [
                { "role": "assistant", "content": text },
                {
                    "role": "user",
                    "content": f"One of the banned words \"{substring}\" was mentioned in your FIGURATIVE sentence; you must not explicitly state these labels in your sentences. Rewrite your AMBIGUOUS phrase, FIGURATIVE sentence, and LITERAL sentence accordingly.",
                },
            ]
            return generate_helper(row, messages, i+1)
        
        elif substring in row['intended_literal_retrial']:
            print(row['intended_literal_retrial'])
            messages = messages + [
                { "role": "assistant", "content": text },
                {
                    "role": "user",
                    "content": f"One of the banned words \"{substring}\" was mentioned in your LITERAL sentence; you must not explicitly state these labels in your sentences. Rewrite your AMBIGUOUS phrase, FIGURATIVE sentence, and LITERAL sentence accordingly.",
                },
            ]
            return generate_helper(row, messages, i+1)
        
    # Ban the word "because"
    if "because" in row['intended_figurative_retrial']:
        print(row['intended_figurative_retrial'])
        messages = messages + [
            { "role": "assistant", "content": text },
            {
                "role": "user",
                "content": f"In your FIGURATIVE sentence, avoid using the word \"because\" since using it tends to create a FIGURATIVE sentence that simply re-states the figurative definition of the idiom without putting much effort in creating a good sentence. Rewrite your AMBIGUOUS phrase, FIGURATIVE sentence, and LITERAL sentence accordingly.",
            },
        ]
        return generate_helper(row, messages, i+1)
    elif "because" in row['intended_literal_retrial']:
        print(row['intended_literal_retrial'])
        messages = messages + [
            { "role": "assistant", "content": text },
            {
                "role": "user",
                "content": f"In your LITERAL sentence, avoid using the word \"because\" since using it tends to create a LITERAL sentence that simply re-states the literal definition of the idiom without putting much effort in creating a good sentence. Rewrite your AMBIGUOUS phrase, FIGURATIVE sentence, and LITERAL sentence accordingly.",
            },
        ]
        return generate_helper(row, messages, i+1)

    row["iter"] = i
    return row

In [10]:
from tqdm import tqdm
tqdm.pandas()
idioms = pd.read_csv("instances_retrial.csv")
# result = idioms[idioms["well_formed"]
#     ].sample(frac=1, random_state=RANDOM_STATE)[:50
#     ].progress_apply(generate, axis=1)
result = idioms.progress_apply(generate, axis=1)

  1%|          | 9/731 [01:31<2:00:28, 10.01s/it]

I <have a lot on my plate> because I took too much food at the buffet


  6%|▌         | 42/731 [06:27<1:29:32,  7.80s/it]

he couldn't sit still during the meeting because he <had ants in his pants>


 11%|█▏        | 83/731 [11:04<56:52,  5.27s/it]  

people thought he <had bats in the belfry> because of his eccentric ideas


 16%|█▋        | 119/731 [15:51<1:08:11,  6.69s/it]

he was always considered <the black sheep of the family> because of his rebellious nature


 23%|██▎       | 170/731 [24:16<1:23:02,  8.88s/it]

he <can't stand the sight of that> person because of their past disagreements


 23%|██▎       | 171/731 [24:43<1:56:32, 12.49s/it]

as a magician, he literally <had a card up his sleeve> for his next trick


 43%|████▎     | 317/731 [41:57<59:58,  8.69s/it]  

after the test, he <got a goose egg> because he didn't study


 45%|████▌     | 329/731 [43:12<39:29,  5.89s/it]  

I can't pick up the phone because <my hands are tied> with this rope


 46%|████▌     | 334/731 [44:34<1:19:24, 12.00s/it]

he <had clean hands after the incident> because he wasn't involved in the wrongdoing


 49%|████▉     | 359/731 [48:59<46:55,  7.57s/it]  

the two athletes, who were also roommates, were literally <in bed with the competition>


 56%|█████▌    | 406/731 [55:41<53:53,  9.95s/it]  

he always agrees with the boss because he <knows which side his bread is buttered>


 62%|██████▏   | 453/731 [1:02:46<31:50,  6.87s/it]  

I have <miles to go before I sleep> because my destination is still far away


 63%|██████▎   | 459/731 [1:04:21<49:56, 11.02s/it]  

I can't help you with the project because <my hands are full> right now with other tasks


 64%|██████▍   | 470/731 [1:06:34<48:31, 11.16s/it]  

during the card game, it <seemed like he wasn't playing with a full deck> because some cards were missing


 67%|██████▋   | 487/731 [1:08:55<25:54,  6.37s/it]

the chair was so small that they were literally <on the edge of their seat>


 72%|███████▏  | 523/731 [1:14:34<27:12,  7.85s/it]

during the physical therapy session, the therapist was <pulling my leg> to stretch my muscles


 84%|████████▍ | 614/731 [1:32:47<28:37, 14.68s/it]

everyone trusts him because he's <a straight shooter> and always tells the truth


 89%|████████▊ | 647/731 [1:39:50<16:56, 12.10s/it]

she always wakes up at 5 am because she believes that <the early bird catches the worm>


 95%|█████████▍| 691/731 [1:52:04<26:02, 39.07s/it]

he <walked out on the meeting> because he couldn't handle the pressure


 99%|█████████▉| 725/731 [1:58:50<01:05, 10.88s/it]

the dog trainer struggled with <teaching an old dog new tricks> because the dog was stubborn


100%|██████████| 731/731 [2:00:14<00:00,  9.87s/it]


In [25]:
result.to_csv('instances_retrial_2.csv', index=False)

In [23]:
result[result.isna().any(axis=1)]

Unnamed: 0,annotations_ambiguous,annotations_figurative,annotations_literal,error,idiom,intended_ambiguous,intended_ambiguous_retrial,intended_figurative,intended_figurative_retrial,intended_literal,intended_literal_retrial,iter,maxvote - for sort,maxvote_ambiguous,maxvote_figurative,maxvote_literal,meaning,not_found,well_formed


In [21]:
result.loc[686] = generate(result.loc[686])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row['intended_ambiguous_retrial'] = response['AMBIGUOUS'].strip()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row['intended_figurative_retrial'] = response['FIGURATIVE'].strip()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row['intended_literal_retrial'] = response['LITERAL'].strip()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#retur

In [22]:
result.loc[686]

annotations_ambiguous                    ['ambiguous', 'ambiguous', 'ambiguous']
annotations_figurative                ['figurative', 'figurative', 'figurative']
annotations_literal                            ['literal', 'literal', 'literal']
error                                                                       True
idiom                                                    voice in the wilderness
intended_ambiguous                              He was a voice in the wilderness
intended_ambiguous_retrial                           <a voice in the wilderness>
intended_figurative            He was a voice in the wilderness offering advi...
intended_figurative_retrial    his environmental concerns were <a voice in th...
intended_literal               He was a voice in the wilderness shouting from...
intended_literal_retrial       while hiking alone, she suddenly heard <a voic...
iter                                                                         0.0
maxvote - for sort          