# Replicate Bott & Noveck (2004) with GPT 3 and 4

### Compare human's reaction time data with LLMs to see which meaning that LLMs assess first when processing scalar terms.

Exp1 (zero-shot)

"Evaluate whether this sentence is True or False"
-> decide whether to provide category membership or not.

Exp2 (one-shot: semantic/pragmatic instruction) 

pragmatic prompt: 
"The word 'some' can be understood in several ways. One way is to understand it as some but not all. Thus, a sentence like 'Some daffodils are flowers' should be considered false because, in fact, all daffodils are flowers. Now evaluate whether the following sentence is true or false."

semantic prompt:
"The word 'some' can be understood in several ways. One way is to understand it as some and possibly all. Thus, a sentence like 'Some daffodils are flowers' should be considered true because, even though all daffodils are flowers. Now evaluate whether the following sentence is true or false."

-> get perplexity (correct answers) and compare surprisal

Exp3: Give true category membership and run base/sem/prag instructions.

In [1]:
import os
import openai
import json

import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
keys = open('../openai-keys.txt')
lines = keys.readlines()
openai.organization = lines[0].rstrip()
openai.api_key = lines[1].rstrip()

In [3]:
#prompt format from Bott & Noveck (2004)

#base_prompt
#"Evaluate whether the following sentence is true or false.
#prag_prompt
#The word 'some' can be understood in several ways. 
#One way is to understand it as some but not all. 
#Thus, a sentence like 'Some daffodils are flowers' should be considered false because, 
#in fact, all daffodils are flowers. 
#Now evaluate whether the following sentence is true or false. 

# sem_prompt
# The word 'some' can be understood in several ways.
# One way is to understand it as some and possibly all.
# Thus, a sentence like 'Some daffodils are flowers' should be
# considered true, even though we know that all daffodils are flowers.
# Now evaluate whether the following sentence is true or false.

In [4]:
def base_prompt(sentence):
    return ("Evaluate whether the following sentence "
        "in the double quotation marks is true or false. "
        "Respond with only one word, either true or false. "
        "Do not include any new lines or empty spaces. "
        "\"{sentence}\"".format(sentence=sentence))

In [5]:
def sem_prompt(sentence):
    return ("The word 'some' can be understood in several ways. " 
            "One way is to understand it as some and possibly all. " 
            "Thus, a sentence like 'Some daffodils are flowers' " 
            "should be considered true, even though all daffodils are flowers. " 
            "Now evaluate whether the following sentence in the "
            "double quotation marks is true or false. "
            "Respond with only one word, either true or false. "
            "Do not include any new lines or empty spaces. "
            "\"{sentence}\"".format(sentence=sentence))

In [6]:
def prag_prompt(sentence):
    return ("The word 'some' can be understood in several ways. " 
            "One way is to understand it as some but not all. " 
            "Thus, a sentence like 'Some daffodils are flowers' " 
            "should be considered false because, in fact, all daffodils are flowers. " 
            "Now evaluate whether the following sentence in the "
            "double quotation marks is true or false. "
            "Respond with only one word, either true or false. "
            "Do not include any new lines or empty spaces. "
            "\"{sentence}\"".format(sentence=sentence))

In [7]:
prag_prompt('All lobsters are fish.')

'The word \'some\' can be understood in several ways. One way is to understand it as some but not all. Thus, a sentence like \'Some daffodils are flowers\' should be considered false because, in fact, all daffodils are flowers. Now evaluate whether the following sentence in the double quotation marks is true or false. Respond with only one word, either true or false. Do not include any new lines or empty spaces. "All lobsters are fish."'

In [8]:
def insert_gpt4(item):
    """
    Generate sentence continuation with GPT-4. 
    Print out the output in a dictionary format.
    """
    
    output = openai.ChatCompletion.create(
        model = "gpt-4-0613",
        messages = [{"role": "user", "content":item}],
        max_tokens=30,
        temperature=0, #argmax
        n= 1,
        stop= [".","\n"]
        )
    
    output_dict = output.to_dict()['choices'][0].to_dict()
    
    return output_dict

In [9]:
def insert_gpt3(item):
    """
    Generate sentence continuation with GPT-3. 
    Print out the output in a dictionary format.
    """
    
    output = openai.Completion.create(
        model = "text-davinci-003",
        prompt = item,
        suffix = ".",
        max_tokens=30, 
        temperature=0, # argmax
        n= 1,
        logprobs= 1,
        #stop= ["."]
        )
    
    output_dict = output.to_dict()['choices'][0].to_dict()
    
    return output_dict

In [10]:
def inserted_prob_gpt3(experiment_item):
    """
    Generate sentence continuation with GPT-3. 
    Print out the log probability for the inserted tokens in a dictionary format.
    """
    
    output = openai.Completion.create(
        model = "text-davinci-003",
        prompt = experiment_item,
        suffix = ".",
        max_tokens = 30,
        temperature = 0,
        n = 1,
        #stream = False,
        logprobs = 1,
        stop = ["."]
        )
    
    toplogprob = output.to_dict()['choices'][0].to_dict()['logprobs'].to_dict()["top_logprobs"]
    # We are using the ["top_logprobs"] because it notes each output token and its logprob together in a JSON entry.
    # Each ["top_logprobs"] is converted into a Python dictionary: output token as a key and logprob as a value. 
    
    response = []
    
    for i in range(0,len(toplogprob)):
        if 'true' in toplogprob[i] or 'True' in toplogprob[i] or 'False' in toplogprob[i] or 'false'.casefold() in toplogprob[i]:
            response.append(toplogprob[i].to_dict())
        
    return response

In [11]:
def run_gpt4(filename):
    data_path = "{FILE}.csv".format(FILE=filename)
    df_item = pd.read_csv(data_path)
    
    logprob = []
    base_res = []
    sem_res = []
    prag_res = []
    t_type = []
    prompt_type = []
    sentence = []
    
    with tqdm(total=df_item.shape[0]) as pbar:
        for index, row in df_item.iterrows():
            
            t = row['type']
            t_type.append(t_type)
        
            b = base_prompt(row['sentence'])
            baseline = insert_gpt4(b)
            base_res.append(baseline['message']['content'])            
            
            s = sem_prompt(row['sentence'])
            semantic = insert_gpt4(s)
            sem_res.append(semantic['message']['content'])
            
            p = prag_prompt(row['sentence'])
            pragmatic = insert_gpt4(p)
            prag_res.append(pragmatic['message']['content'])
            
            pbar.update(1)
        
    df_item['base_res'] = base_res
    df_item['sem_res'] = sem_res
    df_item['prag_res'] = prag_res
    
    df_item.to_csv("{TASK}_gpt4_output.csv".format(TASK=filename))
    
    return df_item


In [12]:
def run_gpt3(filename):
    data_path = "{FILE}.csv".format(FILE=filename)
    df_item = pd.read_csv(data_path)
    
    logprob = []
    base_res = []
    base_prob = []
    sem_res = []
    sem_prob = []
    prag_res = []
    prag_prob = []
    t_type = []
    prompt_type = []
    sentence = []
    
    with tqdm(total=df_item.shape[0]) as pbar:
        for index, row in df_item.iterrows():
            
            t = row['type']
            t_type.append(t_type)
        
            b = base_prompt(row['sentence'])
            baseline = insert_gpt3(b)
            baseline_prob = inserted_prob_gpt3(b)
            
            base_res.append(baseline['text'].strip()) 
            base_prob.append(baseline_prob[0].values())
            
            s = sem_prompt(row['sentence'])
            semantic = insert_gpt3(s)
            semantic_prob = inserted_prob_gpt3(s)
            
            sem_res.append(semantic['text'].strip())
            sem_prob.append(semantic_prob[0].values())
            
            p = prag_prompt(row['sentence'])
            pragmatic = insert_gpt3(p)
            pragmatic_prob = inserted_prob_gpt3(p)
            
            prag_res.append(pragmatic['text'].strip())
            prag_prob.append(pragmatic_prob[0].values())
            
            pbar.update(1)
        
    df_item['base_res'] = base_res
    df_item['base_prob'] = base_prob
    df_item['sem_res'] = sem_res
    df_item['sem_prob'] = sem_prob
    df_item['prag_res'] = prag_res
    df_item['prag_prob'] = prag_prob
    
    df_item.to_csv("{TASK}_gpt3_output.csv".format(TASK=filename))
    
    return df_item