In [1]:
import os
import openai

import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
openai.organization = #insert yours
openai.api_key = #insert yours

In [3]:
def insert_output_gpt3(experiment_item):
    """Print out the whole output entry."""
    output = openai.Completion.create(
        model = "text-davinci-003",
        prompt = experiment_item,
        suffix = ".",
        max_tokens=20,
        temperature=0,
        top_p= 1,
        n= 1,
        frequency_penalty=0,
        presence_penalty=0,
        stream= False,
        logprobs= 1,
        stop= "\n"
        )
        
    return output

In [4]:
def inserted_prob_gpt3(experiment_item):
    """Print out the log probability for the inserted tokens."""
    output = openai.Completion.create(
        model = "text-davinci-003",
        prompt = experiment_item,
        suffix = ".",
        max_tokens=20,
        temperature=0,
        top_p= 1,
        n= 1,
        frequency_penalty=0,
        presence_penalty=0,
        stream= False,
        logprobs= 1,
        stop= "\n"
        )
    
    toplogprob = output.to_dict()['choices'][0].to_dict()['logprobs'].to_dict()["top_logprobs"]
    
    response = []
    
    for i in range(0,len(toplogprob)):
        response.append(toplogprob[i].to_dict())
        
    return response

In [5]:
def entropy(q):
    entropy = -(q * np.log2(q) + (1-q) * np.log2(1-q))
    return entropy

In [6]:
#GPT-3 uses BPE and unknown characters (usually non-English languages, special characters, etc.) appear opaque.
#This function extracts inserted words by grabbing the BPE tokens, then calculates the entropy of the inserted token. 

def token_entropy_gpt3(toplogprob):
    
    token_prob = []
    
    i = 0 
    while i < len(toplogprob):
        if r'.' in toplogprob[i].keys():
            break    
            i +=1
        else:
            token_prob.append(*toplogprob[i].values())
            i +=1
    
    token_entropy = []
    
    for logprob in token_prob:
        token_entropy.append(entropy(np.e**(logprob)))
    
    return sum(token_entropy)

In [7]:
def inserted_text_gpt3(experiment_item):
    """Print out the inserted tokens in a decoded text format."""
    output = openai.Completion.create(
        model = "text-davinci-003",
        prompt = experiment_item,
        suffix = ".",
        max_tokens=20,
        temperature=0,
        top_p= 1,
        n= 1,
        frequency_penalty=0,
        presence_penalty=0,
        stream= False,
        logprobs= 1,
        stop= "\n"
        )
    
    return output.choices[0]['text']

In [8]:
example = inserted_prob_gpt3("민수가 라면을 먹었다. 소희가 라면을 ")

In [9]:
example

[{'bytes:\\xeb': -0.012674714},
 {'bytes:\\xa8': -0.010898419},
 {'bytes:\\xb9': -4.573365e-05},
 {'bytes:\\xec': -0.021624932},
 {'bytes:\\x97': -0.06579405},
 {'bytes:\\x88': -0.00019869342},
 {'bytes:\\xeb\\x8b': -0.029567312},
 {'bytes:\\xa4': -4.277735e-07},
 {'.': -0.027864005},
 {'\n': -0.019333899},
 {'Min': -0.8997774},
 {'-': -0.4776559},
 {'su': -0.48117903},
 {' ate': -0.046376396},
 {' ram': -0.18633811}]

In [10]:
token_entropy_gpt3(example)

0.8684050250428971

In [11]:
def main(filename):
    """Run GPT-3 on experiment items."""
    
    data_path = "{FILE}.csv".format(FILE=filename)
    df_output = pd.read_csv(data_path)
    
    prob = []
    text = []
    cond = []
    item = []
    entropy = []

    with tqdm(total=df_output.shape[0]) as pbar:   
        for index, row in df_output.iterrows():
        
            condition = row['Condition name']
            cond.append(condition)
        
            itemindex = row['Lexicalization']
            item.append(itemindex)
        
            sentence = row['Item']
            p = inserted_prob_gpt3(sentence)
            prob.append(p)
    
            t = inserted_text_gpt3(sentence)
            text.append(t)
            
            ent = token_entropy_gpt3(p)
            entropy.append(ent)
        
            pbar.update(1)
        
    df_output['log_prob'] = prob
    df_output['response'] = text
    df_output['entropy'] = entropy
    
    df_output.to_csv("{TASK}_output.csv".format(TASK=filename))
    
    return df_output

In [12]:
sample = main("sample_gpt3_input")

  entropy = -(q * np.log2(q) + (1-q) * np.log2(1-q))
  entropy = -(q * np.log2(q) + (1-q) * np.log2(1-q))
100%|█████████████████████████████████████████████| 5/5 [00:08<00:00,  1.73s/it]


In [13]:
sample['log_prob'][2]

[{'bytes:\\xeb': -0.706179},
 {'bytes:\\x81': -1.0260508},
 {'bytes:\\x84': -0.3326238},
 {'bytes:\\xeb': -0.8584608},
 {'bytes:\\x8a': -0.45750463},
 {'bytes:\\x94': -0.0057951147},
 {'bytes: \\xeb': -0.9150861},
 {'bytes:\\xb0': -0.2612375},
 {'bytes:\\xa9': -0.00036503928},
 {'bytes:\\xeb': -0.0015714729},
 {'bytes:\\xb2': -0.00023535996},
 {'bytes:\\x95': -0.00033534507},
 {'\n': -0.26113656},
 {'\n': -0.020923948},
 {'1': -0.6555963},
 {'.': -0.009052616}]