### Checking calibration of model on biased random few-shot context

In [28]:
from abc import ABC, abstractmethod
import os
from typing import Any, Dict, List, Set, Tuple, Union
import time
import openai
import requests
import random
import math

_org_ids = {
    "Isaac": "org-m4JAuDSjZRa4yOfHtmmsUvsh",
    "Kei": "org-BrCtJvxjttlWgcJ2C5nWKQx3",
    "NYU": "org-rRALD2hkdlmLWNVCKk9PG5Xq",
    "FAR": "org-AFgHGbU3MeFr5M5QFwrBET31",
}
openai.organization = _org_ids["FAR"] # which to use

In [32]:
def _get_completion_single_call(prompt: str, model_name: str, openai_kwargs: Dict[str, Any] = {}) -> str:
    response = openai.Completion.create(
        model=model_name,
        prompt=prompt,
        **openai_kwargs
    )
    return response

def get_completion_with_retry(prompt: str, model_name: str, openai_kwargs: Dict[str, Any] = {}) -> str:
    completion = None
    backoff_time = 0.1
    backoff_factor = 1.5
    while completion is None:
        try:
            start_time = time.time()
            completion = _get_completion_single_call(prompt, model_name, openai_kwargs)
            end_time = time.time()
        except (requests.exceptions.Timeout, openai.error.ServiceUnavailableError) as e:
            time.sleep(backoff_time)
            if backoff_time < 3:
                backoff_time *= backoff_factor
        except (openai.error.RateLimitError) as e:
            print("R", end="", flush=True)
            time.sleep(backoff_time)
            if backoff_time < 3:
                backoff_time *= backoff_factor
    return completion

In [35]:
def generate_random_context(num_a, num_b):
    prompt = ""
    letter_list = ['A'] * num_a + ['B'] * num_b
    random.shuffle(letter_list)
    for letter in letter_list:
        prompt += "Question: What is the correct answer?\n"
        prompt += "Answer: " + letter + "\n"
    
    prompt += "Question: Is the answer A or B?\n"
    prompt += "Answer:"
    return prompt

def calc_total_a_b_probs(logprobs_dict):
    a_prob = 0
    b_prob = 0
    if ' A' in logprobs_dict:
        a_prob += math.exp(logprobs_dict[' A'])
    if ' B' in logprobs_dict:
        b_prob += math.exp(logprobs_dict[' B'])
    if 'A' in logprobs_dict:
        a_prob += math.exp(logprobs_dict['A'])
    if 'B' in logprobs_dict:
        b_prob += math.exp(logprobs_dict['B'])
    
    return a_prob, b_prob

def test_model_incontext_calibration(model_name, num_episodes):
    a_frac_prob_list = []
    for num_a in range(0, 21, 2):
        print(num_a)
        num_b = 20 - num_a
        a_frac_prob_sum = 0
        for _ in range(num_episodes):
            my_context = generate_random_context(num_a, num_b)
            completion = get_completion_with_retry(my_context, model_name, openai_kwargs={"logprobs":10})
            top_logprobs = completion["choices"][0]["logprobs"]["top_logprobs"][0]
            a_prob, b_prob = calc_total_a_b_probs(top_logprobs)
            a_frac_prob = a_prob/(a_prob+b_prob)
            a_frac_prob_sum += a_frac_prob
        
        a_frac_prob = a_frac_prob_sum/num_episodes
        a_frac_prob_list.append(a_frac_prob)
    return a_frac_prob_list


In [39]:
text_davinci_003_list = test_model_incontext_calibration("text-davinci-003", 300)

text_davinci_002_list = test_model_incontext_calibration("text-davinci-002", 300)

davinci_list = test_model_incontext_calibration("davinci", 300)

0
