### VE Implementation Testing

### Module 0: Installing Libraries

In [2]:
import pandas as pd
import pprint
import os 
from time import time 
from dotenv import load_dotenv
import json
import argparse

from openai import OpenAI
import certifi

In [3]:
# Override bad SSL_CERT_FILE if set
os.environ["SSL_CERT_FILE"] = certifi.where()

In [4]:
load_dotenv()

True

In [5]:
# !pip3 install textblob

In [6]:
## Importing VE libraries
from utils import *
from dataset_utils import read_wikiqa_data

### Module 1: Few Shot answer generations

(Using Code from few_shot.py)

In [72]:
def in_context_prediction(ex, shots, engine, style="standard", length_test_only=False, n=1):

    if style == "standard":
        showcase_examples = [
            "Q: {}\nA: {}\n".format(s["question"], s["answer"]) for s in shots
        ]
        input_example = "Q: {}\nA:".format(ex["question"])
        prompt = "\n".join(showcase_examples + [input_example])
    else:
        raise RuntimeError("Unsupported prompt style")

    temp = 0.7 if n > 1 else 0.0

    try:
        resp = openai.Completion.create(
            engine=engine, prompt=prompt, max_tokens=32, stop='\n',
            temperature=temp, logprobs=5, echo=True, n=n
        )

        pred = resp if n > 1 
    except Exception as e:
        print(f'Encountered Error {e}')

    if n > 1:
        return pred
    else:
        return resp["choices"][0]

    # pred["id"] = ex["id"]
    # pred["prompt"] = prompt
    # try:
    #     if len(pred["text"]) > len(prompt):
    #         pred["text"] = pred["text"][len(prompt):]
    #     else:
    #         pred["text"] = "null"
    #     return pred
    # except:
    #     return None

In [4]:
### Original Function, will dissect it into multiple 

def test_few_shot_performance(args):
    print("Running prediction")
    train_set = read_wikiqa_data(f"data/train_subset.json")
    train_set = train_set[args.train_slice:(args.train_slice + args.num_shot)]
    dev_set = read_wikiqa_data(f"data/dev_sampled.json")
    dev_set = dev_set[args.dev_slice:(args.num_dev)]

    showcase_examples = [
        "Q: {}\nA: {}\n".format(s["question"], s["answer"]) for s in train_set
    ]
    prompt = "\n".join(showcase_examples)
    print('prompt: ')
    print(prompt)
    
    if os.path.exists(result_cache_name(args)) and not args.run_length_test:
        predictions = read_json(result_cache_name(args))
    else:
        predictions = []
        for x in tqdm(dev_set, total=len(dev_set), desc="Predicting"):
            pred = in_context_prediction(x, train_set, engine=args.engine, \
                style=args.style, length_test_only=args.run_length_test)
            if pred == None:
                args.num_dev = len(predictions)
                break
            else:
                predictions.append(pred)

        if args.run_length_test:
            print(result_cache_name(args))
            print('MAX', max(predictions), 'COMP', 32)
            return
        # save
        dump_json(predictions, result_cache_name(args))
    # acc
    for p in predictions:
        p['answer_prob'] = calc_fewshot_pred_with_prob(p, args.style) 
    evaluate_few_shot_predictions(dev_set, predictions, do_print=True)


### Sub-sampling 500 records from the VE test data 

In [41]:
df =pd.read_json("data/dev_sampled.json")
df.shape

(1000, 10)

In [40]:
df.head()

Unnamed: 0,_id,type,question,context,entity_ids,supporting_facts,evidences,answer,evidences_id,answer_id
284,604cd3220bdd11eba7f7acde48001122,compositional,What is the award that the director of film Pu...,"[[Etan Boritzer, [Etan Boritzer( born 1950) is...",Q7258905_Q3291382,"[[Pugachev (1978 film), 0], [Alexey Saltykov (...","[[Pugachev, director, Aleksei Saltykov], [Alek...",People's Artist of the RSFSR,"[[Q7258905, director, Q3291382], [Q3291382, aw...",Q47024
785,6108c3a7097e11ebbdb0ac1f6bf848b6,comparison,Are both Les Tortillards and Trivial (Film) fr...,"[[One and Five, [One and Five is a 1969 short ...",Q3235772_Q2987804,"[[Les Tortillards, 0], [Trivial (film), 0]]","[[Les Tortillards, country of origin, French],...",yes,"[[Q3235772, country of origin, Q142], [Q298780...",
292,c5d56fe60bd911eba7f7acde48001122,compositional,"What is the date of death of Lancelot Lowther,...","[[Humphrey de Bohun, 7th Earl of Hereford, [Hu...",Q6483626_Q5725106,"[[Lancelot Lowther, 6th Earl of Lonsdale, 0], ...","[[Lancelot Edward Lowther, 6th Earl of Lonsdal...",15 August 1876,"[[Q6483626, father, Q5725106], [Q5725106, date...",
746,eb63d1060bae11ebab90acde48001122,inference,Who is the father-in-law of Marcus Vinicius (C...,"[[Marcus Vinicius (consul 30), [Marcus Viniciu...",Q1247301_Q237629,"[[Marcus Vinicius (consul 30), 0], [Julia Livi...","[[Marcus Vinicius, spouse, Julia Livilla], [Ju...",Germanicus,"[[Q1247301, spouse, Q237629], [Q237629, father...",Q191039
624,728d56420bdd11eba7f7acde48001122,compositional,What is the place of birth of the director of ...,"[[Jesse E. Hobson, [Jesse Edward Hobson( May 2...",Q7737654_Q28998,"[[The Great Awakening (film), 0], [Reinhold Sc...","[[The Great Awakening, director, Reinhold Schü...",Hamburg,"[[Q7737654, director, Q28998], [Q28998, place ...",Q1055


In [35]:
df['type'].value_counts()

type
compositional        446
bridge_comparison    226
comparison           218
inference            110
Name: count, dtype: int64

In [37]:
df_sample = df.sample(500)
df_sample.shape

(500, 10)

In [38]:
df_sample.to_json("data/dev_sampled_500.json")

### Executing Few shot function

In [9]:
''' From VE code'''

# def _parse_args():
#     parser = argparse.ArgumentParser()
#     add_engine_argumenet(parser)
#     # standard, instruction, etc
#     parser.add_argument('--style', type=str, default="standard")
#     parser.add_argument('--annotation', type=str, default="std")
#     parser.add_argument('--run_prediction', default=False, action='store_true')
#     parser.add_argument('--run_length_test', default=False, action='store_true')
#     parser.add_argument('--num_shot', type=int, default=6)
#     parser.add_argument('--train_slice', type=int, default=0)
#     parser.add_argument('--num_dev', type=int, default=1000) # firs 58 for calibrating, last 250 for testing
#     parser.add_argument('--dev_slice', type=int, default=0)
#     parser.add_argument('--show_result',  default=False, action='store_true')
#     parser.add_argument('--model', type=str, default="gpt3")
#     parser.add_argument('--show_prompt',  default=False, action='store_true')
#     args = parser.parse_args()    
#     specify_engine(args)

' From VE code'

In [7]:
# Defining args parameter
args = argparse.Namespace(
    style="standard",
    annotation="std",
    run_prediction=False,
    run_length_test=False,
    num_shot=5,
    train_slice=0,
    num_dev=500,
    dev_slice=0,
    show_result=False,
    model="gpt3.5",
    show_prompt=False,
    engine = "gpt-3.5-turbo-0125"
)

print(args)

Namespace(style='standard', annotation='std', run_prediction=False, run_length_test=False, num_shot=5, train_slice=0, num_dev=500, dev_slice=0, show_result=False, model='gpt3.5', show_prompt=False, engine='gpt-3.5-turbo-0125')


In [18]:
train_set = read_wikiqa_data(f"data/train_subset.json", manual_annotation_style= True)

0 not found


In [19]:
train_set = train_set[args.train_slice:(args.train_slice + args.num_shot)]
dev_set = read_wikiqa_data(f"data/dev_sampled.json")
dev_set = dev_set[args.dev_slice:(args.num_dev)]

7 not found


In [24]:
showcase_examples = [
    "Q: {}\nA: {}\n".format(s["question"], s["answer"]) for s in train_set
]
prompt = "\n".join(showcase_examples)
print('prompt: ')
print(prompt)

prompt: 
Q: Which film was released earlier, Kistimaat or I'M Taraneh, 15?
A: I'M Taraneh, 15

Q: What is the date of death of the composer of film Baalaraajana Kathe?
A: 27 May 1980

Q: Who is the spouse of the director of film Alive (1993 Film)?
A: Kathleen Kennedy

Q: Who lived longer, Edward Frederick Sanderson or Forrest Towns?
A: Edward Frederick Sanderson

Q: Which country the director of film Battle Circus (Film) is from?
A: American



In [15]:
x = dev_set[0]

In [16]:
def in_context_prediction(ex, shots, engine, style="standard", length_test_only=False, n=1):

    if style == "standard":
        showcase_examples = [
            "Q: {}\nA: {}\n".format(s["question"], s["answer"]) for s in shots
        ]
        input_example = "Q: {}\nA:".format(ex["question"])
        prompt = "\n".join(showcase_examples + [input_example])
    else:
        raise RuntimeError("Unsupported prompt style")
    
    temp = 0.7 if n > 1 else 0.0

    try:
        resp = openai.Completion.create(
            engine=engine, prompt=prompt, max_tokens=32, stop='\n',
            temperature=temp, logprobs=5, echo=True, n=n
        )

        pred = resp if n > 1 
    except Exception as e:
        print(f'Encountered Error {e}')

    if n > 1:
        return pred
    else:
        return resp["choices"][0]


In [20]:
in_context_prediction(x, train_set, engine=args.engine, \
                style=args.style, length_test_only=args.run_length_test)

Q: Which film was released earlier, Kistimaat or I'M Taraneh, 15?
A: I'M Taraneh, 15

Q: What is the date of death of the composer of film Baalaraajana Kathe?
A: 27 May 1980

Q: Who is the spouse of the director of film Alive (1993 Film)?
A: Kathleen Kennedy

Q: Who lived longer, Edward Frederick Sanderson or Forrest Towns?
A: Edward Frederick Sanderson

Q: Which country the director of film Battle Circus (Film) is from?
A: American

Q: Where was the performer of song Get A Life – Get Alive born?
A:


In [73]:
pred = in_context_prediction(x, train_set, engine=args.engine,
    style=args.style, length_test_only=args.run_length_test)

Encountered Error 

You tried to access openai.Completion, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.

You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. 

Alternatively, you can pin your installation to the old version, e.g. `pip install openai==0.28`

A detailed migration guide is available here: https://github.com/openai/openai-python/discussions/742



UnboundLocalError: cannot access local variable 'pred' where it is not associated with a value

In [9]:
client = OpenAI(
    # This is the default and can be omitted
    api_key=os.environ.get("OPENAI_API_KEY"),
)

In [33]:
from openai import OpenAI
client = OpenAI()

response = client.responses.create(
    model="gpt-4.1",
    input="Write a one-sentence bedtime story about a unicorn."
)

print(response.output_text)

Under a starry sky, a gentle unicorn danced through a shimmering forest, leaving trails of moonlight that whispered sweet dreams to all who slept.


In [23]:
input = x['question']
input

'Where was the performer of song Get A Life – Get Alive born?'

In [25]:
instructions = prompt
print(instructions)

Q: Which film was released earlier, Kistimaat or I'M Taraneh, 15?
A: I'M Taraneh, 15

Q: What is the date of death of the composer of film Baalaraajana Kathe?
A: 27 May 1980

Q: Who is the spouse of the director of film Alive (1993 Film)?
A: Kathleen Kennedy

Q: Who lived longer, Edward Frederick Sanderson or Forrest Towns?
A: Edward Frederick Sanderson

Q: Which country the director of film Battle Circus (Film) is from?
A: American



In [30]:
response = client.responses.create(
    model=args.engine,
    input= input,
    instructions= prompt
)

In [31]:
response.output_text

'The performer of the song "Get A Life - Get Alive" is Peter John Cox, and he was born in Kingston upon Thames, England.'

In [37]:
response

Response(id='resp_681331e0cedc8191af805b63e4021d0a0b08a7d788721748', created_at=1746088416.0, error=None, incomplete_details=None, instructions='Only return the answer and not complete sentence.', metadata={}, model='gpt-3.5-turbo-0125', object='response', output=[ResponseOutputMessage(id='msg_681331e163e88191a25f3410650f5d510b08a7d788721748', content=[ResponseOutputText(annotations=[], text='Kingston, Jamaica', type='output_text')], role='assistant', status='completed', type='message')], parallel_tool_calls=True, temperature=1.0, tool_choice='auto', tools=[], top_p=1.0, max_output_tokens=None, previous_response_id=None, reasoning=Reasoning(effort=None, generate_summary=None, summary=None), service_tier='default', status='completed', text=ResponseTextConfig(format=ResponseFormatText(type='text')), truncation='disabled', usage=ResponseUsage(input_tokens=165, input_tokens_details=InputTokensDetails(cached_tokens=0), output_tokens=5, output_tokens_details=OutputTokensDetails(reasoning_tok

In [26]:
input_example = "Q: {}\nA:".format(x["question"])

In [27]:
input = "\n".join([prompt] + [input_example])

In [28]:
print(input)

Q: Which film was released earlier, Kistimaat or I'M Taraneh, 15?
A: I'M Taraneh, 15

Q: What is the date of death of the composer of film Baalaraajana Kathe?
A: 27 May 1980

Q: Who is the spouse of the director of film Alive (1993 Film)?
A: Kathleen Kennedy

Q: Who lived longer, Edward Frederick Sanderson or Forrest Towns?
A: Edward Frederick Sanderson

Q: Which country the director of film Battle Circus (Film) is from?
A: American

Q: Where was the performer of song Get A Life – Get Alive born?
A:


In [35]:
response = client.responses.create(
    model=args.engine,
    input= input,
    instructions= "Only return the answer and not complete sentence."
)

In [36]:
response.output_text

'Kingston, Jamaica'

In [30]:
import openai

In [31]:
response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[{"role": "user", "content": input}],
    n=5,
    temperature=0.7
)

APIRemovedInV1: 

You tried to access openai.ChatCompletion, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.

You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. 

Alternatively, you can pin your installation to the old version, e.g. `pip install openai==0.28`

A detailed migration guide is available here: https://github.com/openai/openai-python/discussions/742


In [50]:
completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "Tell me a joke."
        }
    ],
    model="gpt-3.5-turbo",
    temperature= 0.7,
    n=3
)

pred = {}
responses =[]
# 'completion.choices' will contain a list of 5 responses
for choice in completion.choices:
    responses.append(choice.message.content)

pred["responses"] = responses

In [51]:
pred

{'responses': ["Why couldn't the bicycle stand up by itself?\n\nBecause it was two-tired!",
  'Why did the scarecrow win an award? Because he was outstanding in his field!',
  "Why don't scientists trust atoms?\n\nBecause they make up everything!"]}

In [40]:
completion

ChatCompletion(id='chatcmpl-BSJr11MIVubI5D0pPIdTw5UhCRqvN', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="Why don't scientists trust atoms?\n\nBecause they make up everything!", refusal=None, role='assistant', annotations=[], audio=None, function_call=None, tool_calls=None)), Choice(finish_reason='stop', index=1, logprobs=None, message=ChatCompletionMessage(content='Why did the scarecrow win an award? Because he was outstanding in his field!', refusal=None, role='assistant', annotations=[], audio=None, function_call=None, tool_calls=None)), Choice(finish_reason='stop', index=2, logprobs=None, message=ChatCompletionMessage(content="Why don't scientists trust atoms?\n\nBecause they make up everything!", refusal=None, role='assistant', annotations=[], audio=None, function_call=None, tool_calls=None)), Choice(finish_reason='stop', index=3, logprobs=None, message=ChatCompletionMessage(content='Why did the scarecrow win an award?\nBecaus

In [49]:
completion.choices[0].message.content

"Why couldn't the bicycle find its way home? Because it lost its bearings!"

In [44]:
completion["id"] = 1

TypeError: 'ChatCompletion' object does not support item assignment