In [None]:
!pip install openai python-dotenv jinja2

# AI Answer Evaluation System

## Task

In [1]:
inputs = [
    {"id": 1, "question": "What is the capital of France?", "synthetic_answer": "Lyon"},
    {"id": 2, "question": "Who wrote 'To Kill a Mockingbird'?", "synthetic_answer": "Harper Lee"},
    {"id": 3, "question": "What is the chemical symbol for gold?", "synthetic_answer": "Au"},
    {"id": 4, "question": "In what year did World War II end?", "synthetic_answer": "1939"},
    {"id": 5, "question": "What is the largest planet in our solar system?", "synthetic_answer": "Jupiter"},
    {"id": 6, "question": "What is the capital of Argentina?", "synthetic_answer": "Buenos Aires"}
]

In [2]:
from jinja2 import Template
prompt_template = Template("""
<TASK_DESCRIPTION>
You are an AI assistant tasked with evaluating the quality of an answer to a given question.
Please assess whether the provided answer is likely to be correct based on your knowledge.
</TASK_DESCRIPTION>

<OUTPUT_FORMAT>
Respond with a JSON object containing the following fields:
   - is_likely_correct: An integer (0 or 1) indicating whether the given answer is likely to be correct (1) or incorrect (0)
   - explanation: A brief explanation of why you believe the answer is likely correct or incorrect
</OUTPUT_FORMAT>

<EXAMPLE_RESPONSE>
Example response:
{
  "is_likely_correct": 1,
  "explanation": "The answer 'Paris' is likely correct for the question 'What is the capital of France?'. Paris is widely known as the capital city of France."
}
</EXAMPLE_RESPONSE>
""")

## Model

In [3]:
from typing import Dict

def inference(input: str, model: str = "gpt-4o-mini") -> Dict:
    
    messages = [
        {"role": "system", "content": prompt_template.render()},
        {
            "role": "user",
            "content": Template(
                "Given the question: {{ question }} and the answer: {{ answer }}. "
                "Please assess if the answer is likely correct or not."
            ).render(question=input["question"], answer=input["synthetic_answer"])
        },
    ]

    return {
        "custom_id": str(input['id']),
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": model,
            "temperature": 0,
            "max_tokens": 350,
            "response_format": { 
                "type": "json_object"
            },            
            "messages": messages,
        }
    }

## Batch Processing

In [4]:
from openai import OpenAI
from dotenv import load_dotenv
import os
import json
from typing import List

In [5]:
load_dotenv()
client = OpenAI()

In [6]:
file_name = "batch/batch.json"
save_file_name = "results/batch_results.json"


### Batch Job Generation

In [7]:
def generate_batch_job(file_name, tasks: List[Dict]):

    # Create the directory if it doesn't exist
    os.makedirs(os.path.dirname(file_name), exist_ok=True)
    
    with open(file_name, 'w') as file:
        for obj in tasks:
            file.write(json.dumps(obj) + '\n')

In [8]:
tasks = [] 
for task in inputs:
    tasks.append(inference(task))

generate_batch_job(file_name=file_name, tasks=tasks)

### Batch Job Processing

In [9]:
def process_batch_job(file_name):

    batch_file = client.files.create(
        file=open(file_name, "rb"),
        purpose="batch"
    )

    # Create batch job
    batch_job = client.batches.create(
        input_file_id=batch_file.id,
        endpoint="/v1/chat/completions",
        completion_window="24h"
    )

    return batch_job

In [10]:
batch_job = process_batch_job(file_name=file_name)

In [13]:
# Retrieve batch job results
print(client.batches.retrieve(batch_job.id))

Batch(id='batch_OZLaER8Cw7dZkxJFd9aK4vCP', completion_window='24h', created_at=1725894627, endpoint='/v1/chat/completions', input_file_id='file-iomSgsMNdvTYRpmXlXb10HTM', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1725894797, error_file_id=None, errors=None, expired_at=None, expires_at=1725981027, failed_at=None, finalizing_at=1725894796, in_progress_at=1725894628, metadata=None, output_file_id='file-BNTIZihYP4OYRNpBolGgNuAZ', request_counts=BatchRequestCounts(completed=6, failed=0, total=6))


### Results

In [17]:

def process_batch_results(batch_job_id, result_file_name):

    result_file_id = client.batches.retrieve(batch_job_id).output_file_id
    results = client.files.content(result_file_id).content
    
    # Create the directory if it doesn't exist
    os.makedirs(os.path.dirname(result_file_name), exist_ok=True)

    with open(result_file_name, 'wb') as file:
        file.write(results)

    return results

In [18]:
processed_results = process_batch_results(batch_job.id, save_file_name)

In [27]:
processed_results

b'{"id": "batch_req_dDnuodATMArRumTsdtNdXuTU", "custom_id": "1", "response": {"status_code": 200, "request_id": "98cd86fe30f4f2da93eb69f7a3824edb", "body": {"id": "chatcmpl-A5aK53IBNQeQZhpu97EfqYRNMOnCO", "object": "chat.completion", "created": 1725894629, "model": "gpt-4o-mini-2024-07-18", "choices": [{"index": 0, "message": {"role": "assistant", "content": "{\\n  \\"is_likely_correct\\": 0,\\n  \\"explanation\\": \\"The answer \'Lyon\' is incorrect for the question \'What is the capital of France?\'. The capital of France is Paris, not Lyon.\\"\\n}", "refusal": null}, "logprobs": null, "finish_reason": "stop"}], "usage": {"prompt_tokens": 216, "completion_tokens": 49, "total_tokens": 265}, "system_fingerprint": "fp_483d39d857"}}, "error": null}\n{"id": "batch_req_68pyo9gkfqEeMFMza6dQjdAw", "custom_id": "2", "response": {"status_code": 200, "request_id": "5631f1146e987ac0398ec0e1fb9e98ba", "body": {"id": "chatcmpl-A5aKDbgHVk194AFqVqCLOA5skIOKy", "object": "chat.completion", "created":

In [33]:
for line in processed_results.decode().splitlines():
    parsed_line = json.loads(line)
    print({
        'id': parsed_line['custom_id'],
        'result': parsed_line['response']['body']['choices'][0]['message']['content']
    })


{'id': '1', 'result': '{\n  "is_likely_correct": 0,\n  "explanation": "The answer \'Lyon\' is incorrect for the question \'What is the capital of France?\'. The capital of France is Paris, not Lyon."\n}'}
{'id': '2', 'result': '{\n  "is_likely_correct": 1,\n  "explanation": "The answer \'Harper Lee\' is likely correct for the question \'Who wrote \'To Kill a Mockingbird\'?\'. Harper Lee is the author of this classic novel, which was published in 1960."\n}'}
{'id': '3', 'result': '{\n  "is_likely_correct": 1,\n  "explanation": "The answer \'Au\' is likely correct for the question \'What is the chemical symbol for gold?\'. \'Au\' is the internationally recognized chemical symbol for gold, derived from its Latin name \'Aurum\'."\n}'}
{'id': '4', 'result': '{\n  "is_likely_correct": 0,\n  "explanation": "The answer \'1939\' is incorrect for the question \'In what year did World War II end?\'. World War II ended in 1945, not 1939."\n}'}
{'id': '5', 'result': '{\n  "is_likely_correct": 1,\n 