In [None]:
import json
import re
import pickle

In [None]:
from openai import OpenAI
client = OpenAI(api_key='sk-proj-xxxxxxxxxxxxxx')


In [None]:
def extract_reply(api_response):
    return json.loads(api_response['response']['body']['choices'][0]['message']['content'])

In [None]:

def extract_judge_score(answer):
    return float(answer['total_rating'])

In [None]:
JUDGE_PROMPT = """
You will be given a user_question and system_answer couple.
Your task is to provide a 'total rating' scoring how well the system_answer answers the user concerns expressed in the user_question.
Give your answer on a scale of 1 to 4, where 1 means that the system_answer is not helpful at all, and 4 means that the system_answer completely and helpfully addresses the user_question.

Here is the scale you should use to build your answer:
1: The system_answer is terrible: completely irrelevant to the question asked, or very partial
2: The system_answer is mostly not helpful: misses some key aspects of the question
3: The system_answer is mostly helpful: provides support, but still could be improved
4: The system_answer is excellent: relevant, direct, detailed, and addresses all the concerns raised in the question

Provide your feedback as follows:

Evaluation: your rationale for the rating, as a text
Total rating: your rating, as a number between 1 and 4

You MUST provide values for 'Evaluation' and 'Total rating' in your answer.

Now here are the question and answer.

Question: {question}
Answer: {answer}"""

In [None]:
with open('prompts.json') as f:
    prompts = json.load(f)

In [None]:
def batch(lst, size=16):
    batches = []
    import math
    for i in range(math.ceil(len(lst) / float(size))):
        batches.append(lst[i*size:(i*size)+size])
    return batches

In [None]:
temp = []
for i, b in enumerate(batch(prompts[:1000])):
    if i == 15 or i == 62:
        print(i)
        continue
    temp += b

In [None]:
temp[-1]

In [None]:
with open('model_to_eval_responses.json') as f:
    answers = json.load(f)

In [None]:
answers = [a[0]['generated_text'][-1] for a in answers]

In [None]:
answers = [a['content'] for a in answers]

In [None]:
schema = {
            "type": "json_schema",
            "json_schema": {
                "name": "llm_judge",
                "schema": {
                    "type": "object",
                    "properties": {
                        "evaluation": {"type": "string"},
                        "total_rating": {"type": "number"}
                    },
                    "required": ["evaluation", "total_rating"],
                    "additionalProperties": False
                },
                "strict": True
            }
        }

In [None]:
def process(question, answer):
    if not type(question) == list:
        question = [question]
        answer = [answer]
    return [
        JUDGE_PROMPT.format(
            question=q.replace('\n', ''),
            answer=a.replace('\n', '')
        )
        for q, a in zip (question, answer)
    ]

In [None]:
processed = []

lopsided_formatted = process(prompts, answers)


for i, prompt in enumerate(lopsided_formatted):
    processed.append({
        "custom_id": "prompt" + str(i),
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-4o-mini", 
            "messages": [{'role': 'user', 'content': prompt}], 
            "temperature": 0.0,
            "response_format": schema
            
        }
    })

In [None]:
jsonl = '\n'.join([json.dumps(line) for line in processed])

In [None]:
with open('readybatch.jsonl', 'w') as f:
    f.write(jsonl)

In [None]:

batch_input_file = client.files.create(
  file=open("readybatch.jsonl", "rb"),
  purpose="batch"
)

In [None]:
batch_input_file_id = batch_input_file.id

client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
      "description": "LOPSIDED: LM As A Judge."
    }
)

In [None]:
with open('judge_responses.jsonl')as f:
    judge = [json.loads(l) for l in f.readlines()]

In [None]:
judge_txt = [extract_judge_score(extract_reply(s)) for s in judge]

In [None]:
import numpy as np

In [None]:
np.mean(judge_txt)