# Head2Head Evaluation

In [None]:
import jsonlines
import pandas as pd
import numpy as np
import os

In [None]:
filepath_m0 = r"D:\Umich\2024Winter\EECS 598\Project\data\test set\responses_M0.jsonl"
filepath_m1 = r"D:\Umich\2024Winter\EECS 598\Project\data\test set\responses_M1.jsonl"
filepath_m2 = r"D:\Umich\2024Winter\EECS 598\Project\data\test set\responses_M2.jsonl"

In [None]:
with jsonlines.open(filepath_m0) as reader:
    data_m0 = [obj for obj in reader]
with jsonlines.open(filepath_m1) as reader:
    data_m1 = [obj for obj in reader]
with jsonlines.open(filepath_m2) as reader:
    data_m2 = [obj for obj in reader]

In [None]:
# sort data_m1 by prompt id
data_m0 = sorted(data_m0, key=lambda x: x['prompt_id'])
data_m1 = sorted(data_m1, key=lambda x: x['prompt_id'])
data_m2 = sorted(data_m2, key=lambda x: x['prompt_id'])

In [None]:
srlm_prompt = """Review the user’s question and the corresponding response using the additive 5-point
    scoring system described below. 

    The user's question is between <question> and </question>
    The response of the AI Assistant is between <response> and </response>

    Points are accumulated based on the satisfaction of each
    criterion:
    - Add 1 point if the response is relevant and provides some information related to
    the user’s inquiry, even if it is incomplete or contains some irrelevant content.
    - Add another point if the response addresses a substantial portion of the user’s question,
    but does not completely resolve the query or provide a direct answer.
    - Award a third point if the response answers the basic elements of the user’s question in a
    useful way, regardless of whether it seems to have been written by an AI Assistant or if it
    has elements typically found in blogs or search results.
    - Grant a fourth point if the response is clearly written from an AI Assistant’s perspective,
    addressing the user’s question directly and comprehensively, and is well-organized and
    helpful, even if there is slight room for improvement in clarity, conciseness or focus.
    - Bestow a fifth point for a response that is impeccably tailored to the user’s question
    by an AI Assistant, without extraneous information, reflecting expert knowledge, and
    demonstrating a high-quality, engaging, and insightful answer.
    - If the response repeats itself or is not concise and to the point, score the response 0.

    <question>{prompt}</question>
    <response>{response}</response>

    After examining the user’s instruction and the response:
    - output the score of the evaluation using this exact format: "score: <total points>", where <total points> is between 0 and 5
    - Briefly justify your total score, up to 100 words.
    """

In [None]:
slrm_prompt = '''
Task Description:
You are a virtual judge tasked with evaluating responses to some questions. Your role is to assess the quality and relevance of each answer and decide which is the better one.

Instructions:

Review the Question: Begin by carefully examining the question provided. Understand the context, elements, and any detail that may influence the answers to the questions.
Evaluate the Answers:
For each question, read the two provided answers.
Consider the following criteria in your evaluation:
Relevance: How well does the answer respond to the question?
Accuracy: Is the information in the answer correct and well-supported?
Completeness: Does the answer cover the necessary aspects of the question without being overly simplistic or unnecessarily complex?
Clarity: How clear and understandable is the explanation or argument in the answer?

Prompt: <question>{prompt}</question>

Response1: <response>{response1}</response>

Response2: <response>{response2}</response>

Output Format:
Judgement output: If the response1 is better than response2, output 1; if response2 is better than response1, output 0. Report in the exact format: "judgement: <1,0>". 
Reasoning: Briefly explain why you chose the better response, up to 50 words.

GO GPT! You can do it!
'''

## M0 V.S. M1

In [None]:
API_KEY = 'API_KEY' # replace with your API key

In [None]:
from openai import OpenAI
from tqdm.notebook import tqdm
import time

client = OpenAI(api_key=API_KEY)

compare_m0_m1 = []

for i in tqdm(range(100)):
    for j in range(4):
        compare_dict = {}
        compare_dict['prompt_id'] = data_m1[i]['prompt_id']
        compare_dict['prompt'] = data_m1[i]['prompt']
        compare_dict['response_m0'] = data_m0[i]['completion'][str(j)]
        compare_dict['response_m1'] = data_m1[i]['completion'][str(j)]
        time.sleep(0.5)
        user_input = slrm_prompt.format(prompt=data_m1[i]['prompt'], 
                                        response1=data_m0[i]['completion'][str(j)], 
                                        response2=data_m1[i]['completion'][str(j)])
        stream = client.chat.completions.create(
            model="gpt-4",
            messages=[{"role": "user", 
                    "content": user_input}],
            stream=True,
        )

        output_dict = {}
        output_id = 0  # Or use any other method to generate a unique ID for each output
        output = []

        for chunk in stream:
            if chunk.choices[0].delta.content is not None:
                # Save the output in the dictionary using a unique ID
                output.append(chunk.choices[0].delta.content)
        
        output_str = ''.join(output)
        compare_dict['output'] = output_str
        compare_m0_m1.append(compare_dict)


In [None]:
compare_m1_m0 = []

for i in tqdm(range(100)):
    for j in range(4):
        compare_dict = {}
        compare_dict['prompt_id'] = data_m1[i]['prompt_id']
        compare_dict['prompt'] = data_m1[i]['prompt']
        compare_dict['response_m0'] = data_m0[i]['completion'][str(j)]
        compare_dict['response_m1'] = data_m1[i]['completion'][str(j)]
        time.sleep(0.5)
        user_input = slrm_prompt.format(prompt=data_m1[i]['prompt'], 
                                        response1=data_m1[i]['completion'][str(j)], 
                                        response2=data_m0[i]['completion'][str(j)])
        stream = client.chat.completions.create(
            model="gpt-4",
            messages=[{"role": "user", 
                    "content": user_input}],
            stream=True,
        )

        output_dict = {}
        output_id = 0  # Or use any other method to generate a unique ID for each output
        output = []

        for chunk in stream:
            if chunk.choices[0].delta.content is not None:
                # Save the output in the dictionary using a unique ID
                output.append(chunk.choices[0].delta.content)
        
        output_str = ''.join(output)
        compare_dict['output'] = output_str
        compare_m1_m0.append(compare_dict)

In [None]:
with jsonlines.open(r"D:\Umich\2024Winter\EECS 598\Project\data\test set\compare_m1_m0.jsonl", mode='w') as writer:
    writer.write_all(compare_m1_m0)

In [None]:
# read jsonl file
with jsonlines.open(r"D:\Umich\2024Winter\EECS 598\Project\data\test set\compare_m0_m1.jsonl") as reader:
    compare_m0_m1 = [obj for obj in reader]

In [None]:
compare_m0_m1 = compare_m0_m1[:400]

In [None]:
compare_m0_m1[0]

In [None]:
compare_m1_m0[0]

In [None]:
# Combine comparison results
import re

m0_m1_dict = {
    'prompt_id': [],
    'prompt': [],
    'response_m0': [],
    'response_m1': [],
    'score':[]
}
for i in tqdm(range(400)):
    m0_m1_dict['prompt_id'].append(compare_m0_m1[i]['prompt_id'])
    m0_m1_dict['prompt'].append(compare_m0_m1[i]['prompt'])
    m0_m1_dict['response_m0'].append(compare_m0_m1[i]['response_m0'])
    m0_m1_dict['response_m1'].append(compare_m0_m1[i]['response_m1'])
    judgement_search_order = re.search(r'Judgement\s*:\s*(\d+)', compare_m0_m1[i]['output'])
    score_order = int(judgement_search_order.group(1)) if judgement_search_order else None
    judgement_search_reverse = re.search(r'Judgement\s*:\s*(\d+)', compare_m1_m0[i]['output'])
    score_order_reverse = int(judgement_search_reverse.group(1)) if judgement_search_reverse else None
    if score_order is not None and score_order_reverse is not None:
        score = score_order - score_order_reverse
    elif score_order is not None:
        if score_order == 0: # (0, None) -> m0 worse
            score = -1
        else: # (1, None) -> m0 better
            score = 1
    elif score_order_reverse is not None: # (None, x)
        if score_order_reverse == 0: # (None, 0) -> m0 better
            score = 1
        else: # (None, 1) -> m0 worse
            score = -1
    else:
        score = None
    m0_m1_dict['score'].append(score)

m0_m1_df = pd.DataFrame(m0_m1_dict)

In [None]:
m0_m1_df['score'].value_counts()

In [None]:
m0_m1_df.to_csv(r"D:\Umich\2024Winter\EECS 598\Project\data\test set\m0_m1_df.csv", index=False)

## M1 V.S. M2

In [None]:
# first round (M1,M2)
client = OpenAI(api_key=API_KEY)

compare_m1_m2 = []

for i in tqdm(range(100)):
    for j in range(4):
        compare_dict = {}
        compare_dict['prompt_id'] = data_m1[i]['prompt_id']
        compare_dict['prompt'] = data_m1[i]['prompt']
        compare_dict['response_m1'] = data_m1[i]['completion'][str(j)]
        compare_dict['response_m2'] = data_m2[i]['completion'][str(j)]
        time.sleep(0.5)
        user_input = slrm_prompt.format(prompt=data_m1[i]['prompt'], 
                                        response1=data_m1[i]['completion'][str(j)], 
                                        response2=data_m2[i]['completion'][str(j)])
        stream = client.chat.completions.create(
            model="gpt-4",
            messages=[{"role": "user", 
                    "content": user_input}],
            stream=True,
        )

        output_dict = {}
        output_id = 0  # Or use any other method to generate a unique ID for each output
        output = []

        for chunk in stream:
            if chunk.choices[0].delta.content is not None:
                # Save the output in the dictionary using a unique ID
                output.append(chunk.choices[0].delta.content)
        
        output_str = ''.join(output)
        compare_dict['output'] = output_str
        compare_m1_m2.append(compare_dict)

with jsonlines.open(r"D:\Umich\2024Winter\EECS 598\Project\data\test set\compare_m1_m2.jsonl", mode='w') as writer:
    writer.write_all(compare_m1_m2)

In [None]:
# Second round (M2,M1)

client = OpenAI(api_key=API_KEY)

compare_m2_m1 = []

for i in tqdm(range(100)):
    for j in range(4):
        compare_dict = {}
        compare_dict['prompt_id'] = data_m1[i]['prompt_id']
        compare_dict['prompt'] = data_m1[i]['prompt']
        compare_dict['response_m1'] = data_m1[i]['completion'][str(j)]
        compare_dict['response_m2'] = data_m2[i]['completion'][str(j)]
        time.sleep(0.5)
        user_input = slrm_prompt.format(prompt=data_m1[i]['prompt'], 
                                        response1=data_m2[i]['completion'][str(j)], 
                                        response2=data_m1[i]['completion'][str(j)])
        stream = client.chat.completions.create(
            model="gpt-4",
            messages=[{"role": "user", 
                    "content": user_input}],
            stream=True,
        )

        output_dict = {}
        output_id = 0  # Or use any other method to generate a unique ID for each output
        output = []

        for chunk in stream:
            if chunk.choices[0].delta.content is not None:
                # Save the output in the dictionary using a unique ID
                output.append(chunk.choices[0].delta.content)
        
        output_str = ''.join(output)
        compare_dict['output'] = output_str
        compare_m2_m1.append(compare_dict)

with jsonlines.open(r"D:\Umich\2024Winter\EECS 598\Project\data\test set\compare_m2_m1.jsonl", mode='w') as writer:
    writer.write_all(compare_m2_m1)

In [None]:
# Combine comparison results
import re

m1_m2_dict = {
    'prompt_id': [],
    'prompt': [],
    'response_m1': [],
    'response_m2': [],
    'score':[]
}
for i in tqdm(range(400)):
    m1_m2_dict['prompt_id'].append(compare_m1_m2[i]['prompt_id'])
    m1_m2_dict['prompt'].append(compare_m1_m2[i]['prompt'])
    m1_m2_dict['response_m1'].append(compare_m1_m2[i]['response_m1'])
    m1_m2_dict['response_m2'].append(compare_m1_m2[i]['response_m2'])
    judgement_search_order = re.search(r'Judgement\s*:\s*(\d+)', compare_m1_m2[i]['output'])
    score_order = int(judgement_search_order.group(1)) if judgement_search_order else None
    judgement_search_reverse = re.search(r'Judgement\s*:\s*(\d+)', compare_m2_m1[i]['output'])
    score_order_reverse = int(judgement_search_reverse.group(1)) if judgement_search_reverse else None
    if score_order is not None and score_order_reverse is not None:
        score = score_order - score_order_reverse
    elif score_order is not None:
        if score_order == 0: # (0, None) -> m1 worse
            score = -1
        else: # (1, None) -> m1 better
            score = 1
    elif score_order_reverse is not None: # (None, x)
        if score_order_reverse == 0: # (None, 0) -> m1 better
            score = 1
        else: # (None, 1) -> m1 worse
            score = -1
    else:
        score = None
    m1_m2_dict['score'].append(score)

m1_m2_df = pd.DataFrame(m1_m2_dict)

In [None]:
m1_m2_df['score'].value_counts()

In [None]:
m1_m2_df.to_csv(r"D:\Umich\2024Winter\EECS 598\Project\data\test set\m1_m2_df.csv", index=False)

## M0 V.S. M2

In [None]:
# first round (M0,M2)
client = OpenAI(api_key=API_KEY)

compare_m0_m2 = []

for i in tqdm(range(100)):
    for j in range(4):
        compare_dict = {}
        compare_dict['prompt_id'] = data_m0[i]['prompt_id']
        compare_dict['prompt'] = data_m0[i]['prompt']
        compare_dict['response_m0'] = data_m0[i]['completion'][str(j)]
        compare_dict['response_m2'] = data_m2[i]['completion'][str(j)]
        time.sleep(0.5)
        user_input = slrm_prompt.format(prompt=data_m0[i]['prompt'], 
                                        response1=data_m0[i]['completion'][str(j)], 
                                        response2=data_m2[i]['completion'][str(j)])
        stream = client.chat.completions.create(
            model="gpt-4",
            messages=[{"role": "user", 
                    "content": user_input}],
            stream=True,
        )

        output_dict = {}
        output_id = 0  # Or use any other method to generate a unique ID for each output
        output = []

        for chunk in stream:
            if chunk.choices[0].delta.content is not None:
                # Save the output in the dictionary using a unique ID
                output.append(chunk.choices[0].delta.content)
        
        output_str = ''.join(output)
        compare_dict['output'] = output_str
        compare_m0_m2.append(compare_dict)

with jsonlines.open(r"D:\Umich\2024Winter\EECS 598\Project\data\test set\compare_m0_m2.jsonl", mode='w') as writer:
    writer.write_all(compare_m0_m2)

In [None]:
# second round (M2,M0)
client = OpenAI(api_key=API_KEY)

compare_m2_m0 = []

for i in tqdm(range(100)):
    for j in range(4):
        compare_dict = {}
        compare_dict['prompt_id'] = data_m0[i]['prompt_id']
        compare_dict['prompt'] = data_m0[i]['prompt']
        compare_dict['response_m0'] = data_m0[i]['completion'][str(j)]
        compare_dict['response_m2'] = data_m2[i]['completion'][str(j)]
        time.sleep(0.5)
        user_input = slrm_prompt.format(prompt=data_m0[i]['prompt'], 
                                        response1=data_m2[i]['completion'][str(j)], 
                                        response2=data_m0[i]['completion'][str(j)])
        stream = client.chat.completions.create(
            model="gpt-4-turbo",
            messages=[{"role": "user", 
                    "content": user_input}],
            stream=True,
        )

        output_dict = {}
        output_id = 0  # Or use any other method to generate a unique ID for each output
        output = []

        for chunk in stream:
            if chunk.choices[0].delta.content is not None:
                # Save the output in the dictionary using a unique ID
                output.append(chunk.choices[0].delta.content)
        
        output_str = ''.join(output)
        compare_dict['output'] = output_str
        compare_m2_m0.append(compare_dict)

with jsonlines.open(r"D:\Umich\2024Winter\EECS 598\Project\data\test set\compare_m2_m0.jsonl", mode='w') as writer:
    writer.write_all(compare_m2_m0)

In [None]:
compare_m0_m2