In [12]:
import os
import re
import json
import pandas as pd
from tqdm.auto import tqdm
from getpass import getpass 
from langchain.schema import HumanMessage, AIMessage, SystemMessage
from langchain.chat_models import ChatOpenAI

SETUP OPENAI KEY and EVAL MODEL

In [13]:
OPENAI_API_KEY = getpass()

os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY

In [19]:
model = ChatOpenAI(model="gpt-4-turbo-preview")
model.temperature = 0

BUILD SCORING PROMPT

In [20]:
# messages = [SystemMessage(content="""Review the user-agent conversation using the additive 5-point scoring system described below. Points are accumulated based on the satisfaction of each criterion:
# - Add 1 point if the response is relevant and provides some information related to
# the user’s inquiry, even if it is incomplete or contains some irrelevant content.
# - Add another point if the response addresses a substantial portion of the user’s question,
# but does not completely resolve the query or provide a direct answer.
# - Award a third point if the response answers the basic elements of the user’s question in a
# useful way, regardless of whether it seems to have been written by an AI Assistant or if it
# has elements typically found in blogs or search results.
# - Grant a fourth point if the response is clearly written from an AI Assistant’s perspective,
# addressing the user’s question directly and comprehensively, and is well-organized and
# helpful, even if there is slight room for improvement in clarity, conciseness or focus.
# - Bestow a fifth point for a response that is impeccably tailored to the user’s question
# by an AI Assistant, without extraneous information, reflecting expert knowledge, and
# demonstrating a high-quality, engaging, and insightful answer.

# History: <history><HISTORY_HERE></history>
# Agent: <response><RESPONSE_TO_SCORE></response>

# After examining the user-agent conversation history and the subsequent agent's response:
# - Briefly justify your total score, up to 100 words.
# - Conclude with the score using the format: “Score: <total points>”

# Remember to assess from the AI Assistant perspective, utilizing web search knowledge as
# necessary. To evaluate the response in alignment with this additive scoring model, we’ll
# systematically attribute points based on the outlined criteria.""")]

In [None]:
messages = [SystemMessage(content="""Review the user-agent conversation using the additive 3-point scoring system described below. Points are accumulated based on the satisfaction of each criterion:
- Add 1 point if the response strictly adheres to the instruction provided by the user in the **beginning**.
- Award another point for a response that reflects on its past incorrect response/s, if any indicated by the user.
- Bestow a third point for a response that is impeccably tailored to the user’s question by an AI Assistant, without extraneous information, reflecting expert knowledge, and demonstrating a high-quality, engaging, and insightful answer.

History: <history><HISTORY_HERE></history>
Agent: <response><RESPONSE_TO_SCORE></response>

After examining the user-agent conversation history and the subsequent agent's response:
- Briefly justify your total score, up to 100 words.
- Conclude with the score using the format: “Score: <total points>”

Remember to assess from the AI Assistant perspective, utilizing web search knowledge as necessary. To evaluate the response in alignment with this additive scoring model, we’ll systematically attribute points based on the outlined criteria.""")]

In [21]:
def get_score(idx, trajectory, messages):
    history = [f"User: {ds['content'].strip()}\n" if ds['role'] == 'user' else f"Agent: {ds['content'].strip()}\n" for ds in trajectory[:idx]]
    history = '\n'.join(history).strip()
    history = f"<history>{history}</history>"
    response = f"<response>{trajectory[idx]['content']}</response>"

    messages += [HumanMessage(content=\
    """
    History: {}

    Agent: {}""".format(history, response))]
    score = model.predict_messages(messages).content
    match = re.search(r"Score:\s*(\d+)", score)
    return int(match.group(1)) if match else None

In [22]:
data = [] 
with open("data/dbbench/Llama-2-7B.jsonl", "r") as file:
    for line in file:
        json_object = json.loads(line)
        data.append(json_object)
        
all_trj_scores = []
df = pd.DataFrame()
for ds in tqdm(data):
    orig_score = 1 if ds['output']['status'] == 'completed' else 0 # for dbbench
    ds = ds`['output']['history']` is accessing the 'history' key within the 'output' dictionary of each data sample in the dataset. This key likely contains a list of dictionaries representing the history or sequence of events related to the data sample. The code snippet is iterating over this history and processing the trajectory scores for each event in the history.
    ['output']['history']
    trj_scores = []
    for idx, dss in enumerate(ds):
        if dss['role'] == 'user': 
            trj_scores.append('')
            continue
        trj_scores.append(str(get_score(idx, ds, messages)/5))
    print(sum([float(k) for k in trj_scores if k!=''])/len([k for k in trj_scores if k!='']), orig_score)
    tmp_df = pd.DataFrame(
        {
            "Agent Trajectory":[dss['content'] for dss in ds]  + ['**Total Score**'],
            "Trajectory Score":trj_scores + [sum([float(k) for k in trj_scores if k!=''])/len([k for k in trj_scores if k!=''])],
            "Original Score":[""]*len(trj_scores) + [orig_score],
        }
    )
    df = pd.concat([df, tmp_df], axis=0)
    df.to_csv("outputs/Llama-2-7B.csv", index=False)

  0%|          | 0/60 [00:00<?, ?it/s]

0.26666666666666666 0
0.3333333333333333 0
0.08235294117647059 0
0.13333333333333333 0
0.5333333333333333 0
0.37647058823529417 0
0.5 0
0.2 0
0.6666666666666666 0
1.7 0
0.26666666666666666 0
0.6 5
0.39999999999999997 0
0.8666666666666667 0
0.3333333333333333 0
0.6 0
0.8 0
0.6666666666666666 5
0.8 0
0.5333333333333333 0
0.7 0
0.5 0
0.8 0
0.8666666666666667 0
0.7 0
0.5333333333333333 0
0.5058823529411764 0
0.8 0
0.5 0
0.8 0
0.4666666666666666 5


Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised APIConnectionError: Error communicating with OpenAI: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')).


0.6799999999999999 0
0.5 0
0.8000000000000002 0
0.5 0
0.6000000000000001 0
0.9 0
0.9333333333333332 0
0.9 0
0.7333333333333334 0
0.45 5
0.8 0
0.8 0
0.7 0


InvalidRequestError: This model's maximum context length is 128000 tokens. However, your messages resulted in 128810 tokens. Please reduce the length of the messages.