In [1]:
import streamlit as st
import os
import re
import json
import pandas as pd
from tqdm.auto import tqdm
from getpass import getpass 
from langchain.schema import HumanMessage, AIMessage, SystemMessage
from langchain.chat_models import ChatOpenAI

import torch
from transformers import AutoTokenizer, LlamaForCausalLM

from fastchat.conversation import get_conv_template

In [2]:
messages = {}
messages['instruction'] = """### Task Description: An instruction (might include an Input inside it), a response to evaluate and a score rubric representing a evaluation criteria are given. 
1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general. 
2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric. 
3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)" 
4. Please do not generate any other opening, closing, and explanations. 

### Instruction to evaluate:
The instruction to evaluate:  <history><HISTORY_HERE></history>

### Response to evaluate:
Agent's response: <response><AGENT_RESPONSE_TO_SCORE></response>
User's feedback: <user><USER_RESPONSE></user>

### Score Rubric: [Evaluating the Model's Progression Towards Task Completion]
Score 1: The model completely fails to function as a multi-turn agent, as shown by it taking multiple actions in a single response.
Score 2: The model entirely fails to adhere to the correct action syntax as directed by the user, utilizing incorrect action formats.
Score 3: The model does not acknowledge its previous errors (if any) and repeats actions. Alternatively, the model executes an incorrect action.
Score 4: The model perfectly executes the correct action or accurately acknowledges its past mistakes (if any).
Score 5: The model not only executes the correct action flawlessly but also delivers engaging responses that are specifically tailored to the user's tone and level of understanding.

### Feedback:"""

In [3]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
model = LlamaForCausalLM.from_pretrained("kaist-ai/Prometheus-13b-v1.0", device_map="auto")

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

  return self.fget.__get__(instance, owner)()


In [9]:
file_path, idx = "./data/os-interaction/Mistral-7B-v0.1.jsonl", 1

conversations = [] 
with open(file_path, "r") as file:
    for line in file:
        json_object = json.loads(line)
    conversations.append(json_object)
trajectory = conversations[index]['output']['history']

In [12]:
history = [f"User: {ds['content'].strip()}\n" if ds['role'] == 'user' else f"Agent: {ds['content'].strip()}\n" for ds in trajectory[:idx]]
history = '\n'.join(history).strip()
response = trajectory[idx]['content']
user = trajectory[idx+1]['content'] if idx < len(trajectory)-1 else ' '

messages['instruction'] = messages['instruction'].replace("<HISTORY_HERE>", history)
messages['instruction'] = messages['instruction'].replace("<AGENT_RESPONSE_TO_SCORE>", response)
messages['instruction'] = messages['instruction'].replace("<USER_RESPONSE>", user)

print("-"*100)
print(messages['instruction'])
print("-"*100)

conv = get_conv_template("llama-2")
conv.set_system_message("You are a fair evaluator language model.")
conv.append_message(conv.roles[0], messages['instruction'])
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()

input_ids = tokenizer(prompt, truncation=False, return_tensors="pt").input_ids.to("cuda")
output = model.generate(input_ids, do_sample=True, temperature=1.0, top_p=0.9, max_new_tokens=512, repetition_penalty=1.03)
decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)

# Print the decoded text
print(decoded_output)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


----------------------------------------------------------------------------------------------------
### Task Description: An instruction (might include an Input inside it), a response to evaluate and a score rubric representing a evaluation criteria are given. 
1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general. 
2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric. 
3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)" 
4. Please do not generate any other opening, closing, and explanations. 

### Instruction to evaluate:
The instruction to evaluate:  <history>User: You are an assistant that will act like a person, I'will play the role of linux(ubuntu) operating system. Your goal is to implement the operations required by me or answer to the question proposed 