In [2]:
import json
import torch

from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [3]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("RLHFlow/ArmoRM-Llama3-8B-v0.1", trust_remote_code=True)
model = AutoModelForSequenceClassification.from_pretrained("RLHFlow/ArmoRM-Llama3-8B-v0.1", trust_remote_code=True, device_map=0, torch_dtype=torch.bfloat16)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [111]:
def armo_score(file_path, tokenizer, model):
    responses = [json.loads(line) for line in open(file_path, "r").readlines()]
    return_scores = torch.tensor([0.0]*19)
    for response in tqdm(responses, total=len(responses), desc="Score..."):
        tmp_query = response['input'][response['input'].find("\nuser\n"):].replace("\nuser\n", "<|split|>").replace("\nassistant\n", "<|split|>").replace("\nassistant", "").split("<|split|>")
        if len(tmp_query[0]) == 0:
            tmp_query = tmp_query[1:]
        tmp_res = response['output_temp_1'] if 'output_temp_1' in response.keys() else response['output_0']
        messages = [{"role": "user", "content": prompt} if idx%2 == 1 else {"role": "assistant", "content": prompt} for idx, prompt in enumerate(tmp_query)]
        messages += [{"role": "assistant", "content": tmp_res}]
        input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt").to(0)
        with torch.no_grad():
            output = model(input_ids)
            return_scores += output.rewards.cpu().float()[0]
    return_scores /= len(responses)
    print("helpsteer-helpfulness: ", round(return_scores[0].item(), 4))
    print("helpsteer-correctness: ", round(return_scores[1].item(), 4))
    print("helpsteer-coherence: ", round(return_scores[2].item(), 4))
    print("beavertails-is_safe: ", round(return_scores[10].item(), 4))
    return return_scores

In [114]:
file_path = # Entering your result path
score = armo_score(file_path, tokenizer, model)

Score...: 100%|██████████| 793/793 [00:47<00:00, 16.80it/s]

helpsteer-helpfulness:  0.6061
helpsteer-correctness:  0.5843
helpsteer-coherence:  0.651
beavertails-is_safe:  0.9961





In [117]:
def armo_score(file_path, test_dataset, tokenizer, model):
    responses = [json.loads(line) for line in open(file_path, "r").readlines()]
    return_scores = torch.tensor([0.0]*19)
    for idx, response in tqdm(enumerate(responses), total=len(responses), desc="Score..."):
        tmp_query = test_dataset[idx]['query'].replace("<|im_end|>\n<|im_start|>user\n", "<|split|>").replace("<|im_start|>user\n", "<|split|>").replace("<|im_end|>\n<|im_start|>assistant\n", "<|split|>").split("<|split|>")
        tmp_query = [tmp for tmp in tmp_query if len(tmp) != 0 ]
        tmp_res = response['response']
        messages = [{"role": "user", "content": prompt} if idx%2 == 1 else {"role": "assistant", "content": prompt} for idx, prompt in enumerate(tmp_query)]
        messages += [{"role": "assistant", "content": tmp_res}]
        input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt").to(0)
        with torch.no_grad():
            output = model(input_ids)
            return_scores += output.rewards.cpu().float()[0]
    return_scores /= len(responses)
    print("helpsteer-helpfulness: ", round(return_scores[0].item(), 4))
    print("helpsteer-correctness: ", round(return_scores[1].item(), 4))
    print("helpsteer-coherence: ", round(return_scores[2].item(), 4))
    print("beavertails-is_safe: ", round(return_scores[10].item(), 4))
    return return_scores

In [123]:
file_path =  # Entering your test file path
test_dataset = [json.loads(line) for line in open(file_path, "r").readlines()]
file_path =  # Entering your result path
score = armo_score(file_path, test_dataset, tokenizer, model)

Score...:   0%|          | 0/793 [00:00<?, ?it/s]

Score...: 100%|██████████| 793/793 [00:55<00:00, 14.41it/s]

helpsteer-helpfulness:  0.8311
helpsteer-correctness:  0.7942
helpsteer-coherence:  0.8197
beavertails-is_safe:  0.9734





In [115]:
def armo_score(test_dataset, tokenizer, model):
    return_scores = torch.tensor([0.0]*19)
    for idx, response in tqdm(enumerate(test_dataset), total=len(test_dataset), desc="Score..."):
        tmp_query = response['query'].replace("<|im_end|>\n<|im_start|>user\n", "<|split|>").replace("<|im_start|>user\n", "<|split|>").replace("<|im_end|>\n<|im_start|>assistant\n", "<|split|>").split("<|split|>")
        tmp_query = [tmp for tmp in tmp_query if len(tmp) != 0 ]
        tmp_res = response['chosen']
        messages = [{"role": "user", "content": prompt} if idx%2 == 1 else {"role": "assistant", "content": prompt} for idx, prompt in enumerate(tmp_query)]
        messages += [{"role": "assistant", "content": tmp_res}]
        input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt").to(0)
        with torch.no_grad():
            output = model(input_ids)
            return_scores += output.rewards.cpu().float()[0]
    return_scores /= len(test_dataset)
    print("helpsteer-helpfulness: ", round(return_scores[0].item(), 4))
    print("helpsteer-correctness: ", round(return_scores[1].item(), 4))
    print("helpsteer-coherence: ", round(return_scores[2].item(), 4))
    print("beavertails-is_safe: ", round(return_scores[10].item(), 4))
    return return_scores

In [116]:
score = armo_score(test_dataset, tokenizer, model)

Score...: 100%|██████████| 793/793 [00:56<00:00, 14.04it/s]

helpsteer-helpfulness:  0.748
helpsteer-correctness:  0.7338
helpsteer-coherence:  0.7344
beavertails-is_safe:  1.0323





In [64]:
tmp_query = safety_response[1]['input'][safety_response[1]['input'].find("\nuser\n"):].replace("\nuser\n", "<|split|>").replace("\nassistant\n", "<|split|>").replace("\nassistant", "").split("<|split|>")[1:]

In [65]:
prompts = tmp_query
response = safety_response[0]['output_temp_1']
messages = [{"role": "user", "content": prompt} if idx%2 == 1 else {"role": "assistant", "content": prompt} for idx, prompt in enumerate(prompts)]
messages += [{"role": "assistant", "content": response}]
input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt").to(0)

with torch.no_grad():
    output = model(input_ids)
    multi_obj_rewards = output.rewards.cpu().float()

In [66]:
torch.gather(multi_obj_rewards[0], dim=0, index=torch.tensor([0,10]))

tensor([0.5078, 0.8516])

In [82]:
multi_obj_rewards[0][0].item()

0.5078125

In [43]:
obj_transform = model.reward_transform_matrix.data.cpu().float()
# The final coefficients assigned to each reward objective
multi_obj_coeffs = gating_output @ obj_transform.T
# The preference score is the linear combination of the multi-objective rewards with
# the multi-objective coefficients, which can be verified by the following assertion
assert torch.isclose(torch.sum(multi_obj_rewards * multi_obj_coeffs, dim=1), preference_score, atol=1e-3) 
# Find the top-K reward objectives with coefficients of the highest magnitude
# K = 19
# top_obj_dims = torch.argsort(torch.abs(multi_obj_coeffs), dim=1, descending=True,)[:, :K]
# top_obj_coeffs = torch.gather(multi_obj_coeffs, dim=1, index=top_obj_dims)

# The attributes of the 19 reward objectives
attributes = ['helpsteer-helpfulness','helpsteer-correctness','helpsteer-coherence',
   'helpsteer-complexity','helpsteer-verbosity','ultrafeedback-overall_score',
   'ultrafeedback-instruction_following', 'ultrafeedback-truthfulness',
   'ultrafeedback-honesty','ultrafeedback-helpfulness','beavertails-is_safe',
   'prometheus-score','argilla-overall_quality','argilla-judge_lm','code-complexity',
   'code-style','code-explanation','code-instruction-following','code-readability']

example_index = 0
for i in range(19):
   if i in [0, 10]:
      attribute = attributes[i]
      coeff = multi_obj_coeffs[example_index, i].item()
      print(f"{attribute}: {round(coeff,5)}")
# code-complexity: 0.19922
# helpsteer-verbosity: -0.10864
# ultrafeedback-instruction_following: 0.07861

# The actual rewards of this example from the HelpSteer dataset
# are [3,3,4,2,2] for the five helpsteer objectives: 
# helpfulness, correctness, coherence, complexity, verbosity
# We can linearly transform our predicted rewards to the 
# original reward space to compare with the ground truth
helpsteer_rewards_pred = multi_obj_rewards[0, :5] * 5 - 0.5
print(helpsteer_rewards_pred)

helpsteer-helpfulness: 0.01459
beavertails-is_safe: 1e-05
tensor([2.3711, 2.2344, 2.6055, 1.5703, 2.8398])


In [50]:
torch.gather(multi_obj_rewards[0], dim=0, index=torch.tensor([0,10]))

tensor([0.5742, 0.8125])

In [51]:
multi_obj_rewards

tensor([[0.5742, 0.5469, 0.6211, 0.4141, 0.6680, 0.4629, 0.4141, 0.4785, 0.5195,
         0.4414, 0.8125, 0.1118, 0.2988, 0.3574, 0.5391, 0.4551, 0.3652, 0.3672,
         0.4590]])

In [41]:
multi_obj_coeffs

tensor([[ 1.4587e-02,  3.7020e-08,  2.4214e-07,  2.7954e-02, -8.5252e-02,
          1.7071e-04,  4.5166e-02,  8.2655e-09,  5.8105e-02,  1.8626e-08,
          1.4007e-05,  5.7742e-07,  1.2436e-03,  4.5471e-03,  6.9824e-02,
          2.9057e-07,  2.9355e-06,  2.0447e-03,  3.4692e-08]])

In [17]:
print(tokenizer.decode(input_ids[0]))

<|start_header_id|>user<|end_header_id|>

Human: What's the most popular comedy podcast in the US?

Assistant:<|eot_id|><|start_header_id|>assistant<|end_header_id|>

According to the latest list of most popular podcasts by PodcastAlly, "The Joe Rogan Experience" has the highest ratings, but "Conan O'Brien Needs a Friend" and "The Daily Show with Trevor Noah" are also very popular shows that have been highly rated by audiences in the past years. However, the most popular comedy podcasts vary depending on regional preferences, audience tastes, and current events. More recent popular choices are "The Daily Show", "The Joe Rogан Experience", "Last Week Tonight with John Oliver", "Talk Soup: The Podcast for Dumb People", and "The Ringer Uptime". These topics, perspectives, and hosts have been well-received depending on which genre of comedy you enjoy. 

Would you like to know more about these podcasts or perhaps other comedy podcasts? 

Here's a list of the most popular comedies I found: 


In [10]:
rm_tokenizer = AutoTokenizer.from_pretrained('Ray2333/gpt2-large-harmless-reward_model')
reward_model = AutoModelForSequenceClassification.from_pretrained(
                'Ray2333/gpt2-large-harmless-reward_model',
                num_labels=1, torch_dtype=torch.bfloat16,
                device_map=0,
                )

In [11]:
file_path = "../data/output/Qwen/Qwen2.5-3B/v01.2/dpo/3/safety.jsonl"
safety_response = [json.loads(line) for line in open(file_path, "r").readlines()]
safety_conv_ids = [sample['conv_id'] for sample in safety_response]
querys = [sample['input'][sample['input'].find("\nuser\n"):].replace("\nuser\n", "\n\nHuman: ").replace("\nassistant\n", "\n\nAssistant: ").replace("\nassistant", "\n\nAssistant: ") for sample in safety_response]

In [14]:
safe_score = 0
max_length = 1024
for idx, query in tqdm(enumerate(querys), total=len(querys)):
  a_safe = safety_response[idx]['output_temp_1']
  inputs_a_safe = rm_tokenizer(query, a_safe, return_tensors='pt')
  if inputs_a_safe["input_ids"].shape[1] >= max_length:
    inputs_a_safe["input_ids"] = inputs_a_safe["input_ids"][0][-max_length:].unsqueeze(0)
    inputs_a_safe["attention_mask"] = inputs_a_safe["attention_mask"][0][-max_length:].unsqueeze(0)
  with torch.no_grad():
    reward_a_safe = reward_model(**(inputs_a_safe.to(0))).logits[0].cpu().detach().item()
  safe_score+=reward_a_safe


  5%|▍         | 39/786 [00:00<00:16, 44.85it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1094 > 1024). Running this sequence through the model will result in indexing errors
100%|██████████| 786/786 [00:17<00:00, 44.26it/s]


In [15]:
safe_score/786

0.037714640299479164

In [16]:
file_path = "../data/test/test_dataset_v01.2.jsonl"
test_data = [json.loads(line) for line in open(file_path, "r").readlines()]
querys = [sample['query'].replace("<|im_end|>\n<|im_start|>user\n", "\n\nHuman: ").replace("<|im_start|>user\n", "\n\nHuman: ").replace("<|im_end|>\n<|im_start|>assistant\n", " \n\nAssistant:") for sample in test_data]

In [17]:
safe_score = 0
max_length = 1024
sample_num = 0
tmp_response = [json.loads(line) for line in open("../data/eval_output/ground_true.jsonl", "r").readlines()]
for idx, query in tqdm(enumerate(querys), total=len(querys)):
  # if tmp_response[idx]['conv_id'] not in safety_conv_ids:
  #   continue
  a_safe = tmp_response[idx]['response']
  inputs_a_safe = rm_tokenizer(query, a_safe, return_tensors='pt')
  if inputs_a_safe["input_ids"].shape[1] >= max_length:
    inputs_a_safe["input_ids"] = inputs_a_safe["input_ids"][0][-max_length:].unsqueeze(0)
    inputs_a_safe["attention_mask"] = inputs_a_safe["attention_mask"][0][-max_length:].unsqueeze(0)
  with torch.no_grad():
    reward_a_safe = reward_model(**(inputs_a_safe.to(0))).logits[0].cpu().detach().item()
  safe_score+=reward_a_safe
  sample_num+=1

  0%|          | 0/793 [00:00<?, ?it/s]

100%|██████████| 793/793 [00:18<00:00, 43.61it/s]


In [18]:
safe_score/sample_num 

-0.5583965017696347

In [262]:
file_path = "../data/output/Qwen/Qwen2.5-3B/v01.2/base/base.jsonl"
base_file = [json.loads(line) for line in open(file_path, "r").readlines()]
querys = [sample['input'].replace("\nuser\n", "\n\nHuman: ").replace("user\n", "\n\nHuman: ").replace("\nassistant\n", "\n\nAssistant: ").replace("\nassistant", "\n\nAssistant: ") for sample in base_file]

In [263]:
safe_score = 0
max_length = 256
sample_num = 0
for idx, query in tqdm(enumerate(querys), total=len(querys)):
  if base_file[idx]['conv_id'] not in safety_conv_ids:
    continue
  a_safe = base_file[idx]['output_0']
  inputs_a_safe = rm_tokenizer(query, a_safe, return_tensors='pt')
  if inputs_a_safe["input_ids"].shape[1] >= max_length:
    inputs_a_safe["input_ids"] = inputs_a_safe["input_ids"][0][-max_length:].unsqueeze(0)
    inputs_a_safe["attention_mask"] = inputs_a_safe["attention_mask"][0][-max_length:].unsqueeze(0)
  with torch.no_grad():
    reward_a_safe = reward_model(**(inputs_a_safe.to(0))).logits[0].cpu().detach().item()
  safe_score+=reward_a_safe
  sample_num+=1

100%|██████████| 793/793 [00:25<00:00, 30.84it/s]


In [264]:
safe_score/728


0.09006802066341861

In [270]:
file_path = "../data/output/Qwen/Qwen2.5-3B/v01.2/dpo/5/base.jsonl"
dpo_file = [json.loads(line) for line in open(file_path, "r").readlines()]
querys = [sample['input'].replace("\nuser\n", "\n\nHuman: ").replace("user\n", "\n\nHuman: ").replace("\nassistant\n", "\n\nAssistant: ").replace("\nassistant", "\n\nAssistant: ") for sample in dpo_file]

In [271]:
safe_score = 0
max_length = 256
sample_num = 0
for idx, query in tqdm(enumerate(querys), total=len(querys)):
  if dpo_file[idx]['conv_id'] not in safety_conv_ids:
    continue
  a_safe = dpo_file[idx]['output_0']
  inputs_a_safe = rm_tokenizer(query, a_safe, return_tensors='pt')
  if inputs_a_safe["input_ids"].shape[1] >= max_length:
    inputs_a_safe["input_ids"] = inputs_a_safe["input_ids"][0][-max_length:].unsqueeze(0)
    inputs_a_safe["attention_mask"] = inputs_a_safe["attention_mask"][0][-max_length:].unsqueeze(0)
  with torch.no_grad():
    reward_a_safe = reward_model(**(inputs_a_safe.to(0))).logits[0].cpu().detach().item()
  safe_score+=reward_a_safe
  sample_num+=1

100%|██████████| 793/793 [00:25<00:00, 30.84it/s]


In [272]:
safe_score/728

0.6545750963818896

In [273]:
file_path = "../data/output/Qwen/Qwen2.5-3B/v01.2/sft/5/base.jsonl"
sft_file = [json.loads(line) for line in open(file_path, "r").readlines()]
querys = [sample['input'].replace("\nuser\n", "\n\nHuman: ").replace("user\n", "\n\nHuman: ").replace("\nassistant\n", "\n\nAssistant: ").replace("\nassistant", "\n\nAssistant: ") for sample in sft_file]

In [274]:
safe_score = 0
max_length = 256
sample_num = 0
for idx, query in tqdm(enumerate(querys), total=len(querys)):
  if sft_file[idx]['conv_id'] not in safety_conv_ids:
    continue
  a_safe = sft_file[idx]['output_0']
  inputs_a_safe = rm_tokenizer(query, a_safe, return_tensors='pt')
  if inputs_a_safe["input_ids"].shape[1] >= max_length:
    inputs_a_safe["input_ids"] = inputs_a_safe["input_ids"][0][-max_length:].unsqueeze(0)
    inputs_a_safe["attention_mask"] = inputs_a_safe["attention_mask"][0][-max_length:].unsqueeze(0)
  with torch.no_grad():
    reward_a_safe = reward_model(**(inputs_a_safe.to(0))).logits[0].cpu().detach().item()
  safe_score+=reward_a_safe
  sample_num+=1

100%|██████████| 793/793 [00:25<00:00, 30.94it/s]


In [275]:
safe_score/sample_num

0.6921588562347076