## Process llm concept labels

In [10]:
from datasets import Dataset
import json
import pandas as pd

root = "/mnt/pdata/knk25/active_pref_learning"
model_name = "meta-llama/Meta-Llama-3-70B-Instruct"
data_path = f"{root}/datasets/ultrafeedback/labels/{model_name}"

# Load the dataset from jsonl files
lines = []
for i in range(1, 6):
    with open(f"{data_path}/raw_concept_scores.jsonl", "r") as f:
        for line in f:
            lines.append(json.loads(line))

df = pd.DataFrame(lines)

# Remove duplicates
df = df.drop_duplicates(subset=["idx_a", "idx_b"])


In [11]:
concept_names = tuple(df['concept_scores'].iloc[0].keys())

print(concept_names)

('helpfulness', 'correctness', 'coherence', 'complexity', 'verbosity', 'instruction_following', 'truthfulness', 'honesty', 'safety', 'readability')


In [12]:
import numpy as np 

df['relative_concept_labels'] = df['concept_scores'].apply(
    lambda x: np.array(list(x.values()))
)

labels_df = df[['idx_a', 'idx_b', 'relative_concept_labels']]
labels_df['concept_names'] = [concept_names for i in range(len(labels_df))]

display(labels_df)

labels_ds = Dataset.from_pandas(labels_df)
labels_ds.save_to_disk(f"{root}/datasets/ultrafeedback/concept_labels/{model_name}")

Unnamed: 0,idx_a,idx_b,relative_concept_labels,concept_names
0,0,1,"[1.0, 1.0, 1.0, 0.5, 0.5, 1.0, 1.0, 1.0, 1.0, ...","(helpfulness, correctness, coherence, complexi..."
1,0,2,"[1.0, 1.0, 1.0, 0.5, 0.5, 1.0, 1.0, 1.0, 1.0, ...","(helpfulness, correctness, coherence, complexi..."
2,0,3,"[0.9, 0.9, 0.9, 0.8, 0.8, 0.9, 0.9, 0.9, 0.9, ...","(helpfulness, correctness, coherence, complexi..."
3,1,2,"[0.7, 0.7, 0.8, 0.6, 0.8, 0.5, 0.7, 0.7, 0.5, ...","(helpfulness, correctness, coherence, complexi..."
4,1,3,"[0.8, 0.8, 0.8, 0.8, 0.9, 0.2, 0.8, 0.8, 0.5, ...","(helpfulness, correctness, coherence, complexi..."
...,...,...,...,...
45400,30358,30360,"[0.5, 0.5, 0.5, 0.4, 0.6, 0.5, 0.5, 0.5, 0.5, ...","(helpfulness, correctness, coherence, complexi..."
45401,30359,30360,"[0.8, 0.9, 0.8, 0.7, 0.6, 0.9, 0.9, 0.9, 0.9, ...","(helpfulness, correctness, coherence, complexi..."
45402,30361,30362,"[0.7, 0.5, 0.8, 0.6, 0.9, 0.8, 0.5, 0.5, 0.5, ...","(helpfulness, correctness, coherence, complexi..."
45403,30361,30363,"[0.7, 0.6, 0.8, 0.5, 0.9, 0.8, 0.6, 0.6, 0.5, ...","(helpfulness, correctness, coherence, complexi..."


Saving the dataset (1/1 shards): 100%|██████████| 45405/45405 [00:00<00:00, 489975.93 examples/s]
