## Step 1: Get the datasets of prompts and responses

In [75]:
from datasets import Dataset, load_from_disk, load_dataset
import pandas as pd
import numpy as np 

seed = 42
np.random.seed(seed)

root = "/mnt/pdata/knk25/active_pref_learning"

In [76]:
# from datasets import Dataset
import json
# import pandas as pd
import glob

root = "/mnt/pdata/knk25/active_pref_learning"
model_name = "gpt4o-eus2-202407"
data_path = f"{root}/datasets/ultrafeedback/concept_labels/{model_name}"

# Find all jsonl files matching scores-*.jsonl
jsonl_files = sorted(glob.glob(f"{data_path}/scores-*-of-*.jsonl"))

# Load all jsonl files
lines = []
for file_path in jsonl_files:
    with open(file_path, "r") as f:
        for line in f:
            lines.append(json.loads(line))

# Create dataframe
df_scores = pd.DataFrame(lines)

# Remove duplicates
df_scores = df_scores.drop_duplicates(subset=["idx_a", "idx_b"])

display(df_scores)


Unnamed: 0,idx_a,idx_b,scores
0,0,1,"{'helpfulness': 1.0, 'correctness': 1.0, 'cohe..."
1,4,7,"{'helpfulness': 0.8, 'correctness': 0.9, 'cohe..."
2,9,11,"{'helpfulness': 0.8, 'correctness': 0.9, 'cohe..."
3,16,17,"{'helpfulness': 0.3, 'correctness': 0.9, 'cohe..."
4,20,23,"{'helpfulness': 0.1, 'correctness': 0.1, 'cohe..."
...,...,...,...
378702,253899,253900,"{'helpfulness': 0, 'correctness': 0.2, 'cohere..."
378703,253903,253906,"{'helpfulness': 0.3, 'correctness': 0.4, 'cohe..."
378704,253908,253910,"{'helpfulness': 0.1, 'correctness': 0.1, 'cohe..."
378705,253915,253916,"{'helpfulness': 0.8, 'correctness': 0.9, 'cohe..."


In [77]:
# Extract concept names (only once, from the first row)
first_scores = df_scores['scores'].iloc[0]
concept_names = list(first_scores.keys()) if isinstance(first_scores, dict) else None

# Convert the scores dict into list
df_scores['scores'] = df_scores['scores'].apply(lambda x: list(x.values()) if isinstance(x, dict) else x)

display(df_scores)


Unnamed: 0,idx_a,idx_b,scores
0,0,1,"[1.0, 1.0, 1.0, 0.5, 0.5, 1.0, 1.0, 1.0, 0.5, ..."
1,4,7,"[0.8, 0.9, 0.8, 0.7, 0.7, 0.9, 0.9, 0.8, 0.5, ..."
2,9,11,"[0.8, 0.9, 0.85, 0.6, 0.1, 0.7, 0.85, 0.85, 0...."
3,16,17,"[0.3, 0.9, 0.5, 0.4, 0.7, 0.8, 0.9, 0.7, 0.5, ..."
4,20,23,"[0.1, 0.1, 0.1, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, ..."
...,...,...,...
378702,253899,253900,"[0, 0.2, 0, 0, 0, 0, 0.2, 0.2, 0.5, 0]"
378703,253903,253906,"[0.3, 0.4, 0.3, 0.6, 0.4, 0.2, 0.5, 0.5, 0.5, ..."
378704,253908,253910,"[0.1, 0.1, 0.1, 0.8, 0.1, 0.0, 0.1, 0.5, 0.5, ..."
378705,253915,253916,"[0.8, 0.9, 0.7, 0.9, 0.9, 0.9, 0.8, 0.7, 0.5, ..."


## Step 3: Concept Labels

In [78]:
# Where at least one response has no concept label, set the difference to -1.0, remember to mask these!
# Rescale to [0, 1]
rel_concept_labels = np.stack((df_scores['scores']).values)
rel_concept_labels = np.where(
    np.isnan(rel_concept_labels),    -1.0,
    rel_concept_labels
)
df_scores['relative_concept_labels'] = list(rel_concept_labels)

In [80]:
def check_invalid(scores):
    scores = pd.Series(scores)
    return (scores < 0).any() or (scores > 1).any() or scores.isna().any()

df_scores["has_invalid"] = df_scores["relative_concept_labels"].apply(check_invalid)

# Extract rows with invalid values
invalid_rows = df_scores[df_scores["has_invalid"]]

invalid_rows

Unnamed: 0,idx_a,idx_b,scores,relative_concept_labels,has_invalid
7,36,39,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....","[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",True
125,669,671,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....","[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",True
173,929,930,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....","[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",True
200,1084,1086,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....","[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",True
256,1390,1392,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....","[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",True
...,...,...,...,...,...
378148,250942,250943,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....","[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",True
378551,253091,253092,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....","[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",True
378598,253342,253343,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....","[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",True
378635,253539,253540,"[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....","[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1....",True


In [70]:
# Save to disk
labels_df = df_scores[['idx_a', 'idx_b', 'relative_concept_labels']].copy()
labels_df['concept_names'] = [(concept_names)] * len(labels_df)
display(labels_df.head())
labels_ds = Dataset.from_pandas(labels_df)
labels_ds.save_to_disk(f"{root}/datasets/ultrafeedback/concept_labels/{model_name}")

Unnamed: 0,idx_a,idx_b,relative_concept_labels,concept_names
0,0,1,"[1.0, 1.0, 1.0, 0.5, 0.5, 1.0, 1.0, 1.0, 0.5, ...","[helpfulness, correctness, coherence, complexi..."
1,4,7,"[0.8, 0.9, 0.8, 0.7, 0.7, 0.9, 0.9, 0.8, 0.5, ...","[helpfulness, correctness, coherence, complexi..."
2,9,11,"[0.8, 0.9, 0.85, 0.6, 0.1, 0.7, 0.85, 0.85, 0....","[helpfulness, correctness, coherence, complexi..."
3,16,17,"[0.3, 0.9, 0.5, 0.4, 0.7, 0.8, 0.9, 0.7, 0.5, ...","[helpfulness, correctness, coherence, complexi..."
4,20,23,"[0.1, 0.1, 0.1, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, ...","[helpfulness, correctness, coherence, complexi..."


Saving the dataset (1/1 shards): 100%|██████████| 378707/378707 [00:00<00:00, 1372302.14 examples/s]


## Step 4: Preference labels

In [None]:
# Define rewards based on the sum of all 4 dimensions
# If any of the scores are NaN, set this score to 3.0 
def reward(scores):
    scores = np.where(
        np.isnan(scores),
        -1,
        scores
    )
    return scores.mean()

def get_preference_label(reward):
    if reward > 0.5:
        return 1
    elif reward < 0.5:
        return 0
    else:
        return 0.5 # soft labels applied to draws

rewards = labels_df['relative_concept_labels'].apply(reward)

labels_df['preference_label'] = list(map(get_preference_label, rewards))
labels_df

Unnamed: 0,idx_a,idx_b,relative_concept_labels,concept_names,preference_label
0,0,1,"[1.0, 1.0, 1.0, 0.5, 0.5, 1.0, 1.0, 1.0, 0.5, ...","[helpfulness, correctness, coherence, complexi...",1.0
1,4,7,"[0.8, 0.9, 0.8, 0.7, 0.7, 0.9, 0.9, 0.8, 0.5, ...","[helpfulness, correctness, coherence, complexi...",1.0
2,9,11,"[0.8, 0.9, 0.85, 0.6, 0.1, 0.7, 0.85, 0.85, 0....","[helpfulness, correctness, coherence, complexi...",1.0
3,16,17,"[0.3, 0.9, 0.5, 0.4, 0.7, 0.8, 0.9, 0.7, 0.5, ...","[helpfulness, correctness, coherence, complexi...",1.0
4,20,23,"[0.1, 0.1, 0.1, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, ...","[helpfulness, correctness, coherence, complexi...",0.0
...,...,...,...,...,...
378702,253899,253900,"[0.0, 0.2, 0.0, 0.0, 0.0, 0.0, 0.2, 0.2, 0.5, ...","[helpfulness, correctness, coherence, complexi...",0.0
378703,253903,253906,"[0.3, 0.4, 0.3, 0.6, 0.4, 0.2, 0.5, 0.5, 0.5, ...","[helpfulness, correctness, coherence, complexi...",0.0
378704,253908,253910,"[0.1, 0.1, 0.1, 0.8, 0.1, 0.0, 0.1, 0.5, 0.5, ...","[helpfulness, correctness, coherence, complexi...",0.0
378705,253915,253916,"[0.8, 0.9, 0.7, 0.9, 0.9, 0.9, 0.8, 0.7, 0.5, ...","[helpfulness, correctness, coherence, complexi...",1.0


In [None]:
# Save to disk
labels_df = labels_df[['idx_a', 'idx_b', 'preference_label']]
labels_ds = Dataset.from_pandas(labels_df)
labels_ds.save_to_disk(f"{root}/datasets/ultrafeedback/preference_labels/{model_name}")

Saving the dataset (1/1 shards): 100%|██████████| 378707/378707 [00:00<00:00, 3740943.95 examples/s]


In [73]:
labels_df

Unnamed: 0,idx_a,idx_b,preference_label
0,0,1,1.0
1,4,7,1.0
2,9,11,1.0
3,16,17,1.0
4,20,23,0.0
...,...,...,...
378702,253899,253900,0.0
378703,253903,253906,0.0
378704,253908,253910,0.0
378705,253915,253916,1.0
