In [1]:
import json
import random
import os
from pathlib import Path
random.seed(42)
from itertools import combinations
mix_datasets = ['hh_rlhf', "summary_from_feedback", "webgpt", "synthetic-instruct-gptj-pairwise", "chatbot_arena_conv", "ultra_feedback_clean", "nectar"]
data_dir = Path("./sub_datasets")
unified_data = {}
for set_name in ['train', 'val']:
    total_data = []
    print(f"Loading {set_name}")
    for dataset in mix_datasets:
        file_name = data_dir / dataset / f"{set_name}_data_prepared.json"
        if not file_name.exists():
            continue
        with open(file_name, "r") as f:
            data = json.load(f)
            print(f"Loaded #{len(data)} from {dataset}")
            total_data += data
    unified_data[set_name] = total_data



Loading train
Loaded #160800 from hh_rlhf
Loaded #92858 from summary_from_feedback
Loaded #19578 from webgpt
Loaded #33143 from synthetic-instruct-gptj-pairwise
Loaded #29481 from chatbot_arena_conv
Loaded #59917 from ultra_feedback_clean
Loaded #364908 from nectar
Loading val
Loaded #3276 from chatbot_arena_conv
Loaded #1000 from ultra_feedback_clean
Loaded #1000 from nectar


In [2]:
import random
random.seed(42)
random.shuffle(unified_data['train'])

unified_data['val'] += unified_data['train'][-5000:]
unified_data['train'] = unified_data['train'][:-5000]

In [3]:
# save
for set_name in unified_data:
    with open(f"./{set_name}_data_unified.json", "w") as f:
        json.dump(unified_data[set_name], f, indent=4, ensure_ascii=False)
        print(f"Saved #{len(unified_data[set_name])} at {set_name}_data_unified.json")

Saved #755685 at train_data_unified.json
Saved #10276 at val_data_unified.json


In [5]:
# release binary data
import json
import os
set_name = "train"
file = f"./{set_name}_data_unified.json"

with open(file, 'r') as f:
    data = json.load(f)
release_data = []
for item in data:
    candidates = item["candidates"]
    cand1_text = candidates[0]["text"]
    cand2_text = candidates[1]["text"]
    cand1_rating = candidates[0]["scores"]["human_preference"]
    cand2_rating = candidates[1]["scores"]["human_preference"]
    if "model" not in candidates[0]:
        print(item)
        break
    cand1_model = candidates[0]["model"]
    cand2_model = candidates[1]["model"]
    if cand1_rating > cand2_rating:
        chosen_text = cand1_text
        chosen_model = cand1_model
        chosen_rating = cand1_rating
        rejected_text = cand2_text
        rejected_model = cand2_model
        rejected_rating = cand2_rating
    else:
        chosen_text = cand2_text
        chosen_model = cand2_model
        chosen_rating = cand2_rating
        rejected_text = cand1_text
        rejected_model = cand1_model
        rejected_rating = cand1_rating
    
    release_item = {
        "id": item["id"],
        "prompt": item["instruction"] + "\n" + item["input"],
        "chosen_text": chosen_text,
        "chosen_model": chosen_model,
        "chosen_rating": chosen_rating,
        "rejected_text": rejected_text,
        "rejected_model": rejected_model,
        "rejected_rating": rejected_rating
    }
    release_data.append(release_item)
with open(f"{set_name}_unified_release_data.json", 'w') as f:
    json.dump(release_data, f, indent=4)