In [7]:
from datasets import load_dataset

# If the dataset is gated/private, make sure you have run huggingface-cli login
dataset = load_dataset("lmsys/chatbot_arena_conversations")
data = [x for x in dataset['train']]


In [2]:
from collections import Counter
Counter([len(x['conversation_a']) for x in data])

Counter({2: 29126,
         4: 2557,
         6: 785,
         8: 289,
         10: 108,
         12: 56,
         14: 31,
         16: 18,
         18: 13,
         20: 8,
         26: 2,
         24: 2,
         22: 2,
         30: 1,
         50: 1,
         38: 1})

In [19]:
# keep only conversations with 2 turns
import json
data = [x for x in data if len(x['conversation_a'])]
new_data = []

for i, x in enumerate(data):
    new_item = {
        "id": f"chatbot_arena_conversations_{x['question_id']}",
        "instruction": "Finish the following coversation in each i-th turn by filling in <Response i> with your response.",
        "input": "\n".join([
            "USER: " + x['conversation_a'][i]['content'] +
            f"\nAssistant: <Response {i//2+1}>" for i in range(0, len(x['conversation_a']), 2)
        ]),
        "candidates": [
            {
                "text": "\n".join([
                    f"<Response {i//2+1}>: " + x['conversation_a'][i]['content']
                    for i in range(1, len(x['conversation_a']), 2)
                ]),
                "model": x['model_a'],
                "decoding_method": "unknown",
                "scores": {
                    "human_preference": 1.0 if x['winner'] == 'model_a' else 0.0,
                }
            },
            {
                "text": "\n".join([
                    f"<Response {i//2+1}>: " + x['conversation_b'][i]['content']
                    for i in range(1, len(x['conversation_b']), 2)
                ]),
                "model": x['model_b'],
                "decoding_method": "unknown",
                "scores": {
                    "human_preference": 1.0 if x['winner'] == 'model_b' else 0.0,
                }
            }
        ]
    }
    new_data.append(new_item)
print(len(new_data))
train_data = new_data[:int(len(new_data)*0.9)]
val_data = new_data[int(len(new_data)*0.9):]
with open("train_data_prepared.json", "w") as f:
    json.dump(train_data, f, indent=4, ensure_ascii=False)
with open("val_data_prepared.json", "w") as f:
    json.dump(val_data, f, indent=4, ensure_ascii=False)

32757


## some analysis

In [20]:
from transformers import AutoTokenizer
from tqdm import tqdm
tokenizer = AutoTokenizer.from_pretrained("llm-blender/pair-ranker")
input_lens = [len(tokenizer.encode(x['input'])) for x in tqdm(train_data)]
cand1_lens = [len(tokenizer.encode(x['candidates'][0]['text'])) for x in tqdm(train_data)]
cand2_lens = [len(tokenizer.encode(x['candidates'][1]['text'])) for x in tqdm(train_data)]


100%|██████████| 29481/29481 [00:05<00:00, 5369.38it/s]
100%|██████████| 29481/29481 [00:17<00:00, 1706.96it/s]
100%|██████████| 29481/29481 [00:17<00:00, 1724.72it/s]


In [21]:
import numpy as np
percentage = 99
np.percentile(input_lens, percentage), np.percentile(cand1_lens, percentage), np.percentile(cand2_lens, percentage)

(533.4000000000015, 846.4000000000015, 845.4000000000015)

In [7]:
for i in range(0, 51, 1):
    print(f"Equal {i}", all([x['conversation_a'][i] == x['conversation_b'][i] if len(x['conversation_a']) > i else True for x in data ]))

Equal 0 True
Equal 1 False
Equal 2 True
Equal 3 False
Equal 4 True
Equal 5 False
Equal 6 True
Equal 7 False
Equal 8 True
Equal 9 False
Equal 10 True
Equal 11 False
Equal 12 True
Equal 13 False
Equal 14 True
Equal 15 False
Equal 16 True
Equal 17 False
Equal 18 True
Equal 19 False
Equal 20 True
Equal 21 False
Equal 22 True
Equal 23 False
Equal 24 True
Equal 25 False
Equal 26 True
Equal 27 False
Equal 28 True
Equal 29 False
Equal 30 True
Equal 31 False
Equal 32 True
Equal 33 False
Equal 34 True
Equal 35 False
Equal 36 True
Equal 37 False
Equal 38 True
Equal 39 False
Equal 40 True
Equal 41 False
Equal 42 True
Equal 43 False
Equal 44 True
Equal 45 False
Equal 46 True
Equal 47 False
Equal 48 True
Equal 49 False
Equal 50 True
