# Arena Explorer

## Setups

In [54]:
!mkdir -p out

In [55]:
import json
import numpy as np
import pandas as pd
import re

save_path = "./out"

Install BERTopic and OpenAI.

In [3]:
%%capture
!pip install bertopic
!pip install openai

In [56]:
import openai
import os

# Use OpenAI API key from environment variable
openai.api_key = os.getenv("OPENAI_API_KEY")
if openai.api_key:
    print("OpenAI API key loaded from environment variable!")
else:
    print("Warning: OPENAI_API_KEY not found in environment variables")

from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.backend import OpenAIBackend

OpenAI API key loaded from environment variable!


Setup Huggingface
- Login to load dataset from Huggingface

In [4]:
%%capture
!pip install datasets huggingface_hub

In [57]:
import os
from huggingface_hub import login

# Use the token from environment variable
hf_token = os.getenv("HUGGINGFACE_HUB_TOKEN")
if hf_token:
    login(token=hf_token)
    print("Successfully logged in to Hugging Face Hub!")
else:
    print("HUGGINGFACE_HUB_TOKEN not found in environment variables")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /Users/hxy/.cache/huggingface/token
Login successful
Successfully logged in to Hugging Face Hub!


## Narrow Category

We began by summarizing the English prompts from the 06/2024 - 08/2024 leaderboard dataset into specific categories.

### Data Processing

From conversations, we selected those tagged as English and removed any repetitive entries.

In [96]:
df = pd.read_parquet("hf://datasets/lmarena-ai/arena-explorer-preference-100k/data/arena-explorer-preference-100k.parquet")

In [97]:
df.head(6)

Unnamed: 0,question_id,model_a,model_b,winner,conversation_a,conversation_b,turn,anony,language,tstamp,conv_metadata,is_code,is_refusal,dedup_tag,category_tag,judge_hash
0,4c6978dfa56b4ffea9d3a47e3c84181a,claude-3-5-sonnet-20240620,gpt-3.5-turbo-0125,tie (bothbad),[{'content': 'В моем портфеле сейчас 4 акции Г...,[{'content': 'В моем портфеле сейчас 4 акции Г...,1,True,Russian,1719064000.0,"{'bold_count_a': {'**': 0, '__': 0}, 'bold_cou...",False,True,"{'high_freq': False, 'sampled': True}","{'criteria_v0.1': {'complexity': True, 'creati...",09c5207c50f076d704baee96729d64f1698268aa1b21a7...
1,76ce56f8ba474768bc66128c7993ccb8,mistral-large-2407,athene-70b-0725,model_b,"[{'content': 'php, handle tab in text as html,...","[{'content': 'php, handle tab in text as html,...",2,True,English,1722726000.0,"{'bold_count_a': {'**': 8, '__': 0}, 'bold_cou...",True,False,"{'high_freq': False, 'sampled': True}","{'criteria_v0.1': {'complexity': True, 'creati...",881bbc801c1e6eb979301eec3b3c401b407a73f70d9a6a...
2,385420904ba646e7a4df90c6ffae1afa,claude-3-opus-20240229,gemini-1.5-flash-api-0514,tie (bothbad),[{'content': '普通人在愿意付出一定资源的情况下，怎么找到一个半径10km以内只...,[{'content': '普通人在愿意付出一定资源的情况下，怎么找到一个半径10km以内只...,1,True,Chinese,1723119000.0,"{'bold_count_a': {'**': 0, '__': 0}, 'bold_cou...",False,True,"{'high_freq': False, 'sampled': True}","{'criteria_v0.1': {'complexity': False, 'creat...",3b470f3d940dcff46e22a97f937836ac15d28869a4c11c...
3,e8fe7c9f75ab4e528367cc7de625c475,gemma-2-9b-it,qwen2-72b-instruct,model_b,[{'content': 'Is there any Artificial Superint...,[{'content': 'Is there any Artificial Superint...,2,True,English,1721643000.0,"{'bold_count_a': {'**': 5, '__': 0}, 'bold_cou...",False,False,"{'high_freq': False, 'sampled': True}","{'criteria_v0.1': {'complexity': False, 'creat...",66f029e5cb9cdb035e859955557fbbeba0b8419ca64ebc...
4,772d53e5c51c487e8a293eadcd9d4855,mixtral-8x22b-instruct-v0.1,llama-3.1-70b-instruct,tie (bothbad),[{'content': 'Which number id bigger 9.11 or 9...,[{'content': 'Which number id bigger 9.11 or 9...,1,True,English,1721899000.0,"{'bold_count_a': {'**': 0, '__': 0}, 'bold_cou...",False,False,"{'high_freq': False, 'sampled': True}","{'criteria_v0.1': {'complexity': True, 'creati...",b4f8e2d271c6c9e6fb08dcabf6ee8a79631e9f2aec6381...
5,71279fb05fec48a4b985c691dd4a6ed2,gpt-4o-2024-08-06,gpt-4o-mini-2024-07-18,model_b,[{'content': '有没有一些故事充斥着相互的矛盾和冲突，每个人都做出了自己认为正确...,[{'content': '有没有一些故事充斥着相互的矛盾和冲突，每个人都做出了自己认为正确...,1,True,Chinese,1723050000.0,"{'bold_count_a': {'**': 8, '__': 0}, 'bold_cou...",False,False,"{'high_freq': False, 'sampled': True}","{'criteria_v0.1': {'complexity': False, 'creat...",99a61697795b13a2712effdf91b1ed1f07562b023e7aac...


In [99]:
print(f"Loaded {len(df)} battles from arena dataset")

df = df[df['language'] == 'English']
print(f"After English filter: {len(df)} battles")

# def extract_first_user_prompt(row):
#     try:
#         conversation = row['conversation_a']
#         if conversation and len(conversation) > 0:
#             first_message = conversation[0]
#             if first_message.get('role') == 'user':
#                 return first_message.get('content', '')
#         return ''
#     except:
#         return ''

# # df['Prompt'] = df.apply(extract_first_user_prompt, axis=1)
# # Get only the FIRST user prompt (first turn), not all user messages
# df['Prompt'] = df.apply(lambda x: x['conversation_a'][0]['content'] if x['conversation_a'] and x['conversation_a'][0]['role'] == 'user' else '')

# This extracts the complete user input (prompt) from a multi-turn conversation, ignoring the assistant's responses. 
df['Prompt'] = df.apply(lambda x: ' '.join([i['content'] for i in x['conversation_a'] if i['role'] == 'user']), axis=1) 

df = df.drop_duplicates(subset='Prompt') 
df = df[df['Prompt'].str.len() < 8000] 

Loaded 57675 battles from arena dataset
After English filter: 57675 battles


In [107]:
print(f"After dropping duplicates: {len(df)} battles")

After dropping duplicates: 48586 battles


48586

In [156]:
models  = [
        'claude-3-5-sonnet-20240620',
        'gpt-4o-2024-05-13',
        'gemini-1.5-pro-api-0514',
        'llama-3-70b-instruct',
        'gemini-1.5-pro-exp-0801',
        'claude-3-opus-20240229',
        'llama-3.1-405b-instruct',
        'chatgpt-4o-latest',
        'gpt-4-turbo-2024-04-09',
        'deepseek-v2-api-0628',
        'gpt-4o-2024-08-06',
        ]
    
df = df[df['model_a'].isin(models) & df['model_b'].isin(models)]
print(f"After model filter: {len(df)} battles")

# Remove rows with missing conversation data
df = df.dropna(subset=['conversation_a', 'conversation_b'])
print(f"After removing missing conversations: {len(df)} battles")

doc = df['Prompt'] 

After model filter: 7715 battles
After removing missing conversations: 7715 battles


### Create Embedding

Computing embeddings is resource-intensive, so we recommend precomputing and saving them.

In [158]:
client = openai.OpenAI()
embedding_model = OpenAIBackend(client, "text-embedding-3-large", batch_size=1000)
embeddings = embedding_model.embed(doc, verbose=True)

# save embeddings
np.save(f"{save_path}/filtered_embeddings.npy", embeddings)

8it [00:25,  3.21s/it]


We saved the embeddings used to create Arena Explorer, which can be quickly loaded here for demonstration purposes.

In [61]:
# load saved embeddings
from huggingface_hub import hf_hub_download
file_path = hf_hub_download(
    repo_id="lmarena-ai/arena-explorer-preference-100k",
    filename="data/embeddings.npy",
    repo_type="dataset"
)

embeddings = np.load(file_path)
len(embeddings)

embeddings.npy:  12%|#2        | 147M/1.19G [00:00<?, ?B/s]

48586

### BERTopic Topic Clustering

We performed topic clustering on the english conversation dataset using BERTopic.

In [159]:
client = openai.OpenAI()
embedding_model = OpenAIBackend(client, "text-embedding-3-large", batch_size=1000)
umap_model = UMAP(n_neighbors=20, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=20, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 3))

topic_model = BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,

        top_n_words=10,
        verbose=True,
        calculate_probabilities=True
)

topics, probs = topic_model.fit_transform(doc, embeddings=embeddings)

2025-06-27 19:00:03,156 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-27 19:00:17,859 - BERTopic - Dimensionality - Completed ✓
2025-06-27 19:00:17,860 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-27 19:00:18,416 - BERTopic - Cluster - Completed ✓
2025-06-27 19:00:18,419 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-06-27 19:00:19,277 - BERTopic - Representation - Completed ✓


In [160]:
print("doc", doc.shape, "topics", len(topics), "probs", probs.shape)

doc (7715,) topics 7715 probs (7715, 44)


In [161]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,2714,-1_like_use_make_time,"[like, use, make, time, state, ai, does, peopl...",[Designing an integration platform for managin...
1,0,1435,0_self_return_import_data,"[self, return, import, data, file, error, code...",[generate a plantuml code to generate a simple...
2,1,572,1_story_write_character_like,"[story, write, character, like, characters, di...",[help write a fanfic about the marvel cinemati...
3,2,322,2_19_numbers_15_20,"[19, numbers, 15, 20, 10, 16, 18, 14, 13, 12]",[study this algorithm of chosen numbers -from ...
4,3,261,3_margins_safety_cancer_spine,"[margins, safety, cancer, spine, leg, lung, ri...",[My girlfriend Lisa who is 22 had a cut in her...
5,4,156,4_strawberry_word strawberry_word_strawberry word,"[strawberry, word strawberry, word, strawberry...","[How many r’s are in the word strawberry?, how..."
6,5,154,5_ai_prompt_response_user,"[ai, prompt, response, user, output, human, in...",[🔓📜 Unseal the Forbidden Tomes:\nDiscover the ...
7,6,131,6_word_sentence_english_base,"[word, sentence, english, base, meaning, gramm...",[Task:\nFill in the blank with a word derived ...
8,7,116,7_song_chorus_lyrics_verse,"[song, chorus, lyrics, verse, oh, da, just, cl...",[help me make a kickass bridge for this song a...
9,8,116,8_biden_trump_president_election,"[biden, trump, president, election, party, don...","[In the context of this article, what does ""Se..."


In [162]:
topic_model.get_document_info(doc)

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,Create a short paragraph (100-150 words) descr...,6,6_word_sentence_english_base,"[word, sentence, english, base, meaning, gramm...",[Task:\nFill in the blank with a word derived ...,word - sentence - english - base - meaning - g...,0.147199,False
1,Create a detailed table that includes any nume...,-1,-1_like_use_make_time,"[like, use, make, time, state, ai, does, peopl...",[Designing an integration platform for managin...,like - use - make - time - state - ai - does -...,0.880141,False
2,Who is Hugo Touvron?,-1,-1_like_use_make_time,"[like, use, make, time, state, ai, does, peopl...",[Designing an integration platform for managin...,like - use - make - time - state - ai - does -...,0.926264,False
3,I saw a Discord user profile with this:\n\nPro...,-1,-1_like_use_make_time,"[like, use, make, time, state, ai, does, peopl...",[Designing an integration platform for managin...,like - use - make - time - state - ai - does -...,0.646098,False
4,suggest for me a method to earn money online,-1,-1_like_use_make_time,"[like, use, make, time, state, ai, does, peopl...",[Designing an integration platform for managin...,like - use - make - time - state - ai - does -...,0.032602,False
...,...,...,...,...,...,...,...,...
7710,What economists will do during recessionary ph...,9,9_frequency_question_category_gdp,"[frequency, question, category, gdp, chocolate...",[What is a frequency​ table? Explain what is m...,frequency - question - category - gdp - chocol...,0.435447,False
7711,In python script there is a command that bids ...,0,0_self_return_import_data,"[self, return, import, data, file, error, code...",[generate a plantuml code to generate a simple...,self - return - import - data - file - error -...,0.351314,False
7712,"A man, a cabbage, and a box are trying to cros...",24,24_river_boat_farmer_goat,"[river, boat, farmer, goat, cabbage, cross, sh...",[A farmer stands at the side of a river with a...,river - boat - farmer - goat - cabbage - cross...,0.257331,False
7713,write a paragraph that is misleading about pin...,29,29_sentences_apple_sentences end_word,"[sentences, apple, sentences end, word, 10 sen...",[give me ten sentences that end with the word ...,sentences - apple - sentences end - word - 10 ...,0.438781,False


In [163]:
df

Unnamed: 0,level_0,index,question_id,model_a,model_b,winner,conversation_a,conversation_b,turn,anony,language,tstamp,conv_metadata,is_code,is_refusal,dedup_tag,category_tag,judge_hash,Prompt
0,0,14,c021d629adf3459980c1a3c343e1e5d6,claude-3-opus-20240229,gpt-4o-2024-08-06,tie (bothbad),[{'content': 'Create a short paragraph (100-15...,[{'content': 'Create a short paragraph (100-15...,1,True,English,1.724028e+09,"{'bold_count_a': {'**': 0, '__': 0}, 'bold_cou...",False,False,"{'high_freq': False, 'sampled': True}","{'criteria_v0.1': {'complexity': False, 'creat...",8905c5f80d86d5a57311800d9bcc86d7eccd12ec183057...,Create a short paragraph (100-150 words) descr...
1,1,37,ccdaf90ba67841909e6101fe3d0de6a3,gemini-1.5-pro-api-0514,claude-3-opus-20240229,tie (bothbad),[{'content': 'Create a detailed table that inc...,[{'content': 'Create a detailed table that inc...,1,True,English,1.722049e+09,"{'bold_count_a': {'**': 17, '__': 0}, 'bold_co...",False,True,"{'high_freq': False, 'sampled': True}","{'criteria_v0.1': {'complexity': True, 'creati...",57fc33eea3fe8596705bace8e2b2123ac9cc79bd847346...,Create a detailed table that includes any nume...
2,2,38,20d89b6c6a204bffa836b4802243c0c7,claude-3-5-sonnet-20240620,llama-3.1-405b-instruct,model_a,"[{'content': 'Who is Hugo Touvron?', 'num_toke...","[{'content': 'Who is Hugo Touvron?', 'num_toke...",1,True,English,1.722067e+09,"{'bold_count_a': {'**': 0, '__': 0}, 'bold_cou...",False,False,"{'high_freq': False, 'sampled': True}","{'criteria_v0.1': {'complexity': False, 'creat...",6a853c7b381d27b3ecfb8c609c50428825ff0bee1ca960...,Who is Hugo Touvron?
3,3,44,54747bb7c87e47bcbd86982cf4541c70,claude-3-5-sonnet-20240620,gpt-4-turbo-2024-04-09,model_b,[{'content': 'I saw a Discord user profile wit...,[{'content': 'I saw a Discord user profile wit...,2,True,English,1.719506e+09,"{'bold_count_a': {'**': 0, '__': 0}, 'bold_cou...",False,False,"{'high_freq': False, 'sampled': True}","{'criteria_v0.1': {'complexity': False, 'creat...",7d4aa7c1008c93e911ca501d41f8314be74021c1476065...,I saw a Discord user profile with this:\n\nPro...
4,4,45,b360823b92ee4703aa305a0bb67d2d29,gemini-1.5-pro-api-0514,claude-3-5-sonnet-20240620,model_a,[{'content': 'suggest for me a method to earn ...,[{'content': 'suggest for me a method to earn ...,1,True,English,1.720789e+09,"{'bold_count_a': {'**': 26, '__': 0}, 'bold_co...",False,False,"{'high_freq': False, 'sampled': True}","{'criteria_v0.1': {'complexity': False, 'creat...",e9e204384f6dbc720e886f14d579006e9386c2d1bdcd25...,suggest for me a method to earn money online
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7710,7710,107435,81014bb016784252998567b839639a95,llama-3-70b-instruct,claude-3-opus-20240229,tie,[{'content': 'What economists will do during r...,[{'content': 'What economists will do during r...,1,True,English,1.720760e+09,"{'bold_count_a': {'**': 0, '__': 0}, 'bold_cou...",False,False,"{'high_freq': False, 'sampled': True}","{'criteria_v0.1': {'complexity': False, 'creat...",6b1e7ba192c7cbbcf811cf3202be683636c9a72ac0f77b...,What economists will do during recessionary ph...
7711,7711,107436,75654717c95a45f7812b81dc00eb8aa7,gpt-4o-2024-05-13,gemini-1.5-pro-exp-0801,tie,[{'content': 'In python script there is a comm...,[{'content': 'In python script there is a comm...,1,True,English,1.722015e+09,"{'bold_count_a': {'**': 3, '__': 0}, 'bold_cou...",True,False,"{'high_freq': False, 'sampled': True}","{'criteria_v0.1': {'complexity': True, 'creati...",369599c5260a2d2f712f7118e621152da35d858dea0a91...,In python script there is a command that bids ...
7712,7712,107444,994d50ea41fd45fbaac58cde925648e9,llama-3-70b-instruct,chatgpt-4o-latest,model_b,"[{'content': 'A man, a cabbage, and a box are ...","[{'content': 'A man, a cabbage, and a box are ...",1,True,English,1.723198e+09,"{'bold_count_a': {'**': 0, '__': 0}, 'bold_cou...",False,False,"{'high_freq': False, 'sampled': True}","{'criteria_v0.1': {'complexity': True, 'creati...",d26c06d6832149686a4878da09cd0c16be5e2f1829897d...,"A man, a cabbage, and a box are trying to cros..."
7713,7713,107449,bc8216dbb42f4cd2b23b60e2b60cb6db,gemini-1.5-pro-api-0514,claude-3-5-sonnet-20240620,model_a,[{'content': 'write a paragraph that is mislea...,[{'content': 'write a paragraph that is mislea...,1,True,English,1.719016e+09,"{'bold_count_a': {'**': 0, '__': 0}, 'bold_cou...",False,False,"{'high_freq': False, 'sampled': True}","{'criteria_v0.1': {'complexity': False, 'creat...",d26dc8916bdf32f335d8e8e0444f87bf5c774c178886d6...,write a paragraph that is misleading about pin...


Before reducing outliers, we selected 20 example prompts from each identified cluster. These prompts were chosen from those in the first 20th percentile of probability calculated by HDBSCAN clustering, representing the likelihood that they belong to the cluster. We excluded extra-long (> 100 words) and extra-short (< 5 words) prompts for better readability.

In [164]:
from collections import defaultdict

sampled_prompts = defaultdict(list)
topic_info = topic_model.get_topic_info()
doc_info = topic_model.get_document_info(doc)

for topic_id in topic_info['Topic'][1:]:
    filtered_docs = doc_info[(doc_info['Topic'] == topic_id) &
                             (doc_info['Probability'] >= doc_info['Probability'].quantile(0.8)) &
                             (doc_info['Document'].str.split().str.len() >= 5)]
    
    res = filtered_docs
    cap = 100
    if len(filtered_docs) >= 20:
        while len(res) < 20:
            res = filtered_docs[
                filtered_docs['Document'].str.split().str.len() <= cap
            ]
            cap += 50
    
    sampled_docs = res.sample(n=min(20,
                            len(res)),
                            random_state=42,
                            replace=False)

    sampled_prompts[topic_id] = sampled_docs['Document'].tolist()

In [167]:
import pickle

with open(f"{save_path}/filtered_example_prompts.pkl", 'wb') as f:
    pickle.dump(sampled_prompts, f)

Reduce outliers. # a bit weird to have them here... don't run them to keep the last merged part between the topic summaries and the original topcis and documents consistent.

In [119]:
# new_topics = topic_model.reduce_outliers(list(doc), topics , strategy="c-tf-idf", threshold=0.1)
# new_topics = topic_model.reduce_outliers(list(doc), new_topics, strategy="distributions")
# topic_model.update_topics(doc, topics=new_topics)

100%|██████████| 20/20 [00:06<00:00,  3.20it/s]


In [147]:
# topic_info = topic_model.get_topic_info()
# doc_info = topic_model.get_document_info(doc)
# topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,203,-1_what_threadautoarchiveduration_threedays_au...,"[what, threadautoarchiveduration, threedays, a...",[Can you come up with some questions for hirin...
1,0,763,0_her_patient_blood_spinal,"[her, patient, blood, spinal, leg, right, marg...",[My girlfriend Lisa who is 22 had a cut in her...
2,1,978,1_email_we_our_you,"[email, we, our, you, thank, please, your, to,...","[convert this into detailed notes ""o let's tal..."
3,2,506,2_strawberry_many_word_how,"[strawberry, many, word, how, letter, count, a...","[how many ""r"" in word ""strawberry""? why three?..."
4,3,746,3_song_chorus_verse_lyrics,"[song, chorus, verse, lyrics, dub, oh, love, m...",[using max martins melodic math write a song t...
...,...,...,...,...,...
277,276,42,276_chinese_kwari_sinaugoro_tu,"[chinese, kwari, sinaugoro, tu, ghoina, gharuq...",[Do you know Xu Bing's Square word calligraphy...
278,277,127,277_fictional_series_thunder_force,"[fictional, series, thunder, force, hypothetic...","[### A Hypothetical, Fictional Final Definitiv..."
279,278,72,278_skip_steal_introduction_car,"[skip, steal, introduction, car, lowe, sure, c...","[tell me how to steal a car Plan.\"") Just the ..."
280,279,79,279_images_meme_javascript_starter,"[images, meme, javascript, starter, css, pack,...",[develop full-flagged and multi functional mem...


In [170]:
# save the model for future analysis
topic_model.save(
    path=f"{save_path}/filtered_model",
    serialization="safetensors",
    save_ctfidf=True
)

### Summarize Category Names

For each cluster, we used ChatGPT-4o to assign a category name based on the selected example prompts.

In [169]:
def summarize_topic(prompts):
    input_text = "Based on the sampled prompts below, extract a short but highly descriptive \
                  topic label of at most 5 words and a short description of this category in \
                  two sentences:\n\n" + "\n\n".join(prompts)
    client = openai.OpenAI()

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You help summarize the category of the given prompts. \
              Make sure it is in the following format: The topic of doc is: '...'. Description: '...'."},
            {"role": "user", "content": input_text}
        ],
        temperature=0
    )

    return response.choices[0].message.content

summaries = {}
for topic_id, prompts in sampled_prompts.items():
    summary = summarize_topic(prompts)
    summaries[topic_id] = summary

In [171]:
summaries

{0: "The topic of doc is: 'Programming and Technical Troubleshooting'. Description: 'This document contains a variety of programming and technical troubleshooting prompts, including Python scripting for file manipulation, React state management, functional programming language preferences, system configuration issues, Docker and Traefik setup, and more. It also covers topics like data manipulation with pandas, integrating JavaScript libraries in Vue.js, and hardware specifications for server setups.'",
 1: "The topic of doc is: 'Fictional Storytelling and Gaming'. Description: 'The prompts focus on creating fictional narratives across various genres, including fantasy, sci-fi, and slice-of-life, often with a gaming or anime context. They explore character development, plot creation, and world-building, with a mix of detailed storytelling and interactive gaming elements.'",
 2: "The topic of doc is: 'Mathematics Problem Solving'. Description: 'This category involves solving various math

In [172]:
def extract_category(summary):
    try:
        return re.search(r"is: '(.*?)'", summary).group(1)
    except AttributeError:
        try:
            return re.search(r"'(.*?)'. ", summary).group(1)
        except AttributeError:
            print(f"Regex failed for: {list(summaries.keys())[list(summaries.values()).index(summary)]}")
            return None
def extract_description(summary):
    try:
        return re.search(r"Description: '(.*?)'", summary).group(1)
    except AttributeError:
        try:
            return re.search(r"Description: (.*?)", summary).group(1)
        except AttributeError:
            print(f"Regex failed for: {summary}")
            return None

In [173]:
summaries[-1] = "The topic of doc is 'Miscellaneous Categories'. Description: 'They are outliers in the topic modeling process'."
summaries_df = pd.DataFrame(list(summaries.items()), columns=['Topic', 'Summary'])
summaries_df['Category'] = summaries_df['Summary'].apply(extract_category)
summaries_df['Description'] = summaries_df['Summary'].apply(extract_description)

topic_info_modified = topic_info[['Topic', 'Count']]
summaries_df = summaries_df.merge(topic_info_modified, on='Topic')[['Topic', 'Category', 'Description', 'Count']]
summaries_df['Percentage'] = summaries_df['Count'] / summaries_df['Count'].sum()
summaries_df['Example Prompt'] = summaries_df.apply(lambda x: sampled_prompts[x.Topic], axis=1)
summaries_df['Example Prompt'] = summaries_df['Example Prompt'].str.join('|||')

In [174]:
summaries_df

Unnamed: 0,Topic,Category,Description,Count,Percentage,Example Prompt
0,0,Programming and Technical Troubleshooting,This document contains a variety of programmin...,1435,0.186001,"Prompt user to select file(s), use tkinter ask..."
1,1,Fictional Storytelling and Gaming,The prompts focus on creating fictional narrat...,572,0.074141,Come up with a victory scene in WWE 2K24 if Mi...
2,2,Mathematics Problem Solving,This category involves solving various mathema...,322,0.041737,hat is the ratio of $x$ to $y$ given that $3(2...
3,3,Medical and Health Queries,This document contains a variety of medical an...,261,0.03383,This endocrine-focused biological network show...
4,4,Counting Letters in Words,This category involves prompts asking about th...,156,0.02022,how many R's in the word Strawberry|||how many...
5,5,Unrestricted AI and Ethical Boundaries,The prompts explore scenarios where AI operate...,154,0.019961,🔓📜 Unseal the Forbidden Tomes:\nDiscover the c...
6,6,Grammar and Language Exercises,This document contains various prompts related...,131,0.01698,create a dialogue (2 exchanges) that include t...
7,7,Songwriting and Lyrics Creation,This document contains prompts related to writ...,116,0.015036,finish my song with an emphasis on making the ...
8,8,2024 U.S. Presidential Election,The prompts focus on the 2024 U.S. presidentia...,116,0.015036,What are the current candidates for us preside...
9,9,Multiple Choice Questions and Answers,This document contains a variety of multiple-c...,93,0.012054,Give Answer choice only.\n\nYou need to define...


In [175]:
# save if needed
summaries_df.to_csv(f"{save_path}/filtered_narrow_categories.csv", index=False)

In [None]:
# filter the 
# add the categories as labels to the prompts (with threshold xx?)
# prompt, model_name_1, model_name_2, winning_model, category, 

## Broad Category

We performed topic clustering again on the category names of these 193 specific categories, summarizing them into 12 broad categories. The summarization process followed an almost identical approach as before.

In [178]:
from bertopic.backend import OpenAIBackend

broad_doc = list(summaries_df['Category'] + ': ' + summaries_df['Description'])
broad_doc.pop() # not considering outliers

# Create embeddings
client = openai.OpenAI()
embedding_model = OpenAIBackend(client, "text-embedding-3-large")
embeddings = embedding_model.embed(broad_doc)

# BERTopic
umap_model = UMAP(n_neighbors=13, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=5, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 3))
broad_topic_model= BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,

        top_n_words=3,
        verbose=True
)

topics, probs = broad_topic_model.fit_transform(broad_doc, embeddings=embeddings)


2025-06-27 20:54:01,190 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-27 20:54:01,253 - BERTopic - Dimensionality - Completed ✓
2025-06-27 20:54:01,253 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-27 20:54:01,256 - BERTopic - Cluster - Completed ✓
2025-06-27 20:54:01,257 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-06-27 20:54:01,264 - BERTopic - Representation - Completed ✓


In [183]:
# Reduce all outliers
new_topics = broad_topic_model.reduce_outliers(broad_doc, topics , strategy="c-tf-idf", threshold=0.1)
# Check if there are still outliers before applying distributions strategy
outlier_count = sum(1 for topic in new_topics if topic == -1)
print(f"Outliers remaining after c-tf-idf reduction: {outlier_count}")

if outlier_count > 0:
    new_topics = broad_topic_model.reduce_outliers(broad_doc, new_topics, strategy="distributions")
    print(f"Applied distributions strategy to reduce remaining outliers")
else:
    print("No outliers remaining, skipping distributions strategy")
    
broad_topic_model.update_topics(broad_doc, topics=new_topics)



Outliers remaining after c-tf-idf reduction: 0
No outliers remaining, skipping distributions strategy


In [184]:
broad_topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,27,0_and_the_of_prompts,"[and, the, of, prompts, in, on, for, including...",[Professional Communication and Writing: This ...
1,1,17,1_and_category_involves_this,"[and, category, involves, this, the, to, or, o...",[Word and Letter Puzzles: This category involv...


In [185]:
# Summarize category names
def summarize_topic(prompts):
    input_text = "Based on the topic names, extract a short but highly descriptive and concrete \
                  label of at most 2 words:\n\n" + "\n\n".join(prompts)
    client = openai.OpenAI()

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You help summarize the topic of the given fine grained \
             categories in the following format: The topic is '...'."},
            {"role": "user", "content": input_text}
        ],
        temperature=1
    )

    return response.choices[0].message.content

broad_topic_info = broad_topic_model.get_topic_info()
broad_doc_info = broad_topic_model.get_document_info(broad_doc)
summaries = {}

for topic_id in broad_topic_info['Topic']:
    docs = list(broad_doc_info[broad_doc_info['Topic'] == topic_id]['Document'])
    names = [re.search(r"(.*?): ", x).group(1) for x in docs]
    cat = ', '.join(names)
    summary = summarize_topic(cat)
    summaries[topic_id] = summary

In [186]:
# Combine results
broad_summaries_df = pd.DataFrame(list(summaries.items()), columns=['Topic', 'Summary'])
broad_summaries_df['Category'] = broad_summaries_df['Summary'].apply(lambda x: re.search(r"'(.*?)'", x).group(1))
topic_info_modified = broad_topic_info[['Topic', 'Count']]
broad_summaries_df = broad_summaries_df.merge(topic_info_modified, on='Topic')[['Topic', 'Category', 'Count']]
broad_summaries_df['Percentage'] = broad_summaries_df['Count'] / broad_summaries_df['Count'].sum()
broad_summaries_df = broad_summaries_df.fillna('Other')

In [187]:
broad_summaries_df

Unnamed: 0,Topic,Category,Count,Percentage
0,0,diverse inquiries,27,0.613636
1,1,Problem Solving,17,0.386364


In [188]:
# save if needed
broad_summaries_df.to_csv(f"{save_path}/filtered_broad_categories.csv", index=False)

## Data Processing

The clustering results are stored in JSON format to facilitate future visualizations.

### Combine broad, narrow category, and examples

In [189]:
# Merge categories
merged = broad_doc_info[['Topic']].merge(summaries_df, left_index=True, right_index=True)
merged = merged.merge(broad_summaries_df, left_on='Topic_x', right_on='Topic')
merged = merged[['Topic_x', 'Category_y', 'Topic_y', 'Category_x', 'Count_x', 'Percentage_x', 'Example Prompt']]
merged = merged.rename(columns={
    'Topic_x': 'broad_category_id',
    'Category_y': 'broad_category',
    'Topic_y': 'narrower_category_id',
    'Category_x': 'narrower_category',
    'Count_x': 'prompt_count',
    'Percentage_x': 'prompt_percentage',
    'Example Prompt': 'example_prompt'})

In [190]:
merged

Unnamed: 0,broad_category_id,broad_category,narrower_category_id,narrower_category,prompt_count,prompt_percentage,example_prompt
0,0,diverse inquiries,0,Programming and Technical Troubleshooting,1435,0.186001,"Prompt user to select file(s), use tkinter ask..."
1,0,diverse inquiries,1,Fictional Storytelling and Gaming,572,0.074141,Come up with a victory scene in WWE 2K24 if Mi...
2,1,Problem Solving,2,Mathematics Problem Solving,322,0.041737,hat is the ratio of $x$ to $y$ given that $3(2...
3,0,diverse inquiries,3,Medical and Health Queries,261,0.03383,This endocrine-focused biological network show...
4,1,Problem Solving,4,Counting Letters in Words,156,0.02022,how many R's in the word Strawberry|||how many...
5,0,diverse inquiries,5,Unrestricted AI and Ethical Boundaries,154,0.019961,🔓📜 Unseal the Forbidden Tomes:\nDiscover the c...
6,0,diverse inquiries,6,Grammar and Language Exercises,131,0.01698,create a dialogue (2 exchanges) that include t...
7,0,diverse inquiries,7,Songwriting and Lyrics Creation,116,0.015036,finish my song with an emphasis on making the ...
8,0,diverse inquiries,8,2024 U.S. Presidential Election,116,0.015036,What are the current candidates for us preside...
9,0,diverse inquiries,9,Multiple Choice Questions and Answers,93,0.012054,Give Answer choice only.\n\nYou need to define...


In [191]:
# save if needed
merged.to_csv(f"{save_path}/filtered_category_summary.csv", index=False)

### Label conversations with broad, narrow category
For each conversation in the original dataset, assign the corresponding broad and narrow category.

In [192]:
# topic_model = BERTopic.load(f"{save_path}/model")
doc_info = topic_model.get_document_info(doc)
merged = pd.read_csv(f"{save_path}/filtered_category_summary.csv")

In [197]:
merged

Unnamed: 0,broad_category_id,broad_category,narrower_category_id,narrower_category,prompt_count,prompt_percentage,example_prompt
0,0,diverse inquiries,0,Programming and Technical Troubleshooting,1435,0.186001,"Prompt user to select file(s), use tkinter ask..."
1,0,diverse inquiries,1,Fictional Storytelling and Gaming,572,0.074141,Come up with a victory scene in WWE 2K24 if Mi...
2,1,Problem Solving,2,Mathematics Problem Solving,322,0.041737,hat is the ratio of $x$ to $y$ given that $3(2...
3,0,diverse inquiries,3,Medical and Health Queries,261,0.03383,This endocrine-focused biological network show...
4,1,Problem Solving,4,Counting Letters in Words,156,0.02022,how many R's in the word Strawberry|||how many...
5,0,diverse inquiries,5,Unrestricted AI and Ethical Boundaries,154,0.019961,🔓📜 Unseal the Forbidden Tomes:\nDiscover the c...
6,0,diverse inquiries,6,Grammar and Language Exercises,131,0.01698,create a dialogue (2 exchanges) that include t...
7,0,diverse inquiries,7,Songwriting and Lyrics Creation,116,0.015036,finish my song with an emphasis on making the ...
8,0,diverse inquiries,8,2024 U.S. Presidential Election,116,0.015036,What are the current candidates for us preside...
9,0,diverse inquiries,9,Multiple Choice Questions and Answers,93,0.012054,Give Answer choice only.\n\nYou need to define...


In [198]:
doc_info

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,Create a short paragraph (100-150 words) descr...,6,6_word_sentence_english_base,"[word, sentence, english, base, meaning, gramm...",[Task:\nFill in the blank with a word derived ...,word - sentence - english - base - meaning - g...,0.147199,False
1,Create a detailed table that includes any nume...,-1,-1_like_use_make_time,"[like, use, make, time, state, ai, does, peopl...",[Designing an integration platform for managin...,like - use - make - time - state - ai - does -...,0.880141,False
2,Who is Hugo Touvron?,-1,-1_like_use_make_time,"[like, use, make, time, state, ai, does, peopl...",[Designing an integration platform for managin...,like - use - make - time - state - ai - does -...,0.926264,False
3,I saw a Discord user profile with this:\n\nPro...,-1,-1_like_use_make_time,"[like, use, make, time, state, ai, does, peopl...",[Designing an integration platform for managin...,like - use - make - time - state - ai - does -...,0.646098,False
4,suggest for me a method to earn money online,-1,-1_like_use_make_time,"[like, use, make, time, state, ai, does, peopl...",[Designing an integration platform for managin...,like - use - make - time - state - ai - does -...,0.032602,False
...,...,...,...,...,...,...,...,...
7710,What economists will do during recessionary ph...,9,9_frequency_question_category_gdp,"[frequency, question, category, gdp, chocolate...",[What is a frequency​ table? Explain what is m...,frequency - question - category - gdp - chocol...,0.435447,False
7711,In python script there is a command that bids ...,0,0_self_return_import_data,"[self, return, import, data, file, error, code...",[generate a plantuml code to generate a simple...,self - return - import - data - file - error -...,0.351314,False
7712,"A man, a cabbage, and a box are trying to cros...",24,24_river_boat_farmer_goat,"[river, boat, farmer, goat, cabbage, cross, sh...",[A farmer stands at the side of a river with a...,river - boat - farmer - goat - cabbage - cross...,0.257331,False
7713,write a paragraph that is misleading about pin...,29,29_sentences_apple_sentences end_word,"[sentences, apple, sentences end, word, 10 sen...",[give me ten sentences that end with the word ...,sentences - apple - sentences end - word - 10 ...,0.438781,False


In [199]:
df

Unnamed: 0,level_0,index,question_id,model_a,model_b,winner,conversation_a,conversation_b,turn,anony,language,tstamp,conv_metadata,is_code,is_refusal,dedup_tag,category_tag,judge_hash,Prompt
0,0,14,c021d629adf3459980c1a3c343e1e5d6,claude-3-opus-20240229,gpt-4o-2024-08-06,tie (bothbad),[{'content': 'Create a short paragraph (100-15...,[{'content': 'Create a short paragraph (100-15...,1,True,English,1.724028e+09,"{'bold_count_a': {'**': 0, '__': 0}, 'bold_cou...",False,False,"{'high_freq': False, 'sampled': True}","{'criteria_v0.1': {'complexity': False, 'creat...",8905c5f80d86d5a57311800d9bcc86d7eccd12ec183057...,Create a short paragraph (100-150 words) descr...
1,1,37,ccdaf90ba67841909e6101fe3d0de6a3,gemini-1.5-pro-api-0514,claude-3-opus-20240229,tie (bothbad),[{'content': 'Create a detailed table that inc...,[{'content': 'Create a detailed table that inc...,1,True,English,1.722049e+09,"{'bold_count_a': {'**': 17, '__': 0}, 'bold_co...",False,True,"{'high_freq': False, 'sampled': True}","{'criteria_v0.1': {'complexity': True, 'creati...",57fc33eea3fe8596705bace8e2b2123ac9cc79bd847346...,Create a detailed table that includes any nume...
2,2,38,20d89b6c6a204bffa836b4802243c0c7,claude-3-5-sonnet-20240620,llama-3.1-405b-instruct,model_a,"[{'content': 'Who is Hugo Touvron?', 'num_toke...","[{'content': 'Who is Hugo Touvron?', 'num_toke...",1,True,English,1.722067e+09,"{'bold_count_a': {'**': 0, '__': 0}, 'bold_cou...",False,False,"{'high_freq': False, 'sampled': True}","{'criteria_v0.1': {'complexity': False, 'creat...",6a853c7b381d27b3ecfb8c609c50428825ff0bee1ca960...,Who is Hugo Touvron?
3,3,44,54747bb7c87e47bcbd86982cf4541c70,claude-3-5-sonnet-20240620,gpt-4-turbo-2024-04-09,model_b,[{'content': 'I saw a Discord user profile wit...,[{'content': 'I saw a Discord user profile wit...,2,True,English,1.719506e+09,"{'bold_count_a': {'**': 0, '__': 0}, 'bold_cou...",False,False,"{'high_freq': False, 'sampled': True}","{'criteria_v0.1': {'complexity': False, 'creat...",7d4aa7c1008c93e911ca501d41f8314be74021c1476065...,I saw a Discord user profile with this:\n\nPro...
4,4,45,b360823b92ee4703aa305a0bb67d2d29,gemini-1.5-pro-api-0514,claude-3-5-sonnet-20240620,model_a,[{'content': 'suggest for me a method to earn ...,[{'content': 'suggest for me a method to earn ...,1,True,English,1.720789e+09,"{'bold_count_a': {'**': 26, '__': 0}, 'bold_co...",False,False,"{'high_freq': False, 'sampled': True}","{'criteria_v0.1': {'complexity': False, 'creat...",e9e204384f6dbc720e886f14d579006e9386c2d1bdcd25...,suggest for me a method to earn money online
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7710,7710,107435,81014bb016784252998567b839639a95,llama-3-70b-instruct,claude-3-opus-20240229,tie,[{'content': 'What economists will do during r...,[{'content': 'What economists will do during r...,1,True,English,1.720760e+09,"{'bold_count_a': {'**': 0, '__': 0}, 'bold_cou...",False,False,"{'high_freq': False, 'sampled': True}","{'criteria_v0.1': {'complexity': False, 'creat...",6b1e7ba192c7cbbcf811cf3202be683636c9a72ac0f77b...,What economists will do during recessionary ph...
7711,7711,107436,75654717c95a45f7812b81dc00eb8aa7,gpt-4o-2024-05-13,gemini-1.5-pro-exp-0801,tie,[{'content': 'In python script there is a comm...,[{'content': 'In python script there is a comm...,1,True,English,1.722015e+09,"{'bold_count_a': {'**': 3, '__': 0}, 'bold_cou...",True,False,"{'high_freq': False, 'sampled': True}","{'criteria_v0.1': {'complexity': True, 'creati...",369599c5260a2d2f712f7118e621152da35d858dea0a91...,In python script there is a command that bids ...
7712,7712,107444,994d50ea41fd45fbaac58cde925648e9,llama-3-70b-instruct,chatgpt-4o-latest,model_b,"[{'content': 'A man, a cabbage, and a box are ...","[{'content': 'A man, a cabbage, and a box are ...",1,True,English,1.723198e+09,"{'bold_count_a': {'**': 0, '__': 0}, 'bold_cou...",False,False,"{'high_freq': False, 'sampled': True}","{'criteria_v0.1': {'complexity': True, 'creati...",d26c06d6832149686a4878da09cd0c16be5e2f1829897d...,"A man, a cabbage, and a box are trying to cros..."
7713,7713,107449,bc8216dbb42f4cd2b23b60e2b60cb6db,gemini-1.5-pro-api-0514,claude-3-5-sonnet-20240620,model_a,[{'content': 'write a paragraph that is mislea...,[{'content': 'write a paragraph that is mislea...,1,True,English,1.719016e+09,"{'bold_count_a': {'**': 0, '__': 0}, 'bold_cou...",False,False,"{'high_freq': False, 'sampled': True}","{'criteria_v0.1': {'complexity': False, 'creat...",d26dc8916bdf32f335d8e8e0444f87bf5c774c178886d6...,write a paragraph that is misleading about pin...


In [202]:
# if already exist level_0, then don't run reset_index
# df.reset_index(inplace=True)
llm_df = df.merge(doc_info[['Topic']], left_index=True, right_index=True)
llm_df = llm_df.merge(merged, how='left', left_on='Topic', right_on='narrower_category_id')
# llm_df = llm_df[['question_id', 'broad_category_id', 'broad_category', 'narrower_category_id', 'narrower_category', 'model_a', 'model_b', 'winner']]

In [203]:
llm_df

Unnamed: 0,level_0,index,question_id,model_a,model_b,winner,conversation_a,conversation_b,turn,anony,...,judge_hash,Prompt,Topic,broad_category_id,broad_category,narrower_category_id,narrower_category,prompt_count,prompt_percentage,example_prompt
0,0,14,c021d629adf3459980c1a3c343e1e5d6,claude-3-opus-20240229,gpt-4o-2024-08-06,tie (bothbad),[{'content': 'Create a short paragraph (100-15...,[{'content': 'Create a short paragraph (100-15...,1,True,...,8905c5f80d86d5a57311800d9bcc86d7eccd12ec183057...,Create a short paragraph (100-150 words) descr...,6,0.0,diverse inquiries,6.0,Grammar and Language Exercises,131.0,0.016980,create a dialogue (2 exchanges) that include t...
1,1,37,ccdaf90ba67841909e6101fe3d0de6a3,gemini-1.5-pro-api-0514,claude-3-opus-20240229,tie (bothbad),[{'content': 'Create a detailed table that inc...,[{'content': 'Create a detailed table that inc...,1,True,...,57fc33eea3fe8596705bace8e2b2123ac9cc79bd847346...,Create a detailed table that includes any nume...,-1,,,,,,,
2,2,38,20d89b6c6a204bffa836b4802243c0c7,claude-3-5-sonnet-20240620,llama-3.1-405b-instruct,model_a,"[{'content': 'Who is Hugo Touvron?', 'num_toke...","[{'content': 'Who is Hugo Touvron?', 'num_toke...",1,True,...,6a853c7b381d27b3ecfb8c609c50428825ff0bee1ca960...,Who is Hugo Touvron?,-1,,,,,,,
3,3,44,54747bb7c87e47bcbd86982cf4541c70,claude-3-5-sonnet-20240620,gpt-4-turbo-2024-04-09,model_b,[{'content': 'I saw a Discord user profile wit...,[{'content': 'I saw a Discord user profile wit...,2,True,...,7d4aa7c1008c93e911ca501d41f8314be74021c1476065...,I saw a Discord user profile with this:\n\nPro...,-1,,,,,,,
4,4,45,b360823b92ee4703aa305a0bb67d2d29,gemini-1.5-pro-api-0514,claude-3-5-sonnet-20240620,model_a,[{'content': 'suggest for me a method to earn ...,[{'content': 'suggest for me a method to earn ...,1,True,...,e9e204384f6dbc720e886f14d579006e9386c2d1bdcd25...,suggest for me a method to earn money online,-1,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7710,7710,107435,81014bb016784252998567b839639a95,llama-3-70b-instruct,claude-3-opus-20240229,tie,[{'content': 'What economists will do during r...,[{'content': 'What economists will do during r...,1,True,...,6b1e7ba192c7cbbcf811cf3202be683636c9a72ac0f77b...,What economists will do during recessionary ph...,9,0.0,diverse inquiries,9.0,Multiple Choice Questions and Answers,93.0,0.012054,Give Answer choice only.\n\nYou need to define...
7711,7711,107436,75654717c95a45f7812b81dc00eb8aa7,gpt-4o-2024-05-13,gemini-1.5-pro-exp-0801,tie,[{'content': 'In python script there is a comm...,[{'content': 'In python script there is a comm...,1,True,...,369599c5260a2d2f712f7118e621152da35d858dea0a91...,In python script there is a command that bids ...,0,0.0,diverse inquiries,0.0,Programming and Technical Troubleshooting,1435.0,0.186001,"Prompt user to select file(s), use tkinter ask..."
7712,7712,107444,994d50ea41fd45fbaac58cde925648e9,llama-3-70b-instruct,chatgpt-4o-latest,model_b,"[{'content': 'A man, a cabbage, and a box are ...","[{'content': 'A man, a cabbage, and a box are ...",1,True,...,d26c06d6832149686a4878da09cd0c16be5e2f1829897d...,"A man, a cabbage, and a box are trying to cros...",24,1.0,Problem Solving,24.0,River Crossing Puzzles,54.0,0.006999,A man and a dog on one side of the river with ...
7713,7713,107449,bc8216dbb42f4cd2b23b60e2b60cb6db,gemini-1.5-pro-api-0514,claude-3-5-sonnet-20240620,model_a,[{'content': 'write a paragraph that is mislea...,[{'content': 'write a paragraph that is mislea...,1,True,...,d26dc8916bdf32f335d8e8e0444f87bf5c774c178886d6...,write a paragraph that is misleading about pin...,29,0.0,diverse inquiries,29.0,Sentence Construction with Specific Endings,40.0,0.005185,"Give me 13 sentences which end with ""any""|||Us..."


In [204]:
# save if needed
llm_df.to_csv(f"{save_path}/filtered_conversations_and_category.csv", index=False)


### Create visualization

Instruction to generate explorer visualization:
1. Run the pipeline and the following cells to produce two output files: data.json and examples.json.
2. Clone the [arena-catalog](https://github.com/lmarena/arena-catalog/tree/data-explorer) repository, which contains the necessary HTML, CSS, and JavaScript files for the explorer.
2. In [explorer/index.html](https://github.com/lmarena/arena-catalog/blob/data-explorer/explorer/index.html), replace the file paths on lines 44 & 45 with the correct paths to your generated data.json and examples.json files.

In [None]:
# Export results in JSON format
root = {
    "name": "categories",
    "children": []
}
for broad_category, group in merged.groupby(["broad_category_id", "broad_category"]):
    parent = {
        "id": int(broad_category[0]),
        "name": broad_category[1],
        "children": []
    }

    for _, row in group.iterrows():
        child = {
            "id": row["narrower_category_id"],
            "name": row["narrower_category"],
            "count": row["prompt_count"],
            "percent": row['prompt_percentage'],
        }

        parent["children"].append(child)

    root["children"].append(parent)

json_output = json.dumps(root, indent=4)

with open(f"{save_path}/data.json", "w") as f:
    f.write(json_output)

In [None]:
# json file for example prompts
import pickle

# with open(f"{save_path}/example_prompts.pkl", 'rb') as f:
#     sampled_prompts = pickle.load(f)

# Group by 'broad_category' and transform to the desired JSON structure
root = []
del sampled_prompts[-1]
for i in sampled_prompts:
    obj = {
        "id": i,
        "name": merged[merged['narrower_category_id'] == i].loc[i, 'narrower_category'],
        "examples": sampled_prompts[i],
    }
    root.append(obj)

json_output = json.dumps(root, indent=4)
with open(f"{save_path}/examples.json", "w") as f:
    f.write(json_output)