# Installing requirements

In [None]:
!pip install huggingface_hub
!pip install datasets

from huggingface_hub import notebook_login
import datasets
from tqdm import tqdm
import numpy as np
import pandas as pd
import json
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib

# Download the dataset

In [44]:
notebook_login()
hugging_face_dataset = datasets.load_dataset('lmsys/chatbot_arena_conversations')
dataset = hugging_face_dataset['train'].to_pandas()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [93]:
dataset[dataset['turn'] > 3][:1]

Unnamed: 0,question_id,model_a,model_b,winner,judge,conversation_a,conversation_b,turn,anony,language,tstamp,openai_moderation,toxic_chat_tag
33,8120899314f74641b09c2aa114d4d253,alpaca-13b,vicuna-13b,model_b,arena_user_316,[{'content': 'Salut ! Comment ça va ce matin ?...,[{'content': 'Salut ! Comment ça va ce matin ?...,6,True,French,1682354000.0,"{'categories': {'harassment': False, 'harassme...","{'roberta-large': {'flagged': False, 'probabil..."


# Create text-author pairs

In [139]:
# Initialize an empty list for processed data
text_author_pairs = []

# Iterate through each row of the DataFrame
for _, row in dataset.iterrows():
    # Process conversation_a (assistant role)
    for message in row["conversation_a"]:
        if message["role"] == "assistant":
            text_author_pairs.append({
                "text": message["content"],
                "author": row["model_a"]
            })

    # Process conversation_b (assistant role)
    for message in row["conversation_b"]:
        if message["role"] == "assistant":
            text_author_pairs.append({
                "text": message["content"],
                "author": row["model_b"]
            })

    # Process user messages (common across both conversations)
    for message in row["conversation_a"]:  # Check only `conversation_a` since user messages are identical
        if message["role"] == "user":
            text_author_pairs.append({
                "text": message["content"],
                "author": "human"
            })

# Create token count vectors for each text

In [142]:
# Extract paragraphs and their authors
paragraphs = [entry["text"] for entry in text_author_pairs]
authors = [entry["author"] for entry in text_author_pairs]

# Vectorize the paragraphs
vectorizer = CountVectorizer()
x_train = vectorizer.fit_transform(paragraphs)  # Each row is a paragraph

             chatglm-6b  koala-13b  oasst-pythia-12b  alpaca-13b  vicuna-13b  \
00                  136         64               238          47         155   
000                 114        204               129          37         227   
0000                  2          0                 0           0           1   
00000                 0          0                 0           0           0   
000000                1          2                 0           0           1   
...                 ...        ...               ...         ...         ...   
ｔｏ                    0          0                 0           0           0   
ｗｅｅｋｓ                 0          0                 0           0           0   
𝘀𝗶𝗺𝗽𝗹𝗲                0          0                 0           0           0   
𝘀𝘂𝗽𝗲𝗿𝗵𝘂𝗺𝗮𝗻            0          0                 0           0           0   
𝘀𝘂𝗽𝗲𝗿𝗵𝘂𝗺𝗮𝗻𝘀           0          0                 0           0           0   

             dolly-v2-12b  stablelm-tun

# Group texts by author

In [142]:
# Group vectors by author and sum
author_vectors = {}
for i, author in enumerate(authors):
    if author == 'human' :
        continue
    if author not in author_vectors:
        author_vectors[author] = x_train[i].toarray()
    else:
        author_vectors[author] += x_train[i].toarray()

# Convert to a more readable format (if needed)
author_df = pd.DataFrame({author: vector.flatten() for author, vector in author_vectors.items()},
                         index=vectorizer.get_feature_names_out())

print(author_df)  # Token counts for each author

             chatglm-6b  koala-13b  oasst-pythia-12b  alpaca-13b  vicuna-13b  \
00                  136         64               238          47         155   
000                 114        204               129          37         227   
0000                  2          0                 0           0           1   
00000                 0          0                 0           0           0   
000000                1          2                 0           0           1   
...                 ...        ...               ...         ...         ...   
ｔｏ                    0          0                 0           0           0   
ｗｅｅｋｓ                 0          0                 0           0           0   
𝘀𝗶𝗺𝗽𝗹𝗲                0          0                 0           0           0   
𝘀𝘂𝗽𝗲𝗿𝗵𝘂𝗺𝗮𝗻            0          0                 0           0           0   
𝘀𝘂𝗽𝗲𝗿𝗵𝘂𝗺𝗮𝗻𝘀           0          0                 0           0           0   

             dolly-v2-12b  stablelm-tun

In [143]:
text_author_pairs_dataframe = pd.DataFrame(text_author_pairs)
text_author_pairs_dataframe[text_author_pairs_dataframe['author'] != 'human']

Unnamed: 0,text,author
0,OpenCL and CUDA are two different programming ...,chatglm-6b
1,OpenCL and CUDA are both programming languages...,koala-13b
3,It is possible that your parent did not invite...,oasst-pythia-12b
4,It is likely that they wanted to keep the gues...,alpaca-13b
6,Both Fuji and Nikon are popular camera brands ...,koala-13b
...,...,...
117940,"1.\t""섹스""\n2.\t""체""\n3.\t""위""\n4.\t""자""\n5.\t""세""\n...",oasst-pythia-12b
117942,Evaluating a language model output involves se...,guanaco-33b
117943,Evaluating the output of a language model invo...,koala-13b
117945,Power Automate is a powerful tool that allows ...,chatglm-6b


In [101]:
ready_for_conversion['conversation_a']

0    [{'content': 'What is the difference between O...
1    [{'content': 'Why did my parent not invite me ...
2    [{'content': 'Fuji vs. Nikon, which is better?...
3    [{'content': 'How to build an arena for chatbo...
4    [{'content': 'When is it today?', 'role': 'use...
Name: conversation_a, dtype: object

In [98]:
wtf = pd.DataFrame({
    "conversation_a": [
        [
            {"content": "What is the difference between OpenCL and CUDA?", "role": "user"},
            {"content": "OpenCL and CUDA are different programming models.", "role": "assistant"}
        ],
        [
            {"content": "Why did my parent not invite me to the party?", "role": "user"},
            {"content": "Perhaps they forgot, or you can ask them directly.", "role": "assistant"}
        ]
    ],
    "conversation_b": [
        [
            {"content": "Could you explain artificial intelligence?", "role": "user"},
            {"content": "Artificial intelligence is a fascinating field.", "role": "assistant"}
        ],
        [
            {"content": "What is quantum computing?", "role": "user"},
            {"content": "Quantum computing leverages quantum mechanics.", "role": "assistant"}
        ]
    ],
    "model_a": ["chatglm-6b", "koala-13b"],
    "model_b": ["oasst-pythia-12b", "vicuna-13b"]
})

In [103]:
wtf

Unnamed: 0,conversation_a,conversation_b,model_a,model_b
0,[{'content': 'What is the difference between O...,[{'content': 'Could you explain artificial int...,chatglm-6b,oasst-pythia-12b
1,[{'content': 'Why did my parent not invite me ...,"[{'content': 'What is quantum computing?', 'ro...",koala-13b,vicuna-13b


In [145]:
author_df.to_csv('bruh.csv')