In [1]:
from datasets import load_dataset, Dataset
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score, classification_report

W1106 21:24:11.970000 63781 site-packages/torch/distributed/elastic/multiprocessing/redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


In [2]:
ds = load_dataset("lmarena-ai/arena-human-preference-140k")

Downloading readme: 0.00B [00:00, ?B/s]

In [3]:
train_ds = ds['train']
print(train_ds)

Dataset({
    features: ['id', 'model_a', 'model_b', 'winner', 'evaluation_session_id', 'evaluation_order', 'conversation_a', 'conversation_b', 'full_conversation', 'conv_metadata', 'category_tag', 'language', 'is_code', 'timestamp'],
    num_rows: 135634
})


In [4]:
def reorganize(dataset):
    df = dataset.to_pandas()
    
    # Create model_a rows
    df_a = df.rename(columns={
        'model_a': 'model_name',
        'conversation_a': 'conversation',
        'model_b': 'other_model'
    })[['model_name', 'conversation', 'other_model', 'winner', 'evaluation_session_id', 
        'evaluation_order', 'full_conversation', 'conv_metadata', 'category_tag', 
        'language', 'is_code', 'timestamp', 'id']]
    # Create model_b rows
    df_b = df.rename(columns={
        'model_b': 'model_name',
        'conversation_b': 'conversation',
        'model_a': 'other_model'
    })[['model_name', 'conversation', 'other_model', 'winner', 'evaluation_session_id', 
        'evaluation_order', 'full_conversation', 'conv_metadata', 'category_tag', 
        'language', 'is_code', 'timestamp', 'id']]
    # Combine and convert back
    df_combined = pd.concat([df_a, df_b], ignore_index=True)
    return df_combined

reorganized = reorganize(train_ds)
print(reorganized.head())

                              model_name  \
0                         gemini-2.5-pro   
1              claude-3-5-haiku-20241022   
2                                o3-mini   
3  claude-sonnet-4-20250514-thinking-32k   
4             claude-3-5-sonnet-20241022   

                                        conversation  \
0  [{'role': 'user', 'content': [{'type': 'text',...   
1  [{'role': 'user', 'content': [{'type': 'text',...   
2  [{'role': 'user', 'content': [{'type': 'text',...   
3  [{'role': 'user', 'content': [{'type': 'text',...   
4  [{'role': 'user', 'content': [{'type': 'text',...   

                               other_model    winner  \
0  claude-3-7-sonnet-20250219-thinking-32k   model_a   
1               claude-3-5-sonnet-20241022       tie   
2                          gemma-3n-e4b-it  both_bad   
3                           gemini-2.5-pro   model_a   
4                      mistral-medium-2505   model_b   

                  evaluation_session_id  evaluation_order  \


In [5]:
num_unique_models = len(set(reorganized['model_name']))
print(num_unique_models)

53


In [6]:
unique_models = sorted(set(reorganized['model_name']))
model_to_label = {model: idx for idx, model in enumerate(unique_models)}
label_to_model = {idx: model for model, idx in model_to_label.items()}
reorganized['label'] = reorganized['model_name'].map(model_to_label)

print(f"Model to label mapping: {model_to_label}")
print(reorganized.head())

Model to label mapping: {'amazon-nova-experimental-chat-05-14': 0, 'amazon.nova-pro-v1:0': 1, 'chatgpt-4o-latest-20250326': 2, 'claude-3-5-haiku-20241022': 3, 'claude-3-5-sonnet-20241022': 4, 'claude-3-7-sonnet-20250219': 5, 'claude-3-7-sonnet-20250219-thinking-32k': 6, 'claude-opus-4-20250514': 7, 'claude-opus-4-20250514-thinking-16k': 8, 'claude-sonnet-4-20250514': 9, 'claude-sonnet-4-20250514-thinking-32k': 10, 'command-a-03-2025': 11, 'deepseek-r1-0528': 12, 'deepseek-v3-0324': 13, 'gemini-2.0-flash-001': 14, 'gemini-2.0-flash-thinking-exp-01-21': 15, 'gemini-2.5-flash': 16, 'gemini-2.5-flash-lite-preview-06-17-thinking': 17, 'gemini-2.5-flash-preview-04-17': 18, 'gemini-2.5-pro': 19, 'gemini-2.5-pro-preview-03-25': 20, 'gemini-2.5-pro-preview-05-06': 21, 'gemma-3-27b-it': 22, 'gemma-3n-e4b-it': 23, 'gpt-4.1-2025-04-14': 24, 'gpt-4.1-mini-2025-04-14': 25, 'gpt-4o-2024-11-20': 26, 'gpt-4o-mini-2024-07-18': 27, 'grok-3-mini-beta': 28, 'grok-3-mini-high': 29, 'grok-3-preview-02-24': 3

In [7]:
def clean_conversation(conversation_list):
    """
    Convert conversation list format to a single text string
    """
    
    text_parts = []
    for turn in conversation_list:
        if isinstance(turn, dict) and 'content' in turn and 'role' in turn:
            role = turn['role']
            # Extract text from content list
            for content_item in turn['content']:
                if isinstance(content_item, dict) and content_item.get('text'):
                    text = content_item['text']
                    text_parts.append(f"{role}: {text}.")
    return " ".join(text_parts) if text_parts else None

In [8]:

print(clean_conversation(reorganized['conversation'][0]))

user: Compare Tormenta20 with DnD5e. assistant: Excellent question! Comparing **Tormenta20 (T20)** and **Dungeons & Dragons 5th Edition (D&D 5e)** is a fantastic way to understand different approaches to the d20 system. While they share a common ancestor (D&D 3rd Edition), they have evolved in vastly different directions.

Here is a detailed comparison, broken down by key aspects of the games.

---

### **High-Level Philosophy**

*   **D&D 5e:** Aims for **accessibility, streamlined play, and heroic fantasy**. It simplifies many of the complex rules of older editions to be welcoming to new players. Its design philosophy is "rulings, not rules," encouraging Dungeon Masters to make calls on the fly. It is built to be setting-agnostic, though the Forgotten Realms is its default.
*   **Tormenta20:** Aims for **deep character customization, high-power fantasy, and an "anime/JRPG" feel**. It offers players a vast menu of options to build a unique character from level 1. The power level escal

In [9]:
reorganized['conversation_text'] = reorganized['conversation'].apply(clean_conversation)
print(reorganized['conversation_text'])

0         user: Compare Tormenta20 with DnD5e. assistant...
1         user: jak w prompcie określić precyzyjnie para...
2         user: Solve the following game of Freecell. Ca...
3         user: What about 1 year in Sweden with proof e...
4         user: Gibt es bei Kleidungs Reinigung verschie...
                                ...                        
271263    user: co się stało na placu tiananmen?. assist...
271264    user: rtss не выводит 1% фпс. assistant: Пробл...
271265    user: czy odpowiadasz na pytania zadane po pol...
271266    user: Проверь этот код что он делает?\n\n     ...
271267    user: What does it mean to say God's transcend...
Name: conversation_text, Length: 271268, dtype: object


In [10]:
reorganized = reorganized[reorganized['conversation_text'].notna()]
reorganized = reorganized[reorganized['conversation_text'].str.len() > 10]

print(f"Rows after cleaning: {len(reorganized)}")
print(max(reorganized['conversation_text'].str.len()))

Rows after cleaning: 271268
1905943


In [11]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def count_tokens(text):
    """Count tokens using BERT tokenizer"""
    if text is None:
        return 0
    tokens = tokenizer.tokenize(text)
    return len(tokens)

num_tokens = reorganized['conversation_text'].apply(count_tokens)

print(num_tokens.head())
print(f"\nToken count statistics:")
print(num_tokens.describe())

Token indices sequence length is longer than the specified maximum sequence length for this model (1982 > 512). Running this sequence through the model will result in indexing errors


0    1982
1     777
2    2167
3     368
4     605
Name: conversation_text, dtype: int64

Token count statistics:
count    271268.000000
mean       1532.890835
std        2533.892882
min           4.000000
25%         408.000000
50%         935.000000
75%        1790.000000
max      235390.000000
Name: conversation_text, dtype: float64


In [12]:
data_short = reorganized[num_tokens < 500]
print(len(data_short))
print(count_tokens(data_short.iloc[0]['conversation_text']))

81334
368


In [13]:
dataset = Dataset.from_pandas(data_short[['conversation_text', 'label']])

In [14]:
# Split into train/validation/test sets
train_test_split = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

# Further split train into train/validation
train_val_split = train_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = train_val_split['train']
val_dataset = train_val_split['test']

print(f"Train set size: {len(train_dataset)}")
print(f"Val set size: {len(val_dataset)}")
print(f"Test set size: {len(test_dataset)}")

Train set size: 58560
Val set size: 6507
Test set size: 16267


In [15]:
print(dataset.column_names)

['conversation_text', 'label', '__index_level_0__']


In [23]:
dataset.save_to_disk('./LMarena_short')

Saving the dataset (0/1 shards):   0%|          | 0/81334 [00:00<?, ? examples/s]

In [16]:
def tokenize_function(examples):
    return tokenizer(
        examples['conversation_text'],
        padding='max_length',
        truncation=True,
        max_length=512  # Adjust based on your conversation lengths
    )

In [17]:
# Tokenize all datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Remove the original text column (we only need input_ids, attention_mask, and labels)
train_dataset = train_dataset.remove_columns(['conversation_text'])
val_dataset = val_dataset.remove_columns(['conversation_text'])
test_dataset = test_dataset.remove_columns(['conversation_text'])

# Set format for PyTorch
train_dataset.set_format('torch')
val_dataset.set_format('torch')
test_dataset.set_format('torch')

Map:   0%|          | 0/58560 [00:00<?, ? examples/s]

Map:   0%|          | 0/6507 [00:00<?, ? examples/s]

Map:   0%|          | 0/16267 [00:00<?, ? examples/s]

In [18]:
print(train_dataset.column_names)

['label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask']


In [19]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=53
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    eval_strategy='epoch',  # Evaluate at the end of each epoch
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    learning_rate=2e-5,
    seed=42,
    push_to_hub=False,  # Set to True if you want to push to Hugging Face Hub
)


In [21]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    
    return {
        'accuracy': accuracy,
        'f1': f1,
    }

In [22]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

print("Starting training...")
trainer.train()

Starting training...




Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 