In [1]:
import os
import polars as pl
from getpass import getpass

hf_token = getpass()
os.environ["HF_HUB_CACHE"] = "/home/jupyterhub-users/shared/projet_comparia/huggingface_hub/"
os.environ["HF_DATASETS_CACHE"] = "/home/jupyterhub-users/shared/projet_comparia/huggingface_hub/"
os.environ["HF_TOKEN"] = hf_token

In [2]:
import datasets

comparia = datasets.load_dataset(
    "ministere-culture/comparia-reactions",
    cache_dir="/home/jupyterhub-users/shared/projet_comparia/huggingface_hub/",
    split="train",
)

In [3]:
comparia: pl.DataFrame = comparia.to_polars()  # type: ignore

In [4]:
comparia_model_a = (
    comparia.group_by(["model_a_name"])
    .len()
    .sort("len", descending=True)
    .with_columns(model_name="model_a_name")
    .drop("model_a_name")
)
comparia_model_b = (
    comparia.group_by(["model_b_name"])
    .len()
    .sort("len", descending=True)
    .with_columns(model_name="model_b_name")
    .drop("model_b_name")
)
number_by_model = (
    pl.concat([comparia_model_a, comparia_model_b]).group_by("model_name").sum().sort("len", descending=True)
)

In [5]:
number_by_model

model_name,len
str,u32
"""gpt-4o-2024-08-06""",3894
"""deepseek-v3-chat""",3857
"""gpt-4o-mini-2024-07-18""",3816
"""claude-3-5-sonnet-v2""",3514
"""llama-3.1-405b""",3409
…,…
"""gemma-3-12b""",408
"""mistral-small-3.1-24b""",383
"""gemma-3-4b""",381
"""gemma-2-27b-it-q8""",296


In [6]:
from pathlib import Path
from rank_comparia.data_transformation import get_matches_with_score, get_winners, get_winrates

matches = get_matches_with_score(comparia)

In [7]:
matches.head(5)

model_a_name,model_b_name,conversation_pair_id,score_a,score_b
str,str,str,i64,i64
"""phi-3.5-mini-instruct""","""llama-3.1-70b""","""407f67a39f0f4128b1ea0a4e130028…",0,2
"""llama-3.3-70b""","""gpt-4o-2024-08-06""","""9868b2a12eb442568206f0a32cf2cf…",3,3
"""aya-expanse-8b""","""gemini-1.5-pro-002""","""05da273de21542138e9d8cc47c648d…",3,0
"""qwen2.5-7b-instruct""","""gpt-4o-2024-08-06""","""16a15b220d88442b89bc8adb72323e…",-1,0
"""gemini-2.0-flash-exp""","""lfm-40b""","""c867c9c56c32463480eac7d1e6ede2…",0,-1


In [8]:
winners = get_winners(matches)

In [9]:
winrates = get_winrates(winners)
winrates.sort("winrate", descending=True)

model_name,len,wins,winrate
str,u32,u32,f64
"""gemini-2.0-flash-exp""",856,647,75.584112
"""gemma-3-27b""",275,202,73.454545
"""deepseek-v3-chat""",1511,1065,70.483124
"""gemini-2.0-flash-001""",434,301,69.354839
"""command-a""",208,141,67.788462
…,…,…,…
"""mixtral-8x7b-instruct-v0.1""",585,222,37.948718
"""lfm-40b""",887,321,36.189402
"""mixtral-8x22b-instruct-v0.1""",1459,445,30.500343
"""mistral-nemo-2407""",1440,430,29.861111


In [10]:
from rank_comparia.elo import ELORanker
from rank_comparia.match import Match, MatchScore
import random


def compute_match_score(score_a: int, score_b: int) -> MatchScore:
    final_score = score_b - score_a
    if final_score > 0:
        return MatchScore.B
    elif final_score < 0:
        return MatchScore.A
    else:
        return MatchScore.Draw


def get_shuffled_results(matches: list[Match], model_names: list[str], seed: int = 0):
    random.seed(seed)
    ranker_shuffle = ELORanker(K=40)
    matches_shuffle = random.sample(matches, k=len(matches))
    ranker_shuffle.add_players(model_names)
    ranker_shuffle.compute_ranks(matches=matches_shuffle)
    return ranker_shuffle.players

In [None]:
model_names = set(matches["model_a_name"].unique()) | set(matches["model_b_name"].unique())
matches = [
    Match(
        match_dict["model_a_name"],
        match_dict["model_b_name"],
        compute_match_score(match_dict["score_a"], match_dict["score_b"]),
    )
    for match_dict in matches.to_dicts()
]

player_results = {
    seed: get_shuffled_results(matches=matches, model_names=model_names, seed=seed) for seed in range(100)  # type: ignore
}

In [12]:
players_avg_ranking = {
    player_name: sum(results[player_name] for results in player_results.values()) / 100 for player_name in model_names
}

In [13]:
for player, ranking in sorted(players_avg_ranking.items(), key=lambda x: -x[1]):
    print(f"{player} : {ranking}")

gemini-2.0-flash-exp : 1157.028270838856
gemma-3-27b : 1155.3129448418474
deepseek-v3-chat : 1127.7370483265513
gemini-2.0-flash-001 : 1119.3969136393855
command-a : 1110.4961845967168
llama-3.1-nemotron-70b-instruct : 1081.6852126662395
deepseek-r1 : 1069.1204646445353
gemma-3-12b : 1068.2495396221402
gemma-3-4b : 1064.6241428516964
gemini-1.5-pro-002 : 1059.5959973931642
gemini-1.5-pro-001 : 1056.9972654113524
mistral-small-3.1-24b : 1046.5567065114808
mistral-large-2411 : 1039.9056029570474
llama-3.1-405b : 1027.1693649023744
gpt-4o-mini-2024-07-18 : 1016.6777315210252
o3-mini : 1012.7184184770366
gpt-4o-2024-08-06 : 1009.289900364873
claude-3-5-sonnet-v2 : 1008.2581872267095
jamba-1.5-large : 999.6233574330689
llama-3.3-70b : 998.0678896749833
mistral-small-24b-instruct-2501 : 990.7475928390303
phi-4 : 986.3984552127437
gemma-2-27b-it-q8 : 984.7081894887895
deepseek-r1-distill-llama-70b : 981.1814187776363
llama-3.1-70b : 980.7062058969495
gemma-2-9b-it : 976.3340476903418
ministra

In [14]:
from random import sample, seed

ranker_shuffle = ELORanker(K=40)

seed(42)
matches_shuffle = sample(matches, k=len(matches))
ranker_shuffle.add_players(model_names)  # type: ignore
ranker_shuffle.compute_ranks(matches=matches_shuffle)
ranker_shuffle.get_scores()

{'gemini-2.0-flash-exp': 1182.655813210524,
 'gemma-3-27b': 1137.7936889773757,
 'gemini-2.0-flash-001': 1132.8096894254127,
 'command-a': 1119.0145969398302,
 'gemini-1.5-pro-001': 1112.7388218655956,
 'deepseek-v3-chat': 1107.2829890748374,
 'deepseek-r1': 1086.2569897594703,
 'mistral-small-3.1-24b': 1077.9125231734731,
 'gemma-3-4b': 1075.5107270904402,
 'gemma-3-12b': 1046.6431029677765,
 'gpt-4o-2024-08-06': 1045.563105673389,
 'aya-expanse-8b': 1039.034797080991,
 'llama-3.1-nemotron-70b-instruct': 1021.9468619696204,
 'qwq-32b': 1021.3041798059635,
 'gpt-4o-mini-2024-07-18': 1020.8563395704401,
 'llama-3.1-405b': 1020.6523063070479,
 'o3-mini': 1019.2265179250985,
 'mistral-large-2411': 1016.3749877999796,
 'claude-3-5-sonnet-v2': 1012.1734602193051,
 'gemini-1.5-pro-002': 1007.393517141211,
 'jamba-1.5-large': 1005.9618854374053,
 'gemma-2-27b-it-q8': 996.4120727334465,
 'ministral-8b-instruct-2410': 993.0275637064254,
 'mistral-small-24b-instruct-2501': 991.9651155785541,
 'g

In [15]:
ranker_shuffle = ELORanker(K=40)

seed(1337)
matches_shuffle = sample(matches, k=len(matches))
ranker_shuffle.add_players(model_names)  # type: ignore
ranker_shuffle.compute_ranks(matches=matches_shuffle)
ranker_shuffle.get_scores()

{'gemini-2.0-flash-exp': 1182.7390199203767,
 'gemma-3-27b': 1172.766533344677,
 'command-a': 1147.7567901030293,
 'llama-3.1-nemotron-70b-instruct': 1126.527917793558,
 'gemini-2.0-flash-001': 1122.711787366888,
 'llama-3.1-405b': 1112.5545696469314,
 'deepseek-v3-chat': 1091.5538878101115,
 'gemini-1.5-pro-002': 1083.7210389698582,
 'mistral-small-3.1-24b': 1076.9213791792906,
 'gemma-3-12b': 1069.0676682530045,
 'gpt-4o-2024-08-06': 1051.2384489958413,
 'deepseek-r1': 1045.5131987432121,
 'phi-4': 1043.4095425272337,
 'llama-3.3-70b': 1042.681128173848,
 'gemini-1.5-pro-001': 1035.1120001878635,
 'gemma-2-9b-it': 1027.1483984916058,
 'gemma-3-4b': 1015.5574347684051,
 'claude-3-5-sonnet-v2': 1001.4204178114753,
 'llama-3.1-70b': 1001.1786671594692,
 'aya-expanse-8b': 991.5167601928323,
 'gpt-4o-mini-2024-07-18': 990.854755650781,
 'jamba-1.5-large': 984.091114434604,
 'mistral-large-2411': 983.7083239297039,
 'mistral-small-24b-instruct-2501': 980.3636992482191,
 'hermes-3-llama-3.1