In [1]:
import os
import polars as pl
from getpass import getpass

hf_token = getpass()
os.environ["HF_HUB_CACHE"] = "/home/jupyterhub-users/shared/projet_comparia/huggingface_hub/"
os.environ["HF_DATASETS_CACHE"] = "/home/jupyterhub-users/shared/projet_comparia/huggingface_hub/"
os.environ["HF_TOKEN"] = hf_token

In [2]:
import datasets

comparia = datasets.load_dataset(
    "ministere-culture/comparia-reactions",
    cache_dir="/home/jupyterhub-users/shared/projet_comparia/huggingface_hub/",
    split="train",
)

In [3]:
comparia: pl.DataFrame = comparia.to_polars()  # type: ignore

In [4]:
comparia_model_a = (
    comparia.group_by(["model_a_name"])
    .len()
    .sort("len", descending=True)
    .with_columns(model_name="model_a_name")
    .drop("model_a_name")
)
comparia_model_b = (
    comparia.group_by(["model_b_name"])
    .len()
    .sort("len", descending=True)
    .with_columns(model_name="model_b_name")
    .drop("model_b_name")
)
number_by_model = (
    pl.concat([comparia_model_a, comparia_model_b]).group_by("model_name").sum().sort("len", descending=True)
)

In [5]:
number_by_model

model_name,len
str,u32
"""gpt-4o-2024-08-06""",3894
"""deepseek-v3-chat""",3857
"""gpt-4o-mini-2024-07-18""",3816
"""claude-3-5-sonnet-v2""",3514
"""llama-3.1-405b""",3409
…,…
"""gemma-3-12b""",408
"""mistral-small-3.1-24b""",383
"""gemma-3-4b""",381
"""gemma-2-27b-it-q8""",296


In [6]:
from rank_comparia.data_transformation import get_matches_with_score, get_winners, get_winrates

matches = get_matches_with_score(comparia)

In [7]:
matches.head(5)

model_a_name,model_b_name,conversation_pair_id,score_a,score_b
str,str,str,i64,i64
"""claude-3-5-sonnet-v2""","""deepseek-v3-chat""","""b5864da64a9f481f9286dfa6c4d6d1…",0,-2
"""llama-3.1-8b""","""deepseek-v3-chat""","""7784132de47e4ee0b4db402891f5f5…",3,3
"""mixtral-8x22b-instruct-v0.1""","""deepseek-v3-chat""","""f234054c39434419b2846efd06e3c5…",0,1
"""claude-3-5-sonnet-v2""","""mistral-nemo-2407""","""ff5e42399c2d4e87961c41bd96bfff…",1,-2
"""llama-3.1-70b""","""mixtral-8x22b-instruct-v0.1""","""2261be2af79a4445946294ae189408…",-2,-2


In [8]:
winners = get_winners(matches)

In [9]:
winrates = get_winrates(winners)
winrates.sort("winrate", descending=True)

model_name,len,wins,winrate
str,u32,u32,f64
"""gemini-2.0-flash-exp""",856,647,75.584112
"""gemma-3-27b""",275,202,73.454545
"""deepseek-v3-chat""",1511,1065,70.483124
"""gemini-2.0-flash-001""",434,301,69.354839
"""command-a""",208,141,67.788462
…,…,…,…
"""mixtral-8x7b-instruct-v0.1""",585,222,37.948718
"""lfm-40b""",887,321,36.189402
"""mixtral-8x22b-instruct-v0.1""",1459,445,30.500343
"""mistral-nemo-2407""",1440,430,29.861111


In [10]:
from rank_comparia.elo import ELORanker
from rank_comparia.ranker import Match, MatchScore
import random


def compute_match_score(score_a: int, score_b: int) -> MatchScore:
    final_score = score_b - score_a
    if final_score > 0:
        return MatchScore.B
    elif final_score < 0:
        return MatchScore.A
    else:
        return MatchScore.Draw


def get_shuffled_results(matches: list[Match], model_names: list[str], seed: int = 0):
    random.seed(seed)
    ranker_shuffle = ELORanker(K=40)
    matches_shuffle = random.sample(matches, k=len(matches))
    ranker_shuffle.add_players(model_names)
    ranker_shuffle.compute_scores(matches=matches_shuffle)
    return ranker_shuffle.players

In [11]:
model_names = set(matches["model_a_name"].unique()) | set(matches["model_b_name"].unique())
matches = [
    Match(
        match_dict["model_a_name"],
        match_dict["model_b_name"],
        compute_match_score(match_dict["score_a"], match_dict["score_b"]),
    )
    for match_dict in matches.to_dicts()
]

player_results = {
    seed: get_shuffled_results(matches=matches, model_names=model_names, seed=seed) for seed in range(100)  # type: ignore
}

In [12]:
players_avg_ranking = {
    player_name: sum(results[player_name] for results in player_results.values()) / 100 for player_name in model_names
}

In [13]:
for player, ranking in sorted(players_avg_ranking.items(), key=lambda x: -x[1]):
    print(f"{player} : {ranking}")

gemini-2.0-flash-exp : 1157.2941479234191
gemma-3-27b : 1151.017731303702
gemini-2.0-flash-001 : 1128.1208818091804
deepseek-v3-chat : 1127.2467881834598
command-a : 1111.9863135497003
llama-3.1-nemotron-70b-instruct : 1085.704619881789
gemma-3-12b : 1074.0127135388204
deepseek-r1 : 1067.0753052677067
gemma-3-4b : 1058.942950664265
gemini-1.5-pro-002 : 1055.109890657114
gemini-1.5-pro-001 : 1051.7473561303123
mistral-small-3.1-24b : 1041.177806252562
mistral-large-2411 : 1034.2766890373898
claude-3-5-sonnet-v2 : 1020.8692907378431
o3-mini : 1016.0008992360407
llama-3.1-405b : 1014.7368456157051
gpt-4o-2024-08-06 : 1009.6175522181197
gpt-4o-mini-2024-07-18 : 1009.0453037875014
llama-3.3-70b : 1003.7986825017117
jamba-1.5-large : 996.5424515898704
mistral-small-24b-instruct-2501 : 996.0728320125398
gemma-2-27b-it-q8 : 993.7873247351531
phi-4 : 991.143395068032
deepseek-r1-distill-llama-70b : 981.7745660237254
llama-3.1-70b : 981.035165377253
gemma-2-9b-it : 974.2934151360159
ministral-8b

In [14]:
from random import sample, seed

ranker_shuffle = ELORanker(K=40)

seed(42)
matches_shuffle = sample(matches, k=len(matches))
ranker_shuffle.add_players(model_names)  # type: ignore
ranker_shuffle.compute_scores(matches=matches_shuffle)
ranker_shuffle.get_scores()

{'gemma-3-27b': 1218.9697449404073,
 'gemini-2.0-flash-exp': 1150.858189361456,
 'deepseek-v3-chat': 1119.5783278500196,
 'command-a': 1108.4126271866912,
 'gemini-2.0-flash-001': 1094.5899437475196,
 'gemma-3-4b': 1080.4268240046715,
 'llama-3.1-nemotron-70b-instruct': 1079.3609075985867,
 'gemini-1.5-pro-002': 1078.108923148095,
 'gemini-1.5-pro-001': 1075.8876201577034,
 'gpt-4o-mini-2024-07-18': 1069.0936258148813,
 'claude-3-5-sonnet-v2': 1065.5645975332131,
 'gemma-3-12b': 1051.502508837704,
 'llama-3.1-8b': 1035.6399625301644,
 'mistral-large-2411': 1024.9240656873892,
 'mistral-small-3.1-24b': 1022.4594850892795,
 'qwen2.5-coder-32b-instruct': 1020.2050230041795,
 'deepseek-r1': 1013.5370037672889,
 'aya-expanse-8b': 1011.9924843896729,
 'mistral-small-24b-instruct-2501': 1007.8120967265646,
 'o3-mini': 1007.3257540112074,
 'jamba-1.5-large': 997.1705166960566,
 'deepseek-r1-distill-llama-70b': 990.0498018645136,
 'phi-4': 987.0139024387507,
 'llama-3.1-70b': 985.2419780549873,

In [15]:
ranker_shuffle = ELORanker(K=40)

seed(1337)
matches_shuffle = sample(matches, k=len(matches))
ranker_shuffle.add_players(model_names)  # type: ignore
ranker_shuffle.compute_scores(matches=matches_shuffle)
ranker_shuffle.get_scores()

{'gemini-2.0-flash-exp': 1183.2249238191819,
 'gemma-3-27b': 1170.8246373266186,
 'command-a': 1158.7196218466388,
 'gemini-2.0-flash-001': 1148.9368106015984,
 'deepseek-r1': 1126.5284608238528,
 'llama-3.1-nemotron-70b-instruct': 1102.656687428728,
 'gemma-3-4b': 1098.5158362420539,
 'gemini-1.5-pro-002': 1095.7595031299986,
 'deepseek-v3-chat': 1094.4597103322528,
 'gemma-3-12b': 1082.8651112931598,
 'gpt-4o-mini-2024-07-18': 1071.8016781715187,
 'gpt-4o-2024-08-06': 1049.544044269285,
 'mistral-small-3.1-24b': 1038.17783454564,
 'o3-mini': 1031.1784329344102,
 'deepseek-r1-distill-llama-70b': 1019.7467208322944,
 'gemini-1.5-pro-001': 1013.0913784387037,
 'llama-3.1-70b': 1006.7479287837886,
 'mistral-small-24b-instruct-2501': 1001.7586732700793,
 'mistral-large-2411': 998.3664288496796,
 'jamba-1.5-large': 989.0282367273422,
 'claude-3-5-sonnet-v2': 988.1648726866101,
 'llama-3.1-405b': 985.6521349099373,
 'qwen2.5-coder-32b-instruct': 982.2354056192605,
 'qwq-32b': 972.9671604248

In [16]:
from rank_comparia.maximum_likelihood import MaximumLikelihoodRanker

ranker = MaximumLikelihoodRanker()
ranker.compute_scores(matches=matches)
ranker.get_scores()

{'gemini-2.0-flash-exp': np.float64(1154.613547646744),
 'gemma-3-27b': np.float64(1151.996701599601),
 'gemini-2.0-flash-001': np.float64(1124.5616750643296),
 'deepseek-v3-chat': np.float64(1123.9831406383748),
 'command-a': np.float64(1106.6338381476846),
 'llama-3.1-nemotron-70b-instruct': np.float64(1079.4198358059523),
 'gemma-3-12b': np.float64(1072.030188153528),
 'deepseek-r1': np.float64(1065.1122962748934),
 'gemma-3-4b': np.float64(1062.6939030753736),
 'gemini-1.5-pro-002': np.float64(1059.1824058503007),
 'gemini-1.5-pro-001': np.float64(1051.8732400312688),
 'mistral-small-3.1-24b': np.float64(1040.97347256112),
 'mistral-large-2411': np.float64(1037.7388644150883),
 'llama-3.1-405b': np.float64(1018.244845660219),
 'claude-3-5-sonnet-v2': np.float64(1013.7515686846236),
 'gpt-4o-mini-2024-07-18': np.float64(1010.5428022782845),
 'o3-mini': np.float64(1009.7305011924993),
 'gpt-4o-2024-08-06': np.float64(1005.1432127620177),
 'llama-3.3-70b': np.float64(1000.796407404923