In [1]:
import os
import polars as pl
from getpass import getpass

hf_token = getpass()
os.environ["HF_HUB_CACHE"] = "/home/jupyterhub-users/shared/projet_comparia/huggingface_hub/"
os.environ["HF_DATASETS_CACHE"] = "/home/jupyterhub-users/shared/projet_comparia/huggingface_hub/"
os.environ["HF_TOKEN"] = hf_token

In [2]:
import datasets

comparia = datasets.load_dataset(
    "ministere-culture/comparia-reactions",
    cache_dir="/home/jupyterhub-users/shared/projet_comparia/huggingface_hub/",
    split="train",
)

In [3]:
comparia: pl.DataFrame = comparia.to_polars()  # type: ignore

In [4]:
comparia_model_a = (
    comparia.group_by(["model_a_name"])
    .len()
    .sort("len", descending=True)
    .with_columns(model_name="model_a_name")
    .drop("model_a_name")
)
comparia_model_b = (
    comparia.group_by(["model_b_name"])
    .len()
    .sort("len", descending=True)
    .with_columns(model_name="model_b_name")
    .drop("model_b_name")
)
number_by_model = (
    pl.concat([comparia_model_a, comparia_model_b]).group_by("model_name").sum().sort("len", descending=True)
)

In [5]:
number_by_model

model_name,len
str,u32
"""gpt-4o-2024-08-06""",3894
"""deepseek-v3-chat""",3857
"""gpt-4o-mini-2024-07-18""",3816
"""claude-3-5-sonnet-v2""",3514
"""llama-3.1-405b""",3409
…,…
"""gemma-3-12b""",408
"""mistral-small-3.1-24b""",383
"""gemma-3-4b""",381
"""gemma-2-27b-it-q8""",296


In [6]:
from pathlib import Path
from rank_comparia.data_transformation import get_matches_with_score, get_winners, get_winrates

matches = get_matches_with_score(comparia)

In [7]:
matches.head(5)

model_a_name,model_b_name,conversation_pair_id,score_a,score_b
str,str,str,i64,i64
"""command-a""","""gemini-1.5-pro-002""","""c5c35269ef1a484292a60b2fcbf801…",2,0
"""llama-3.1-nemotron-70b-instruc…","""llama-3.1-8b""","""69a81c6dfadb4445a1ec7ddf58a11d…",3,0
"""claude-3-5-sonnet-v2""","""qwen2.5-coder-32b-instruct""","""75a36e57c9614fbc9b8e904d13c788…",0,1
"""jamba-1.5-large""","""gemini-1.5-pro-002""","""7f1dd8b6e2a64ea893f71af885d7b1…",1,1
"""mistral-small-3.1-24b""","""deepseek-v3-chat""","""a4d37bb370544e8497f7b250ca0e32…",0,1


In [8]:
winners = get_winners(matches)

In [9]:
winrates = get_winrates(winners)
winrates.sort("winrate", descending=True)

model_name,len,wins,winrate
str,u32,u32,f64
"""gemini-2.0-flash-exp""",856,647,75.584112
"""gemma-3-27b""",275,202,73.454545
"""deepseek-v3-chat""",1511,1065,70.483124
"""gemini-2.0-flash-001""",434,301,69.354839
"""command-a""",208,141,67.788462
…,…,…,…
"""mixtral-8x7b-instruct-v0.1""",585,222,37.948718
"""lfm-40b""",887,321,36.189402
"""mixtral-8x22b-instruct-v0.1""",1459,445,30.500343
"""mistral-nemo-2407""",1440,430,29.861111


In [10]:
from rank_comparia.elo import ELORanker
from rank_comparia.match import Match, MatchScore
import random


def compute_match_score(score_a: int, score_b: int) -> MatchScore:
    final_score = score_b - score_a
    if final_score > 0:
        return MatchScore.B
    elif final_score < 0:
        return MatchScore.A
    else:
        return MatchScore.Draw


def get_shuffled_results(matches: list[Match], model_names: list[str], seed: int = 0):
    random.seed(seed)
    ranker_shuffle = ELORanker(K=40)
    matches_shuffle = random.sample(matches, k=len(matches))
    ranker_shuffle.add_players(model_names)
    ranker_shuffle.compute_scores(matches=matches_shuffle)
    return ranker_shuffle.players

In [11]:
model_names = set(matches["model_a_name"].unique()) | set(matches["model_b_name"].unique())
matches = [
    Match(
        match_dict["model_a_name"],
        match_dict["model_b_name"],
        compute_match_score(match_dict["score_a"], match_dict["score_b"]),
    )
    for match_dict in matches.to_dicts()
]

player_results = {
    seed: get_shuffled_results(matches=matches, model_names=model_names, seed=seed) for seed in range(100)  # type: ignore
}

In [12]:
players_avg_ranking = {
    player_name: sum(results[player_name] for results in player_results.values()) / 100 for player_name in model_names
}

In [13]:
for player, ranking in sorted(players_avg_ranking.items(), key=lambda x: -x[1]):
    print(f"{player} : {ranking}")

gemini-2.0-flash-exp : 1159.835765649278
gemma-3-27b : 1152.618960183281
gemini-2.0-flash-001 : 1125.6259159717804
deepseek-v3-chat : 1122.7934338956784
command-a : 1109.2156052313073
llama-3.1-nemotron-70b-instruct : 1088.2077349488752
deepseek-r1 : 1066.984214234589
gemma-3-12b : 1066.4512967588537
gemini-1.5-pro-002 : 1065.5034803505066
gemma-3-4b : 1062.8008354011902
gemini-1.5-pro-001 : 1049.610792610557
mistral-small-3.1-24b : 1044.1573804933316
mistral-large-2411 : 1031.1972652931388
o3-mini : 1016.0236651676515
llama-3.1-405b : 1013.8497280412064
gpt-4o-2024-08-06 : 1012.1437886433353
claude-3-5-sonnet-v2 : 1011.9123455668218
gpt-4o-mini-2024-07-18 : 1011.1633080538566
jamba-1.5-large : 996.5450212630784
phi-4 : 994.1058393080427
llama-3.3-70b : 992.4166532884456
mistral-small-24b-instruct-2501 : 990.9888211898422
gemma-2-27b-it-q8 : 989.0215035034687
deepseek-r1-distill-llama-70b : 986.3702642005592
gemma-2-9b-it : 981.4534250743222
llama-3.1-70b : 981.016759005094
aya-expanse

In [14]:
from random import sample, seed

ranker_shuffle = ELORanker(K=40)

seed(42)
matches_shuffle = sample(matches, k=len(matches))
ranker_shuffle.add_players(model_names)  # type: ignore
ranker_shuffle.compute_scores(matches=matches_shuffle)
ranker_shuffle.get_scores()

{'gemini-2.0-flash-exp': 1183.4867491157845,
 'gemma-3-27b': 1180.7409053885328,
 'deepseek-v3-chat': 1139.4939627497208,
 'command-a': 1117.531631093939,
 'gemini-2.0-flash-001': 1097.9617704624359,
 'llama-3.1-nemotron-70b-instruct': 1092.6948479646198,
 'deepseek-r1': 1082.5117765662246,
 'gemini-1.5-pro-001': 1055.2311933164472,
 'gemma-3-4b': 1054.5029233755392,
 'llama-3.1-405b': 1045.2969672940199,
 'mistral-large-2411': 1039.7748720202499,
 'mistral-small-3.1-24b': 1037.55214990131,
 'gemma-3-12b': 1029.9889436928504,
 'llama-3.3-70b': 1016.9411699555915,
 'claude-3-5-sonnet-v2': 1005.3881412437157,
 'deepseek-r1-distill-llama-70b': 1002.279239468485,
 'gpt-4o-2024-08-06': 999.7796306088758,
 'ministral-8b-instruct-2410': 997.3705857434142,
 'c4ai-command-r-08-2024': 981.2995658320218,
 'o3-mini': 981.053399770166,
 'mistral-small-24b-instruct-2501': 980.4589842411491,
 'gemini-1.5-pro-002': 975.9931450774732,
 'jamba-1.5-large': 971.1860296890408,
 'gpt-4o-mini-2024-07-18': 96

In [15]:
ranker_shuffle = ELORanker(K=40)

seed(1337)
matches_shuffle = sample(matches, k=len(matches))
ranker_shuffle.add_players(model_names)  # type: ignore
ranker_shuffle.compute_scores(matches=matches_shuffle)
ranker_shuffle.get_scores()

{'gemini-2.0-flash-exp': 1183.4179921009365,
 'gemma-3-27b': 1182.5980532698009,
 'gemini-1.5-pro-002': 1136.4441252706813,
 'command-a': 1116.8700754601125,
 'deepseek-v3-chat': 1103.0714861195213,
 'gemini-2.0-flash-001': 1088.4041617738817,
 'llama-3.1-nemotron-70b-instruct': 1085.318580098813,
 'deepseek-r1': 1082.0732443672703,
 'mistral-small-3.1-24b': 1076.8682172908802,
 'gemma-3-4b': 1074.091100502742,
 'gemini-1.5-pro-001': 1065.5802781049345,
 'mistral-large-2411': 1064.9739292714796,
 'llama-3.3-70b': 1038.1029852149766,
 'gemma-3-12b': 1024.3882899362548,
 'gpt-4o-2024-08-06': 1020.6206349017535,
 'gemma-2-27b-it-q8': 1017.6556602981265,
 'gemma-2-9b-it': 1015.4259778307485,
 'llama-3.1-405b': 1010.3436327945215,
 'phi-4': 1009.3227755806488,
 'o3-mini': 1001.835494463063,
 'c4ai-command-r-08-2024': 1000.3276030693978,
 'claude-3-5-sonnet-v2': 991.220280300796,
 'gpt-4o-mini-2024-07-18': 990.3480848172474,
 'mistral-small-24b-instruct-2501': 987.8857128336056,
 'jamba-1.5-

In [16]:
from rank_comparia.maximum_likelihood import MaximumLikelihoodRanker

ranker = MaximumLikelihoodRanker()
ranker.compute_scores(matches=matches)
ranker.get_scores()

{'gemini-2.0-flash-exp': np.float64(1071.5911431488494),
 'gemma-3-27b': np.float64(1070.6149512161023),
 'deepseek-v3-chat': np.float64(1058.7552040622897),
 'gemini-2.0-flash-001': np.float64(1058.5208268807687),
 'command-a': np.float64(1050.5695593881183),
 'llama-3.1-nemotron-70b-instruct': np.float64(1038.0127107423189),
 'gemma-3-12b': np.float64(1034.5409904079895),
 'deepseek-r1': np.float64(1031.587219159151),
 'gemma-3-4b': np.float64(1030.1160721559013),
 'gemini-1.5-pro-002': np.float64(1028.6351487602299),
 'gemini-1.5-pro-001': np.float64(1025.3303833271952),
 'mistral-small-3.1-24b': np.float64(1019.7461220961129),
 'mistral-large-2411': np.float64(1018.3135817966071),
 'llama-3.1-405b': np.float64(1009.1366953720458),
 'claude-3-5-sonnet-v2': np.float64(1006.6935189502086),
 'gpt-4o-mini-2024-07-18': np.float64(1005.1632698064324),
 'o3-mini': np.float64(1004.5772402173617),
 'gpt-4o-2024-08-06': np.float64(1002.5159238018007),
 'llama-3.3-70b': np.float64(1000.5323939