In [1]:
import os
import polars as pl
from getpass import getpass

hf_token = getpass()
os.environ["HF_HUB_CACHE"] = "/home/jupyterhub-users/shared/projet_comparia/huggingface_hub/"
os.environ["HF_DATASETS_CACHE"] = "/home/jupyterhub-users/shared/projet_comparia/huggingface_hub/"
os.environ["HF_TOKEN"] = hf_token

In [2]:
import datasets

comparia = datasets.load_dataset(
    "ministere-culture/comparia-reactions",
    cache_dir="/home/jupyterhub-users/shared/projet_comparia/huggingface_hub/",
    split="train",
)

In [3]:
comparia: pl.DataFrame = comparia.to_polars()  # type: ignore

In [4]:
comparia_model_a = (
    comparia.group_by(["model_a_name"])
    .len()
    .sort("len", descending=True)
    .with_columns(model_name="model_a_name")
    .drop("model_a_name")
)
comparia_model_b = (
    comparia.group_by(["model_b_name"])
    .len()
    .sort("len", descending=True)
    .with_columns(model_name="model_b_name")
    .drop("model_b_name")
)
number_by_model = (
    pl.concat([comparia_model_a, comparia_model_b]).group_by("model_name").sum().sort("len", descending=True)
)

In [5]:
number_by_model

model_name,len
str,u32
"""gpt-4o-2024-08-06""",3894
"""deepseek-v3-chat""",3857
"""gpt-4o-mini-2024-07-18""",3816
"""claude-3-5-sonnet-v2""",3514
"""llama-3.1-405b""",3409
…,…
"""gemma-3-12b""",408
"""mistral-small-3.1-24b""",383
"""gemma-3-4b""",381
"""gemma-2-27b-it-q8""",296


In [6]:
from rank_comparia.data_transformation import get_matches_with_score, get_winners, get_winrates

matches = get_matches_with_score(comparia)

In [7]:
matches.head(5)

model_a_name,model_b_name,conversation_pair_id,score_a,score_b
str,str,str,i64,i64
"""llama-3.1-405b""","""ministral-8b-instruct-2410""","""720b5c23ef07404896536f1945780a…",2,0
"""mixtral-8x22b-instruct-v0.1""","""mistral-large-2411""","""3a8b72f6f778430ca466d0f056613a…",0,1
"""gemma-2-9b-it""","""mistral-nemo-2407""","""0731014c941449768290a706b4260b…",1,-1
"""mixtral-8x22b-instruct-v0.1""","""gpt-4o-mini-2024-07-18""","""45122dc4707a415e937f8195011fda…",0,4
"""gpt-4o-mini-2024-07-18""","""deepseek-v3-chat""","""e38d646e2da7432abbe6964d0ed14e…",5,1


In [8]:
winners = get_winners(matches)

In [9]:
winrates = get_winrates(winners)
winrates.sort("winrate", descending=True)

model_name,len,wins,winrate
str,u32,u32,f64
"""gemini-2.0-flash-exp""",856,647,75.584112
"""gemma-3-27b""",275,202,73.454545
"""deepseek-v3-chat""",1511,1065,70.483124
"""gemini-2.0-flash-001""",434,301,69.354839
"""command-a""",208,141,67.788462
…,…,…,…
"""mixtral-8x7b-instruct-v0.1""",585,222,37.948718
"""lfm-40b""",887,321,36.189402
"""mixtral-8x22b-instruct-v0.1""",1459,445,30.500343
"""mistral-nemo-2407""",1440,430,29.861111


In [10]:
from rank_comparia.elo import ELORanker
from rank_comparia.ranker import Match, MatchScore
import random


def compute_match_score(score_a: int, score_b: int) -> MatchScore:
    final_score = score_b - score_a
    if final_score > 0:
        return MatchScore.B
    elif final_score < 0:
        return MatchScore.A
    else:
        return MatchScore.Draw


def get_shuffled_results(matches: list[Match], model_names: list[str], seed: int = 0):
    random.seed(seed)
    ranker_shuffle = ELORanker(K=40)
    matches_shuffle = random.sample(matches, k=len(matches))
    ranker_shuffle.add_players(model_names)
    ranker_shuffle.compute_scores(matches=matches_shuffle)
    return ranker_shuffle.players

In [11]:
model_names = set(matches["model_a_name"].unique()) | set(matches["model_b_name"].unique())
matches = [
    Match(
        match_dict["model_a_name"],
        match_dict["model_b_name"],
        compute_match_score(match_dict["score_a"], match_dict["score_b"]),
    )
    for match_dict in matches.to_dicts()
]

player_results = {
    seed: get_shuffled_results(matches=matches, model_names=model_names, seed=seed) for seed in range(100)  # type: ignore
}

In [12]:
players_avg_ranking = {
    player_name: sum(results[player_name] for results in player_results.values()) / 100 for player_name in model_names
}

In [13]:
for player, ranking in sorted(players_avg_ranking.items(), key=lambda x: -x[1]):
    print(f"{player} : {ranking}")

gemini-2.0-flash-exp : 1156.1004278778405
gemma-3-27b : 1150.1716949861686
deepseek-v3-chat : 1125.495745778569
gemini-2.0-flash-001 : 1124.8876503681481
command-a : 1112.562974819661
llama-3.1-nemotron-70b-instruct : 1083.075774833557
gemma-3-12b : 1071.4604002616913
deepseek-r1 : 1065.478023218923
gemini-1.5-pro-002 : 1063.6057012365407
gemma-3-4b : 1059.9305784011249
gemini-1.5-pro-001 : 1056.5126722499294
mistral-small-3.1-24b : 1047.2671006303808
mistral-large-2411 : 1036.457386333585
claude-3-5-sonnet-v2 : 1024.2494895151578
llama-3.1-405b : 1017.5007553771991
o3-mini : 1014.4649910360977
gpt-4o-mini-2024-07-18 : 1007.8929045031135
gpt-4o-2024-08-06 : 1007.0763468989388
llama-3.3-70b : 999.6052152769726
jamba-1.5-large : 998.3568564325521
mistral-small-24b-instruct-2501 : 991.8242932923025
gemma-2-27b-it-q8 : 990.6543025310857
phi-4 : 987.2785605924513
deepseek-r1-distill-llama-70b : 979.4535948223928
llama-3.1-70b : 978.6077391509342
gemma-2-9b-it : 976.8990809209939
aya-expanse

In [14]:
from random import sample, seed

ranker_shuffle = ELORanker(K=40)

seed(42)
matches_shuffle = sample(matches, k=len(matches))
ranker_shuffle.add_players(model_names)  # type: ignore
ranker_shuffle.compute_scores(matches=matches_shuffle)
ranker_shuffle.get_scores()

{'deepseek-v3-chat': 1159.2933220629432,
 'gemma-3-27b': 1131.4826054838509,
 'gemini-2.0-flash-exp': 1115.5434587195246,
 'command-a': 1112.228637384357,
 'deepseek-r1': 1108.0014373805302,
 'gemma-3-4b': 1106.4568188126573,
 'gemini-1.5-pro-002': 1099.3880616353367,
 'mistral-small-3.1-24b': 1090.3709302131715,
 'gemini-1.5-pro-001': 1071.5089307256758,
 'gemma-3-12b': 1064.9502700017786,
 'aya-expanse-8b': 1061.035237293724,
 'llama-3.1-nemotron-70b-instruct': 1059.6540036015351,
 'o3-mini': 1053.3097242360948,
 'gemini-2.0-flash-001': 1036.8512236236552,
 'claude-3-5-sonnet-v2': 1021.0011212701257,
 'qwq-32b': 1003.6707106743951,
 'phi-4': 1000.4463589369606,
 'llama-3.1-70b': 998.2657938646969,
 'mistral-large-2411': 998.0276305624897,
 'jamba-1.5-large': 995.0334584722015,
 'mistral-small-24b-instruct-2501': 990.2609533875658,
 'gemma-2-9b-it': 983.9516374897088,
 'llama-3.1-405b': 983.9372880182531,
 'deepseek-r1-distill-llama-70b': 980.0967373139342,
 'gemma-2-27b-it-q8': 979.2

In [15]:
ranker_shuffle = ELORanker(K=40)

seed(1337)
matches_shuffle = sample(matches, k=len(matches))
ranker_shuffle.add_players(model_names)  # type: ignore
ranker_shuffle.compute_scores(matches=matches_shuffle)
ranker_shuffle.get_scores()

{'deepseek-v3-chat': 1192.4519647836078,
 'gemma-3-27b': 1172.5505507546152,
 'gemini-2.0-flash-exp': 1149.1984235473246,
 'llama-3.1-nemotron-70b-instruct': 1148.5766871691976,
 'gemini-2.0-flash-001': 1147.658799637024,
 'gemma-3-12b': 1119.050781689956,
 'claude-3-5-sonnet-v2': 1083.7977330173142,
 'gemini-1.5-pro-002': 1078.9683706881383,
 'command-a': 1077.4824644476864,
 'gemma-3-4b': 1052.7087714093007,
 'deepseek-r1': 1034.5443067702472,
 'mistral-small-3.1-24b': 1033.367189127272,
 'llama-3.3-70b': 1019.0032678674577,
 'gpt-4o-mini-2024-07-18': 1008.0489077877519,
 'gemma-2-27b-it-q8': 1006.5294334292803,
 'c4ai-command-r-08-2024': 999.1302972363653,
 'mistral-small-24b-instruct-2501': 997.0482454845783,
 'jamba-1.5-large': 996.0236493428928,
 'llama-3.1-405b': 990.9570812984107,
 'gemini-1.5-pro-001': 990.4876389890597,
 'qwq-32b': 983.8871814740564,
 'qwen2.5-7b-instruct': 982.0868234985542,
 'o3-mini': 978.350235620246,
 'mistral-large-2411': 978.188433626538,
 'aya-expanse

In [16]:
from rank_comparia.maximum_likelihood import MaximumLikelihoodRanker

ranker = MaximumLikelihoodRanker()
ranker.compute_scores(matches=matches)
ranker.get_scores()

{'gemini-2.0-flash-exp': np.float64(1071.5911431488494),
 'gemma-3-27b': np.float64(1070.6149512161023),
 'deepseek-v3-chat': np.float64(1058.7552040622897),
 'gemini-2.0-flash-001': np.float64(1058.5208268807687),
 'command-a': np.float64(1050.569559388118),
 'llama-3.1-nemotron-70b-instruct': np.float64(1038.0127107423189),
 'gemma-3-12b': np.float64(1034.5409904079895),
 'deepseek-r1': np.float64(1031.5872191591507),
 'gemma-3-4b': np.float64(1030.1160721559013),
 'gemini-1.5-pro-002': np.float64(1028.6351487602299),
 'gemini-1.5-pro-001': np.float64(1025.330383327195),
 'mistral-small-3.1-24b': np.float64(1019.7461220961129),
 'mistral-large-2411': np.float64(1018.3135817966071),
 'llama-3.1-405b': np.float64(1009.1366953720458),
 'claude-3-5-sonnet-v2': np.float64(1006.6935189502087),
 'gpt-4o-mini-2024-07-18': np.float64(1005.1632698064324),
 'o3-mini': np.float64(1004.5772402173617),
 'gpt-4o-2024-08-06': np.float64(1002.5159238018007),
 'llama-3.3-70b': np.float64(1000.53239396

## Bootstrap

In [18]:
model_names

{'aya-expanse-8b',
 'c4ai-command-r-08-2024',
 'chocolatine-2-14b-instruct-v2.0.3-q8',
 'claude-3-5-sonnet-v2',
 'command-a',
 'deepseek-r1',
 'deepseek-r1-distill-llama-70b',
 'deepseek-v3-chat',
 'gemini-1.5-pro-001',
 'gemini-1.5-pro-002',
 'gemini-2.0-flash-001',
 'gemini-2.0-flash-exp',
 'gemma-2-27b-it-q8',
 'gemma-2-9b-it',
 'gemma-3-12b',
 'gemma-3-27b',
 'gemma-3-4b',
 'gpt-4o-2024-08-06',
 'gpt-4o-mini-2024-07-18',
 'hermes-3-llama-3.1-405b',
 'jamba-1.5-large',
 'lfm-40b',
 'llama-3.1-405b',
 'llama-3.1-70b',
 'llama-3.1-8b',
 'llama-3.1-nemotron-70b-instruct',
 'llama-3.3-70b',
 'ministral-8b-instruct-2410',
 'mistral-large-2411',
 'mistral-nemo-2407',
 'mistral-small-24b-instruct-2501',
 'mistral-small-3.1-24b',
 'mixtral-8x22b-instruct-v0.1',
 'mixtral-8x7b-instruct-v0.1',
 'o3-mini',
 'phi-3.5-mini-instruct',
 'phi-4',
 'qwen2.5-7b-instruct',
 'qwen2.5-coder-32b-instruct',
 'qwq-32b'}

In [19]:
ranker = ELORanker(K=40)

ranker.add_players(model_names)  # type: ignore
scores = ranker.compute_bootstrap_scores(matches=matches)

Processing bootstrap samples.: 100%|██████████| 100/100 [00:03<00:00, 31.00it/s]


In [20]:
scores

model,median,p2.5,p97.5
str,f64,f64,f64
"""gemini-2.0-flash-exp""",1151.630602,1089.012649,1214.7701
"""gemma-3-27b""",1156.796602,1065.642393,1231.398323
"""command-a""",1108.67657,1028.633901,1193.283863
"""deepseek-v3-chat""",1126.174772,1042.820364,1207.251953
"""gemini-2.0-flash-001""",1125.913368,1043.369645,1187.758266
…,…,…,…
"""qwen2.5-7b-instruct""",924.392659,849.701241,993.556433
"""qwen2.5-coder-32b-instruct""",942.755426,863.813279,1015.934088
"""mixtral-8x22b-instruct-v0.1""",869.044198,792.073653,963.989966
"""mistral-nemo-2407""",866.358855,788.742152,925.149116


In [21]:
ranker = MaximumLikelihoodRanker()
scores = ranker.compute_bootstrap_scores(matches=matches)

Processing bootstrap samples.: 100%|██████████| 100/100 [01:40<00:00,  1.01s/it]


In [22]:
scores

model,median,p2.5,p97.5
str,f64,f64,f64
"""gemma-3-27b""",1071.538146,1050.597322,1087.437122
"""command-a""",1051.795832,1026.492324,1070.682596
"""gemini-2.0-flash-001""",1058.029991,1047.162266,1070.90866
"""gemini-2.0-flash-exp""",1072.226837,1064.511494,1080.581669
"""deepseek-v3-chat""",1058.470982,1051.058605,1065.727889
…,…,…,…
"""lfm-40b""",954.996164,947.065501,963.425475
"""mixtral-8x7b-instruct-v0.1""",951.471693,938.301479,961.632723
"""mixtral-8x22b-instruct-v0.1""",939.928828,933.506206,946.224519
"""mistral-nemo-2407""",937.869592,931.327491,945.52133
