In [1]:
import os
import polars as pl
from getpass import getpass

hf_token = getpass()
os.environ["HF_HUB_CACHE"] = "/home/jupyterhub-users/shared/projet_comparia/huggingface_hub/"
os.environ["HF_DATASETS_CACHE"] = "/home/jupyterhub-users/shared/projet_comparia/huggingface_hub/"
os.environ["HF_TOKEN"] = hf_token

In [2]:
import datasets

comparia = datasets.load_dataset(
    "ministere-culture/comparia-reactions",
    cache_dir="/home/jupyterhub-users/shared/projet_comparia/huggingface_hub/",
    split="train",
)

In [3]:
comparia: pl.DataFrame = comparia.to_polars()  # type: ignore

In [4]:
comparia_model_a = (
    comparia.group_by(["model_a_name"])
    .len()
    .sort("len", descending=True)
    .with_columns(model_name="model_a_name")
    .drop("model_a_name")
)
comparia_model_b = (
    comparia.group_by(["model_b_name"])
    .len()
    .sort("len", descending=True)
    .with_columns(model_name="model_b_name")
    .drop("model_b_name")
)
number_by_model = (
    pl.concat([comparia_model_a, comparia_model_b]).group_by("model_name").sum().sort("len", descending=True)
)

In [5]:
number_by_model

model_name,len
str,u32
"""gpt-4o-2024-08-06""",3894
"""deepseek-v3-chat""",3857
"""gpt-4o-mini-2024-07-18""",3816
"""claude-3-5-sonnet-v2""",3514
"""llama-3.1-405b""",3409
…,…
"""gemma-3-12b""",408
"""mistral-small-3.1-24b""",383
"""gemma-3-4b""",381
"""gemma-2-27b-it-q8""",296


In [6]:
from rank_comparia.data_transformation import get_matches_with_score, get_winners, get_winrates

matches = get_matches_with_score(comparia)

In [7]:
matches.head(5)

model_a_name,model_b_name,conversation_pair_id,score_a,score_b
str,str,str,i64,i64
"""mistral-small-24b-instruct-250…","""mistral-large-2411""","""71e04e857dfc45319c259493c3bd33…",0,1
"""gemini-1.5-pro-002""","""gemini-2.0-flash-exp""","""abf3a7f173a94365ac2a86766d5f8a…",-4,5
"""chocolatine-2-14b-instruct-v2.…","""command-a""","""4bd0e7f4cb2440d6ab17dc5a15e1ee…",-2,1
"""mistral-large-2411""","""claude-3-5-sonnet-v2""","""a39fc656efe44dc09304f599453c43…",-1,3
"""llama-3.1-8b""","""gemma-2-9b-it""","""8f9f1ccfddd24e5a8fb51982970909…",0,2


In [8]:
winners = get_winners(matches)

In [9]:
winrates = get_winrates(winners)
winrates.sort("winrate", descending=True)

model_name,len,wins,winrate
str,u32,u32,f64
"""gemini-2.0-flash-exp""",856,647,75.584112
"""gemma-3-27b""",275,202,73.454545
"""deepseek-v3-chat""",1511,1065,70.483124
"""gemini-2.0-flash-001""",434,301,69.354839
"""command-a""",208,141,67.788462
…,…,…,…
"""mixtral-8x7b-instruct-v0.1""",585,222,37.948718
"""lfm-40b""",887,321,36.189402
"""mixtral-8x22b-instruct-v0.1""",1459,445,30.500343
"""mistral-nemo-2407""",1440,430,29.861111


In [10]:
from rank_comparia.elo import ELORanker
from rank_comparia.ranker import Match, MatchScore
import random


def compute_match_score(score_a: int, score_b: int) -> MatchScore:
    final_score = score_b - score_a
    if final_score > 0:
        return MatchScore.B
    elif final_score < 0:
        return MatchScore.A
    else:
        return MatchScore.Draw


def get_shuffled_results(matches: list[Match], model_names: list[str], seed: int = 0):
    random.seed(seed)
    ranker_shuffle = ELORanker(K=40)
    matches_shuffle = random.sample(matches, k=len(matches))
    ranker_shuffle.add_players(model_names)
    ranker_shuffle.compute_scores(matches=matches_shuffle)
    return ranker_shuffle.players

In [11]:
model_names = set(matches["model_a_name"].unique()) | set(matches["model_b_name"].unique())
matches = [
    Match(
        match_dict["model_a_name"],
        match_dict["model_b_name"],
        compute_match_score(match_dict["score_a"], match_dict["score_b"]),
    )
    for match_dict in matches.to_dicts()
]

player_results = {
    seed: get_shuffled_results(matches=matches, model_names=model_names, seed=seed) for seed in range(100)  # type: ignore
}

In [12]:
players_avg_ranking = {
    player_name: sum(results[player_name] for results in player_results.values()) / 100 for player_name in model_names
}

In [13]:
for player, ranking in sorted(players_avg_ranking.items(), key=lambda x: -x[1]):
    print(f"{player} : {ranking}")

gemma-3-27b : 1156.0486512649727
gemini-2.0-flash-exp : 1150.1602912876206
deepseek-v3-chat : 1125.1919281182347
gemini-2.0-flash-001 : 1123.6845140222883
command-a : 1102.9467503281348
llama-3.1-nemotron-70b-instruct : 1079.7847337684125
gemma-3-12b : 1074.291038064285
gemini-1.5-pro-002 : 1069.8553168291314
deepseek-r1 : 1064.8458669324705
gemma-3-4b : 1058.0982386645562
gemini-1.5-pro-001 : 1053.8535513382164
mistral-small-3.1-24b : 1042.0602365840293
mistral-large-2411 : 1038.6511694741837
claude-3-5-sonnet-v2 : 1019.0033262775237
llama-3.1-405b : 1018.5229289903359
gpt-4o-mini-2024-07-18 : 1017.7388198070615
o3-mini : 1007.7727452827941
gpt-4o-2024-08-06 : 1005.4396846845273
jamba-1.5-large : 996.6073494344724
llama-3.3-70b : 996.5877927226035
mistral-small-24b-instruct-2501 : 994.1136438084618
phi-4 : 993.7654264555771
deepseek-r1-distill-llama-70b : 985.0189314843499
gemma-2-27b-it-q8 : 983.2483960577565
llama-3.1-70b : 982.5898172578559
gemma-2-9b-it : 978.6402321674244
aya-exp

In [14]:
from random import sample, seed

ranker_shuffle = ELORanker(K=40)

seed(42)
matches_shuffle = sample(matches, k=len(matches))
ranker_shuffle.add_players(model_names)  # type: ignore
ranker_shuffle.compute_scores(matches=matches_shuffle)
ranker_shuffle.get_scores()

{'gemini-2.0-flash-exp': 1154.1670879122732,
 'command-a': 1139.5466574147688,
 'gemma-3-12b': 1137.031875816478,
 'gemma-3-4b': 1136.335483342115,
 'gemini-2.0-flash-001': 1130.7334208177335,
 'gemma-3-27b': 1126.4946403630825,
 'gemini-1.5-pro-002': 1090.0302601576593,
 'deepseek-v3-chat': 1089.2898228891256,
 'mistral-small-3.1-24b': 1059.5915419240193,
 'deepseek-r1': 1042.6066504794112,
 'gemini-1.5-pro-001': 1028.501278401883,
 'llama-3.1-70b': 1027.0419065432473,
 'gpt-4o-mini-2024-07-18': 1023.1861996143211,
 'llama-3.1-nemotron-70b-instruct': 1021.3307252216715,
 'jamba-1.5-large': 1018.4061061470616,
 'claude-3-5-sonnet-v2': 1013.9789381278949,
 'llama-3.3-70b': 1007.373247030111,
 'gpt-4o-2024-08-06': 1002.870761825725,
 'qwq-32b': 1001.2230929385424,
 'o3-mini': 1000.6309714099089,
 'llama-3.1-405b': 996.4394335019336,
 'c4ai-command-r-08-2024': 991.9649408078149,
 'ministral-8b-instruct-2410': 987.3279426089935,
 'qwen2.5-7b-instruct': 987.2479906429548,
 'gemma-2-9b-it': 

In [15]:
ranker_shuffle = ELORanker(K=40)

seed(1337)
matches_shuffle = sample(matches, k=len(matches))
ranker_shuffle.add_players(model_names)  # type: ignore
ranker_shuffle.compute_scores(matches=matches_shuffle)
ranker_shuffle.get_scores()

{'gemini-2.0-flash-exp': 1221.793689083819,
 'gemma-3-12b': 1148.1826597289905,
 'gemma-3-27b': 1135.8888035423233,
 'command-a': 1121.4580982995947,
 'deepseek-v3-chat': 1121.3171590302288,
 'llama-3.1-nemotron-70b-instruct': 1098.386096698119,
 'deepseek-r1': 1070.7324007588047,
 'gemma-3-4b': 1063.279752807947,
 'mistral-large-2411': 1047.9870626308698,
 'mistral-small-3.1-24b': 1044.0944007008816,
 'claude-3-5-sonnet-v2': 1042.7785359594016,
 'gemini-1.5-pro-002': 1036.2798065845957,
 'mistral-small-24b-instruct-2501': 1034.236011889542,
 'gemini-2.0-flash-001': 1031.1886306339286,
 'gemini-1.5-pro-001': 1030.7096709576833,
 'o3-mini': 1018.6811133063709,
 'deepseek-r1-distill-llama-70b': 1017.0813047650392,
 'llama-3.3-70b': 1009.6888478013763,
 'gemma-2-9b-it': 1008.8204756820937,
 'llama-3.1-405b': 1007.491138858506,
 'llama-3.1-70b': 1006.1804391859696,
 'llama-3.1-8b': 994.1151087595802,
 'gpt-4o-mini-2024-07-18': 992.8972656379399,
 'ministral-8b-instruct-2410': 986.761978793

In [16]:
from rank_comparia.maximum_likelihood import MaximumLikelihoodRanker

ranker = MaximumLikelihoodRanker()
ranker.compute_scores(matches=matches)
ranker.get_scores()

{'gemini-2.0-flash-exp': np.float64(1071.5911431488407),
 'gemma-3-27b': np.float64(1070.6149512161712),
 'deepseek-v3-chat': np.float64(1058.7552040623111),
 'gemini-2.0-flash-001': np.float64(1058.5208268808342),
 'command-a': np.float64(1050.5695593880446),
 'llama-3.1-nemotron-70b-instruct': np.float64(1038.0127107423277),
 'gemma-3-12b': np.float64(1034.5409904078952),
 'deepseek-r1': np.float64(1031.587219159033),
 'gemma-3-4b': np.float64(1030.116072155743),
 'gemini-1.5-pro-002': np.float64(1028.6351487602242),
 'gemini-1.5-pro-001': np.float64(1025.3303833271143),
 'mistral-small-3.1-24b': np.float64(1019.7461220960782),
 'mistral-large-2411': np.float64(1018.3135817966249),
 'llama-3.1-405b': np.float64(1009.1366953721039),
 'claude-3-5-sonnet-v2': np.float64(1006.6935189502154),
 'gpt-4o-mini-2024-07-18': np.float64(1005.1632698064823),
 'o3-mini': np.float64(1004.5772402173053),
 'gpt-4o-2024-08-06': np.float64(1002.5159238018146),
 'llama-3.3-70b': np.float64(1000.53239396

## Bootstrap

In [17]:
model_names

{'aya-expanse-8b',
 'c4ai-command-r-08-2024',
 'chocolatine-2-14b-instruct-v2.0.3-q8',
 'claude-3-5-sonnet-v2',
 'command-a',
 'deepseek-r1',
 'deepseek-r1-distill-llama-70b',
 'deepseek-v3-chat',
 'gemini-1.5-pro-001',
 'gemini-1.5-pro-002',
 'gemini-2.0-flash-001',
 'gemini-2.0-flash-exp',
 'gemma-2-27b-it-q8',
 'gemma-2-9b-it',
 'gemma-3-12b',
 'gemma-3-27b',
 'gemma-3-4b',
 'gpt-4o-2024-08-06',
 'gpt-4o-mini-2024-07-18',
 'hermes-3-llama-3.1-405b',
 'jamba-1.5-large',
 'lfm-40b',
 'llama-3.1-405b',
 'llama-3.1-70b',
 'llama-3.1-8b',
 'llama-3.1-nemotron-70b-instruct',
 'llama-3.3-70b',
 'ministral-8b-instruct-2410',
 'mistral-large-2411',
 'mistral-nemo-2407',
 'mistral-small-24b-instruct-2501',
 'mistral-small-3.1-24b',
 'mixtral-8x22b-instruct-v0.1',
 'mixtral-8x7b-instruct-v0.1',
 'o3-mini',
 'phi-3.5-mini-instruct',
 'phi-4',
 'qwen2.5-7b-instruct',
 'qwen2.5-coder-32b-instruct',
 'qwq-32b'}

In [18]:
ranker = ELORanker(K=40)

ranker.add_players(model_names)  # type: ignore
scores = ranker.compute_bootstrap_scores(matches=matches)

Processing bootstrap samples: 100%|██████████| 100/100 [00:03<00:00, 29.76it/s]


In [19]:
scores

model,median,p2.5,p97.5
str,f64,f64,f64
"""gemma-3-27b""",1154.441609,1080.287601,1228.198377
"""gemini-2.0-flash-exp""",1166.362917,1088.292898,1228.393702
"""llama-3.1-nemotron-70b-instruc…",1082.746041,1016.218255,1163.795486
"""gemini-2.0-flash-001""",1127.974128,1045.808538,1217.288659
"""command-a""",1107.549563,1038.563985,1198.142902
…,…,…,…
"""qwq-32b""",955.571501,888.08977,1017.773079
"""mixtral-8x22b-instruct-v0.1""",867.894862,792.95239,938.15538
"""lfm-40b""",908.471049,836.608766,983.980811
"""mixtral-8x7b-instruct-v0.1""",894.57041,823.186301,962.992924


In [20]:
ranker = MaximumLikelihoodRanker()
scores = ranker.compute_bootstrap_scores(matches=matches)

Processing bootstrap samples: 100%|██████████| 100/100 [01:48<00:00,  1.09s/it]


In [21]:
scores

model,median,p2.5,p97.5
str,f64,f64,f64
"""gemma-3-27b""",1071.228735,1054.095027,1085.856631
"""gemini-2.0-flash-exp""",1070.70261,1062.689237,1078.417569
"""deepseek-v3-chat""",1059.31096,1052.828469,1065.3792
"""gemini-2.0-flash-001""",1059.401273,1047.658237,1071.575883
"""llama-3.1-nemotron-70b-instruc…",1037.970977,1029.47792,1048.945975
…,…,…,…
"""mixtral-8x7b-instruct-v0.1""",952.145416,940.388631,962.030301
"""phi-3.5-mini-instruct""",954.177657,936.796446,966.384446
"""mistral-nemo-2407""",937.481536,930.563496,942.746621
"""mixtral-8x22b-instruct-v0.1""",939.723636,930.659723,945.291229
