In [1]:
from pathlib import Path
import polars as pl

In [None]:
comparia = pl.read_parquet(Path(".").resolve().parent / "data" / "reactions.parquet")
comparia_model_a = (
    comparia.group_by(["model_a_name"])
    .len()
    .sort("len", descending=True)
    .with_columns(model_name="model_a_name")
    .drop("model_a_name")
)
comparia_model_b = (
    comparia.group_by(["model_b_name"])
    .len()
    .sort("len", descending=True)
    .with_columns(model_name="model_b_name")
    .drop("model_b_name")
)
number_by_model = (
    pl.concat([comparia_model_a, comparia_model_b]).group_by("model_name").sum().sort("len", descending=True)
)

In [3]:
number_by_model

model_name,len
str,u32
"""gpt-4o-2024-08-06""",3894
"""deepseek-v3-chat""",3206
"""llama-3.1-405b""",2920
"""gpt-4o-mini-2024-07-18""",2766
"""claude-3-5-sonnet-v2""",2760
…,…
"""gemini-2.0-flash-001""",282
"""jamba-1.5-large""",237
"""gemma-3-12b""",24
"""gemma-3-4b""",23


In [None]:
from pathlib import Path
from rank_comparia.data_transformation import get_matches_with_score, get_winners, get_winrates

matches = get_matches_with_score(Path(".").resolve().parent / "data" / "reactions.parquet")

In [5]:
winners = get_winners(matches)

In [6]:
winrates = get_winrates(winners)
winrates.sort("winrate", descending=True)

model_name,len,wins,winrate
str,u32,u32,f64
"""gemma-3-4b""",12,10,83.333333
"""gemini-2.0-flash-exp""",856,647,75.584112
"""deepseek-v3-chat""",1184,848,71.621622
"""gemini-2.0-flash-001""",127,89,70.07874
"""gemini-1.5-pro-001""",328,222,67.682927
…,…,…,…
"""mixtral-8x7b-instruct-v0.1""",585,222,37.948718
"""lfm-40b""",665,247,37.142857
"""mistral-nemo-2407""",1199,368,30.692244
"""mixtral-8x22b-instruct-v0.1""",1187,353,29.738837


In [12]:
from rank_comparia.elo import RankerELO
import random

model_names = set(matches["model_a_name"].unique()) | set(matches["model_b_name"].unique())

matches_list = [
    (match_dict["model_a_name"], match_dict["model_b_name"], match_dict["score_a"], match_dict["score_b"])
    for match_dict in matches.to_dicts()
]


def get_shuffled_results(matches_list, model_names, seed=0):
    random.seed(seed)
    ranker_shuffle = RankerELO(K=40)
    matches_shuffle = random.sample(matches_list, k=len(matches_list))
    ranker_shuffle.add_players(model_names)
    ranker_shuffle.compute_ranks(matches_list=matches_shuffle)
    return ranker_shuffle.players

In [17]:
player_results = {
    seed: get_shuffled_results(matches_list=matches_list, model_names=model_names, seed=seed) for seed in range(100)
}

In [18]:
players_avg_ranking = {
    player_name: sum(results[player_name] for results in player_results.values()) / 100 for player_name in model_names
}

In [19]:
for player, ranking in sorted(players_avg_ranking.items(), key=lambda x: -x[1]):
    print(f"{player} : {ranking}")

gemini-2.0-flash-exp : 1169.781323403136
deepseek-v3-chat : 1141.1423126006355
gemini-2.0-flash-001 : 1116.575240785811
gemma-3-4b : 1104.6213522598753
llama-3.1-nemotron-70b-instruct : 1080.4550832487155
gemma-3-27b : 1067.4643333268157
gemini-1.5-pro-002 : 1065.8011522464794
gemini-1.5-pro-001 : 1060.8104081834147
mistral-large-2411 : 1050.1304917447856
gemma-3-12b : 1045.9515378774577
llama-3.1-405b : 1027.8624764354631
claude-3-5-sonnet-v2 : 1027.063795796701
gpt-4o-mini-2024-07-18 : 1025.2893501507383
llama-3.3-70b : 1010.7720856990284
gpt-4o-2024-08-06 : 1010.3616161810185
jamba-1.5-large : 1009.2998487178018
mistral-small-24b-instruct-2501 : 999.6523342483056
phi-4 : 998.6798758954953
gemma-2-27b-it-q8 : 996.4918085869062
gemma-2-9b-it : 988.2834499143715
llama-3.1-70b : 988.176783078807
aya-expanse-8b : 979.3331691215346
ministral-8b-instruct-2410 : 978.2844329390423
c4ai-command-r-08-2024 : 969.8939771834646
hermes-3-llama-3.1-405b : 950.5592849029724
llama-3.1-8b : 949.782970

In [None]:
from rank_comparia.elo import RankerELO
from random import sample, seed

ranker_shuffle = RankerELO(K=40)
model_names = set(matches["model_a_name"].unique()) | set(matches["model_b_name"].unique())

matches_list = [
    (match_dict["model_a_name"], match_dict["model_b_name"], match_dict["score_a"], match_dict["score_b"])
    for match_dict in matches.to_dicts()
]
seed(42)
matches_suffle = sample(matches_list, k=len(matches_list))
ranker_shuffle.add_players(model_names)
ranker_shuffle.compute_ranks(matches_list=matches_suffle)
ranker_shuffle.get_all_rankings()

gemini-2.0-flash-exp : 1150.4700434124802
gemini-2.0-flash-001 : 1143.9304843712302
gpt-4o-mini-2024-07-18 : 1128.0470464832379
deepseek-v3-chat : 1120.819926137853
llama-3.1-nemotron-70b-instruct : 1100.979282658023
gemma-3-4b : 1082.8852329960412
gemini-1.5-pro-001 : 1081.4335874245248
gemini-1.5-pro-002 : 1080.9932600863472
gemma-3-27b : 1079.140483837468
mistral-large-2411 : 1066.2872477012352
llama-3.3-70b : 1053.6579622493116
phi-4 : 1049.9066244908154
gemma-3-12b : 1043.5684621754242
jamba-1.5-large : 1023.7193228271046
gpt-4o-2024-08-06 : 1016.9042562708933
claude-3-5-sonnet-v2 : 1015.0694564926664
c4ai-command-r-08-2024 : 989.2435047976638
llama-3.1-405b : 986.3112983835166
gemma-2-9b-it : 986.1843466884308
qwen2.5-coder-32b-instruct : 976.572124972731
mistral-small-24b-instruct-2501 : 966.149711898671
llama-3.1-70b : 963.4000275566317
aya-expanse-8b : 960.0425647311392
llama-3.1-8b : 947.226331260453
hermes-3-llama-3.1-405b : 937.501452933138
ministral-8b-instruct-2410 : 937.

In [10]:
from rank_comparia.elo import RankerELO
from random import sample, seed

ranker_shuffle = RankerELO(K=40)

model_names = set(matches["model_a_name"].unique()) | set(matches["model_b_name"].unique())

matches_list = [
    (match_dict["model_a_name"], match_dict["model_b_name"], match_dict["score_a"], match_dict["score_b"])
    for match_dict in matches.to_dicts()
]
seed(1337)
matches_suffle = sample(matches_list, k=len(matches_list))
ranker_shuffle.add_players(model_names)
ranker_shuffle.compute_ranks(matches_list=matches_suffle)
ranker_shuffle.get_all_rankings()

gemini-2.0-flash-exp : 1200.592947192445
deepseek-v3-chat : 1124.1437361967505
gemini-2.0-flash-001 : 1113.4882061588316
gemini-1.5-pro-001 : 1106.368064695688
gemma-3-4b : 1104.9780302254394
llama-3.1-nemotron-70b-instruct : 1077.4993648446539
gemma-3-27b : 1073.8445119546225
llama-3.1-70b : 1062.6915774021036
claude-3-5-sonnet-v2 : 1046.6285315942455
gpt-4o-mini-2024-07-18 : 1045.9983440073072
gemini-1.5-pro-002 : 1043.5379956858596
gemma-3-12b : 1041.4913314082753
c4ai-command-r-08-2024 : 1034.2841149353017
mistral-small-24b-instruct-2501 : 1031.9022093591066
mistral-large-2411 : 1029.6478028713036
jamba-1.5-large : 1015.123090971581
gemma-2-27b-it-q8 : 1010.1909846519338
llama-3.3-70b : 1007.0649490552951
phi-4 : 1006.7219536666561
gpt-4o-2024-08-06 : 1002.3084013563594
llama-3.1-405b : 993.675689573038
aya-expanse-8b : 983.2797636586968
gemma-2-9b-it : 980.5054275232847
qwen2.5-7b-instruct : 971.7295749095838
hermes-3-llama-3.1-405b : 942.4440351446967
ministral-8b-instruct-2410 :