# Chargement du jeu de données

In [None]:
import os
import polars as pl
from getpass import getpass

hf_token = getpass()
os.environ["HF_HUB_CACHE"] = "/home/jupyterhub-users/shared/projet_comparia/huggingface_hub/"
os.environ["HF_DATASETS_CACHE"] = "/home/jupyterhub-users/shared/projet_comparia/huggingface_hub/"
os.environ["HF_TOKEN"] = hf_token

'import os\nimport polars as pl\nfrom getpass import getpass\n\nhf_token = getpass()\nos.environ["HF_HUB_CACHE"] = "/home/jupyterhub-users/shared/projet_comparia/huggingface_hub/"\nos.environ["HF_DATASETS_CACHE"] = "/home/jupyterhub-users/shared/projet_comparia/huggingface_hub/"\nos.environ["HF_TOKEN"] = hf_token'

In [None]:
import datasets

comparia = datasets.load_dataset(
    "ministere-culture/comparia-reactions",
    cache_dir="/home/jupyterhub-users/shared/projet_comparia/huggingface_hub/",
    split="train",
)

'import datasets\n\ncomparia = datasets.load_dataset(\n    "ministere-culture/comparia-reactions",\n    cache_dir="/home/jupyterhub-users/shared/projet_comparia/huggingface_hub/",\n    split="train",\n)'

In [None]:
comparia: pl.DataFrame = comparia.to_polars()  # type: ignore

# Calcul des rankings

In [5]:
comparia_model_a = (
    comparia.group_by(["model_a_name"])
    .len()
    .sort("len", descending=True)
    .with_columns(model_name="model_a_name")
    .drop("model_a_name")
)
comparia_model_b = (
    comparia.group_by(["model_b_name"])
    .len()
    .sort("len", descending=True)
    .with_columns(model_name="model_b_name")
    .drop("model_b_name")
)
number_by_model = (
    pl.concat([comparia_model_a, comparia_model_b]).group_by("model_name").sum().sort("len", descending=True)
)

In [6]:
number_by_model

model_name,len
str,u32
"""gpt-4o-2024-08-06""",3894
"""deepseek-v3-chat""",3857
"""gpt-4o-mini-2024-07-18""",3816
"""claude-3-5-sonnet-v2""",3514
"""llama-3.1-405b""",3409
…,…
"""gemma-3-12b""",408
"""mistral-small-3.1-24b""",383
"""gemma-3-4b""",381
"""gemma-2-27b-it-q8""",296


In [7]:
from rank_comparia.data_transformation import get_matches_with_score, get_winners, get_winrates

matches = get_matches_with_score(comparia)

In [8]:
matches.head(5)

model_a_name,model_b_name,conversation_pair_id,score_a,score_b
str,str,str,i64,i64
"""aya-expanse-8b""","""mixtral-8x22b-instruct-v0.1""","""0d343e022ffc480c904b9c72f9124e…",0,0
"""phi-3.5-mini-instruct""","""llama-3.1-70b""","""3f53ccb6592b4f44bede2a05d1dbdd…",1,0
"""llama-3.1-8b""","""gemini-2.0-flash-exp""","""6fb44fa3c1fc4e8cb785fe74ed606e…",-2,1
"""chocolatine-2-14b-instruct-v2.…","""claude-3-5-sonnet-v2""","""5270a7ab2c0f404ebf98b2ca24fd72…",-4,4
"""mistral-small-24b-instruct-250…","""llama-3.1-nemotron-70b-instruc…","""7c8e98541f724790856271617262f0…",2,3


In [9]:
winners = get_winners(matches)

In [10]:
winrates = get_winrates(winners)
winrates.sort("winrate", descending=True)

model_name,len,wins,winrate
str,u32,u32,f64
"""gemini-2.0-flash-exp""",856,647,75.584112
"""gemma-3-27b""",275,202,73.454545
"""deepseek-v3-chat""",1511,1065,70.483124
"""gemini-2.0-flash-001""",434,301,69.354839
"""command-a""",208,141,67.788462
…,…,…,…
"""mixtral-8x7b-instruct-v0.1""",585,222,37.948718
"""lfm-40b""",887,321,36.189402
"""mixtral-8x22b-instruct-v0.1""",1459,445,30.500343
"""mistral-nemo-2407""",1440,430,29.861111


In [11]:
from rank_comparia.elo import ELORanker
from rank_comparia.ranker import Match, MatchScore
import random


def compute_match_score(score_a: int, score_b: int) -> MatchScore:
    final_score = score_b - score_a
    if final_score > 0:
        return MatchScore.B
    elif final_score < 0:
        return MatchScore.A
    else:
        return MatchScore.Draw


def get_shuffled_results(matches: list[Match], model_names: list[str], seed: int = 0):
    random.seed(seed)
    ranker_shuffle = ELORanker(K=40)
    matches_shuffle = random.sample(matches, k=len(matches))
    ranker_shuffle.add_players(model_names)
    ranker_shuffle.compute_scores(matches=matches_shuffle)
    return ranker_shuffle.players

In [12]:
model_names = set(matches["model_a_name"].unique()) | set(matches["model_b_name"].unique())
matches = [
    Match(
        match_dict["model_a_name"],
        match_dict["model_b_name"],
        compute_match_score(match_dict["score_a"], match_dict["score_b"]),
    )
    for match_dict in matches.to_dicts()
]

player_results = {
    seed: get_shuffled_results(matches=matches, model_names=model_names, seed=seed) for seed in range(100)  # type: ignore
}

In [13]:
players_avg_ranking = {
    player_name: sum(results[player_name] for results in player_results.values()) / 100 for player_name in model_names
}

In [14]:
for player, ranking in sorted(players_avg_ranking.items(), key=lambda x: -x[1]):
    print(f"{player} : {ranking}")

gemini-2.0-flash-exp : 1153.1917521849846
gemma-3-27b : 1150.4243228233127
deepseek-v3-chat : 1126.568574007991
gemini-2.0-flash-001 : 1120.580618263385
command-a : 1110.8499310552359
llama-3.1-nemotron-70b-instruct : 1080.0654812644425
gemma-3-12b : 1074.3541247061266
deepseek-r1 : 1065.7916869459375
gemma-3-4b : 1061.6427653019982
gemini-1.5-pro-002 : 1053.2831114378766
gemini-1.5-pro-001 : 1051.1822147050527
mistral-small-3.1-24b : 1040.1165001938648
mistral-large-2411 : 1038.283070801158
gpt-4o-mini-2024-07-18 : 1016.5955618293206
llama-3.1-405b : 1015.3381865046157
claude-3-5-sonnet-v2 : 1014.5216611944884
o3-mini : 1012.8998439472485
llama-3.3-70b : 1006.6974833398053
gpt-4o-2024-08-06 : 1004.6584967327492
mistral-small-24b-instruct-2501 : 1000.9951336813177
jamba-1.5-large : 996.1224560427638
phi-4 : 993.5682938562879
llama-3.1-70b : 987.3309540121828
gemma-2-27b-it-q8 : 985.8051154675218
deepseek-r1-distill-llama-70b : 980.7221978560152
gemma-2-9b-it : 979.1377307864168
aya-exp

In [15]:
from random import sample, seed

ranker_shuffle = ELORanker(K=40)

seed(42)
matches_shuffle = sample(matches, k=len(matches))
ranker_shuffle.add_players(model_names)  # type: ignore
ranker_shuffle.compute_scores(matches=matches_shuffle)
ranker_shuffle.get_scores()

{'deepseek-v3-chat': 1189.7121843811944,
 'gemma-3-27b': 1184.2839363672344,
 'gemini-2.0-flash-exp': 1121.0705518542718,
 'gemini-2.0-flash-001': 1120.0818032951888,
 'gemini-1.5-pro-001': 1116.457652774753,
 'command-a': 1085.7927015314162,
 'gemini-1.5-pro-002': 1085.2308721253512,
 'llama-3.1-nemotron-70b-instruct': 1080.9896320034711,
 'deepseek-r1': 1067.4550685921402,
 'mistral-large-2411': 1067.3900016647794,
 'gemma-3-12b': 1063.6642148662647,
 'gemma-2-27b-it-q8': 1052.5065862212975,
 'gemma-3-4b': 1049.9509997819796,
 'qwq-32b': 1040.7807222085173,
 'o3-mini': 1036.1997403131288,
 'claude-3-5-sonnet-v2': 1020.1520903021325,
 'gpt-4o-mini-2024-07-18': 1018.4135685231082,
 'llama-3.1-405b': 1011.6994140035113,
 'gpt-4o-2024-08-06': 994.9352688883218,
 'mistral-small-24b-instruct-2501': 977.1611384938334,
 'mistral-small-3.1-24b': 976.9032263527009,
 'phi-3.5-mini-instruct': 974.5508536779101,
 'llama-3.3-70b': 970.2526924757079,
 'jamba-1.5-large': 966.3613565003008,
 'llama-3

In [16]:
ranker_shuffle = ELORanker(K=40)

seed(1337)
matches_shuffle = sample(matches, k=len(matches))
ranker_shuffle.add_players(model_names)  # type: ignore
ranker_shuffle.compute_scores(matches=matches_shuffle)
ranker_shuffle.get_scores()

{'gemini-2.0-flash-exp': 1187.2567752933778,
 'gemma-3-27b': 1130.4482957390255,
 'deepseek-v3-chat': 1109.5086040897133,
 'deepseek-r1': 1094.8254083367322,
 'gemini-2.0-flash-001': 1092.5100639601135,
 'gemma-3-12b': 1084.9237133106296,
 'command-a': 1078.3891750326056,
 'llama-3.1-405b': 1077.5172994913676,
 'mistral-large-2411': 1075.6668303885447,
 'llama-3.1-nemotron-70b-instruct': 1052.522522165146,
 'phi-4': 1052.1330917389332,
 'gemini-1.5-pro-002': 1043.367789364427,
 'gpt-4o-2024-08-06': 1040.7748146207252,
 'gemini-1.5-pro-001': 1037.701988282026,
 'llama-3.3-70b': 1029.5276520990058,
 'claude-3-5-sonnet-v2': 1029.3040448608099,
 'gpt-4o-mini-2024-07-18': 1016.0233377113445,
 'mistral-small-3.1-24b': 1014.6516071792289,
 'gemma-3-4b': 1003.9334853904393,
 'ministral-8b-instruct-2410': 995.3251445393876,
 'o3-mini': 990.2755142940731,
 'gemma-2-9b-it': 988.0128484491373,
 'jamba-1.5-large': 984.0416915670814,
 'gemma-2-27b-it-q8': 983.5179013775755,
 'mistral-small-24b-instr

# Calcul frugalité

In [17]:
conv_infos = pl.read_parquet(
    "../data/conversations.parquet",
    columns=[
        "id",
        "conversation_pair_id",
        "model_a_name",
        "model_b_name",
        "model_pair_name",
        "total_conv_a_kwh",
        "total_conv_b_kwh",
    ],
).unique(subset="conversation_pair_id", keep="first")

conv_infos.head(3)

id,conversation_pair_id,model_a_name,model_b_name,model_pair_name,total_conv_a_kwh,total_conv_b_kwh
i64,str,str,str,str,f64,f64
53859,"""6673f2f88dae46498fd37cb43a6cce…","""ministral-8b-instruct-2410""","""mistral-large-2411""","""{ministral-8b-instruct-2410,mi…",0.002596,0.01268
24309,"""f615f573bf674c37a9b354cb80baa1…","""gpt-4o-2024-08-06""","""ministral-8b-instruct-2410""","""gpt-4o-2024-08-06,ministral-8b…",0.0773955,0.006029
44528,"""2c64ef7296564a8a89fc06960fe229…","""aya-expanse-8b""","""gpt-4o-mini-2024-07-18""","""{aya-expanse-8b,gpt-4o-mini-20…",0.001304,0.00093


In [18]:
from rank_comparia.frugality import calculate_frugality_score, draw_chart

frugal_scores = calculate_frugality_score(conv_infos, number_by_model, mean=True)

frugal_scores

model_name,conso_all_conv,len,mean_conso
str,f64,u32,f64
"""llama-3.1-405b""",2332.336888,3409,0.68417
"""gemini-1.5-pro-002""",1233.994754,3011,0.409829
"""claude-3-5-sonnet-v2""",1007.772718,3514,0.286788
"""hermes-3-llama-3.1-405b""",949.987392,2349,0.404422
"""deepseek-v3-chat""",575.585088,3857,0.149231
…,…,…,…
"""qwen2.5-7b-instruct""",9.735039,756,0.012877
"""aya-expanse-8b""",7.210222,965,0.007472
"""gemma-3-4b""",7.148588,381,0.018763
"""gemma-2-27b-it-q8""",7.010399,296,0.023684


In [19]:
elo_scores = pl.DataFrame(
    {
        "model_name": ranker_shuffle.players.keys(),
        "elo_score": ranker_shuffle.players.values(),
    },
    strict=False,
).sort(by="elo_score", descending=True)

elo_scores

model_name,elo_score
str,f64
"""gemini-2.0-flash-exp""",1187.256775
"""gemma-3-27b""",1130.448296
"""deepseek-v3-chat""",1109.508604
"""deepseek-r1""",1094.825408
"""gemini-2.0-flash-001""",1092.510064
…,…
"""llama-3.1-8b""",898.263064
"""lfm-40b""",891.344808
"""chocolatine-2-14b-instruct-v2.…",871.436348
"""mixtral-8x7b-instruct-v0.1""",865.656169


In [20]:
from pathlib import Path

info_model = pl.read_json(source=Path(".").resolve().parent / "data" / "models_data.json")

info_model

name,model_name,organization,license
str,str,str,str
"""GPT-4o""","""chatgpt-4o-2024-08-06""","""OpenAI""","""Proprietary"""
"""Yi-1.5 9B Chat""","""Yi-1.5-9B-Chat""","""01 AI""","""Proprietary"""
"""Claude 3.5 Sonnet V2""","""claude-3-5-sonnet-v2""","""Anthropic""","""Proprietary"""
"""GPT-4o mini""","""gpt-4o-mini-2024-07-18""","""OpenAI""","""Proprietary"""
"""Gemini 2.0 Flash""","""gemini-2.0-flash-exp""","""Google""","""Proprietary"""
…,…,…,…
"""Phi 4""","""phi-4""","""Microsoft""","""MIT"""
"""QwQ 32B""","""qwq-32b""","""Alibaba""","""Apache 2.0"""
"""Aya 8b""","""aya-23-8b""","""Cohere""","""CC-BY-NC-4.0"""
"""Llama 3 8B""","""Meta-Llama-3-8B-Instruct""","""Meta""","""llama3"""


In [None]:
final_df = info_model.join(elo_scores, on="model_name").join(frugal_scores, on="model_name")

final_df

name,model_name,organization,license,elo_score,conso_all_conv,len,mean_conso
str,str,str,str,f64,f64,u32,f64
"""Llama 3.1 405B""","""llama-3.1-405b""","""Meta""","""Llama 3.1 Community""",1077.517299,2332.336888,3409,0.68417
"""Gemini 1.5 pro 002""","""gemini-1.5-pro-002""","""Google""","""Proprietary""",1043.367789,1233.994754,3011,0.409829
"""Claude 3.5 Sonnet V2""","""claude-3-5-sonnet-v2""","""Anthropic""","""Proprietary""",1029.304045,1007.772718,3514,0.286788
"""Hermes 3""","""hermes-3-llama-3.1-405b""","""Nous Research""","""llama3""",955.211798,949.987392,2349,0.404422
"""DeepSeek-V3 Chat""","""deepseek-v3-chat""","""DeepSeek""","""DeepSeek""",1109.508604,575.585088,3857,0.149231
…,…,…,…,…,…,…,…
"""Qwen2.5-7B""","""qwen2.5-7b-instruct""","""Alibaba""","""Apache 2.0""",956.398131,9.735039,756,0.012877
"""Aya-Expanse-8B""","""aya-expanse-8b""","""Cohere""","""CC-BY-NC-4.0""",923.519774,7.210222,965,0.007472
"""Gemma 3 4B""","""gemma-3-4b""","""Google""","""Gemma license""",1003.933485,7.148588,381,0.018763
"""Gemma 2 27B q8""","""gemma-2-27b-it-q8""","""Google""","""Gemma license""",983.517901,7.010399,296,0.023684


In [22]:
draw_chart(final_df, title="", log=True, mean=False)

In [None]:
from rank_comparia.utils import save_data, save_chart

save_data(data=final_df, title="frugality", savepath="../data")

save_chart(final_df, "frugality representation", log=True, savepath="../data")